def main():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('ptb_file', help='PTB MRG file')
    args = parser.parse_args()

    with open(args.ptb_file) as f:
        doc = re.sub(r'\s+', ' ', f.read()).strip()
        trees = [HeadedParentedTree.fromstring('( ({}'.format(x)) for x
                 in re.split(r'\(\s*\(', doc) if x]

        for t in trees:
            convert_ptb_tree(t)
            print("\n\n{}".format(t.pprint()))
            for subtree in t.subtrees():
                print("{}{}\t{}".format(' '.join(['' for x in range(depth(subtree))]), subtree.label(), subtree.head_word()))
def main():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('ptb_file', help='PTB MRG file')
    args = parser.parse_args()

    with open(args.ptb_file) as f:
        doc = re.sub(r'\s+', ' ', f.read()).strip()
        trees = [
            HeadedParentedTree.fromstring('( ({}'.format(x))
            for x in re.split(r'\(\s*\(', doc) if x
        ]

        for t in trees:
            convert_ptb_tree(t)
            print("\n\n{}".format(t.pformat()))
            for subtree in t.subtrees():
                print("{}{}\t{}".format(
                    ' '.join(['' for x in range(depth(subtree))]),
                    subtree.label(), subtree.head_word()))
    def parse(self, doc_dict, gold_actions=None, make_features=True):
        '''
        `doc_dict` is a dictionary with EDU segments, parse trees, etc.
        See `convert_rst_discourse_tb.py`.

        If `gold_actions` is specified, then the parser will behave as if in
        training mode.

        If `make_features` and `gold_actions` are specified, then the parser
        will yield (action, features) tuples instead of trees
        (e.g., to produce training examples).
        This will have no effect if `gold_actions` is not provided.
        Disabling `make features` can be useful for debugging and testing.
        '''

        doc_id = doc_dict["doc_id"]
        logging.info('RST parsing, doc_id = {}'.format(doc_id))

        states = []
        completetrees = []
        tagged_edus = extract_tagged_doc_edus(doc_dict)

        queue = self.initialize_edu_data(tagged_edus)

        # If there is only one item on the queue to start, then make it a
        # finished tree so that parsing will complete immediately.
        # TODO add a unit test for this.
        if len(queue) == 1:
            logging.warning('There was only one EDU to parse. A very simple' +
                            ' tree will be returned. doc_id = {}'
                            .format(doc_id))
            new_tree = Tree.fromstring("(ROOT)")
            new_tree.append(queue[0]['tree'])
            queue[0]['tree'] = new_tree

        # precompute syntax tree objects so this only needs to be done once
        if 'syntax_trees_objs' not in doc_dict \
                or len(doc_dict['syntax_trees_objs']) \
                != len(doc_dict['syntax_trees']):
            doc_dict['syntax_trees_objs'] = []
            for tree_str in doc_dict['syntax_trees']:
                doc_dict['syntax_trees_objs'].append(
                    HeadedParentedTree.fromstring(tree_str))

        # initialize the stack
        stack = []

        prevact = ShiftReduceAction(type="S", label="text")

        # insert an initial state on the state list
        tmp_state = {"prevact": prevact,
                     "ucnt": 0,
                     "score": 0.0,  # log probability
                     "nsteps": 0,
                     "stack": stack,
                     "queue": queue}
        states.append(tmp_state)

        # loop while there are states to process
        while states:
            states.sort(key=itemgetter("score"), reverse=True)
            states = states[:self.max_states]

            cur_state = states.pop(0)  # should maybe replace this with a deque
            logging.debug(("cur_state prevact = {}:{}, score = {}," +
                           " num. states = {}, doc_id = {}")
                          .format(cur_state["prevact"].type,
                                  cur_state["prevact"].label,
                                  cur_state["score"], len(states), doc_id))

            # check if the current state corresponds to a complete tree
            if len(cur_state["queue"]) == 0 and len(cur_state["stack"]) == 1:
                tree = cur_state["stack"][-1]["tree"]
                assert tree.label() == 'ROOT'

                # collapse binary branching * rules in the output
                output_tree = ParentedTree.fromstring(tree.pprint())
                collapse_binarized_nodes(output_tree)

                completetrees.append({"tree": output_tree,
                                      "score": cur_state["score"]})
                logging.debug('complete tree found, doc_id = {}'
                              .format(doc_id))

                # stop if we have found enough trees
                if gold_actions is not None or (len(completetrees) >=
                                                self.n_best):
                    break

                # otherwise, move on to the next best state
                continue

            # extract features
            feats = self.mkfeats(cur_state, doc_dict)

            # Compute the possible actions given this state.
            # During training, print them out.
            # During parsing, score them according to the model and sort.
            scored_acts = []
            if gold_actions is not None:
                # take the next action from gold_actions
                act = gold_actions.pop(0) if gold_actions else None
                if act is None:
                    logger.error('Ran out of gold actions for state %s and ' +
                                 'gold_actions %s', cur_state, gold_actions)
                    break

                assert act.type != 'S' or act.label == "text"

                if make_features:
                    if not (act == cur_state["prevact"] and act.type == 'U'):
                        yield ('{}:{}'.format(act.type, act.label), feats)

                scored_acts.append(ScoredAction(act, 0.0))  # logprob
            else:
                vectorizer = self.model.feat_vectorizer
                examples = skll.data.ExamplesTuple(
                    None, None, vectorizer.transform(Counter(feats)),
                    vectorizer)
                scores = [np.log(x) for x in self.model.predict(examples)[0]]

                # Convert the string labels from the classifier back into
                # ShiftReduceAction objects and sort them by their scores
                scored_acts = sorted(zip(self._get_model_actions(),
                                         scores),
                                     key=itemgetter(1),
                                     reverse=True)

            # If parsing, verify the validity of the actions.
            if gold_actions is None:
                scored_acts = [x for x in scored_acts
                               if self.is_valid_action(x[0], cur_state)]
            else:
                for x in scored_acts:
                    assert self.is_valid_action(x[0], cur_state)

            # Don't exceed the maximum number of actions
            # to consider for a parser state.
            scored_acts = scored_acts[:self.max_acts]

            while scored_acts:
                if self.max_acts > 1:
                    # Make copies of the input queue and stack.
                    # This is not necessary if we are doing greedy parsing.
                    # Note that we do not need to make deep copies because
                    # the reduce actions do not modify the subtrees.  They
                    # only create new trees that have them as children.
                    # This ends up making something like a parse forest.
                    queue = list(cur_state["queue"])
                    stack = list(cur_state["stack"])
                prevact = cur_state["prevact"]

                action, score = scored_acts.pop(0)

                # Add the newly created state
                tmp_state = {"prevact": action,
                             "ucnt": cur_state["ucnt"],
                             "score": cur_state["score"] + score,
                             "nsteps": cur_state["nsteps"] + 1,
                             "stack": stack,
                             "queue": queue}
                self.process_action(action, tmp_state)

                states.append(tmp_state)

        if not completetrees:
            logging.warning('No complete trees found. doc id = {}'
                            .format(doc_dict['doc_id']))

            # Default to a flat tree if there is no complete parse.
            new_tree = Tree.fromstring("(ROOT)")
            for i in range(len(tagged_edus)):
                tmp_child = Tree.fromstring('(text)')
                tmp_child.append(i)
                new_tree.append(tmp_child)
            completetrees.append({"tree": new_tree, "score": 0.0})

        if gold_actions is None or not make_features:
            for t in completetrees:
                yield t
Beispiel #4
0
def extract_segmentation_features(doc_dict):
    '''
    This extracts features for use in the discourse segmentation CRF. Note that
    the CRF++ template makes it so that the features for the current word and
    2 previous and 2 next words are used for each word.

    :param doc_dict: A dictionary of edu_start_indices, tokens, syntax_trees,
                     token_tree_positions, and pos_tags for a document, as
                     extracted by convert_rst_discourse_tb.py.
    :returns: a list of lists of lists of features (one feature list per word
              per sentence), and a list of lists of labels (one label per word
              per sentence)
    '''

    labels_doc = []
    feat_lists_doc = []

    if 'edu_start_indices' in doc_dict:
        edu_starts = {(x[0], x[1]) for x in doc_dict['edu_start_indices']}
    else:
        # if none available, just say the whole document is one EDU
        edu_starts = {(0, 0)}

    for sent_num, (sent_tokens, tree_str, sent_tree_positions, pos_tags) \
            in enumerate(zip(doc_dict['tokens'],
                             doc_dict['syntax_trees'],
                             doc_dict['token_tree_positions'],
                             doc_dict['pos_tags'])):

        labels_sent = []
        feat_lists_sent = []

        tree = HeadedParentedTree.fromstring(tree_str)
        for token_num, (token, tree_position, pos_tag) \
                in enumerate(zip(sent_tokens, sent_tree_positions, pos_tags)):
            feats = []
            label = 'B-EDU' if (sent_num, token_num) in edu_starts else 'C-EDU'

            # POS tags and words for lexicalized parse nodes
            # from 3.2 of Bach et al., 2012.
            # preterminal node for the current word
            node_w = tree[tree_position]
            # node for the word to the right
            node_r = tree[sent_tree_positions[token_num + 1]] if token_num + \
                1 < len(sent_tree_positions) else None
            # parent node

            node_p, ancestor_w, ancestor_r = None, None, None
            node_p_parent, node_p_right_sibling = None, None
            if node_r:
                node_p = find_first_common_ancestor(node_w, node_r)
                node_p_treeposition = node_p.treeposition()
                node_p_len = len(node_p_treeposition)
                # child subtree of node_p that includes node_w
                ancestor_w = node_p[node_w.treeposition()[node_p_len]]
                # child subtree of node_p that includes node_r
                ancestor_r = node_p[node_r.treeposition()[node_p_len]]
                node_p_parent = node_p.parent()
                node_p_right_sibling = node_p.right_sibling()

            # now make the list of features
            feats.append(token.lower())
            feats.append(pos_tag)
            feats.extend(
                parse_node_features([
                    node_p, ancestor_w, ancestor_r, node_p_parent,
                    node_p_right_sibling
                ]))

            feat_lists_sent.append(feats)
            labels_sent.append(label)
        feat_lists_doc.append(feat_lists_sent)
        labels_doc.append(labels_sent)

    return feat_lists_doc, labels_doc
Beispiel #5
0
    def parse(self, doc_dict, gold_actions=None, make_features=True):
        '''
        `doc_dict` is a dictionary with EDU segments, parse trees, etc.
        See `convert_rst_discourse_tb.py`.

        If `gold_actions` is specified, then the parser will behave as if in
        training mode.

        If `make_features` and `gold_actions` are specified, then the parser
        will yield (action, features) tuples instead of trees
        (e.g., to produce training examples).
        This will have no effect if `gold_actions` is not provided.
        Disabling `make features` can be useful for debugging and testing.
        '''

        doc_id = doc_dict["doc_id"]
        logging.info('RST parsing, doc_id = {}'.format(doc_id))

        states = []
        completetrees = []
        tagged_edus = extract_tagged_doc_edus(doc_dict)

        queue = self.initialize_edu_data(tagged_edus)

        # If there is only one item on the queue to start, then make it a
        # finished tree so that parsing will complete immediately.
        # TODO add a unit test for this.
        if len(queue) == 1:
            logging.warning(
                'There was only one EDU to parse. A very simple' +
                ' tree will be returned. doc_id = {}'.format(doc_id))
            new_tree = Tree.fromstring("(ROOT)")
            new_tree.append(queue[0]['tree'])
            queue[0]['tree'] = new_tree

        # precompute syntax tree objects so this only needs to be done once
        if 'syntax_trees_objs' not in doc_dict \
                or len(doc_dict['syntax_trees_objs']) \
                != len(doc_dict['syntax_trees']):
            doc_dict['syntax_trees_objs'] = []
            for tree_str in doc_dict['syntax_trees']:
                doc_dict['syntax_trees_objs'].append(
                    HeadedParentedTree.fromstring(tree_str))

        # initialize the stack
        stack = []

        prevact = ShiftReduceAction(type="S", label="text")

        # insert an initial state on the state list
        tmp_state = {
            "prevact": prevact,
            "ucnt": 0,
            "score": 0.0,  # log probability
            "nsteps": 0,
            "stack": stack,
            "queue": queue
        }
        states.append(tmp_state)

        # loop while there are states to process
        while states:
            states.sort(key=itemgetter("score"), reverse=True)
            states = states[:self.max_states]

            cur_state = states.pop(0)  # should maybe replace this with a deque
            logging.debug(("cur_state prevact = {}:{}, score = {}," +
                           " num. states = {}, doc_id = {}").format(
                               cur_state["prevact"].type,
                               cur_state["prevact"].label, cur_state["score"],
                               len(states), doc_id))

            # check if the current state corresponds to a complete tree
            if len(cur_state["queue"]) == 0 and len(cur_state["stack"]) == 1:
                tree = cur_state["stack"][-1]["tree"]
                assert tree.label() == 'ROOT'

                # collapse binary branching * rules in the output
                output_tree = ParentedTree.fromstring(tree.pprint())
                collapse_binarized_nodes(output_tree)

                completetrees.append({
                    "tree": output_tree,
                    "score": cur_state["score"]
                })
                logging.debug(
                    'complete tree found, doc_id = {}'.format(doc_id))

                # stop if we have found enough trees
                if gold_actions is not None or (len(completetrees) >=
                                                self.n_best):
                    break

                # otherwise, move on to the next best state
                continue

            # extract features
            feats = self.mkfeats(cur_state, doc_dict)

            # Compute the possible actions given this state.
            # During training, print them out.
            # During parsing, score them according to the model and sort.
            scored_acts = []
            if gold_actions is not None:
                # take the next action from gold_actions
                act = gold_actions.pop(0) if gold_actions else None
                if act is None:
                    logger.error(
                        'Ran out of gold actions for state %s and ' +
                        'gold_actions %s', cur_state, gold_actions)
                    break

                assert act.type != 'S' or act.label == "text"

                if make_features:
                    if not (act == cur_state["prevact"] and act.type == 'U'):
                        yield ('{}:{}'.format(act.type, act.label), feats)

                scored_acts.append(ScoredAction(act, 0.0))  # logprob
            else:
                vectorizer = self.model.feat_vectorizer
                examples = skll.data.ExamplesTuple(
                    None, None, vectorizer.transform(Counter(feats)),
                    vectorizer)
                scores = [np.log(x) for x in self.model.predict(examples)[0]]

                # Convert the string labels from the classifier back into
                # ShiftReduceAction objects and sort them by their scores
                scored_acts = sorted(zip(self._get_model_actions(), scores),
                                     key=itemgetter(1),
                                     reverse=True)

            # If parsing, verify the validity of the actions.
            if gold_actions is None:
                scored_acts = [
                    x for x in scored_acts
                    if self.is_valid_action(x[0], cur_state)
                ]
            else:
                for x in scored_acts:
                    assert self.is_valid_action(x[0], cur_state)

            # Don't exceed the maximum number of actions
            # to consider for a parser state.
            scored_acts = scored_acts[:self.max_acts]

            while scored_acts:
                if self.max_acts > 1:
                    # Make copies of the input queue and stack.
                    # This is not necessary if we are doing greedy parsing.
                    # Note that we do not need to make deep copies because
                    # the reduce actions do not modify the subtrees.  They
                    # only create new trees that have them as children.
                    # This ends up making something like a parse forest.
                    queue = list(cur_state["queue"])
                    stack = list(cur_state["stack"])
                prevact = cur_state["prevact"]

                action, score = scored_acts.pop(0)

                # Add the newly created state
                tmp_state = {
                    "prevact": action,
                    "ucnt": cur_state["ucnt"],
                    "score": cur_state["score"] + score,
                    "nsteps": cur_state["nsteps"] + 1,
                    "stack": stack,
                    "queue": queue
                }
                self.process_action(action, tmp_state)

                states.append(tmp_state)

        if not completetrees:
            logging.warning('No complete trees found. doc id = {}'.format(
                doc_dict['doc_id']))

            # Default to a flat tree if there is no complete parse.
            new_tree = Tree.fromstring("(ROOT)")
            for i in range(len(tagged_edus)):
                tmp_child = Tree.fromstring('(text)')
                tmp_child.append(i)
                new_tree.append(tmp_child)
            completetrees.append({"tree": new_tree, "score": 0.0})

        if gold_actions is None or not make_features:
            for t in completetrees:
                yield t
def extract_segmentation_features(doc_dict):
    '''
    This extracts features for use in the discourse segmentation CRF. Note that
    the CRF++ template makes it so that the features for the current word and
    2 previous and 2 next words are used for each word.

    :param doc_dict: A dictionary of edu_start_indices, tokens, syntax_trees,
                     token_tree_positions, and pos_tags for a document, as
                     extracted by convert_rst_discourse_tb.py.
    :returns: a list of lists of lists of features (one feature list per word
              per sentence), and a list of lists of labels (one label per word
              per sentence)
    '''

    labels_doc = []
    feat_lists_doc = []

    if 'edu_start_indices' in doc_dict:
        edu_starts = {(x[0], x[1]) for x in doc_dict['edu_start_indices']}
    else:
        # if none available, just say the whole document is one EDU
        edu_starts = {(0, 0)}

    for sent_num, (sent_tokens, tree_str, sent_tree_positions, pos_tags) \
            in enumerate(zip(doc_dict['tokens'],
                             doc_dict['syntax_trees'],
                             doc_dict['token_tree_positions'],
                             doc_dict['pos_tags'])):

        labels_sent = []
        feat_lists_sent = []

        tree = HeadedParentedTree.fromstring(tree_str)
        for token_num, (token, tree_position, pos_tag) \
                in enumerate(zip(sent_tokens, sent_tree_positions, pos_tags)):
            feats = []
            label = 'B-EDU' if (sent_num, token_num) in edu_starts else 'C-EDU'

            # POS tags and words for lexicalized parse nodes
            # from 3.2 of Bach et al., 2012.
            # preterminal node for the current word
            node_w = tree[tree_position]
            # node for the word to the right
            node_r = tree[sent_tree_positions[token_num + 1]] if token_num + \
                1 < len(sent_tree_positions) else None
            # parent node

            node_p, ancestor_w, ancestor_r = None, None, None
            node_p_parent, node_p_right_sibling = None, None
            if node_r:
                node_p = find_first_common_ancestor(node_w, node_r)
                node_p_treeposition = node_p.treeposition()
                node_p_len = len(node_p_treeposition)
                # child subtree of node_p that includes node_w
                ancestor_w = node_p[node_w.treeposition()[node_p_len]]
                # child subtree of node_p that includes node_r
                ancestor_r = node_p[node_r.treeposition()[node_p_len]]
                node_p_parent = node_p.parent()
                node_p_right_sibling = node_p.right_sibling()

            # now make the list of features
            feats.append(token.lower())
            feats.append(pos_tag)
            feats.extend(parse_node_features([node_p,
                                              ancestor_w,
                                              ancestor_r,
                                              node_p_parent,
                                              node_p_right_sibling]))

            feat_lists_sent.append(feats)
            labels_sent.append(label)
        feat_lists_doc.append(feat_lists_sent)
        labels_doc.append(labels_sent)

    return feat_lists_doc, labels_doc