def main(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('ptb_file', help='PTB MRG file') args = parser.parse_args() with open(args.ptb_file) as f: doc = re.sub(r'\s+', ' ', f.read()).strip() trees = [HeadedParentedTree.fromstring('( ({}'.format(x)) for x in re.split(r'\(\s*\(', doc) if x] for t in trees: convert_ptb_tree(t) print("\n\n{}".format(t.pprint())) for subtree in t.subtrees(): print("{}{}\t{}".format(' '.join(['' for x in range(depth(subtree))]), subtree.label(), subtree.head_word()))
def main(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('ptb_file', help='PTB MRG file') args = parser.parse_args() with open(args.ptb_file) as f: doc = re.sub(r'\s+', ' ', f.read()).strip() trees = [ HeadedParentedTree.fromstring('( ({}'.format(x)) for x in re.split(r'\(\s*\(', doc) if x ] for t in trees: convert_ptb_tree(t) print("\n\n{}".format(t.pformat())) for subtree in t.subtrees(): print("{}{}\t{}".format( ' '.join(['' for x in range(depth(subtree))]), subtree.label(), subtree.head_word()))
def parse(self, doc_dict, gold_actions=None, make_features=True): ''' `doc_dict` is a dictionary with EDU segments, parse trees, etc. See `convert_rst_discourse_tb.py`. If `gold_actions` is specified, then the parser will behave as if in training mode. If `make_features` and `gold_actions` are specified, then the parser will yield (action, features) tuples instead of trees (e.g., to produce training examples). This will have no effect if `gold_actions` is not provided. Disabling `make features` can be useful for debugging and testing. ''' doc_id = doc_dict["doc_id"] logging.info('RST parsing, doc_id = {}'.format(doc_id)) states = [] completetrees = [] tagged_edus = extract_tagged_doc_edus(doc_dict) queue = self.initialize_edu_data(tagged_edus) # If there is only one item on the queue to start, then make it a # finished tree so that parsing will complete immediately. # TODO add a unit test for this. if len(queue) == 1: logging.warning('There was only one EDU to parse. A very simple' + ' tree will be returned. doc_id = {}' .format(doc_id)) new_tree = Tree.fromstring("(ROOT)") new_tree.append(queue[0]['tree']) queue[0]['tree'] = new_tree # precompute syntax tree objects so this only needs to be done once if 'syntax_trees_objs' not in doc_dict \ or len(doc_dict['syntax_trees_objs']) \ != len(doc_dict['syntax_trees']): doc_dict['syntax_trees_objs'] = [] for tree_str in doc_dict['syntax_trees']: doc_dict['syntax_trees_objs'].append( HeadedParentedTree.fromstring(tree_str)) # initialize the stack stack = [] prevact = ShiftReduceAction(type="S", label="text") # insert an initial state on the state list tmp_state = {"prevact": prevact, "ucnt": 0, "score": 0.0, # log probability "nsteps": 0, "stack": stack, "queue": queue} states.append(tmp_state) # loop while there are states to process while states: states.sort(key=itemgetter("score"), reverse=True) states = states[:self.max_states] cur_state = states.pop(0) # should maybe replace this with a deque logging.debug(("cur_state prevact = {}:{}, score = {}," + " num. states = {}, doc_id = {}") .format(cur_state["prevact"].type, cur_state["prevact"].label, cur_state["score"], len(states), doc_id)) # check if the current state corresponds to a complete tree if len(cur_state["queue"]) == 0 and len(cur_state["stack"]) == 1: tree = cur_state["stack"][-1]["tree"] assert tree.label() == 'ROOT' # collapse binary branching * rules in the output output_tree = ParentedTree.fromstring(tree.pprint()) collapse_binarized_nodes(output_tree) completetrees.append({"tree": output_tree, "score": cur_state["score"]}) logging.debug('complete tree found, doc_id = {}' .format(doc_id)) # stop if we have found enough trees if gold_actions is not None or (len(completetrees) >= self.n_best): break # otherwise, move on to the next best state continue # extract features feats = self.mkfeats(cur_state, doc_dict) # Compute the possible actions given this state. # During training, print them out. # During parsing, score them according to the model and sort. scored_acts = [] if gold_actions is not None: # take the next action from gold_actions act = gold_actions.pop(0) if gold_actions else None if act is None: logger.error('Ran out of gold actions for state %s and ' + 'gold_actions %s', cur_state, gold_actions) break assert act.type != 'S' or act.label == "text" if make_features: if not (act == cur_state["prevact"] and act.type == 'U'): yield ('{}:{}'.format(act.type, act.label), feats) scored_acts.append(ScoredAction(act, 0.0)) # logprob else: vectorizer = self.model.feat_vectorizer examples = skll.data.ExamplesTuple( None, None, vectorizer.transform(Counter(feats)), vectorizer) scores = [np.log(x) for x in self.model.predict(examples)[0]] # Convert the string labels from the classifier back into # ShiftReduceAction objects and sort them by their scores scored_acts = sorted(zip(self._get_model_actions(), scores), key=itemgetter(1), reverse=True) # If parsing, verify the validity of the actions. if gold_actions is None: scored_acts = [x for x in scored_acts if self.is_valid_action(x[0], cur_state)] else: for x in scored_acts: assert self.is_valid_action(x[0], cur_state) # Don't exceed the maximum number of actions # to consider for a parser state. scored_acts = scored_acts[:self.max_acts] while scored_acts: if self.max_acts > 1: # Make copies of the input queue and stack. # This is not necessary if we are doing greedy parsing. # Note that we do not need to make deep copies because # the reduce actions do not modify the subtrees. They # only create new trees that have them as children. # This ends up making something like a parse forest. queue = list(cur_state["queue"]) stack = list(cur_state["stack"]) prevact = cur_state["prevact"] action, score = scored_acts.pop(0) # Add the newly created state tmp_state = {"prevact": action, "ucnt": cur_state["ucnt"], "score": cur_state["score"] + score, "nsteps": cur_state["nsteps"] + 1, "stack": stack, "queue": queue} self.process_action(action, tmp_state) states.append(tmp_state) if not completetrees: logging.warning('No complete trees found. doc id = {}' .format(doc_dict['doc_id'])) # Default to a flat tree if there is no complete parse. new_tree = Tree.fromstring("(ROOT)") for i in range(len(tagged_edus)): tmp_child = Tree.fromstring('(text)') tmp_child.append(i) new_tree.append(tmp_child) completetrees.append({"tree": new_tree, "score": 0.0}) if gold_actions is None or not make_features: for t in completetrees: yield t
def extract_segmentation_features(doc_dict): ''' This extracts features for use in the discourse segmentation CRF. Note that the CRF++ template makes it so that the features for the current word and 2 previous and 2 next words are used for each word. :param doc_dict: A dictionary of edu_start_indices, tokens, syntax_trees, token_tree_positions, and pos_tags for a document, as extracted by convert_rst_discourse_tb.py. :returns: a list of lists of lists of features (one feature list per word per sentence), and a list of lists of labels (one label per word per sentence) ''' labels_doc = [] feat_lists_doc = [] if 'edu_start_indices' in doc_dict: edu_starts = {(x[0], x[1]) for x in doc_dict['edu_start_indices']} else: # if none available, just say the whole document is one EDU edu_starts = {(0, 0)} for sent_num, (sent_tokens, tree_str, sent_tree_positions, pos_tags) \ in enumerate(zip(doc_dict['tokens'], doc_dict['syntax_trees'], doc_dict['token_tree_positions'], doc_dict['pos_tags'])): labels_sent = [] feat_lists_sent = [] tree = HeadedParentedTree.fromstring(tree_str) for token_num, (token, tree_position, pos_tag) \ in enumerate(zip(sent_tokens, sent_tree_positions, pos_tags)): feats = [] label = 'B-EDU' if (sent_num, token_num) in edu_starts else 'C-EDU' # POS tags and words for lexicalized parse nodes # from 3.2 of Bach et al., 2012. # preterminal node for the current word node_w = tree[tree_position] # node for the word to the right node_r = tree[sent_tree_positions[token_num + 1]] if token_num + \ 1 < len(sent_tree_positions) else None # parent node node_p, ancestor_w, ancestor_r = None, None, None node_p_parent, node_p_right_sibling = None, None if node_r: node_p = find_first_common_ancestor(node_w, node_r) node_p_treeposition = node_p.treeposition() node_p_len = len(node_p_treeposition) # child subtree of node_p that includes node_w ancestor_w = node_p[node_w.treeposition()[node_p_len]] # child subtree of node_p that includes node_r ancestor_r = node_p[node_r.treeposition()[node_p_len]] node_p_parent = node_p.parent() node_p_right_sibling = node_p.right_sibling() # now make the list of features feats.append(token.lower()) feats.append(pos_tag) feats.extend( parse_node_features([ node_p, ancestor_w, ancestor_r, node_p_parent, node_p_right_sibling ])) feat_lists_sent.append(feats) labels_sent.append(label) feat_lists_doc.append(feat_lists_sent) labels_doc.append(labels_sent) return feat_lists_doc, labels_doc
def parse(self, doc_dict, gold_actions=None, make_features=True): ''' `doc_dict` is a dictionary with EDU segments, parse trees, etc. See `convert_rst_discourse_tb.py`. If `gold_actions` is specified, then the parser will behave as if in training mode. If `make_features` and `gold_actions` are specified, then the parser will yield (action, features) tuples instead of trees (e.g., to produce training examples). This will have no effect if `gold_actions` is not provided. Disabling `make features` can be useful for debugging and testing. ''' doc_id = doc_dict["doc_id"] logging.info('RST parsing, doc_id = {}'.format(doc_id)) states = [] completetrees = [] tagged_edus = extract_tagged_doc_edus(doc_dict) queue = self.initialize_edu_data(tagged_edus) # If there is only one item on the queue to start, then make it a # finished tree so that parsing will complete immediately. # TODO add a unit test for this. if len(queue) == 1: logging.warning( 'There was only one EDU to parse. A very simple' + ' tree will be returned. doc_id = {}'.format(doc_id)) new_tree = Tree.fromstring("(ROOT)") new_tree.append(queue[0]['tree']) queue[0]['tree'] = new_tree # precompute syntax tree objects so this only needs to be done once if 'syntax_trees_objs' not in doc_dict \ or len(doc_dict['syntax_trees_objs']) \ != len(doc_dict['syntax_trees']): doc_dict['syntax_trees_objs'] = [] for tree_str in doc_dict['syntax_trees']: doc_dict['syntax_trees_objs'].append( HeadedParentedTree.fromstring(tree_str)) # initialize the stack stack = [] prevact = ShiftReduceAction(type="S", label="text") # insert an initial state on the state list tmp_state = { "prevact": prevact, "ucnt": 0, "score": 0.0, # log probability "nsteps": 0, "stack": stack, "queue": queue } states.append(tmp_state) # loop while there are states to process while states: states.sort(key=itemgetter("score"), reverse=True) states = states[:self.max_states] cur_state = states.pop(0) # should maybe replace this with a deque logging.debug(("cur_state prevact = {}:{}, score = {}," + " num. states = {}, doc_id = {}").format( cur_state["prevact"].type, cur_state["prevact"].label, cur_state["score"], len(states), doc_id)) # check if the current state corresponds to a complete tree if len(cur_state["queue"]) == 0 and len(cur_state["stack"]) == 1: tree = cur_state["stack"][-1]["tree"] assert tree.label() == 'ROOT' # collapse binary branching * rules in the output output_tree = ParentedTree.fromstring(tree.pprint()) collapse_binarized_nodes(output_tree) completetrees.append({ "tree": output_tree, "score": cur_state["score"] }) logging.debug( 'complete tree found, doc_id = {}'.format(doc_id)) # stop if we have found enough trees if gold_actions is not None or (len(completetrees) >= self.n_best): break # otherwise, move on to the next best state continue # extract features feats = self.mkfeats(cur_state, doc_dict) # Compute the possible actions given this state. # During training, print them out. # During parsing, score them according to the model and sort. scored_acts = [] if gold_actions is not None: # take the next action from gold_actions act = gold_actions.pop(0) if gold_actions else None if act is None: logger.error( 'Ran out of gold actions for state %s and ' + 'gold_actions %s', cur_state, gold_actions) break assert act.type != 'S' or act.label == "text" if make_features: if not (act == cur_state["prevact"] and act.type == 'U'): yield ('{}:{}'.format(act.type, act.label), feats) scored_acts.append(ScoredAction(act, 0.0)) # logprob else: vectorizer = self.model.feat_vectorizer examples = skll.data.ExamplesTuple( None, None, vectorizer.transform(Counter(feats)), vectorizer) scores = [np.log(x) for x in self.model.predict(examples)[0]] # Convert the string labels from the classifier back into # ShiftReduceAction objects and sort them by their scores scored_acts = sorted(zip(self._get_model_actions(), scores), key=itemgetter(1), reverse=True) # If parsing, verify the validity of the actions. if gold_actions is None: scored_acts = [ x for x in scored_acts if self.is_valid_action(x[0], cur_state) ] else: for x in scored_acts: assert self.is_valid_action(x[0], cur_state) # Don't exceed the maximum number of actions # to consider for a parser state. scored_acts = scored_acts[:self.max_acts] while scored_acts: if self.max_acts > 1: # Make copies of the input queue and stack. # This is not necessary if we are doing greedy parsing. # Note that we do not need to make deep copies because # the reduce actions do not modify the subtrees. They # only create new trees that have them as children. # This ends up making something like a parse forest. queue = list(cur_state["queue"]) stack = list(cur_state["stack"]) prevact = cur_state["prevact"] action, score = scored_acts.pop(0) # Add the newly created state tmp_state = { "prevact": action, "ucnt": cur_state["ucnt"], "score": cur_state["score"] + score, "nsteps": cur_state["nsteps"] + 1, "stack": stack, "queue": queue } self.process_action(action, tmp_state) states.append(tmp_state) if not completetrees: logging.warning('No complete trees found. doc id = {}'.format( doc_dict['doc_id'])) # Default to a flat tree if there is no complete parse. new_tree = Tree.fromstring("(ROOT)") for i in range(len(tagged_edus)): tmp_child = Tree.fromstring('(text)') tmp_child.append(i) new_tree.append(tmp_child) completetrees.append({"tree": new_tree, "score": 0.0}) if gold_actions is None or not make_features: for t in completetrees: yield t
def extract_segmentation_features(doc_dict): ''' This extracts features for use in the discourse segmentation CRF. Note that the CRF++ template makes it so that the features for the current word and 2 previous and 2 next words are used for each word. :param doc_dict: A dictionary of edu_start_indices, tokens, syntax_trees, token_tree_positions, and pos_tags for a document, as extracted by convert_rst_discourse_tb.py. :returns: a list of lists of lists of features (one feature list per word per sentence), and a list of lists of labels (one label per word per sentence) ''' labels_doc = [] feat_lists_doc = [] if 'edu_start_indices' in doc_dict: edu_starts = {(x[0], x[1]) for x in doc_dict['edu_start_indices']} else: # if none available, just say the whole document is one EDU edu_starts = {(0, 0)} for sent_num, (sent_tokens, tree_str, sent_tree_positions, pos_tags) \ in enumerate(zip(doc_dict['tokens'], doc_dict['syntax_trees'], doc_dict['token_tree_positions'], doc_dict['pos_tags'])): labels_sent = [] feat_lists_sent = [] tree = HeadedParentedTree.fromstring(tree_str) for token_num, (token, tree_position, pos_tag) \ in enumerate(zip(sent_tokens, sent_tree_positions, pos_tags)): feats = [] label = 'B-EDU' if (sent_num, token_num) in edu_starts else 'C-EDU' # POS tags and words for lexicalized parse nodes # from 3.2 of Bach et al., 2012. # preterminal node for the current word node_w = tree[tree_position] # node for the word to the right node_r = tree[sent_tree_positions[token_num + 1]] if token_num + \ 1 < len(sent_tree_positions) else None # parent node node_p, ancestor_w, ancestor_r = None, None, None node_p_parent, node_p_right_sibling = None, None if node_r: node_p = find_first_common_ancestor(node_w, node_r) node_p_treeposition = node_p.treeposition() node_p_len = len(node_p_treeposition) # child subtree of node_p that includes node_w ancestor_w = node_p[node_w.treeposition()[node_p_len]] # child subtree of node_p that includes node_r ancestor_r = node_p[node_r.treeposition()[node_p_len]] node_p_parent = node_p.parent() node_p_right_sibling = node_p.right_sibling() # now make the list of features feats.append(token.lower()) feats.append(pos_tag) feats.extend(parse_node_features([node_p, ancestor_w, ancestor_r, node_p_parent, node_p_right_sibling])) feat_lists_sent.append(feats) labels_sent.append(label) feat_lists_doc.append(feat_lists_sent) labels_doc.append(labels_sent) return feat_lists_doc, labels_doc