def traverse(node): def extract_tags(W): pos = [W.getAttribute('lc') if W.getAttribute('lc') else None] if W.getAttribute('clitic') in {'ezafe', 'pronominal', 'verb', 'prep', 'adv', 'det'}: pos.append(W.getAttribute('clitic')) if W.getAttribute('ne_sort'): pos.append(W.getAttribute('ne_sort')) if W.getAttribute('n_type'): pos.append(W.getAttribute('n_type')) if W.getAttribute('ya_type'): pos.append(W.getAttribute('ya_type')) if W.getAttribute('ke_type'): pos.append(W.getAttribute('ke_type')) if W.getAttribute('type'): pos.append(W.getAttribute('type')) if W.getAttribute('kind'): pos.append(W.getAttribute('kind')) return pos def clitic_join(tree, clitic): if type(tree[-1]) == Tree: return clitic_join(tree[-1], clitic) else: if(clitic[0][0][0] == 'ا'): clitic[0] = ('' + clitic[0][0], clitic[0][1]) tree[-1]=(tree[-1][0] + clitic[0][0], clitic[0][1]) tree.set_label('CLITICS') return if not len(node.childNodes): return first = node.childNodes[0] if first.tagName == 'w': pos=extract_tags(first) return Tree(node.tagName, [(first.childNodes[0].data.replace('می ', 'می'), self._pos_map(pos))]) childs = node.childNodes[2:] if node.tagName == 'S' else node.childNodes for child in childs: if not len(child.childNodes): childs.remove(child) tree = Tree(node.tagName, map(traverse, childs)) if self._join_clitics and len(tree) > 1 and type(tree[1]) == Tree and tree[1].label() == 'CLITIC' and tree[1][0][1] not in {'P', 'V'}: clitic=tree[-1] tree = Tree(tree.label(), [subtree for subtree in tree[0]]) clitic_join(tree, clitic) if self._join_verb_parts and len(tree) > 1 and type(tree[1]) == Tree and type(tree[0]) == Tree and tree[0].label() == 'AUX' and tree[0][0][0] in self._tokenizer.before_verbs: tree[1][0] = (tree[0][0][0] + ' ' + tree[1][0][0], tree[1][0][1]) tree.remove(tree[0]) if self._join_verb_parts and len(tree.leaves()) > 1 and tree.leaves()[-1][0] in self._tokenizer.after_verbs and tree.leaves()[-2][0] in self._tokenizer.verbe : tree[1][0] = (tree[0].leaves()[-1][0] + ' ' + tree[1][0][0], tree[1][0][1]) path = tree.leaf_treeposition(len(tree.leaves())-2) removingtree = tree while len(path) > 2 : removingtree = removingtree[path[0]] path = path[1:] removingtree.remove(Tree(tree.pos()[-2][1],[tree.pos()[-2][0]])) if self._join_verb_parts and len(tree.leaves()) > 1 and tree.leaves()[-1][0] in self._tokenizer.after_verbs and tree.leaves()[-2][0] in self._tokenizer.verbe : tree[1][0] = (tree[0].leaves()[-1][0] + ' ' + tree[1][0][0], tree[1][0][1]) path = tree.leaf_treeposition(len(tree.leaves())-2) removingtree = tree while len(path) > 2 : removingtree = removingtree[path[0]] path = path[1:] removingtree.remove(Tree(tree.pos()[-2][1],[tree.pos()[-2][0]])) return tree
def traverse(node): def extract_tags(W): pos = [W.getAttribute('lc') if W.getAttribute('lc') else None] if W.getAttribute('clitic') in { 'ezafe', 'pronominal', 'verb', 'prep', 'adv', 'det' }: pos.append(W.getAttribute('clitic')) if W.getAttribute('ne_sort'): pos.append(W.getAttribute('ne_sort')) if W.getAttribute('n_type'): pos.append(W.getAttribute('n_type')) if W.getAttribute('ya_type'): pos.append(W.getAttribute('ya_type')) if W.getAttribute('ke_type'): pos.append(W.getAttribute('ke_type')) if W.getAttribute('type'): pos.append(W.getAttribute('type')) if W.getAttribute('kind'): pos.append(W.getAttribute('kind')) return pos def clitic_join(tree, clitic): if type(tree[-1]) == Tree: return clitic_join(tree[-1], clitic) else: if (clitic[0][0][0] == 'ا'): clitic[0] = ('' + clitic[0][0], clitic[0][1]) tree[-1] = (tree[-1][0] + clitic[0][0], clitic[0][1]) tree.set_label('CLITICS') return if not len(node.childNodes): return first = node.childNodes[0] if first.tagName == 'w': pos = extract_tags(first) return Tree(node.tagName, [(first.childNodes[0].data.replace( 'می ', 'می'), self._pos_map(pos))]) childs = node.childNodes[ 2:] if node.tagName == 'S' else node.childNodes for child in childs: if not len(child.childNodes): childs.remove(child) tree = Tree(node.tagName, map(traverse, childs)) if self._join_clitics and len(tree) > 1 and type( tree[1]) == Tree and tree[1].label( ) == 'CLITIC' and tree[1][0][1] not in {'P', 'V'}: clitic = tree[-1] tree = Tree(tree.label(), [subtree for subtree in tree[0]]) clitic_join(tree, clitic) if self._join_verb_parts and len(tree) > 1 and type( tree[1]) == Tree and type( tree[0]) == Tree and tree[0].label() == 'AUX' and tree[ 0][0][0] in self._tokenizer.before_verbs: tree[1][0] = (tree[0][0][0] + ' ' + tree[1][0][0], tree[1][0][1]) tree.remove(tree[0]) if self._join_verb_parts and len( tree.leaves()) > 1 and tree.leaves( )[-1][0] in self._tokenizer.after_verbs and tree.leaves( )[-2][0] in self._tokenizer.verbe: tree[1][0] = (tree[0].leaves()[-1][0] + ' ' + tree[1][0][0], tree[1][0][1]) path = tree.leaf_treeposition(len(tree.leaves()) - 2) removingtree = tree while len(path) > 2: removingtree = removingtree[path[0]] path = path[1:] removingtree.remove( Tree(tree.pos()[-2][1], [tree.pos()[-2][0]])) if self._join_verb_parts and len( tree.leaves()) > 1 and tree.leaves( )[-1][0] in self._tokenizer.after_verbs and tree.leaves( )[-2][0] in self._tokenizer.verbe: tree[1][0] = (tree[0].leaves()[-1][0] + ' ' + tree[1][0][0], tree[1][0][1]) path = tree.leaf_treeposition(len(tree.leaves()) - 2) removingtree = tree while len(path) > 2: removingtree = removingtree[path[0]] path = path[1:] removingtree.remove( Tree(tree.pos()[-2][1], [tree.pos()[-2][0]])) return tree
def coref_replace(event_dict, key): """ Function to replace pronouns with the referenced noun phrase. Iterates over each sentence in a news story and pulls coreference information from the applicable sentence, even if it is from another sentence. Also keeps track of any changes in indexes made by replacing pronouns, i.e., the reference is longer than the reference so the tree index changes for future references. Filters coreferences on various dimensions to ensure only "good" coreferences are replaced. The default behavior is to do no replacement rather than a bad replacement. The function does not return a value, instead the event_dict is updated with the new parse tree containing the coref information. Parameters ---------- event_dict: Dictionary. Dictionary of sentence information, such as produced by utilities.parse_sents(). key: String. ID of the event or news story being processed. """ #TODO: This could use some major refactoring. if 'coref_info' in event_dict[key]['sent_info'].keys(): sent_info = event_dict[key]['sent_info']['sents'] coref_info = event_dict[key]['sent_info']['coref_info'] for sent in coref_info: for coref in coref_info[sent]['corefs']: pronoun = coref[0] ref = coref[1] if any([word in ref[0] for word in pronoun[0].split()]): pass elif any([word in pronoun[0] for word in ref[0].split()]): pass elif pronoun[4] - pronoun[3] > 1: pass else: try: #Getting the stuff for pronouns if 'coref_tree' in sent_info[pronoun[1]].keys(): pronoun_sent = copy.deepcopy(sent_info[pronoun[1]] ['coref_tree']) else: pronoun_sent = copy.deepcopy(sent_info[pronoun[1]] ['parse_tree']) pronoun_sent = Tree(pronoun_sent) pro_shift = coref_info[pronoun[1]]['shift'] #Getting stuff for the reference if 'coref_tree' in sent_info[ref[1]].keys(): coref_sent = sent_info[ref[1]]['coref_tree'] else: coref_sent = Tree(sent_info[ref[1]]['parse_tree']) ref_shift = coref_info[ref[1]]['shift'] #Actaully replacing the pronoun try: pronoun_pos = pronoun_sent.leaf_treeposition(pronoun[3] + pro_shift) #Hunting for the right pronoun if pronoun_sent[pronoun_pos] != pronoun[0]: if pronoun_sent[pronoun_sent.leaf_treeposition(pronoun[3] + (pro_shift - 1))] == pronoun[0]: pronoun_pos = pronoun_sent.leaf_treeposition(pronoun[3] + (pro_shift - 1)) coref_info[pronoun[1]]['shift'] -= 1 elif pronoun_sent[pronoun_sent.leaf_treeposition(pronoun[3] + (pro_shift + 1))] == pronoun[0]: pronoun_pos = pronoun_sent.leaf_treeposition(pronoun[3] + (pro_shift + 1)) coref_info[pronoun[1]]['shift'] += 1 else: break #Hunting for the right coref original_coref_index = coref_sent.leaf_treeposition(ref[3])[:-2] if ' '.join(coref_sent[original_coref_index].leaves()) == ref[0]: coref_pos = coref_sent.leaf_treeposition(ref[3])[:-2] elif ref[0] in ' '.join(coref_sent[original_coref_index].leaves()): coref_pos = coref_sent.leaf_treeposition(ref[3])[:-2] else: coref_pos = coref_sent.leaf_treeposition(ref[3] + ref_shift)[:-2] if ref[0] not in ' '.join(coref_sent[coref_pos].leaves()): pass #Found everything, now replace coref_tree = Tree('COREF', [coref_sent[coref_pos]]) pronoun_sent[pronoun_pos[:-1]] = coref_tree except IndexError: #TODO: Should this use the original sentence rather #than possibly bad coreferences? print """Key {}, sentence {} has a problem with the corefencing. Breaking and moving on.\n""".format(key, sent) break #Recording the shift length for the pronoun replacement if len(coref_tree.leaves()) <= 2: coref_info[pronoun[1]]['shift'] += 0 else: coref_info[pronoun[1]]['shift'] += coref_tree.height() coref_info[pronoun[1]]['errors'].append(False) if not any(coref_info[pronoun[1]]['errors']): if pronoun_sent != sent_info[sent]['parse_tree']: sent_info[sent]['coref_tree'] = pronoun_sent except RuntimeError, e: print 'There was an error. {}'.format(e) coref_info[pronoun[1]]['errors'].append(True) pass