def conjunction(dep_graph: DependencyGraph): """ #### Coordination #### #### I like apples, bananas and oranges. conj:and/or with punct #### @return a list of list of conjuncted entities TODO: currently cannot process nested conjunction. should process from bottom to up :param sentence: :return: """ # find the root of conj and do the process root_of_conj = [] for node in dep_graph.nodes(): if any( rels.startswith("conj") for parent, rels in dep_graph.parents(node)): continue if any( rels.startswith("conj") for child, rels in dep_graph.children(node)): root_of_conj.append(node) for root in root_of_conj: logger.debug("found the root of conjunction") logger.debug(str(root)) process_conjunction(dep_graph, root) process_head_conj(dep_graph)
def parallel_list(dep_graph: DependencyGraph, oia_graph: OIAGraph, context: UD2OIAContext): """ :param dep_graph: :param oia_graph: :return: """ list_phrases = [] for n in dep_graph.nodes(): list_nodes = [ n for n, l in dep_graph.children(n, filter=lambda n, l: "list" in l) ] if not list_nodes: continue list_nodes.append(n) list_nodes.sort(key=lambda n: n.LOC) list_phrases.append(list_nodes) for list_nodes in list_phrases: pred = oia_graph.add_aux("LIST") for idx, node in enumerate(list_nodes): oia_arg = oia_graph.add_words(node.position) oia_graph.add_argument(pred, oia_arg, idx + 1)
def and_or_conjunction(dep_graph: DependencyGraph, oia_graph: OIAGraph, context: UD2OIAContext): """ #### Coordination #### #### I like apples, bananas and oranges. conj:and/or with punct #### @return a list of list of conjuncted entities :param sentence: :return: """ for node in dep_graph.nodes(): conj_components = list( dep_graph.children(node, filter=lambda n, l: l.startswith("arg_con"))) if not conj_components: continue oia_conj_root_node = oia_graph.add_words(node.position) for child, rels in conj_components: soake_child_node = oia_graph.add_words(child.position) arg_index = int(rels.values()[0]) oia_graph.add_argument(oia_conj_root_node, soake_child_node, arg_index)
def goeswith(dep_graph: DependencyGraph): """ :param dep_graph: :param oia_graph: :return: """ goeswith_phrases = [] for n in dep_graph.nodes(): goeswith_nodes = [n for n, l in dep_graph.children(n, filter=lambda n, l: "goeswith" in l)] if not goeswith_nodes: continue goeswith_nodes.append(n) goeswith_nodes.sort(key=lambda n: n.LOC) goeswith_phrases.append(goeswith_nodes) for goeswith_nodes in goeswith_phrases: upos = "X" for node in goeswith_nodes: if node.UPOS != "X": upos = node.UPOS new_node = merge_dep_nodes(goeswith_nodes, UPOS=upos, LOC=goeswith_nodes[-1].LOC ) dep_graph.replace_nodes(goeswith_nodes, new_node)
def be_adp_phrase(dep_graph: DependencyGraph): """ example: is for xxx this should be not applied: 1. if xxx is adj, then be_adj_verb will be applied; 2. if xxx is NOUN, then copula_phrase will be applied note that there may be multiple adp: the insurgency is out of the picture :param dep_graph: :param oia_graph: :return: """ pattern = DependencyGraph() some_node = pattern.create_node() adp_node = pattern.create_node(UPOS="ADP") be_node = pattern.create_node(UPOS="AUX") pattern.add_dependency(some_node, be_node, r'cop') pattern.add_dependency(some_node, adp_node, r'case') verb_phrases = [] for match in dep_graph.match(pattern): dep_be_node = match[be_node] dep_some_node = match[some_node] dep_adp_nodes = [ n for n, l in dep_graph.children( dep_some_node, filter=lambda n, l: "case" in l and n.UPOS == "ADP") ] if not all(dep_be_node.LOC < x.LOC < dep_some_node.LOC for x in dep_adp_nodes): continue pred = [dep_be_node] + dep_adp_nodes head = dep_be_node verb_phrases.append((dep_some_node, pred, head)) for dep_some_node, verbs, root in verb_phrases: if not all(dep_graph.get_node(v.ID) for v in verbs): continue # has been processed verb_node = merge_dep_nodes(verbs, UPOS="AUX", LOC=root.LOC) for node in verbs: dep_graph.remove_dependency(dep_some_node, node) dep_graph.replace_nodes(verbs, verb_node) dep_graph.add_dependency(dep_some_node, verb_node, "cop")
def amod_obl(dep_graph: DependencyGraph): """ ##### include: more than, successful by :param dep_graph: :param oia_graph: :return: """ pattern = DependencyGraph() noun_node = DependencyGraphNode(UPOS=r"NOUN|PRON") adj_node = DependencyGraphNode(UPOS="ADJ") adp_node = DependencyGraphNode(UPOS="ADP") obl_node = DependencyGraphNode() pattern.add_nodes([noun_node, adj_node, adp_node, obl_node]) pattern.add_dependency(noun_node, adj_node, r'amod') pattern.add_dependency(adj_node, obl_node, r'obl:\w+') pattern.add_dependency(obl_node, adp_node, r'case') more_than_pred = [] for match in dep_graph.match(pattern): dep_noun_node = match[noun_node] dep_adj_node = match[adj_node] dep_obl_node = match[obl_node] dep_adp_node = match[adp_node] obl_nodes = list( dep_graph.children(dep_adj_node, filter=lambda n, l: "obl" in l)) if len(obl_nodes) > 1: # similar in form to the one continue if dep_adp_node.FORM not in dep_graph.get_dependency( dep_adj_node, dep_obl_node).values(): continue if dep_noun_node.LOC < dep_adj_node.LOC < dep_adp_node.LOC < dep_obl_node.LOC: more_than_pred.append( (dep_noun_node, dep_adj_node, dep_obl_node, dep_adp_node)) for dep_noun_node, dep_adj_node, dep_obl_node, dep_adp_node in more_than_pred: nodes = [dep_adj_node, dep_adp_node] more_than_pred = merge_dep_nodes(nodes, UPOS="ADP", LOC=dep_adp_node.LOC) dep_graph.remove_dependency(dep_noun_node, dep_adj_node) dep_graph.remove_dependency(dep_adj_node, dep_obl_node) dep_graph.replace_nodes([dep_adj_node, dep_adp_node], more_than_pred) dep_graph.add_dependency(dep_noun_node, dep_obl_node, "nmod:" + more_than_pred.FORM)
def det_adjv_phrase(dep_graph: DependencyGraph): """ :param dep_graph: :param oia_graph: :return: """ phrases = [] for node in dep_graph.nodes(filter=lambda n: n.UPOS in {"ADJ", "ADV"}): parent_rels = itertools.chain.from_iterable( (rel for parent, rel in dep_graph.parents(node))) if any([rel in valid_adj_form for rel in parent_rels]): continue if any([rel in {"amod", "advmod"} for rel in parent_rels]): continue det = [ n for n, l in dep_graph.children(node, filter=lambda n, l: l == "det") ] if not det: continue det.sort(key=lambda x: x.LOC) det = det[-1] if det.LEMMA not in {"the", "a", "an", "some", "any", "all"}: continue root = node np_elements = list( dep_graph.offsprings( root, filter=lambda n: det.LOC <= n.LOC <= root.LOC)) # check the element should be continuous np_elements = sorted(list(np_elements), key=lambda x: x.LOC) # if np_elements[-1].LOC - np_elements[0].LOC != len(np_elements) - 1: # print ("root", root) # for n in np_elements: # print("np element", n.LOC, n) # raise Exception("Bad Business Logic") phrases.append((np_elements, root)) for np, root in phrases: noun_node = merge_dep_nodes(np, UPOS="NOUN", LOC=root.LOC) # print("Noun detected", noun_node.ID) dep_graph.replace_nodes(np, noun_node)
def adverbial_clause(dep_graph: DependencyGraph, oia_graph: OIAGraph, context: UD2OIAContext): """ Adverbial Clause ##### run in order to catch it. advcl with mark (in order to) ##### ##### he worked hard, replacing his feud. advcl without mark ##### :param dep_graph: :param oia_graph: :return: """ pattern = DependencyGraph() verb_node = pattern.create_node() modifier_node = pattern.create_node() pattern.add_dependency(verb_node, modifier_node, "advcl") for match in list(dep_graph.match(pattern)): dep_verb_node = match[verb_node] dep_modifier_node = match[modifier_node] if context.is_processed(dep_verb_node, dep_modifier_node): continue oia_verb_node = oia_graph.add_words(dep_verb_node.position) oia_modifier_node = oia_graph.add_words(dep_modifier_node.position) logger.debug("adverbial clause: verb={0}, modifier={1}".format( dep_verb_node.position, dep_modifier_node.position)) if oia_graph.has_relation(oia_verb_node, oia_modifier_node): continue mark = list( dep_graph.children(dep_modifier_node, filter=lambda n, rel: "mark" in rel)) if mark: mark, rel = mark[0] pred_node = oia_graph.add_words(mark.position) if pred_node is None: continue if mark.LEMMA in CONJUNCTION_WORDS[language]: continue oia_graph.add_argument(pred_node, oia_verb_node, 1, mod=True) oia_graph.add_argument(pred_node, oia_modifier_node, 2) else: oia_graph.add_mod(oia_modifier_node, oia_verb_node)
def be_not_phrase2(dep_graph: DependencyGraph): """TODO: add doc string """ be_not = [] # for pred_node in dep_graph.nodes(filter=lambda x: x.UPOS in {"VERB"}): for pred_node in dep_graph.nodes(): # print('pred_node LEMMA:', pred_node.LEMMA, 'pred_node UPOS:', pred_node.UPOS) if not "be" in pred_node.LEMMA.split(" "): continue objs = [] for child, rel in dep_graph.children(pred_node): if rel.startswith('obj'): objs.append(child) if not objs: continue objs.sort(key=lambda x: x.LOC) for obj in objs: def __interested_node2(n): # that conj is ommited return (n.UPOS == "PART" and "not" in n.LEMMA.split(" ")) nodes_of_interests2 = [ n for n, l in dep_graph.children( obj, filter=lambda n, l: l == "advmod" and __interested_node2(n )) ] if not nodes_of_interests2: continue assert len(nodes_of_interests2) == 1 not_node = nodes_of_interests2[0] be_not.append((pred_node, obj, not_node)) for dep_be_node, dep_obj_node, dep_not_node in be_not: dep_graph.remove_dependency(dep_obj_node, dep_not_node, 'advmod') verb_node = merge_dep_nodes((dep_be_node, dep_not_node), UPOS=dep_be_node.UPOS, LOC=dep_be_node.LOC) dep_graph.replace_nodes([dep_be_node, dep_not_node], verb_node)
def complete_missing_case_mark(dep_graph: DependencyGraph, root, root_parents, parallel_components, relation_to_conj, case_marks): """ :param dep_graph: :param parallel_components: :return: """ parallel_components.sort(key=lambda x: x.LOC) for parent in root_parents: # ic(str(root)) # ic(str(parent)) # ic(relation_to_conj) prefix, shared_prefix, required_mark = relation_to_conj[parent.ID] if not required_mark: continue for index, (node, mark) in enumerate(zip(parallel_components, required_mark)): if mark is None: continue is_exist = any( mark == child.LEMMA or mark in child.LEMMA.split(" ") for child, l in dep_graph.children(node)) if is_exist: continue found_mark = find_mark(case_marks, parallel_components[:index], mark) if found_mark: mark_node, rel = found_mark dup_case_mark = dep_graph.create_node(FORM=mark_node.FORM, LEMMA=mark_node.LEMMA, UPOS=mark_node.UPOS, LOC=mark_node.LOC) dup_case_mark.aux = True dep_graph.add_dependency(node, dup_case_mark, rel) else: logger.warning("cannot find the mark, just add the relation")
def multi_words_mark(dep_graph: DependencyGraph): """ arise on to the "on to" should be combined :param dep_graph: :param oia_graph: :return: """ # print('multi_words_mark') mark_phrases = [] for node in dep_graph.nodes(): marks = [] for n, l in dep_graph.children(node, filter=lambda n, l: "mark" in l): marks.extend(dep_graph.offsprings(n)) if not marks: continue # print('multi_words_mark marks:', marks) if len(marks) > 1: if any([x.UPOS in {"NOUN", "NUM", "VERB", "ADJ", "ADV", "PRON"} for x in marks]): continue marks.sort(key=lambda n: n.LOC) mark_phrases.append((node, marks)) for node, marks in mark_phrases: # print('multi_words_mark marks:', marks) if not all([dep_graph.get_node(x.ID) for x in marks]): continue mark_min_loc = marks[0].LOC mark_max_loc = marks[-1].LOC marks = [n for n in dep_graph.nodes() if mark_min_loc <= n.LOC <= mark_max_loc] marks.sort(key=lambda n: n.LOC) if any([x.UPOS in NOUN_UPOS for x in marks]): continue # print('marks:') # for nnnn in marks: # print(nnnn) new_mark_node = merge_dep_nodes(marks, UPOS=marks[0].UPOS, LOC=marks[0].LOC ) for mark in marks: dep_graph.remove_dependency(node, mark) dep_graph.replace_nodes(marks, new_mark_node) dep_graph.add_dependency(node, new_mark_node, "mark")
def multi_words_cc(dep_graph: DependencyGraph): """ arise on to the "on to" should be combined :param dep_graph: :param oia_graph: :return: """ mark_phrases = [] for node in dep_graph.nodes(): marks = [] for n, l in dep_graph.children(node, filter=lambda n, l: "cc" == l): marks.extend(dep_graph.offsprings(n)) if not marks: continue if len(marks) > 1: if any([x.UPOS in {"NOUN", "NUM", "VERB"} for x in marks]): continue marks.sort(key=lambda n: n.LOC) mark_phrases.append((node, marks)) for node, marks in mark_phrases: mark_min_loc = marks[0].LOC mark_max_loc = marks[-1].LOC marks = [n for n in dep_graph.nodes() if mark_min_loc <= n.LOC <= mark_max_loc] if any([x.UPOS in {"NOUN", "NUM", "VERB"} for x in marks]): continue if not all([dep_graph.get_node(x.ID) for x in marks]): continue new_mark_node = merge_dep_nodes(marks, UPOS=marks[0].UPOS, LOC=marks[0].LOC ) dep_graph.replace_nodes(marks, new_mark_node) for mark in marks: dep_graph.remove_dependency(node, mark) if dep_graph.get_node(node.ID): dep_graph.add_dependency(node, new_mark_node, "cc")
def noun_all(dep_graph: DependencyGraph): """ :param dep_graph: :return: """ noun_all_phrase = [] for root in dep_graph.nodes(filter=lambda x: x.UPOS in {"NOUN", "PROPN", "PRON", "X", "NUM", "SYM"}): for child, rels in dep_graph.children(root): if "det" in rels and child.LEMMA == "all" and child.LOC == root.LOC + 1: noun_all_phrase.append((root, child)) for noun, all in noun_all_phrase: noun_node = merge_dep_nodes([noun, all], UPOS=noun.UPOS, LOC=noun.LOC) # print("Noun detected", noun_node.ID) dep_graph.replace_nodes([noun, all], noun_node)
def to_verb(dep_graph: DependencyGraph): """ :param dep_graph: :return: """ to_verb_phrase = [] for root in dep_graph.nodes(filter=lambda x: x.UPOS in {"VERB"}): if any("to" in rels.values() for parent, rels in dep_graph.parents(root)): continue for child, rels in dep_graph.children(root): if "mark" in rels and child.LEMMA == "to" and child.LOC == root.LOC - 1 and \ not (isinstance(child, DependencyGraphSuperNode) and child.is_conj): to_verb_phrase.append((child, root)) for to, verb in to_verb_phrase: noun_node = merge_dep_nodes([to, verb], UPOS=verb.UPOS, LOC=verb.LOC) # print("Noun detected", noun_node.ID) dep_graph.replace_nodes([to, verb], noun_node)
def noun_phrase(dep_graph: DependencyGraph): """ :param dep_graph: :param oia_graph: :return: """ nouns = [] # we first find np roots for root in dep_graph.nodes( filter=lambda x: x.UPOS in {"NOUN", "PROPN", "X", "NUM", "SYM"}): logger.debug("checking the node:") logger.debug(str(root)) # np_elements = valid_np_element(root, dep_graph) parent_rels = set( itertools.chain.from_iterable(l.values() for n, l in dep_graph.parents(root))) parent_rels = set(rel.replace("_", " ") for rel in parent_rels) escaped_case_node = set() if parent_rels: case_nodes = [ x for x, l in dep_graph.children(root, filter=lambda n, l: l == "case") ] for node in case_nodes: if node.LEMMA.lower() in parent_rels or node.FORM.lower( ) in parent_rels: # lemma is for including escaped_case_node.add(node) valid_np_children = [(n, l) for n, l in dep_graph.children( root, filter=lambda n, l: is_valid_np_child(dep_graph, root, l, n)) ] logger.debug("noun_phrase: valid_np_children:") for node, l in valid_np_children: logger.debug(str(node)) np_elements = [root] for n, l in valid_np_children: if n.UPOS == "ADP": continue if n.LOC > root.LOC and \ not any(l.startswith(x) for x in {"fixed", "compound", "nummod", "nmod:tmod", "flat", "nmod:npmod", "dep"}): continue if n in escaped_case_node: continue if isinstance(n, DependencyGraphSuperNode) and n.is_conj: continue offsprings = list(dep_graph.offsprings(n)) valid_np_component = True for x in offsprings: for parent, rels in dep_graph.parents(x): if any(x in rels for x in {"acl", "obl", "advcl", "subj", "obj"}): valid_np_component = False break if not valid_np_component: break if valid_np_component: np_elements.extend(offsprings) logger.debug("noun_phrase: candidate np_elements:") for node in np_elements: logger.debug(str(node)) det = [ n for n, l in dep_graph.children(root, filter=lambda n, l: l == "det") ] det = [x for x in det if x.LOC <= root.LOC] det.sort(key=lambda x: x.LOC) if det: # raise Exception("noun phrase without det ") det = det[-1] # check the element should be continuous np_elements = [x for x in np_elements if det.LOC <= x.LOC] logger.debug( "noun_phrase: det found, cut the nodes before the det") filtered_np_elements = sorted(list(np_elements), key=lambda x: x.LOC) # if np_elements[-1].LOC - np_elements[0].LOC != len(np_elements) - 1: # print ("root", root) # for n in np_elements: # print("np element", n.LOC, n) # raise Exception("Bad Business Logic") changed = True while changed: changed = False if filtered_np_elements and filtered_np_elements[0].LEMMA in { "-", "--" }: filtered_np_elements.pop(0) changed = True if filtered_np_elements and filtered_np_elements[0].UPOS in { "ADP", "CCONJ", "PUNCT" }: filtered_np_elements.pop(0) changed = True if filtered_np_elements: nouns.append((set(filtered_np_elements), root)) sub_nouns = [] for idx1, (phrase1, head1) in enumerate(nouns): for idx2, (phrase2, head2) in enumerate(nouns): if idx1 == idx2: continue phrasex, phrasey = ( phrase1, phrase2) if len(phrase1) > len(phrase2) else (phrase2, phrase1) common = phrasex.intersection(phrasey) if not common: continue elif len(common) == len(phrasey): # node2 is a sub np of node1, delete sub_nouns.append(phrasey) else: print("Phrase 1", [x.ID for x in phrase1]) print("Phrase 2", [x.ID for x in phrase2]) # return raise Exception("duplicate words found") for idx, (phrase, head) in enumerate(nouns): if phrase in sub_nouns: continue phrase = sorted(list(phrase), key=lambda x: x.LOC) for node in phrase: for child, _ in dep_graph.children(node): if child.LOC == phrase[0].LOC - 1 and child.LEMMA in { "\"", "\'" }: phrase.insert(0, child) if child.LOC == phrase[-1].LOC + 1 and child.LEMMA in { "\"", "\'" }: phrase.append(child) noun_node = merge_dep_nodes(phrase, UPOS=head.UPOS, LOC=phrase[-1].LOC) # print("Noun detected", noun_node.ID) dep_graph.replace_nodes(phrase, noun_node)
def noun_of_noun(dep_graph: DependencyGraph): """ :param dep_graph: :return: """ pattern = DependencyGraph() noun1_node = pattern.create_node(UPOS="NOUN|PROPN|PRON|X|NUM|SYM") of_node = pattern.create_node(LEMMA="of") noun2_node = pattern.create_node(UPOS="NOUN|PROPN|PRON|X|NUM|SYM") pattern.add_dependency(noun1_node, noun2_node, "nmod:of") pattern.add_dependency(noun2_node, of_node, "case") merged_map = dict() # need_merge = [] for match in list(dep_graph.match(pattern)): dep_noun1_node = match[noun1_node] if dep_noun1_node in merged_map: dep_noun1_node = merged_map[dep_noun1_node] dep_noun2_node = match[noun2_node] if dep_noun2_node in merged_map: dep_noun2_node = merged_map[dep_noun2_node] dep_of_node = match[of_node] if not all([dep_noun1_node, dep_noun2_node, dep_of_node]): # processed by others continue involved_in_complex_structure = False for child, rel in dep_graph.children(dep_noun2_node): if "conj" in rel or "acl" in rel: involved_in_complex_structure = True for parent, rel in dep_graph.parents(dep_noun2_node): if "conj" in rel or "acl" in rel: involved_in_complex_structure = True if involved_in_complex_structure: continue if isinstance(dep_noun1_node, DependencyGraphSuperNode) and dep_noun1_node.is_conj: continue if isinstance(dep_noun2_node, DependencyGraphSuperNode) and dep_noun2_node.is_conj: continue dep_noun2_parents = [ parent for parent, rel in dep_graph.parents(dep_noun2_node) ] if len(dep_noun2_parents) == 1: if dep_noun2_parents[0] != dep_noun1_node: logger.error("dep_noun1 {0} {1}".format( dep_noun1_node.ID, dep_noun1_node.FORM)) logger.error("dep_noun2 {0} {1}".format( dep_noun2_node.ID, dep_noun2_node.FORM)) logger.error("dep_noun2_parent {0} {1}".format( dep_noun2_parents[0].ID, dep_noun2_parents[0].FORM)) raise Exception("Noun of Noun failed") new_noun_nodes = [dep_noun1_node, dep_of_node, dep_noun2_node] # <<<<<<< HEAD new_noun = merge_dep_nodes(new_noun_nodes, UPOS=dep_noun1_node.UPOS, FEATS=dep_noun1_node.FEATS, LOC=dep_noun1_node.LOC) dep_graph.replace_nodes(new_noun_nodes, new_noun) for node in new_noun_nodes: merged_map[node] = new_noun logger.debug("node merged :" + " ".join( [dep_noun1_node.ID, dep_of_node.ID, dep_noun2_node.ID]))
def multi_word_fix_flat(dep_graph: DependencyGraph): """ :param dep_graph: :param oia_graph: :return: """ fixed_rels = {"fixed", "flat", "compound"} phrases = [] for node in dep_graph.nodes(): parents = [n for n, l in dep_graph.parents(node, filter=lambda n, l: any(x in l for x in fixed_rels))] if parents: continue phrase = [] for n, l in dep_graph.children(node, filter=lambda n, l: any(x in l for x in fixed_rels)): phrase.extend(dep_graph.offsprings(n)) if not phrase: continue phrase.append(node) if len(phrase) > 1: phrase.sort(key=lambda n: n.LOC) # min_loc = phrase[0].LOC # max_loc = phrase[-1].LOC # phrase = [n for n in dep_graph.nodes() if min_loc <= n.LOC <= max_loc] phrases.append((phrase, node)) phrases.sort(key=lambda x: len(x[0]), reverse=True) for phrase, head in phrases: if not all([dep_graph.get_node(x.ID) for x in phrase]): continue # already been processed merging_nodes = set() min_loc = 10000 max_loc = -1 for child in phrase: if isinstance(child, DependencyGraphNode): min_loc = min(min_loc, child.LOC) max_loc = max(min_loc, child.LOC) elif isinstance(child, DependencyGraphSuperNode): min_loc = min(min_loc, min([x.LOC for x in child.nodes])) max_loc = max(max_loc, max([x.LOC for x in child.nodes])) merging_nodes.update(dep_graph.offsprings(child)) merged_nodes = set([n for n in merging_nodes if min_loc <= n.LOC <= max_loc]) for node in merging_nodes: if node.LOC == min_loc - 1 and node.LEMMA in {"\"", "\'"}: merged_nodes.add(node) if node.LOC == max_loc + 1 and node.LEMMA in {"\"", "\'"}: merged_nodes.add(node) merged_nodes = list(merged_nodes) merged_nodes.sort(key=lambda x: x.LOC) logger.debug("multi_word_fix_flat: we are merging ") logger.debug("\n".join(str(node) for node in merged_nodes)) logger.debug("with head \n" + str(head)) new_node = merge_dep_nodes(merged_nodes, UPOS=head.UPOS, LOC=head.LOC) dep_graph.replace_nodes(merged_nodes, new_node)
def secondary_predicate(dep_graph: DependencyGraph): """ detect the case of xcomp as a secondary predicate, and add implicit (be) node to make a predicate :param dep_graph: :return: """ pattern = DependencyGraph() pred_node = pattern.create_node() xcomp_node = pattern.create_node(UPOS=r'(?!VERB\b)\b\w+') xcomp_subj_node = pattern.create_node() pattern.add_dependency(pred_node, xcomp_node, "xcomp") pattern.add_dependency(xcomp_node, xcomp_subj_node, "nsubj") pattern.add_dependency(pred_node, xcomp_subj_node, "obj") for match in list(dep_graph.match(pattern)): dep_pred_node = match[pred_node] dep_xcomp_node = match[xcomp_node] dep_xcomp_subj_node = match[xcomp_subj_node] # if not (dep_pred_node.LOC < dep_xcomp_subj_node.LOC and dep_pred_node.LOC < dep_xcomp_node.LOC): # raise Exception("Unexpected Situation, let's throw out to see what happens") # the position of dep_xcomp_subj_node and dep_xcomp_node may be reversed in questions # I can't tell you how ominous I found Bush's performance in that interview. if dep_pred_node.LOC < dep_xcomp_subj_node.LOC < dep_xcomp_node.LOC: dep_graph.remove_dependency(dep_pred_node, dep_xcomp_node) dep_graph.remove_dependency(dep_pred_node, dep_xcomp_subj_node) dep_graph.remove_dependency(dep_xcomp_node, dep_xcomp_subj_node) if dep_xcomp_node.UPOS == "ADJ" or dep_xcomp_node.UPOS == "ADV": new_pred_nodes = ["(be)", dep_xcomp_node] dep_be_node = merge_dep_nodes(new_pred_nodes, UPOS="VERB", LOC=dep_xcomp_node.LOC) dep_graph.add_node(dep_be_node) dep_graph.add_dependency(dep_pred_node, dep_be_node, "obj") dep_graph.add_dependency(dep_be_node, dep_xcomp_subj_node, "nsubj") for child, l in list(dep_graph.children(dep_xcomp_node)): dep_graph.remove_dependency(dep_xcomp_node, child) dep_graph.add_dependency(dep_be_node, child, l) dep_graph.remove_node(dep_xcomp_node) else: dep_be_node = dep_graph.create_node(FORM="(be)", LEMMA="(be)", UPOS="VERB", LOC=dep_xcomp_node.LOC - 0.5) dep_be_node.aux = True dep_graph.add_dependency(dep_pred_node, dep_be_node, "obj") dep_graph.add_dependency(dep_be_node, dep_xcomp_subj_node, "nsubj") dep_graph.add_dependency(dep_be_node, dep_xcomp_node, "obj") elif dep_xcomp_node.LOC < dep_pred_node.LOC: dep_graph.remove_dependency(dep_pred_node, dep_xcomp_node) dep_graph.remove_dependency(dep_pred_node, dep_xcomp_subj_node) dep_graph.remove_dependency(dep_xcomp_node, dep_xcomp_subj_node) # in question, for example : how ominous # I can't tell you how ominous I found Bush's performance in that interview. dep_be_node = dep_graph.create_node(FORM="(be)", LEMMA="(be)", UPOS="VERB", LOC=dep_xcomp_node.LOC - 0.5) dep_be_node.aux = True dep_graph.add_dependency(dep_pred_node, dep_be_node, "obj") dep_graph.add_dependency(dep_be_node, dep_xcomp_subj_node, "nsubj") if dep_xcomp_node.UPOS == "ADJ" or dep_xcomp_node.UPOS == "ADV": dep_graph.add_dependency(dep_be_node, dep_xcomp_node, "amod") else: dep_graph.add_dependency(dep_be_node, dep_xcomp_node, "obj")
def single_root(dep_graph: DependencyGraph, oia_graph: OIAGraph, context: UD2OIAContext): """ :param dep_graph: :param oia_graph: :return: """ in_degrees = [(node, oia_graph.g.in_degree(node.ID)) for node in oia_graph.nodes()] zero_degree_nodes = [n for n, degree in in_degrees if degree == 0] if len(zero_degree_nodes) == 0: return elif len(zero_degree_nodes) == 1: root = zero_degree_nodes[0] else: # len(zero_degree_nodes) >= 2 dists_to_root = [] for oia_node in zero_degree_nodes: related_dep_nodes = set() if isinstance(oia_node, OIAWordsNode): dep_node = dep_graph.get_node_by_spans(oia_node.spans) if dep_node: if isinstance(dep_node, DependencyGraphNode): related_dep_nodes.add(dep_node) elif isinstance(dep_node, list): for node in dep_node: related_dep_nodes.add(node) else: logger.error("get_node_by_spans return type unknown.") children = [n for n, l in oia_graph.children(oia_node)] for child in children: if isinstance(child, OIAWordsNode): dep_node = dep_graph.get_node_by_spans(child.spans) if dep_node: if isinstance(dep_node, DependencyGraphNode): related_dep_nodes.add(dep_node) elif isinstance(dep_node, list): for node in dep_node: related_dep_nodes.add(node) else: logger.error( "get_node_by_spans return type unknown.") dep_root = dep_graph.get_node("0") real_dep_root = next(n for n, l in dep_graph.children(dep_root)) min_dist_to_root = min([ len( nx.shortest_path(dep_graph.g.to_undirected(), real_dep_root.ID, dep_node.ID)) for dep_node in related_dep_nodes ]) dists_to_root.append((oia_node, min_dist_to_root)) dists_to_root.sort(key=lambda x: x[1]) root_candidates = [] min_dist = dists_to_root[0][1] for oia_node, dist in dists_to_root: if dist == min_dist: root_candidates.append(oia_node) if len(root_candidates) == 1: root = root_candidates[0] else: scores = [] score_map = {":": 40, "\"": 30, ";": 20, ",": 10, "(": -10} for cand in root_candidates: score = -100 if any([ "func" in rel.label for n, rel in oia_graph.children(cand) ]): score = 100 children = [n for n, l in oia_graph.children(cand)] dep_children = [] for child in children: if isinstance(child, OIAWordsNode): dep_node = dep_graph.get_node_by_spans(child.spans) if dep_node: if isinstance(dep_node, DependencyGraphNode): dep_children.append(dep_node) elif isinstance(dep_node, list): for node in dep_node: dep_children.append(node) else: logger.error( "get_node_by_spans return type unknown.") # check what between them dep_children.sort(key=lambda x: x.LOC) for node in dep_graph.nodes(): if node.LOC is None: continue if dep_children[0].LOC < node.LOC < dep_children[-1].LOC: if node.FORM in score_map: score = max(score, score_map[node.FORM]) if isinstance(cand, OIAWordsNode): dep_node = dep_graph.get_node_by_spans(cand.spans) if dep_node: if isinstance(dep_node, DependencyGraphNode): if dep_node.LEMMA in IMPORTANT_CONNECTION_WORDS: score += 8 elif isinstance(dep_node, list): for node in dep_node: if node.LEMMA in IMPORTANT_CONNECTION_WORDS: score += 8 else: logger.error( "get_node_by_spans return type unknown.") elif isinstance(cand, OIAAuxNode) and cand.label == "PARATAXIS": score += 4 scores.append((cand, score)) scores.sort(key=lambda x: x[1], reverse=True) top_nodes = [] for node, score in scores: if score == scores[0][1]: top_nodes.append(node) if len(top_nodes) == 1: root = top_nodes[0] elif len(top_nodes) >= 3: # multiple top node found, merge them to one if all( isinstance(node, OIAAuxNode) and node.label == "PARATAXIS" for node in top_nodes): next_nodes = [] for top in top_nodes: for n, l in list(oia_graph.children(top)): next_nodes.append(n) oia_graph.remove_node(top) for node in zero_degree_nodes: if node.ID == top.ID: zero_degree_nodes.remove(node) root = oia_graph.add_aux("PARATAXIS") oia_graph.add_node(root) next_nodes.sort(key=lambda x: x.ID) for index, second_node in enumerate(next_nodes): oia_graph.add_argument(root, second_node, index) else: logger.error( "Deep intersection point, currently cannot process") return # raise Exception("Two top nodes? I think it is not possible ") else: # len(top_nodes) == 2: # check who is prev, and who is next dep_tops = [] for top in top_nodes: if isinstance(top, OIAWordsNode): dep_node = dep_graph.get_node_by_spans(top.spans) if dep_node: if isinstance(dep_node, DependencyGraphNode): dep_tops.append((top, dep_node)) elif isinstance(dep_node, list): for node in dep_node: dep_tops.append((top, node)) else: logger.error( "get_node_by_spans return type unknown.") if not len(dep_tops) >= 1: logger.error("Multiple AUX head ") return dep_tops.sort(key=lambda x: x[1].LOC) root = dep_tops[0][0] # root obtained, change other zero-in-degree node logger.debug("Root obtained ") logger.debug(root) for node in zero_degree_nodes: # print('zero_degree_nodes:', node) if root.ID == node.ID: continue if is_conj_node(node, dep_graph): # print('is_conj_node:',node,' !!!!!!!!!!') for child, rel in list(oia_graph.children(node)): label = rel.label if "pred.arg." in label: arg_no = label.split(".")[-1] new_rel = "as:pred.arg." + arg_no oia_graph.remove_relation(node, child) oia_graph.add_relation(child, node, new_rel) continue ref_childs = [ child for child, rel in oia_graph.children(node) if rel.label == "ref" ] if ref_childs: for child in ref_childs: oia_graph.remove_relation(node, child) oia_graph.add_relation(child, node, "as:ref") continue in_degrees = [(node, oia_graph.g.in_degree(node.ID)) for node in oia_graph.nodes()] zero_degree_nodes = [ n for n, degree in in_degrees if degree == 0 and n.ID != root.ID ] while len(zero_degree_nodes) > 0: logger.debug("we found zero_degree_nodes: ") for node in zero_degree_nodes: logger.debug(node) root_offsprings = set(oia_graph.offsprings(root)) logger.debug("root offsprings :") for n in root_offsprings: logger.debug(n) intersections = [] for node in zero_degree_nodes: node_offspring = set(oia_graph.offsprings(node)) logger.debug("node offsprings :") for n in node_offspring: logger.debug(n) intersection = root_offsprings.intersection(node_offspring) logger.debug("we found {0} initial intersection :".format( len(intersection))) for n in intersection: logger.debug(n) if intersection: top_intersection_point = None parents_to_root = None parents_to_other = None for x in intersection: parents = set([n for n, l in oia_graph.parents(x)]) if not parents.intersection(intersection): top_intersection_point = x parents_to_root = parents.intersection(root_offsprings) parents_to_other = parents.intersection(node_offspring) break if top_intersection_point is None: logger.error("It seems we have a problem ") continue logger.debug("we found a intersections: ") logger.debug(top_intersection_point) logger.debug("Its parents to root: ") for x in parents_to_root: logger.debug(x) logger.debug("Its parents to other: ") for x in parents_to_other: logger.debug(x) intersections.append((top_intersection_point, parents_to_root, parents_to_other)) if len(intersections) == 0: logger.error("seems we have disconnected compoenent") break # raise Exception("Unexpected situation") for intersection_point, parents_to_root, parents_to_other in intersections: # if node not in set([n for n, l in oia_graph.parents(intersection_point)]): # logger.error("Deep intersection point, currently cannot process") # # raise Exception("Deep intersection point, currently cannot process") # continue for node in parents_to_other: if isinstance(node, OIAAuxNode) and node.label == "LIST": logger.error("lets see what happens for LIST") if len(list(oia_graph.parents(node))) != 0: logger.error( "it seems different with what we have thought for LIST " ) relation = oia_graph.get_edge(node, intersection_point) oia_graph.remove_relation(node, intersection_point) oia_graph.add_relation(intersection_point, node, "as:" + relation.label) # for parent, l in list(oia_graph.parents(intersection_point)): # if parent != node: # oia_graph.remove_relation(parent, intersection_point) # oia_graph.add_relation(parent, node, l.label) elif (isinstance(node, OIAAuxNode) and node.label == "WHETHER"): # parents_to_root = list(oia_graph.parents_on_path(intersection_point, root)) if len(list(oia_graph.parents(node))) != 0: logger.error( "it seems different with what we have thought for WHETHER " ) for parent in parents_to_root: relation = oia_graph.get_edge(parent, intersection_point) oia_graph.remove_relation(parent, intersection_point) oia_graph.add_relation(parent, node, relation.label) else: relation = oia_graph.get_edge(node, intersection_point) oia_graph.remove_relation(node, intersection_point) oia_graph.add_relation(intersection_point, node, "as:" + relation.label) in_degrees = [(node, oia_graph.g.in_degree(node.ID)) for node in oia_graph.nodes()] zero_degree_nodes = [ n for n, degree in in_degrees if degree == 0 and n.ID != root.ID ]
def acl_verb_obl_case(dep_graph: DependencyGraph): """ something extracted by :param dep_graph: :param oia_graph: :return: """ pattern = DependencyGraph() subj_node = pattern.create_node() verb_node = pattern.create_node(UPOS="VERB") obj_node = pattern.create_node() case_node = pattern.create_node() pattern.add_dependency(subj_node, verb_node, r'acl') pattern.add_dependency(verb_node, obj_node, r'obl:\w*') pattern.add_dependency(obj_node, case_node, r'case') phrases = [] for match in dep_graph.match(pattern): dep_subj_node = match[subj_node] dep_verb_node = match[verb_node] dep_obj_node = match[obj_node] dep_case_node = match[case_node] obl_nodes = [ n for n, l in dep_graph.children( dep_verb_node, filter=lambda n, l: l.startswith("obl")) ] if len(obl_nodes) > 1: continue existing_obj_nodes = [ n for n, l in dep_graph.children( dep_verb_node, filter=lambda n, l: "obj" in l or "comp" in l) ] if existing_obj_nodes: continue obl_rel = dep_graph.get_dependency(dep_verb_node, dep_obj_node) if dep_case_node.FORM not in obl_rel.values(): continue # there are may be other cases, join them all dep_case_nodes = [ n for n, l in dep_graph.children(dep_obj_node, filter=lambda n, l: l.startswith("case") and dep_verb_node.LOC < n.LOC < dep_obj_node.LOC) ] subjs = list( dep_graph.children(dep_verb_node, filter=lambda n, l: "subj" in l)) if len(subjs) > 1: continue phrases.append( (dep_subj_node, dep_verb_node, dep_obj_node, dep_case_nodes)) for dep_subj_node, dep_verb_node, dep_obj_node, dep_case_nodes in phrases: new_verb_phrase = [dep_verb_node] + dep_case_nodes logging.debug("acl_verb_obl_case: we are merging nodes") logging.debug("\n".join(str(node) for node in new_verb_phrase)) new_verb_node = merge_dep_nodes(new_verb_phrase, UPOS=dep_verb_node.UPOS, LOC=dep_verb_node.LOC, FEATS=dep_verb_node.FEATS) logging.debug("acl_verb_obl_case: we obtain a new node") logging.debug(str(new_verb_node)) dep_graph.remove_dependency(dep_verb_node, dep_obj_node) for node in dep_case_nodes: dep_graph.remove_dependency(dep_obj_node, node) dep_graph.replace_nodes(new_verb_phrase, new_verb_node) dep_graph.add_dependency(new_verb_node, dep_obj_node, "obj")
def amod_xcomp_to_acl(dep_graph: DependencyGraph): """ something extracted by :param dep_graph: :param oia_graph: :return: """ pattern = DependencyGraph() noun_node = pattern.create_node(UPOS="NOUN") adj_node = pattern.create_node(UPOS="ADJ") verb_node = pattern.create_node(UPOS="VERB") pattern.add_dependency(noun_node, adj_node, r'amod') pattern.add_dependency(adj_node, verb_node, r"xcomp") for match in list(dep_graph.match(pattern)): dep_noun_node = match[noun_node] dep_verb_node = match[verb_node] dep_adj_node = match[adj_node] try: [ dep_graph.get_node(x.ID) for x in [dep_noun_node, dep_verb_node, dep_adj_node] ] except Exception as e: # has been processed by previous match continue xcomp_nodes = [ n for n, l in dep_graph.children( dep_adj_node, filter=lambda n, l: l.startswith("xcomp")) ] mark_nodes_list = [] for dep_xcomp_node in xcomp_nodes: mark_nodes = [ n for n, l in dep_graph.children( dep_xcomp_node, filter=lambda n, l: l.startswith("mark") and dep_adj_node. LOC < n.LOC < dep_xcomp_node.LOC) ] if mark_nodes: mark_nodes_list.append(mark_nodes) if len(mark_nodes_list) > 1: raise Exception("Unexpected Situation Happened") new_verb_nodes = [dep_adj_node] if mark_nodes_list: mark_nodes = mark_nodes_list[0] new_verb_nodes.extend(mark_nodes) new_verb_nodes.sort(key=lambda x: x.LOC) new_verb_nodes = ["(be)"] + new_verb_nodes new_node = merge_dep_nodes(new_verb_nodes, UPOS="VERB", LOC=new_verb_nodes[-1].LOC, FEATS={"VerbForm": "Ger"}) dep_graph.replace_nodes(new_verb_nodes, new_node) dep_graph.set_dependency(dep_noun_node, new_node, "acl") for dep_xcomp_node in xcomp_nodes: dep_graph.remove_dependency(dep_xcomp_node, new_node) dep_graph.set_dependency(new_node, dep_verb_node, "obj")
def adv_ccomp(dep_graph: DependencyGraph, oia_graph: OIAGraph, context: UD2OIAContext): """ :param dep_graph: :param oia_graph: :return: """ pattern = DependencyGraph() # TODO: it seems that in UD labeling, adv is used instead of adj for noun # verb_node = pattern.create_node(UPOS="VERB|NOUN|PROPN") adv_node = pattern.create_node(UPOS="ADV|X|NOUN|PART") # part is for "not" ccomp_node = pattern.create_node() # pattern.add_dependency(verb_node, adv_node, r'advmod') pattern.add_dependency(adv_node, ccomp_node, r"ccomp|xcomp") patterns = [] for match in dep_graph.match(pattern): # dep_verb_node = match[verb_node] dep_adv_node = match[adv_node] dep_ccomp_node = match[ccomp_node] if oia_graph.has_relation(dep_adv_node, dep_ccomp_node): continue dep_case_nodes = [ n for n, l in dep_graph.children(dep_ccomp_node, filter=lambda n, l: "case" == l and dep_adv_node .LOC < n.LOC < dep_ccomp_node.LOC) ] if dep_case_nodes: dep_case_nodes = continuous_component(dep_case_nodes, dep_case_nodes[0]) predicate_nodes = [dep_adv_node] + dep_case_nodes predicate_nodes.sort(key=lambda n: n.LOC) else: predicate_nodes = [dep_adv_node] dep_subj_nodes = [ n for n, l in dep_graph.parents(dep_adv_node, filter=lambda n, l: "advmod" == l and n.UPOS in {"ADV", "X", "NOUN"}) ] if len(dep_subj_nodes) > 1: raise Exception("Multiple subject") elif len(dep_subj_nodes) > 0: dep_subj_node = dep_subj_nodes[0] else: dep_subj_node = None patterns.append([dep_subj_node, predicate_nodes, dep_ccomp_node]) for dep_subj_node, predicate_nodes, dep_ccomp_node in patterns: if len(predicate_nodes) > 1: new_pred_node = dep_graph.create_node( ID=" ".join([x.ID for x in predicate_nodes]), FORM=" ".join([x.FORM for x in predicate_nodes]), LEMMA=" ".join([x.LEMMA for x in predicate_nodes]), UPOS="ADV", LOC=predicate_nodes[0].LOC) new_pred_node.aux = True dep_graph.replace_nodes(predicate_nodes, new_pred_node) dep_graph.remove_dependency(dep_ccomp_node, new_pred_node) else: new_pred_node = predicate_nodes[0] oia_pred_node = oia_graph.add_words(new_pred_node.position) if dep_subj_node: oia_subj_node = oia_graph.add_words(dep_subj_node.position) oia_graph.add_argument(oia_pred_node, oia_subj_node, 1, mod=True) else: oia_ccomp_node = oia_graph.add_words(dep_ccomp_node.position) oia_graph.add_argument(oia_pred_node, oia_ccomp_node, 2)
def adv_verb_modifier(dep_graph: DependencyGraph, oia_graph: OIAGraph, context: UD2OIAContext): """ the adv before the verb should be processed by verb_phrase this converter should process the adv after the verb verb1 in order to verb2 :param sentence: :return: """ pattern = DependencyGraph() # TODO: it seems that in UD labeling, adv is used instead of adj for noun verb_node = DependencyGraphNode( UPOS="VERB|NOUN|PROPN|AUX|PRON") # aux is for be word adv_node = DependencyGraphNode(UPOS="ADV|X|NOUN|ADJ|VERB") pattern.add_nodes([verb_node, adv_node]) pattern.add_dependency(verb_node, adv_node, r'advmod') for match in dep_graph.match(pattern): dep_verb_node = match[verb_node] dep_adv_node = match[adv_node] if context.is_processed(dep_verb_node, dep_adv_node): continue if oia_graph.has_relation(dep_verb_node, dep_adv_node): continue obl_children = [ x for x, l in dep_graph.children( dep_adv_node, filter=lambda n, l: l.startswith("obl")) ] obl_node = None obl_has_case = False if len(obl_children) == 1: obl_node = obl_children[0] case_nodes = list(n for n, l in dep_graph.children( obl_node, filter=lambda n, l: "case" in l)) if case_nodes: # if obl with case, let the oblique to process it obl_has_case = True mark_children = [ x for x, l in dep_graph.children( dep_adv_node, filter=lambda n, l: l.startswith("mark")) ] oia_verb_node = oia_graph.add_words(dep_verb_node.position) oia_adv_node = oia_graph.add_words(dep_adv_node.position) if obl_node and not obl_has_case: # arg_nodes = list(dep_graph.offsprings(obl_node)) # arg_nodes.sort(key=lambda x: x.LOC) # arg_words = [x.ID for x in arg_nodes] # head = obl_node.ID oia_arg_node = oia_graph.add_words(obl_node.position) oia_graph.add_argument(oia_adv_node, oia_verb_node, 1, mod=True) oia_graph.add_argument(oia_adv_node, oia_arg_node, 2) else: if mark_children: mark_node = mark_children[0] oia_pred_node = oia_graph.add_words(mark_node.position) oia_graph.add_argument(oia_pred_node, oia_verb_node, 1, mod=True) oia_graph.add_argument(oia_pred_node, oia_adv_node, 2) else: oia_graph.add_mod(oia_adv_node, oia_verb_node)
def simple_clause(dep_graph: DependencyGraph, oia_graph: OIAGraph, context: UD2OIAContext): """ :TODO badcase Attached is a new link :param dep_graph: :param oia_graph: :return: """ # for node in dep_graph.nodes(): # print('node:',node) for pred_node in dep_graph.nodes( filter=lambda x: x.UPOS in {"VERB", "ADJ", "NOUN", "AUX", "ADV"}): # ADJ is for "With the demand so high," # NOUN is for "X the best for Y" # AUX is for have in "I have a cat" # print('pred_node', pred_node) expl = None nsubj = None subj = None objs = [] for child, rel in dep_graph.children(pred_node): # print('child node:', child) # print('child rel:', rel) if ('nsubj' in rel or "csubj" in rel): # and ":xsubj" not in rel: nsubj = child elif rel.startswith('obj'): objs.append((child, 1)) elif rel.startswith('iobj'): objs.append((child, 0)) elif 'ccomp' in rel or "xcomp" in rel: # and child.UPOS == "VERB": objs.append((child, 2)) elif "expl" in rel: expl = child if nsubj: # if pred_node.LOC < nsubj.LOC: # # TODO: in what situation? # objs.insert(0, nsubj) # else: subj = nsubj if expl: # It VERB subj that # VERB subj it that if expl.LOC < pred_node.LOC: subj = expl objs.insert(0, (subj, -1)) else: # expl.LOC > pred_node.LOC: objs.insert(0, (expl, -1)) if not subj and not objs: continue pred_node = oia_graph.add_words(pred_node.position) if not pred_node: continue arg_index = 1 if subj is not None: if not oia_graph.has_relation(pred_node, subj): subj_node = oia_graph.add_words(subj.position) oia_graph.add_argument(pred_node, subj_node, arg_index) arg_index += 1 objs.sort(key=lambda x: x[1]) for obj, weight in objs: # print('obj:',obj) oia_obj_node = oia_graph.add_words(obj.position) # def __sconj_node(n): # # that conj is ommited # return (n.UPOS == "SCONJ" and n.LEMMA not in {"that"}) def __adv_question_node(n): return ((n.UPOS == "ADV" and n.LEMMA in {"when", "where", "how", "whether"})) # # def __pron_question_node(n): # return (n.UPOS == "PRON" and n.LEMMA in {"what", "who", "which"}) # def __interested_node2(n): # # that conj is ommited # return (n.UPOS == "PART") # sconj_nodes = [n for n, l in dep_graph.children(obj, # filter=lambda n,l: l == "mark" and __sconj_node(n))] adv_question_nodes = [ n for n, l in dep_graph.children( obj, filter=lambda n, l: l == "mark" and __adv_question_node(n)) ] # subj_question_nodes = [n for n, l in dep_graph.children(obj, # filter=lambda n,l: "subj" in l and __pron_question_node(n))] # # obj_question_nodes = [n for n, l in dep_graph.children(obj, # filter=lambda n, # l: ("obj" in l or "comp") in l and __pron_question_node( # n))] # nodes_of_interests2 = [n for n, l in dep_graph.children(obj, # filter=lambda n,l: l == "advmod" and __interested_node2(n))] # print('nodes_of_interests:', nodes_of_interests) # if nodes_of_interests2: # assert len(nodes_of_interests2) == 1 # interest_node = nodes_of_interests2[0] # oia_interest_node = oia_graph.add_word_with_head(interest_node.LOC) # oia_graph.add_argument(pred_node, oia_interest_node, arg_index) # # oia_graph.add_function(oia_interest_node, oia_obj_node) # arg_index += 1 # oia_graph.add_argument(oia_interest_node, oia_obj_node, arg_index) # arg_index += 1 if adv_question_nodes: assert len(adv_question_nodes) == 1 interest_node = adv_question_nodes[0] oia_interest_node = oia_graph.add_words(interest_node.position) oia_graph.add_argument(pred_node, oia_interest_node, arg_index) oia_graph.add_function(oia_interest_node, oia_obj_node) else: if not oia_graph.has_relation(pred_node, obj): oia_graph.add_argument(pred_node, oia_obj_node, arg_index) arg_index += 1 pattern = DependencyGraph() parent_pred = pattern.create_node() child_pred = pattern.create_node() question_word = pattern.create_node(LEMMA=r'what|who') pattern.add_dependency(parent_pred, child_pred, r'subj|nsubj|iobj|obj|xcomp|ccomp') pattern.add_dependency(parent_pred, question_word, r'subj|nsubj|iobj|obj|xcomp|ccomp') pattern.add_dependency(child_pred, question_word, r'subj|nsubj|iobj|obj|xcomp|ccomp') for match in dep_graph.match(pattern): dep_parent_pred, dep_child_pred, dep_question_word = [ match[x] for x in [parent_pred, child_pred, question_word] ] oia_parent_pred, oia_child_pred, oia_question_word = [ oia_graph.add_words(x.position) for x in [dep_parent_pred, dep_child_pred, dep_question_word] ] oia_question_word.is_func = True rel = oia_graph.get_edge(oia_child_pred, oia_question_word) oia_graph.remove_relation(oia_child_pred, oia_question_word) oia_graph.remove_relation(oia_parent_pred, oia_child_pred) oia_graph.add_relation(oia_question_word, oia_child_pred, "mod_by:" + rel.label)
def object_relative_clause(dep_graph: DependencyGraph, oia_graph: OIAGraph, context: UD2OIAContext): """ ##### Object-extracted/referred relative clause ##### ##### the person that Andy knows ##### :param sentence: :return: """ pattern = DependencyGraph() verb_node = DependencyGraphNode() entity_node = DependencyGraphNode() subj_node = DependencyGraphNode() pattern.add_nodes([verb_node, entity_node, subj_node]) pattern.add_dependency(verb_node, subj_node, r'\w*subj\w*') pattern.add_dependency(entity_node, verb_node, r'\w*acl:relcl\w*') for match in dep_graph.match(pattern): dep_entity_node = match[entity_node] dep_subj_node = match[subj_node] dep_verb_node = match[verb_node] if dep_subj_node.LEMMA in {"what", "who", "which", "that"}: continue logger.debug("we found a objective relative clause") logger.debug("entity: {0}".format(dep_entity_node)) logger.debug("subject: {0}".format(dep_subj_node)) logger.debug("verb: {0}".format(dep_verb_node)) if context.is_processed(dep_entity_node, dep_verb_node): logger.debug("processed") continue context.processed(dep_verb_node, dep_subj_node) context.processed(dep_entity_node, dep_verb_node) oia_entity_node = oia_graph.add_words(dep_entity_node.position) oia_verb_node = oia_graph.add_words(dep_verb_node.position) oia_subj_node = oia_graph.add_words(dep_subj_node.position) if oia_graph.has_relation(oia_entity_node, oia_verb_node): logger.debug("has relation between entity and verb") continue oia_graph.add_argument(oia_verb_node, oia_subj_node, 1) def __valid_ref(n, l): return l == "ref" and dep_entity_node.LOC < n.LOC < dep_verb_node.LOC ref_nodes = list(n for n, l in dep_graph.children(dep_entity_node, filter=__valid_ref)) ref_nodes.sort(key=lambda x: x.LOC) if ref_nodes: ref_node = ref_nodes[-1] oia_ref_node = oia_graph.add_words(ref_node.position) oia_graph.add_ref(oia_entity_node, oia_ref_node) logger.debug("we are coping with ref between:") logger.debug(dep_verb_node) logger.debug(ref_node) ref_relation = dep_graph.get_dependency(dep_verb_node, ref_node) case_nodes = list(n for n, l in dep_graph.children( ref_node, filter=lambda n, l: "case" in l)) case_nodes.sort(key=lambda x: x.LOC) if ref_relation: if case_nodes: # with which xxxx, the with will become the root pred case_node = case_nodes[-1] oia_case_node = oia_graph.add_words(case_node.position) oia_graph.add_argument(oia_case_node, oia_verb_node, 1, mod=True) oia_graph.add_argument(oia_case_node, oia_ref_node, 2) oia_graph.add_mod(oia_verb_node, oia_entity_node) else: if "obj" in ref_relation: oia_graph.add_argument(oia_verb_node, oia_ref_node, 2) elif ref_relation == "advmod": oia_graph.add_mod(oia_ref_node, oia_verb_node) else: raise Exception( "unknown relation: {}".format(ref_relation)) # oia_graph.add_argument(oia_verb_node, oia_entity_node, 2, mod=True) oia_graph.add_argument(oia_verb_node, oia_subj_node, 1) oia_graph.add_argument(oia_verb_node, oia_entity_node, 2, mod=True) rels = dep_graph.get_dependency(dep_entity_node, dep_verb_node) #if rels.endswith("obj"): for node, l in dep_graph.children(dep_verb_node): if l == "ccomp": oia_ccomp_node = oia_graph.add_words(node.position) oia_graph.add_argument(oia_verb_node, oia_ccomp_node, 3)
def parataxis(dep_graph: DependencyGraph, oia_graph: OIAGraph, context: UD2OIAContext): """ #################### adverbs like however, then, etc ######################## :param sentence: :return: """ for dep_node in list(dep_graph.nodes()): parallel_nodes = [ n for n, l in dep_graph.children(dep_node) if "parataxis" == l ] if not parallel_nodes: continue parallel_nodes.append(dep_node) parallel_nodes.sort(key=lambda x: x.LOC) predicates = [] for index, (former, latter) in enumerate( more_itertools.pairwise(parallel_nodes)): advcon = [ n for n, l in dep_graph.children(latter, filter=lambda n, l: "advmod" in l and (former.LOC < n.LOC < latter.LOC) and (n.UPOS == "SCONJ" or n.LEMMA in {"so"})) ] coloncon = [ n for n, l in dep_graph.children(dep_node, filter=lambda n, l: "punct" in l and n.FORM in {":", ";", "--", ","} and (former.LOC < n.LOC < latter.LOC)) ] if advcon: dep_con = advcon[0] # dep_graph.remove_dependency(para, dep_con) # otherwise, the dep_con will be recovered by adv_modifier, may cause further question elif coloncon: dep_con = coloncon[0] else: dep_con = None predicates.append(dep_con) if all(x is None for x in predicates): oia_pred_node = oia_graph.add_aux("PARATAXIS") else: if len(predicates) == 1: oia_pred_node = oia_graph.add_words(predicates[0].position) else: position = ["{1}"] for i, node in enumerate(predicates): if node is not None: position.extend(node.position) position.append("{{{0}}}".format(i + 2)) oia_pred_node = oia_graph.add_words(position) for idx, node in enumerate(parallel_nodes): oia_arg_node = oia_graph.add_words(node.position) oia_graph.add_argument(oia_pred_node, oia_arg_node, idx + 1)
def gradation(dep_graph: DependencyGraph): """ TODO: do not match with the tech report, and the verb is not considered ##### Comparative ##### ##### Periphrastic gradation ##### ##### He runs faster than her ##### ##### Martin is more intelligent than Donald ##### ##### He is a nicer person than Tom ##### She is more than a regular cook :param dep_graph: :param oia_graph: :return: """ pattern = DependencyGraph() verb_node = pattern.create_node(UPOS="VERB|NOUN|PRON|PROPN|SYM") advj_node = pattern.create_node(UPOS="ADJ|ADV", FEATS={"Degree": "Cmp"}) than_node = pattern.create_node(FORM="than") obj_node = pattern.create_node() pattern.add_dependency(verb_node, advj_node, r'advmod|amod') pattern.add_dependency(advj_node, obj_node, r'\w*(nmod:than|obl:than|advcl:than)\w*') pattern.add_dependency(obj_node, than_node, r'\w*case|mark\w*') for match in list(dep_graph.match(pattern)): dep_verb_node = match[verb_node] dep_advj_node = match[advj_node] dep_than_node = match[than_node] dep_obj_node = match[obj_node] def __valid_mod(n, l): return (l == "amod" or l == "advmod") and in_interval( n, None, dep_advj_node) aux_node = list(dep_graph.children(dep_advj_node, filter=__valid_mod)) if aux_node: aux_node = aux_node[0][0] offsprings = dep_graph.offsprings(aux_node) more_than_nodes = offsprings + [dep_than_node] else: more_than_nodes = (dep_advj_node, dep_than_node) dep_more_than_node = merge_dep_nodes(more_than_nodes, UPOS="ADP", LOC=dep_than_node.LOC) dep_graph.replace_nodes(more_than_nodes, dep_more_than_node) dep_graph.remove_dependency(dep_obj_node, dep_more_than_node) dep_graph.remove_dependency(dep_more_than_node, dep_obj_node) dep_graph.remove_dependency(dep_verb_node, dep_more_than_node) if dep_verb_node.UPOS == "VERB": dep_graph.set_dependency(dep_verb_node, dep_obj_node, "advcl:" + dep_more_than_node.FORM) dep_graph.set_dependency(dep_obj_node, dep_more_than_node, "mark") else: dep_graph.set_dependency(dep_verb_node, dep_obj_node, "obl:" + dep_more_than_node.FORM) dep_graph.set_dependency(dep_obj_node, dep_more_than_node, "case")
def general_question(dep_graph: DependencyGraph, oia_graph: OIAGraph, context: UD2OIAContext): """ :param dep_graph: :param oia_graph: :return: """ for verb in dep_graph.nodes(filter=lambda n: n.UPOS == "VERB"): if any( any(x in n.LEMMA for x in {"what", "how", "why", "when", "where"}) for n in dep_graph.offsprings(verb)): continue parents = [n for n, _ in dep_graph.parents(verb)] # if not(len(parents) == 1 and parents[0].ID == "0"): # continue # check subj and aux subj = None aux = None for child, rel in dep_graph.children(verb): if "subj" in rel: subj = child if "aux" in rel: aux = child is_be_verb = False if not isinstance(verb, DependencyGraphSuperNode): is_be_verb = verb.LEMMA == "be" else: assert isinstance(verb, DependencyGraphSuperNode) assert aux is None for n in verb.nodes: if isinstance(n, DependencyGraphNode): if n.LEMMA == "be": is_be_verb = True # print('verb.nodes:', str(" ".join(str(xx.LEMMA) for xx in verb.nodes))) # print('is_be_verb222:', is_be_verb) if n.UPOS == "AUX": aux = n # print('is_be_verb:', is_be_verb) if aux is None and not is_be_verb: # cannot be a general question continue expl_child = [n for n, l in dep_graph.children(verb) if l == "expl"] if expl_child: assert len(expl_child) == 1 subj = expl_child[0] if subj is None: logger.warning( "subject is none, cannot decide whether it is a question") continue # print('subj.LOC:', subj.LOC) # print('subj.LOC type:', type(subj.LOC)) oia_verb_node = oia_graph.add_words(verb.position) is_there_be_verb = is_be_verb and ("there" in verb.LEMMA.split(' ') or "here" in verb.LEMMA.split(' ')) is_question = False if is_there_be_verb: assert isinstance(verb, DependencyGraphSuperNode) be_node = [n for n in verb.nodes if n.LEMMA == "be"][0] there_node = [ n for n in verb.nodes if n.LEMMA == "there" or n.LEMMA == "here" ][0] # print('there_node:', there_node) if be_node.LOC < there_node.LOC: is_question = True elif (is_be_verb and verb.LOC < subj.LOC): is_question = True elif (aux is not None and aux.LOC < subj.LOC): is_question = True if is_question: # if aux is not None and aux.LEMMA == "do": # oia_question_node = oia_graph.add_word_with_head(aux.LOC) # else: oia_question_node = oia_graph.add_aux("WHETHER") oia_graph.add_function(oia_question_node, oia_verb_node)
def advcl_mark_sconj(dep_graph: DependencyGraph, oia_graph: OIAGraph, context: UD2OIAContext): """ :param dep_graph: :param oia_graph: :return: """ pattern = DependencyGraph() pred1_node = pattern.create_node() pred2_node = pattern.create_node() # sconj_node = pattern.create_node(UPOS="SCONJ") sconj_node = pattern.create_node() pattern.add_dependency(pred1_node, pred2_node, r'advcl\w*') # pattern.add_dependency(pred1_node, pred2_node, r'\w*') # pattern.add_dependency(pred2_node, sconj_node, r'mark|advmod') pattern.add_dependency(pred2_node, sconj_node, 'mark') for match in list(dep_graph.match(pattern)): dep_pred1_node = match[pred1_node] dep_pred2_node = match[pred2_node] dep_sconj_node = match[sconj_node] # advcl_rel = dep_graph.get_dependency(dep_pred1_node, dep_pred2_node) if dep_sconj_node.LEMMA not in CONJUNCTION_WORDS[language]: continue context.processed(dep_pred2_node, dep_sconj_node) context.processed(dep_pred1_node, dep_pred2_node) oia_pred1_node = oia_graph.add_words(dep_pred1_node.position) oia_pred2_node = oia_graph.add_words(dep_pred2_node.position) if dep_sconj_node.LEMMA == "if": # check whether there is "then" dep_then_nodes = [ n for n, l in dep_graph.children(dep_pred1_node) if n.LEMMA == "then" and l == "advmod" ] if dep_then_nodes: assert len(dep_then_nodes) == 1 dep_then_node = dep_then_nodes[0] context.processed(dep_pred1_node, dep_then_node) if_then_position = dep_sconj_node.position + [ "{1}" ] + dep_then_node.position + ["{2}"] oia_condition_node = oia_graph.add_words(if_then_position) else: oia_condition_node = oia_graph.add_words( dep_sconj_node.position) oia_graph.add_argument(oia_condition_node, oia_pred2_node, 1) oia_graph.add_argument(oia_condition_node, oia_pred1_node, 2) else: oia_condition_node = oia_graph.add_words(dep_sconj_node.position) if dep_sconj_node.LEMMA in CONJUNCTION_WORDS[language]: oia_graph.add_argument(oia_condition_node, oia_pred2_node, 1) oia_graph.add_argument(oia_condition_node, oia_pred1_node, 2) else: oia_graph.add_argument(oia_condition_node, oia_pred1_node, 1, mod=True) oia_graph.add_argument(oia_condition_node, oia_pred2_node, 2)
def verb_phrase(dep_graph: DependencyGraph): """ ##### Merging aux and cop with their head VERB ##### Cases: :param sentence: :return: """ verb_phrases = [] for node in dep_graph.nodes(filter=lambda x: x.UPOS in {"VERB", "AUX"}): if node.UPOS == "AUX": parent = [ n for n, l in dep_graph.parents(node, filter=lambda n, l: l == "aux") ] if len(parent) > 0: continue # if "VerbForm" in node.FEATS and "Ger" in node.FEATS["VerbForm"]: # continue if "Tense" in node.FEATS and "Past" in node.FEATS["Tense"]: # if the verb is before the noun, it will be processed by noun_phrase and taken as a part of the noun parent = [ n for n, l in dep_graph.parents( node, filter=lambda n, l: l == "amod" and node.LOC < n.LOC) ] if len(parent) > 0: continue # logger.debug("We are checking node {0}".format(node)) root = node verbs = [root] for n, l in dep_graph.children(root): if dep_graph.get_dependency(n, root): continue if n.LEMMA in {"so", "also", "why"}: continue if "advmod" in l: offsprings = list(dep_graph.offsprings(n)) if any(x.UPOS in {"VERB", "NOUN", "AUX", "PRON"} for x in offsprings): continue verbs.extend(offsprings) elif "compound" in l: verbs.append(n) verbs = [ x for x in verbs if x.LOC <= root.LOC or "compound" in dep_graph.get_dependency(root, x) ] # logger.debug("Verb: before continuous component ") # logger.debug("\n".join(str(verb) for verb in verbs)) verbs = continuous_component(verbs, root) # add aux verbs.extend(n for n, l in dep_graph.children(root) if "aux" in l) # logger.debug("Verb: after continuous component ") # for verb in verbs: # logger.debug(verb) verbs.sort(key=lambda x: x.LOC) last_loc = verbs[-1].LOC # next_node = dep_graph.get_node_by_loc(last_loc + 1) # if next_node and next_node.LEMMA == "not": # verbs.append(next_node) if len(verbs) > 1: verb_phrases.append((verbs, root)) for verbs, root in verb_phrases: verb_node = merge_dep_nodes(verbs, UPOS="VERB", LOC=root.LOC, FEATS=root.FEATS) dep_graph.replace_nodes(verbs, verb_node)