def ccomp_mark_sconj(dep_graph: DependencyGraph): """ See them as they are :param dep_graph: :param oia_graph: :return: """ pattern = DependencyGraph() pred1_node = pattern.create_node(UPOS="VERB") pred2_node = pattern.create_node() sconj_node = pattern.create_node(UPOS="SCONJ") pattern.add_dependency(pred1_node, pred2_node, r'ccomp') pattern.add_dependency(pred2_node, sconj_node, 'mark') for match in list(dep_graph.match(pattern)): dep_pred1_node = match[pred1_node] dep_pred2_node = match[pred2_node] dep_sconj_node = match[sconj_node] if dep_sconj_node.LEMMA == "as": dep_graph.remove_dependency(dep_pred2_node, dep_sconj_node) new_verb = [dep_pred1_node, "{1}", dep_sconj_node, "{2}"] new_verb_node = merge_dep_nodes(new_verb, UPOS=dep_pred1_node.UPOS, LOC=dep_pred1_node.LOC) # print("Noun detected", noun_node.ID) dep_graph.replace_nodes(new_verb, new_verb_node)
def whose_noun(dep_graph: DependencyGraph): """ :param dep_graph: :return: """ pattern = DependencyGraph() noun_node = pattern.create_node(UPOS="NOUN|PROPN|PRON|X|NUM|SYM") owner_node = pattern.create_node() whose_node = pattern.create_node(LEMMA="whose") pattern.add_dependency(noun_node, owner_node, "nmod:poss") pattern.add_dependency(owner_node, whose_node, "ref") whose_noun_phrase = [] for match in dep_graph.match(pattern): dep_owner_node = match[owner_node] dep_noun_node = match[noun_node] dep_whose_node = match[whose_node] whose_noun_phrase.append( (dep_owner_node, dep_whose_node, dep_noun_node)) for owner, whose, noun in whose_noun_phrase: noun_node = merge_dep_nodes([whose, noun], UPOS=noun.UPOS, LOC=noun.LOC) # print("Noun detected", noun_node.ID) dep_graph.remove_dependency(owner_node, whose) dep_graph.remove_dependency(noun, owner_node, "nmod:poss") dep_graph.replace_nodes([whose, noun], noun_node)
def ever_since(dep_graph: DependencyGraph): """TODO: add doc string """ ever_nodes = [] since_nodes = [] for node in dep_graph.nodes(): if node.LEMMA == "ever": ever_nodes.append(node) elif node.LEMMA == "since": since_nodes.append(node) if not ever_nodes or not since_nodes: return since_LOCs = [node.LOC for node in since_nodes] rel_remove = [] union_nodes = [] for ever_node in ever_nodes: expect_LOC = ever_node.LOC + 1 if expect_LOC not in since_LOCs: continue union_nodes.append( (ever_node, since_nodes[since_LOCs.index(expect_LOC)])) for p_node, p_rel in dep_graph.parents(ever_node): if 'advmod' not in p_rel: continue rel_remove.append((p_node, ever_node, 'advmod')) for src, trg, rel in rel_remove: dep_graph.remove_dependency(src, trg, rel) for ever_node, since_node in union_nodes: new_since_node = merge_dep_nodes([ever_node, since_node], UPOS=since_node.UPOS, LOC=since_node.LOC) dep_graph.replace_nodes([ever_node, since_node], new_since_node)
def separated_asas(dep_graph: DependencyGraph): """ ##### Equality comparison ##### ##### A is as X a C as B ##### ##### the first 'as' is always the advmod of a following element, X, which is within the range of as... as ##### ##### the second 'as' is always the dependent of B ##### ##### B sometimes depends on the first 'as', sometimes dependts on X ##### ##### Sometimes X has a head that is also within the range of as...as ##### :param dep_graph: :param oia_graph: :return: """ pattern = DependencyGraph() adj_node = DependencyGraphNode(UPOS="ADJ") noun_node = DependencyGraphNode(UPOS="NOUN") as1_node = DependencyGraphNode(FORM="as") as2_node = DependencyGraphNode(FORM="as") obj_node = DependencyGraphNode() pattern.add_nodes([noun_node, adj_node, as1_node, as2_node, obj_node]) pattern.add_dependency(noun_node, adj_node, r'amod') pattern.add_dependency(adj_node, as1_node, r'\w*advmod\w*') pattern.add_dependency(as1_node, obj_node, r'\w*advcl:as\w*') pattern.add_dependency(obj_node, as2_node, r'mark') as_as_pred = [] for match in dep_graph.match(pattern): dep_noun_node = match[noun_node] dep_adj_node = match[adj_node] dep_as1_node = match[as1_node] dep_as2_node = match[as2_node] dep_obj_node = match[obj_node] if dep_as1_node.LOC < dep_adj_node.LOC < dep_noun_node.LOC < dep_as2_node.LOC < dep_obj_node.LOC: pred = [ node for node in dep_graph.nodes() if dep_as1_node.LOC <= node.LOC <= dep_adj_node.LOC ] pred.append(dep_as2_node) pred.sort(key=lambda x: x.LOC) head = dep_adj_node asas_node = merge_dep_nodes(pred, UPOS="ADJ", LOC=dep_as2_node.LOC) as_as_pred.append( (pred, head, asas_node, dep_noun_node, dep_obj_node)) for pred, head, asas_node, dep_noun_node, dep_obj_node in as_as_pred: dep_graph.replace_nodes(pred, asas_node) dep_graph.remove_dependency(asas_node, dep_obj_node) dep_graph.remove_dependency(dep_noun_node, asas_node) dep_graph.add_dependency(dep_noun_node, dep_obj_node, "acl:" + asas_node.FORM)
def be_adp_phrase(dep_graph: DependencyGraph): """ example: is for xxx this should be not applied: 1. if xxx is adj, then be_adj_verb will be applied; 2. if xxx is NOUN, then copula_phrase will be applied note that there may be multiple adp: the insurgency is out of the picture :param dep_graph: :param oia_graph: :return: """ pattern = DependencyGraph() some_node = pattern.create_node() adp_node = pattern.create_node(UPOS="ADP") be_node = pattern.create_node(UPOS="AUX") pattern.add_dependency(some_node, be_node, r'cop') pattern.add_dependency(some_node, adp_node, r'case') verb_phrases = [] for match in dep_graph.match(pattern): dep_be_node = match[be_node] dep_some_node = match[some_node] dep_adp_nodes = [ n for n, l in dep_graph.children( dep_some_node, filter=lambda n, l: "case" in l and n.UPOS == "ADP") ] if not all(dep_be_node.LOC < x.LOC < dep_some_node.LOC for x in dep_adp_nodes): continue pred = [dep_be_node] + dep_adp_nodes head = dep_be_node verb_phrases.append((dep_some_node, pred, head)) for dep_some_node, verbs, root in verb_phrases: if not all(dep_graph.get_node(v.ID) for v in verbs): continue # has been processed verb_node = merge_dep_nodes(verbs, UPOS="AUX", LOC=root.LOC) for node in verbs: dep_graph.remove_dependency(dep_some_node, node) dep_graph.replace_nodes(verbs, verb_node) dep_graph.add_dependency(dep_some_node, verb_node, "cop")
def amod_obl(dep_graph: DependencyGraph): """ ##### include: more than, successful by :param dep_graph: :param oia_graph: :return: """ pattern = DependencyGraph() noun_node = DependencyGraphNode(UPOS=r"NOUN|PRON") adj_node = DependencyGraphNode(UPOS="ADJ") adp_node = DependencyGraphNode(UPOS="ADP") obl_node = DependencyGraphNode() pattern.add_nodes([noun_node, adj_node, adp_node, obl_node]) pattern.add_dependency(noun_node, adj_node, r'amod') pattern.add_dependency(adj_node, obl_node, r'obl:\w+') pattern.add_dependency(obl_node, adp_node, r'case') more_than_pred = [] for match in dep_graph.match(pattern): dep_noun_node = match[noun_node] dep_adj_node = match[adj_node] dep_obl_node = match[obl_node] dep_adp_node = match[adp_node] obl_nodes = list( dep_graph.children(dep_adj_node, filter=lambda n, l: "obl" in l)) if len(obl_nodes) > 1: # similar in form to the one continue if dep_adp_node.FORM not in dep_graph.get_dependency( dep_adj_node, dep_obl_node).values(): continue if dep_noun_node.LOC < dep_adj_node.LOC < dep_adp_node.LOC < dep_obl_node.LOC: more_than_pred.append( (dep_noun_node, dep_adj_node, dep_obl_node, dep_adp_node)) for dep_noun_node, dep_adj_node, dep_obl_node, dep_adp_node in more_than_pred: nodes = [dep_adj_node, dep_adp_node] more_than_pred = merge_dep_nodes(nodes, UPOS="ADP", LOC=dep_adp_node.LOC) dep_graph.remove_dependency(dep_noun_node, dep_adj_node) dep_graph.remove_dependency(dep_adj_node, dep_obl_node) dep_graph.replace_nodes([dep_adj_node, dep_adp_node], more_than_pred) dep_graph.add_dependency(dep_noun_node, dep_obl_node, "nmod:" + more_than_pred.FORM)
def process_head_conj(dep_graph: DependencyGraph): """ :param dep_graph: :return: """ first_word = dep_graph.get_node_by_loc(0) if first_word and first_word.LEMMA in {"and", "but"}: cc_parents = [n for n, l in dep_graph.parents(first_word) if l == "cc"] for p in cc_parents: dep_graph.remove_dependency(p, first_word) dep_graph.add_dependency(first_word, p, "arg_conj:1")
def multi_word_sconj(dep_graph: DependencyGraph): """ :param dep_graph: :param oia_graph: :return: """ pattern = DependencyGraph() verb_node = pattern.create_node(UPOS="VERB") verb2_node = pattern.create_node(UPOS="VERB") mark_node = pattern.create_node(UPOS="SCONJ") pattern.add_dependency(verb_node, verb2_node, r'advcl:\w*') pattern.add_dependency(verb2_node, mark_node, r'mark') mark_phrases = [] for match in dep_graph.match(pattern): dep_verb_node = match[verb_node] dep_verb2_node = match[verb2_node] dep_mark_node = match[mark_node] if dep_mark_node.LEMMA not in dep_graph.get_dependency(dep_verb_node, dep_verb2_node).values(): continue new_marks = list(dep_graph.offsprings(dep_mark_node)) if len(new_marks) == 1: continue new_marks.sort(key=lambda n: n.LOC) mark_phrases.append((dep_verb_node, dep_verb2_node, dep_mark_node, new_marks)) for (dep_verb_node, dep_verb2_node, dep_mark_node, new_marks) in mark_phrases: if not all([dep_graph.get_node(x.ID) for x in new_marks]): continue dep_graph.remove_dependency(dep_verb2_node, dep_mark_node) dep_graph.remove_dependency(dep_verb_node, dep_verb2_node) new_mark_node = merge_dep_nodes(new_marks, UPOS=dep_mark_node.UPOS, LOC=dep_mark_node.LOC ) dep_graph.replace_nodes(new_marks, new_mark_node) dep_graph.add_dependency(dep_verb_node, dep_verb2_node, "advcl:" + new_mark_node.LEMMA) dep_graph.add_dependency(dep_verb2_node, new_mark_node, "mark")
def multi_words_mark(dep_graph: DependencyGraph): """ arise on to the "on to" should be combined :param dep_graph: :param oia_graph: :return: """ # print('multi_words_mark') mark_phrases = [] for node in dep_graph.nodes(): marks = [] for n, l in dep_graph.children(node, filter=lambda n, l: "mark" in l): marks.extend(dep_graph.offsprings(n)) if not marks: continue # print('multi_words_mark marks:', marks) if len(marks) > 1: if any([x.UPOS in {"NOUN", "NUM", "VERB", "ADJ", "ADV", "PRON"} for x in marks]): continue marks.sort(key=lambda n: n.LOC) mark_phrases.append((node, marks)) for node, marks in mark_phrases: # print('multi_words_mark marks:', marks) if not all([dep_graph.get_node(x.ID) for x in marks]): continue mark_min_loc = marks[0].LOC mark_max_loc = marks[-1].LOC marks = [n for n in dep_graph.nodes() if mark_min_loc <= n.LOC <= mark_max_loc] marks.sort(key=lambda n: n.LOC) if any([x.UPOS in NOUN_UPOS for x in marks]): continue # print('marks:') # for nnnn in marks: # print(nnnn) new_mark_node = merge_dep_nodes(marks, UPOS=marks[0].UPOS, LOC=marks[0].LOC ) for mark in marks: dep_graph.remove_dependency(node, mark) dep_graph.replace_nodes(marks, new_mark_node) dep_graph.add_dependency(node, new_mark_node, "mark")
def acl_loop(dep_graph: DependencyGraph): """ :param dep_graph: :param oia_graph: :return: """ for n1, n2, deps in dep_graph.dependencies(): if "acl:relcl" in deps: back_deps = dep_graph.get_dependency(n2, n1) if any(x in back_deps for x in {"obl", "nsubj", "obj", "mark", "advmod"}): dep_graph.remove_dependency(n2, n1)
def multi_words_cc(dep_graph: DependencyGraph): """ arise on to the "on to" should be combined :param dep_graph: :param oia_graph: :return: """ mark_phrases = [] for node in dep_graph.nodes(): marks = [] for n, l in dep_graph.children(node, filter=lambda n, l: "cc" == l): marks.extend(dep_graph.offsprings(n)) if not marks: continue if len(marks) > 1: if any([x.UPOS in {"NOUN", "NUM", "VERB"} for x in marks]): continue marks.sort(key=lambda n: n.LOC) mark_phrases.append((node, marks)) for node, marks in mark_phrases: mark_min_loc = marks[0].LOC mark_max_loc = marks[-1].LOC marks = [n for n in dep_graph.nodes() if mark_min_loc <= n.LOC <= mark_max_loc] if any([x.UPOS in {"NOUN", "NUM", "VERB"} for x in marks]): continue if not all([dep_graph.get_node(x.ID) for x in marks]): continue new_mark_node = merge_dep_nodes(marks, UPOS=marks[0].UPOS, LOC=marks[0].LOC ) dep_graph.replace_nodes(marks, new_mark_node) for mark in marks: dep_graph.remove_dependency(node, mark) if dep_graph.get_node(node.ID): dep_graph.add_dependency(node, new_mark_node, "cc")
def be_not_phrase(dep_graph: DependencyGraph): """TODO: add doc string """ pattern = DependencyGraph() be_node = pattern.create_node() # contain the be verb obj_node = pattern.create_node() # not_node = pattern.create_node(UPOS="PART") not_node = pattern.create_node() pattern.add_node(be_node) pattern.add_node(obj_node) pattern.add_node(not_node) pattern.add_dependency(be_node, obj_node, r'\w*obj\w*') pattern.add_dependency(obj_node, not_node, r'\w*advmod\w*') be_not = [] for match in dep_graph.match(pattern): # print("be_not_phrase match !!!!!!!!!!!!!!") dep_be_node = match[be_node] dep_obj_node = match[obj_node] dep_not_node = match[not_node] if not "be" in dep_be_node.LEMMA.split(" "): continue if not "not" in dep_not_node.LEMMA.split(" "): continue if (dep_not_node.LOC > dep_obj_node.LOC) or (dep_be_node.LOC > dep_not_node.LOC): continue be_not.append((dep_be_node, dep_obj_node, dep_not_node)) for dep_be_node, dep_obj_node, dep_not_node in be_not: dep_graph.remove_dependency(dep_obj_node, dep_not_node, 'advmod') verb_node = merge_dep_nodes((dep_be_node, dep_not_node), UPOS=dep_be_node.UPOS, LOC=dep_be_node.LOC) dep_graph.replace_nodes([dep_be_node, dep_not_node], verb_node)
def be_not_phrase2(dep_graph: DependencyGraph): """TODO: add doc string """ be_not = [] # for pred_node in dep_graph.nodes(filter=lambda x: x.UPOS in {"VERB"}): for pred_node in dep_graph.nodes(): # print('pred_node LEMMA:', pred_node.LEMMA, 'pred_node UPOS:', pred_node.UPOS) if not "be" in pred_node.LEMMA.split(" "): continue objs = [] for child, rel in dep_graph.children(pred_node): if rel.startswith('obj'): objs.append(child) if not objs: continue objs.sort(key=lambda x: x.LOC) for obj in objs: def __interested_node2(n): # that conj is ommited return (n.UPOS == "PART" and "not" in n.LEMMA.split(" ")) nodes_of_interests2 = [ n for n, l in dep_graph.children( obj, filter=lambda n, l: l == "advmod" and __interested_node2(n )) ] if not nodes_of_interests2: continue assert len(nodes_of_interests2) == 1 not_node = nodes_of_interests2[0] be_not.append((pred_node, obj, not_node)) for dep_be_node, dep_obj_node, dep_not_node in be_not: dep_graph.remove_dependency(dep_obj_node, dep_not_node, 'advmod') verb_node = merge_dep_nodes((dep_be_node, dep_not_node), UPOS=dep_be_node.UPOS, LOC=dep_be_node.LOC) dep_graph.replace_nodes([dep_be_node, dep_not_node], verb_node)
def advp_phrase(dep_graph: DependencyGraph): """ :param dep_graph: :param oia_graph: :return: case: english-UD-12774 """ # return phrases = [] remove_rels = [] for node in dep_graph.nodes(filter=lambda n: n.UPOS in {"ADP"}): # is_root = True need_merge_node = set() # if str(node.FORM).lower() != 'after': # continue # print('advp node:', str(node.FORM)) for parent, rel in dep_graph.parents(node): if "case" in rel and \ any(node.FORM in l.values() or node.LEMMA in l.values() for x, l in dep_graph.parents(parent)): break remove_rel = False # we find neighborhood adjvs silibings = list(dep_graph.children(parent)) silibings.sort(key=lambda x: x[0].LOC) start_loc = -1 for child, ch_rel in reversed(silibings): # print(str(node.FORM)) if child.LOC >= node.LOC: start_loc = child.LOC continue if "advmod" in ch_rel and child.UPOS in { "ADJ", "ADV" } and child.LOC == start_loc - 1: # is_root = True need_merge_node.update( set(valid_adjv_element(child, dep_graph))) remove_rel = True start_loc = child.LOC # adjv_element = valid_adjv_element(child, dep_graph) if remove_rel: if 'case' in rel: remove_rels.append((parent, node, 'case')) if len(need_merge_node) == 0: continue need_merge_node.add(node) adjv_element = sorted(list(need_merge_node), key=lambda x: x.LOC) phrases.append((adjv_element, node)) for src, trg, rel in remove_rels: dep_graph.remove_dependency(src, trg, rel) for adjv_phrase, node in phrases: advp_node = merge_dep_nodes( adjv_phrase, # UPOS=node.UPOS, UPOS='ADV', LOC=node.LOC) # print("Noun detected", noun_node.ID) dep_graph.replace_nodes(adjv_phrase, advp_node)
def multi_words_case(dep_graph: DependencyGraph): """ :TODO add example case :param dep_graph: :param oia_graph: :return: """ pattern = DependencyGraph() noun_node = DependencyGraphNode() x_node = DependencyGraphNode() case_node = DependencyGraphNode() pattern.add_node(noun_node) pattern.add_node(x_node) pattern.add_node(case_node) pattern.add_dependency(noun_node, x_node, r'\w*:\w*') pattern.add_dependency(x_node, case_node, r'\bcase\b') for match in list(dep_graph.match(pattern)): multiword_cases = [] dep_noun_node = match[noun_node] dep_x_node = match[x_node] dep_case_node = match[case_node] if not dep_graph.has_node(dep_case_node): continue direct_case_nodes = [n for n, l in dep_graph.children(dep_x_node, filter=lambda n, l: "case" == l)] all_case_nodes = set() for node in direct_case_nodes: all_case_nodes.update(dep_graph.offsprings(node)) if len(all_case_nodes) == 1: continue all_case_nodes = sorted(list(all_case_nodes), key=lambda n: n.LOC) logger.debug("multi case discovered") for node in all_case_nodes: logger.debug(str(node)) # if len(case_nodes) > 2: # raise Exception("multi_words_case: Unexpected Situation: nodes with more than two cases") x_rel = dep_graph.get_dependency(dep_noun_node, dep_x_node) for rel in x_rel: if ":" in rel: # print('-----------------rel: ',rel) rel_str, case_str = rel.split(":") # some times, the rel only contains one word # Example : # that OBSF values within the extended trial balance may be misstated due to data issues ( above and beyond existing conversations with AA on model simplifications) if case_str in "_".join([x.LEMMA for x in all_case_nodes]): multiword_cases.append((dep_noun_node, dep_x_node, dep_case_node, all_case_nodes, rel_str)) for dep_noun_node, dep_x_node, dep_case_node, case_nodes, rel_str in multiword_cases: logger.debug("we are merging:") for node in case_nodes: logger.debug(str(node)) if not all([dep_graph.has_node(x) for x in case_nodes]): continue new_case_node = merge_dep_nodes(case_nodes, UPOS=dep_case_node.UPOS, LOC=dep_case_node.LOC ) dep_graph.replace_nodes(case_nodes, new_case_node) dep_graph.remove_dependency(dep_noun_node, dep_x_node) dep_graph.add_dependency(dep_noun_node, dep_x_node, rel_str + ":" + " ".join([x.LEMMA for x in case_nodes]))
def amod_xcomp_to_acl(dep_graph: DependencyGraph): """ something extracted by :param dep_graph: :param oia_graph: :return: """ pattern = DependencyGraph() noun_node = pattern.create_node(UPOS="NOUN") adj_node = pattern.create_node(UPOS="ADJ") verb_node = pattern.create_node(UPOS="VERB") pattern.add_dependency(noun_node, adj_node, r'amod') pattern.add_dependency(adj_node, verb_node, r"xcomp") for match in list(dep_graph.match(pattern)): dep_noun_node = match[noun_node] dep_verb_node = match[verb_node] dep_adj_node = match[adj_node] try: [ dep_graph.get_node(x.ID) for x in [dep_noun_node, dep_verb_node, dep_adj_node] ] except Exception as e: # has been processed by previous match continue xcomp_nodes = [ n for n, l in dep_graph.children( dep_adj_node, filter=lambda n, l: l.startswith("xcomp")) ] mark_nodes_list = [] for dep_xcomp_node in xcomp_nodes: mark_nodes = [ n for n, l in dep_graph.children( dep_xcomp_node, filter=lambda n, l: l.startswith("mark") and dep_adj_node. LOC < n.LOC < dep_xcomp_node.LOC) ] if mark_nodes: mark_nodes_list.append(mark_nodes) if len(mark_nodes_list) > 1: raise Exception("Unexpected Situation Happened") new_verb_nodes = [dep_adj_node] if mark_nodes_list: mark_nodes = mark_nodes_list[0] new_verb_nodes.extend(mark_nodes) new_verb_nodes.sort(key=lambda x: x.LOC) new_verb_nodes = ["(be)"] + new_verb_nodes new_node = merge_dep_nodes(new_verb_nodes, UPOS="VERB", LOC=new_verb_nodes[-1].LOC, FEATS={"VerbForm": "Ger"}) dep_graph.replace_nodes(new_verb_nodes, new_node) dep_graph.set_dependency(dep_noun_node, new_node, "acl") for dep_xcomp_node in xcomp_nodes: dep_graph.remove_dependency(dep_xcomp_node, new_node) dep_graph.set_dependency(new_node, dep_verb_node, "obj")
def acl_verb_obl_case(dep_graph: DependencyGraph): """ something extracted by :param dep_graph: :param oia_graph: :return: """ pattern = DependencyGraph() subj_node = pattern.create_node() verb_node = pattern.create_node(UPOS="VERB") obj_node = pattern.create_node() case_node = pattern.create_node() pattern.add_dependency(subj_node, verb_node, r'acl') pattern.add_dependency(verb_node, obj_node, r'obl:\w*') pattern.add_dependency(obj_node, case_node, r'case') phrases = [] for match in dep_graph.match(pattern): dep_subj_node = match[subj_node] dep_verb_node = match[verb_node] dep_obj_node = match[obj_node] dep_case_node = match[case_node] obl_nodes = [ n for n, l in dep_graph.children( dep_verb_node, filter=lambda n, l: l.startswith("obl")) ] if len(obl_nodes) > 1: continue existing_obj_nodes = [ n for n, l in dep_graph.children( dep_verb_node, filter=lambda n, l: "obj" in l or "comp" in l) ] if existing_obj_nodes: continue obl_rel = dep_graph.get_dependency(dep_verb_node, dep_obj_node) if dep_case_node.FORM not in obl_rel.values(): continue # there are may be other cases, join them all dep_case_nodes = [ n for n, l in dep_graph.children(dep_obj_node, filter=lambda n, l: l.startswith("case") and dep_verb_node.LOC < n.LOC < dep_obj_node.LOC) ] subjs = list( dep_graph.children(dep_verb_node, filter=lambda n, l: "subj" in l)) if len(subjs) > 1: continue phrases.append( (dep_subj_node, dep_verb_node, dep_obj_node, dep_case_nodes)) for dep_subj_node, dep_verb_node, dep_obj_node, dep_case_nodes in phrases: new_verb_phrase = [dep_verb_node] + dep_case_nodes logging.debug("acl_verb_obl_case: we are merging nodes") logging.debug("\n".join(str(node) for node in new_verb_phrase)) new_verb_node = merge_dep_nodes(new_verb_phrase, UPOS=dep_verb_node.UPOS, LOC=dep_verb_node.LOC, FEATS=dep_verb_node.FEATS) logging.debug("acl_verb_obl_case: we obtain a new node") logging.debug(str(new_verb_node)) dep_graph.remove_dependency(dep_verb_node, dep_obj_node) for node in dep_case_nodes: dep_graph.remove_dependency(dep_obj_node, node) dep_graph.replace_nodes(new_verb_phrase, new_verb_node) dep_graph.add_dependency(new_verb_node, dep_obj_node, "obj")
def adv_ccomp(dep_graph: DependencyGraph, oia_graph: OIAGraph, context: UD2OIAContext): """ :param dep_graph: :param oia_graph: :return: """ pattern = DependencyGraph() # TODO: it seems that in UD labeling, adv is used instead of adj for noun # verb_node = pattern.create_node(UPOS="VERB|NOUN|PROPN") adv_node = pattern.create_node(UPOS="ADV|X|NOUN|PART") # part is for "not" ccomp_node = pattern.create_node() # pattern.add_dependency(verb_node, adv_node, r'advmod') pattern.add_dependency(adv_node, ccomp_node, r"ccomp|xcomp") patterns = [] for match in dep_graph.match(pattern): # dep_verb_node = match[verb_node] dep_adv_node = match[adv_node] dep_ccomp_node = match[ccomp_node] if oia_graph.has_relation(dep_adv_node, dep_ccomp_node): continue dep_case_nodes = [ n for n, l in dep_graph.children(dep_ccomp_node, filter=lambda n, l: "case" == l and dep_adv_node .LOC < n.LOC < dep_ccomp_node.LOC) ] if dep_case_nodes: dep_case_nodes = continuous_component(dep_case_nodes, dep_case_nodes[0]) predicate_nodes = [dep_adv_node] + dep_case_nodes predicate_nodes.sort(key=lambda n: n.LOC) else: predicate_nodes = [dep_adv_node] dep_subj_nodes = [ n for n, l in dep_graph.parents(dep_adv_node, filter=lambda n, l: "advmod" == l and n.UPOS in {"ADV", "X", "NOUN"}) ] if len(dep_subj_nodes) > 1: raise Exception("Multiple subject") elif len(dep_subj_nodes) > 0: dep_subj_node = dep_subj_nodes[0] else: dep_subj_node = None patterns.append([dep_subj_node, predicate_nodes, dep_ccomp_node]) for dep_subj_node, predicate_nodes, dep_ccomp_node in patterns: if len(predicate_nodes) > 1: new_pred_node = dep_graph.create_node( ID=" ".join([x.ID for x in predicate_nodes]), FORM=" ".join([x.FORM for x in predicate_nodes]), LEMMA=" ".join([x.LEMMA for x in predicate_nodes]), UPOS="ADV", LOC=predicate_nodes[0].LOC) new_pred_node.aux = True dep_graph.replace_nodes(predicate_nodes, new_pred_node) dep_graph.remove_dependency(dep_ccomp_node, new_pred_node) else: new_pred_node = predicate_nodes[0] oia_pred_node = oia_graph.add_words(new_pred_node.position) if dep_subj_node: oia_subj_node = oia_graph.add_words(dep_subj_node.position) oia_graph.add_argument(oia_pred_node, oia_subj_node, 1, mod=True) else: oia_ccomp_node = oia_graph.add_words(dep_ccomp_node.position) oia_graph.add_argument(oia_pred_node, oia_ccomp_node, 2)
def secondary_predicate(dep_graph: DependencyGraph): """ detect the case of xcomp as a secondary predicate, and add implicit (be) node to make a predicate :param dep_graph: :return: """ pattern = DependencyGraph() pred_node = pattern.create_node() xcomp_node = pattern.create_node(UPOS=r'(?!VERB\b)\b\w+') xcomp_subj_node = pattern.create_node() pattern.add_dependency(pred_node, xcomp_node, "xcomp") pattern.add_dependency(xcomp_node, xcomp_subj_node, "nsubj") pattern.add_dependency(pred_node, xcomp_subj_node, "obj") for match in list(dep_graph.match(pattern)): dep_pred_node = match[pred_node] dep_xcomp_node = match[xcomp_node] dep_xcomp_subj_node = match[xcomp_subj_node] # if not (dep_pred_node.LOC < dep_xcomp_subj_node.LOC and dep_pred_node.LOC < dep_xcomp_node.LOC): # raise Exception("Unexpected Situation, let's throw out to see what happens") # the position of dep_xcomp_subj_node and dep_xcomp_node may be reversed in questions # I can't tell you how ominous I found Bush's performance in that interview. if dep_pred_node.LOC < dep_xcomp_subj_node.LOC < dep_xcomp_node.LOC: dep_graph.remove_dependency(dep_pred_node, dep_xcomp_node) dep_graph.remove_dependency(dep_pred_node, dep_xcomp_subj_node) dep_graph.remove_dependency(dep_xcomp_node, dep_xcomp_subj_node) if dep_xcomp_node.UPOS == "ADJ" or dep_xcomp_node.UPOS == "ADV": new_pred_nodes = ["(be)", dep_xcomp_node] dep_be_node = merge_dep_nodes(new_pred_nodes, UPOS="VERB", LOC=dep_xcomp_node.LOC) dep_graph.add_node(dep_be_node) dep_graph.add_dependency(dep_pred_node, dep_be_node, "obj") dep_graph.add_dependency(dep_be_node, dep_xcomp_subj_node, "nsubj") for child, l in list(dep_graph.children(dep_xcomp_node)): dep_graph.remove_dependency(dep_xcomp_node, child) dep_graph.add_dependency(dep_be_node, child, l) dep_graph.remove_node(dep_xcomp_node) else: dep_be_node = dep_graph.create_node(FORM="(be)", LEMMA="(be)", UPOS="VERB", LOC=dep_xcomp_node.LOC - 0.5) dep_be_node.aux = True dep_graph.add_dependency(dep_pred_node, dep_be_node, "obj") dep_graph.add_dependency(dep_be_node, dep_xcomp_subj_node, "nsubj") dep_graph.add_dependency(dep_be_node, dep_xcomp_node, "obj") elif dep_xcomp_node.LOC < dep_pred_node.LOC: dep_graph.remove_dependency(dep_pred_node, dep_xcomp_node) dep_graph.remove_dependency(dep_pred_node, dep_xcomp_subj_node) dep_graph.remove_dependency(dep_xcomp_node, dep_xcomp_subj_node) # in question, for example : how ominous # I can't tell you how ominous I found Bush's performance in that interview. dep_be_node = dep_graph.create_node(FORM="(be)", LEMMA="(be)", UPOS="VERB", LOC=dep_xcomp_node.LOC - 0.5) dep_be_node.aux = True dep_graph.add_dependency(dep_pred_node, dep_be_node, "obj") dep_graph.add_dependency(dep_be_node, dep_xcomp_subj_node, "nsubj") if dep_xcomp_node.UPOS == "ADJ" or dep_xcomp_node.UPOS == "ADV": dep_graph.add_dependency(dep_be_node, dep_xcomp_node, "amod") else: dep_graph.add_dependency(dep_be_node, dep_xcomp_node, "obj")
def gradation(dep_graph: DependencyGraph): """ TODO: do not match with the tech report, and the verb is not considered ##### Comparative ##### ##### Periphrastic gradation ##### ##### He runs faster than her ##### ##### Martin is more intelligent than Donald ##### ##### He is a nicer person than Tom ##### She is more than a regular cook :param dep_graph: :param oia_graph: :return: """ pattern = DependencyGraph() verb_node = pattern.create_node(UPOS="VERB|NOUN|PRON|PROPN|SYM") advj_node = pattern.create_node(UPOS="ADJ|ADV", FEATS={"Degree": "Cmp"}) than_node = pattern.create_node(FORM="than") obj_node = pattern.create_node() pattern.add_dependency(verb_node, advj_node, r'advmod|amod') pattern.add_dependency(advj_node, obj_node, r'\w*(nmod:than|obl:than|advcl:than)\w*') pattern.add_dependency(obj_node, than_node, r'\w*case|mark\w*') for match in list(dep_graph.match(pattern)): dep_verb_node = match[verb_node] dep_advj_node = match[advj_node] dep_than_node = match[than_node] dep_obj_node = match[obj_node] def __valid_mod(n, l): return (l == "amod" or l == "advmod") and in_interval( n, None, dep_advj_node) aux_node = list(dep_graph.children(dep_advj_node, filter=__valid_mod)) if aux_node: aux_node = aux_node[0][0] offsprings = dep_graph.offsprings(aux_node) more_than_nodes = offsprings + [dep_than_node] else: more_than_nodes = (dep_advj_node, dep_than_node) dep_more_than_node = merge_dep_nodes(more_than_nodes, UPOS="ADP", LOC=dep_than_node.LOC) dep_graph.replace_nodes(more_than_nodes, dep_more_than_node) dep_graph.remove_dependency(dep_obj_node, dep_more_than_node) dep_graph.remove_dependency(dep_more_than_node, dep_obj_node) dep_graph.remove_dependency(dep_verb_node, dep_more_than_node) if dep_verb_node.UPOS == "VERB": dep_graph.set_dependency(dep_verb_node, dep_obj_node, "advcl:" + dep_more_than_node.FORM) dep_graph.set_dependency(dep_obj_node, dep_more_than_node, "mark") else: dep_graph.set_dependency(dep_verb_node, dep_obj_node, "obl:" + dep_more_than_node.FORM) dep_graph.set_dependency(dep_obj_node, dep_more_than_node, "case")
def continuous_asas(dep_graph: DependencyGraph): """ ##### as far as I known ##### ##### the first 'as' is always the advmod of a following element, X, which is within the range of as... as ##### ##### the second 'as' is always the dependent of B ##### ##### B sometimes depends on the first 'as', sometimes dependts on X ##### ##### Sometimes X has a head that is also within the range of as...as ##### :param dep_graph: :param oia_graph: :return: """ verb_node = DependencyGraphNode(UPOS="VERB|NOUN|PRON|PROPN") adv_node = DependencyGraphNode(UPOS="ADV|ADJ") as1_node = DependencyGraphNode(LEMMA="as") as2_node = DependencyGraphNode(LEMMA="as") verb2_node = DependencyGraphNode(UPOS="VERB|ADJ|NOUN|PROPN|PRON") # ADJ is for as soon as possible pattern1 = DependencyGraph() pattern1.add_nodes([verb_node, adv_node, as1_node, as2_node, verb2_node]) pattern1.add_dependency(verb_node, adv_node, r'advmod|amod') pattern1.add_dependency(adv_node, as1_node, r'\w*advmod\w*') pattern1.add_dependency(as1_node, verb2_node, r'advcl:as|obl:as|advmod') pattern1.add_dependency(verb2_node, as2_node, r'mark|case') pattern2 = DependencyGraph() pattern2.add_nodes([verb_node, adv_node, as1_node, as2_node, verb2_node]) pattern2.add_dependency(verb_node, adv_node, r'advmod|amod') pattern2.add_dependency(adv_node, as1_node, r'\w*advmod\w*') pattern2.add_dependency(adv_node, verb2_node, r'advcl:as|obl:as|advmod') pattern2.add_dependency(verb2_node, as2_node, r'mark|case') as_as_pred = [] for match in list(dep_graph.match(pattern1)) + list( dep_graph.match(pattern2)): dep_verb_node = match[verb_node] dep_adv_node = match[adv_node] dep_as1_node = match[as1_node] dep_as2_node = match[as2_node] dep_verb2_node = match[verb2_node] if not (dep_as1_node.LOC < dep_adv_node.LOC < dep_as2_node.LOC < dep_verb2_node.LOC): continue as_as_pred.append((dep_as1_node, dep_as2_node, dep_adv_node, dep_verb_node, dep_verb2_node)) pred = [ node for node in dep_graph.nodes() if dep_as1_node.LOC <= node.LOC <= dep_adv_node.LOC ] pred.append(dep_as2_node) pred.sort(key=lambda x: x.LOC) head = dep_adv_node dep_asas_node = merge_dep_nodes(pred, UPOS="ADP", LOC=head.LOC) dep_graph.replace_nodes(pred, dep_asas_node) dep_graph.remove_dependency(dep_verb2_node, dep_asas_node) dep_graph.remove_dependency(dep_asas_node, dep_verb2_node) dep_graph.remove_dependency(dep_verb_node, dep_asas_node) if dep_verb_node.UPOS == "VERB": dep_graph.set_dependency(dep_verb_node, dep_verb2_node, "advcl:" + dep_asas_node.FORM) dep_graph.set_dependency(dep_verb2_node, dep_asas_node, "mark") else: dep_graph.set_dependency(dep_verb_node, dep_verb2_node, "obl:" + dep_asas_node.FORM) dep_graph.set_dependency(dep_verb2_node, dep_asas_node, "case")
def process_conjunction(dep_graph: DependencyGraph, root: DependencyGraphNode): """ :param dep_graph: :param root: :return: """ conj_childs = [ child for child, rels in dep_graph.children( root, filter=lambda n, l: l.startswith("conj")) ] assert conj_childs parallel_components = [root] for child in conj_childs: is_nest = any( grand_rels.startswith("conj") for grand_sun, grand_rels in dep_graph.children(child)) if is_nest: logger.debug("nested conj is found ") logger.debug(str(child)) conj_node, parallel_nodes = process_conjunction(dep_graph, child) logger.debug("conj_node is created ") logger.debug(str(conj_node)) for node in parallel_nodes: logger.debug("Containing nodes ") logger.debug(str(node)) rels = list(dep_graph.get_dependency(root, node)) for rel in rels: if rel.startswith("conj"): logger.debug("remove dependency {0}".format( (root.ID, node.ID, rel))) dep_graph.remove_dependency(root, node, rel) dep_graph.add_dependency(root, conj_node, rel) child = conj_node parallel_components.append(child) parallel_components.sort(key=lambda x: x.LOC) # if all(n.UPOS in NOUN_UPOS for n in parallel_components): # # logger.debug("Processing all noun conjunction") # # is_pure_noun = True # # merging_noun_nodes = [] # min_loc = 10000 # max_loc = -1 # for child in parallel_components: # if isinstance(child, DependencyGraphNode): # min_loc = min(min_loc, child.LOC) # max_loc = max(min_loc, child.LOC) # elif isinstance(child, DependencyGraphSuperNode): # min_loc = min(min_loc, min([x.LOC for x in child.nodes])) # max_loc = max(max_loc, max([x.LOC for x in child.nodes])) # merging_noun_nodes.extend(dep_graph.offsprings(child)) # # logger.debug("Checking acl for {0}".format(child)) # for n, l in dep_graph.children(child): # logger.debug(n) # logger.debug("label {0}".format(l)) # if "acl" in l: # is_pure_noun = False # break # # if is_pure_noun: # merging_noun_nodes = [n for n in merging_noun_nodes if min_loc <= n.LOC <= max_loc] # is_pure_noun = not any(n.UPOS in {"ADP", "VERB", "SCONJ", "AUX"} for n in merging_noun_nodes) # # if is_pure_noun: # # merged_noun_nodes.sort(key=lambda x: x.LOC) # for node in merging_noun_nodes: # logger.debug("merging {0}".format(node)) # # new_noun = merge_dep_nodes(merging_noun_nodes, UPOS=root.UPOS, LOC=root.LOC) # dep_graph.replace_nodes(merging_noun_nodes, new_noun) # # return new_noun, [] root_parents = list(set(parent for parent, rels in dep_graph.parents(root))) root_parents.sort(key=lambda x: x.LOC) # ic(list(map(str, root_parents))) conj_node, with_arg_palceholder = build_conjunction_node( dep_graph, root, root_parents, parallel_components) relation_to_conj = get_relation_to_conj(dep_graph, root, root_parents, parallel_components) case_marks = dict() for index, node in enumerate(parallel_components): case_marks[node.ID] = [(n, l) for n, l in dep_graph.children(node) if ("case" in l or "mark" in l or "cc" in l)] for key, values in case_marks.items(): for v in values: logger.debug("case_marker = {} {} {}".format( key, v[0].ID, v[1].rels)) logger.debug("relation_to_conj = {}".format(relation_to_conj)) for parent in root_parents: # ic(parent) prefix, shared_prefix, required_mark = relation_to_conj[parent.ID] if any(x in prefix for x in {"subj", "obj", "ccomp", "xcomp"}) \ or not required_mark or len(set(required_mark)) == 1: for node in parallel_components: dep_graph.remove_dependency(parent, node) relation = prefix if required_mark and len(set(required_mark)) == 1: ## with same mark mark_lemma = list(set(required_mark))[0] relation += ":" + mark_lemma mark_node = find_mark(case_marks, parallel_components, mark_lemma) if mark_node: mark_node, mark_rel = mark_node dep_graph.remove_node(mark_node) dep_graph.add_node(mark_node) # clear the dependency dep_graph.add_dependency(conj_node, mark_node, mark_rel) else: logger.error("cannot find the mark node") dep_graph.add_dependency(parent, conj_node, relation) else: complete_missing_case_mark(dep_graph, root, root_parents, parallel_components, relation_to_conj, case_marks) if not required_mark: required_mark = [None] * len(parallel_components) for index, (node, mark) in enumerate( zip(parallel_components, required_mark)): if mark: rel = prefix + ":" + mark else: rel = prefix # if rel.startswith("conj"): # continue logger.debug("add dependency {0}".format( (parent.ID, node.ID, rel))) dep_graph.add_dependency(parent, node, rel) for idx, node in enumerate(parallel_components): if node != root: rels = dep_graph.get_dependency(root, node) for rel in rels: if rel.startswith("conj"): dep_graph.remove_dependency(root, node) if with_arg_palceholder: index = idx + 1 else: # a, but b, b should be the arg1 and a be the arg2 index = len(parallel_components) - idx dep_graph.add_dependency(conj_node, node, "arg_conj:{0}".format(index)) return conj_node, parallel_components