def det_adjv_phrase(dep_graph: DependencyGraph): """ :param dep_graph: :param oia_graph: :return: """ phrases = [] for node in dep_graph.nodes(filter=lambda n: n.UPOS in {"ADJ", "ADV"}): parent_rels = itertools.chain.from_iterable( (rel for parent, rel in dep_graph.parents(node))) if any([rel in valid_adj_form for rel in parent_rels]): continue if any([rel in {"amod", "advmod"} for rel in parent_rels]): continue det = [ n for n, l in dep_graph.children(node, filter=lambda n, l: l == "det") ] if not det: continue det.sort(key=lambda x: x.LOC) det = det[-1] if det.LEMMA not in {"the", "a", "an", "some", "any", "all"}: continue root = node np_elements = list( dep_graph.offsprings( root, filter=lambda n: det.LOC <= n.LOC <= root.LOC)) # check the element should be continuous np_elements = sorted(list(np_elements), key=lambda x: x.LOC) # if np_elements[-1].LOC - np_elements[0].LOC != len(np_elements) - 1: # print ("root", root) # for n in np_elements: # print("np element", n.LOC, n) # raise Exception("Bad Business Logic") phrases.append((np_elements, root)) for np, root in phrases: noun_node = merge_dep_nodes(np, UPOS="NOUN", LOC=root.LOC) # print("Noun detected", noun_node.ID) dep_graph.replace_nodes(np, noun_node)
def multi_word_sconj(dep_graph: DependencyGraph): """ :param dep_graph: :param oia_graph: :return: """ pattern = DependencyGraph() verb_node = pattern.create_node(UPOS="VERB") verb2_node = pattern.create_node(UPOS="VERB") mark_node = pattern.create_node(UPOS="SCONJ") pattern.add_dependency(verb_node, verb2_node, r'advcl:\w*') pattern.add_dependency(verb2_node, mark_node, r'mark') mark_phrases = [] for match in dep_graph.match(pattern): dep_verb_node = match[verb_node] dep_verb2_node = match[verb2_node] dep_mark_node = match[mark_node] if dep_mark_node.LEMMA not in dep_graph.get_dependency(dep_verb_node, dep_verb2_node).values(): continue new_marks = list(dep_graph.offsprings(dep_mark_node)) if len(new_marks) == 1: continue new_marks.sort(key=lambda n: n.LOC) mark_phrases.append((dep_verb_node, dep_verb2_node, dep_mark_node, new_marks)) for (dep_verb_node, dep_verb2_node, dep_mark_node, new_marks) in mark_phrases: if not all([dep_graph.get_node(x.ID) for x in new_marks]): continue dep_graph.remove_dependency(dep_verb2_node, dep_mark_node) dep_graph.remove_dependency(dep_verb_node, dep_verb2_node) new_mark_node = merge_dep_nodes(new_marks, UPOS=dep_mark_node.UPOS, LOC=dep_mark_node.LOC ) dep_graph.replace_nodes(new_marks, new_mark_node) dep_graph.add_dependency(dep_verb_node, dep_verb2_node, "advcl:" + new_mark_node.LEMMA) dep_graph.add_dependency(dep_verb2_node, new_mark_node, "mark")
def multi_words_mark(dep_graph: DependencyGraph): """ arise on to the "on to" should be combined :param dep_graph: :param oia_graph: :return: """ # print('multi_words_mark') mark_phrases = [] for node in dep_graph.nodes(): marks = [] for n, l in dep_graph.children(node, filter=lambda n, l: "mark" in l): marks.extend(dep_graph.offsprings(n)) if not marks: continue # print('multi_words_mark marks:', marks) if len(marks) > 1: if any([x.UPOS in {"NOUN", "NUM", "VERB", "ADJ", "ADV", "PRON"} for x in marks]): continue marks.sort(key=lambda n: n.LOC) mark_phrases.append((node, marks)) for node, marks in mark_phrases: # print('multi_words_mark marks:', marks) if not all([dep_graph.get_node(x.ID) for x in marks]): continue mark_min_loc = marks[0].LOC mark_max_loc = marks[-1].LOC marks = [n for n in dep_graph.nodes() if mark_min_loc <= n.LOC <= mark_max_loc] marks.sort(key=lambda n: n.LOC) if any([x.UPOS in NOUN_UPOS for x in marks]): continue # print('marks:') # for nnnn in marks: # print(nnnn) new_mark_node = merge_dep_nodes(marks, UPOS=marks[0].UPOS, LOC=marks[0].LOC ) for mark in marks: dep_graph.remove_dependency(node, mark) dep_graph.replace_nodes(marks, new_mark_node) dep_graph.add_dependency(node, new_mark_node, "mark")
def multi_words_cc(dep_graph: DependencyGraph): """ arise on to the "on to" should be combined :param dep_graph: :param oia_graph: :return: """ mark_phrases = [] for node in dep_graph.nodes(): marks = [] for n, l in dep_graph.children(node, filter=lambda n, l: "cc" == l): marks.extend(dep_graph.offsprings(n)) if not marks: continue if len(marks) > 1: if any([x.UPOS in {"NOUN", "NUM", "VERB"} for x in marks]): continue marks.sort(key=lambda n: n.LOC) mark_phrases.append((node, marks)) for node, marks in mark_phrases: mark_min_loc = marks[0].LOC mark_max_loc = marks[-1].LOC marks = [n for n in dep_graph.nodes() if mark_min_loc <= n.LOC <= mark_max_loc] if any([x.UPOS in {"NOUN", "NUM", "VERB"} for x in marks]): continue if not all([dep_graph.get_node(x.ID) for x in marks]): continue new_mark_node = merge_dep_nodes(marks, UPOS=marks[0].UPOS, LOC=marks[0].LOC ) dep_graph.replace_nodes(marks, new_mark_node) for mark in marks: dep_graph.remove_dependency(node, mark) if dep_graph.get_node(node.ID): dep_graph.add_dependency(node, new_mark_node, "cc")
def nmod_without_case(dep_graph: DependencyGraph, oia_graph: OIAGraph, context: UD2OIAContext): """ #################### nmod:x ######################## :param sentence: :return: """ pattern = DependencyGraph() center_node = pattern.create_node() modifier_node = pattern.create_node() pattern.add_dependency(center_node, modifier_node, r'\w*nmod\w*') for match in dep_graph.match(pattern): dep_center_node = match[center_node] dep_modifier_node = match[modifier_node] rels = dep_graph.get_dependency(dep_center_node, dep_modifier_node) if "nmod:poss" in rels and dep_center_node in set( dep_graph.offsprings(dep_modifier_node)): # whose in there continue if oia_graph.has_relation(dep_center_node, dep_modifier_node, direct_link=False): continue oia_center_node = oia_graph.add_words(dep_center_node.position) oia_modifier_node = oia_graph.add_words(dep_modifier_node.position) oia_graph.add_mod(oia_modifier_node, oia_center_node)
def noun_phrase(dep_graph: DependencyGraph): """ :param dep_graph: :param oia_graph: :return: """ nouns = [] # we first find np roots for root in dep_graph.nodes( filter=lambda x: x.UPOS in {"NOUN", "PROPN", "X", "NUM", "SYM"}): logger.debug("checking the node:") logger.debug(str(root)) # np_elements = valid_np_element(root, dep_graph) parent_rels = set( itertools.chain.from_iterable(l.values() for n, l in dep_graph.parents(root))) parent_rels = set(rel.replace("_", " ") for rel in parent_rels) escaped_case_node = set() if parent_rels: case_nodes = [ x for x, l in dep_graph.children(root, filter=lambda n, l: l == "case") ] for node in case_nodes: if node.LEMMA.lower() in parent_rels or node.FORM.lower( ) in parent_rels: # lemma is for including escaped_case_node.add(node) valid_np_children = [(n, l) for n, l in dep_graph.children( root, filter=lambda n, l: is_valid_np_child(dep_graph, root, l, n)) ] logger.debug("noun_phrase: valid_np_children:") for node, l in valid_np_children: logger.debug(str(node)) np_elements = [root] for n, l in valid_np_children: if n.UPOS == "ADP": continue if n.LOC > root.LOC and \ not any(l.startswith(x) for x in {"fixed", "compound", "nummod", "nmod:tmod", "flat", "nmod:npmod", "dep"}): continue if n in escaped_case_node: continue if isinstance(n, DependencyGraphSuperNode) and n.is_conj: continue offsprings = list(dep_graph.offsprings(n)) valid_np_component = True for x in offsprings: for parent, rels in dep_graph.parents(x): if any(x in rels for x in {"acl", "obl", "advcl", "subj", "obj"}): valid_np_component = False break if not valid_np_component: break if valid_np_component: np_elements.extend(offsprings) logger.debug("noun_phrase: candidate np_elements:") for node in np_elements: logger.debug(str(node)) det = [ n for n, l in dep_graph.children(root, filter=lambda n, l: l == "det") ] det = [x for x in det if x.LOC <= root.LOC] det.sort(key=lambda x: x.LOC) if det: # raise Exception("noun phrase without det ") det = det[-1] # check the element should be continuous np_elements = [x for x in np_elements if det.LOC <= x.LOC] logger.debug( "noun_phrase: det found, cut the nodes before the det") filtered_np_elements = sorted(list(np_elements), key=lambda x: x.LOC) # if np_elements[-1].LOC - np_elements[0].LOC != len(np_elements) - 1: # print ("root", root) # for n in np_elements: # print("np element", n.LOC, n) # raise Exception("Bad Business Logic") changed = True while changed: changed = False if filtered_np_elements and filtered_np_elements[0].LEMMA in { "-", "--" }: filtered_np_elements.pop(0) changed = True if filtered_np_elements and filtered_np_elements[0].UPOS in { "ADP", "CCONJ", "PUNCT" }: filtered_np_elements.pop(0) changed = True if filtered_np_elements: nouns.append((set(filtered_np_elements), root)) sub_nouns = [] for idx1, (phrase1, head1) in enumerate(nouns): for idx2, (phrase2, head2) in enumerate(nouns): if idx1 == idx2: continue phrasex, phrasey = ( phrase1, phrase2) if len(phrase1) > len(phrase2) else (phrase2, phrase1) common = phrasex.intersection(phrasey) if not common: continue elif len(common) == len(phrasey): # node2 is a sub np of node1, delete sub_nouns.append(phrasey) else: print("Phrase 1", [x.ID for x in phrase1]) print("Phrase 2", [x.ID for x in phrase2]) # return raise Exception("duplicate words found") for idx, (phrase, head) in enumerate(nouns): if phrase in sub_nouns: continue phrase = sorted(list(phrase), key=lambda x: x.LOC) for node in phrase: for child, _ in dep_graph.children(node): if child.LOC == phrase[0].LOC - 1 and child.LEMMA in { "\"", "\'" }: phrase.insert(0, child) if child.LOC == phrase[-1].LOC + 1 and child.LEMMA in { "\"", "\'" }: phrase.append(child) noun_node = merge_dep_nodes(phrase, UPOS=head.UPOS, LOC=phrase[-1].LOC) # print("Noun detected", noun_node.ID) dep_graph.replace_nodes(phrase, noun_node)
def general_question(dep_graph: DependencyGraph, oia_graph: OIAGraph, context: UD2OIAContext): """ :param dep_graph: :param oia_graph: :return: """ for verb in dep_graph.nodes(filter=lambda n: n.UPOS == "VERB"): if any( any(x in n.LEMMA for x in {"what", "how", "why", "when", "where"}) for n in dep_graph.offsprings(verb)): continue parents = [n for n, _ in dep_graph.parents(verb)] # if not(len(parents) == 1 and parents[0].ID == "0"): # continue # check subj and aux subj = None aux = None for child, rel in dep_graph.children(verb): if "subj" in rel: subj = child if "aux" in rel: aux = child is_be_verb = False if not isinstance(verb, DependencyGraphSuperNode): is_be_verb = verb.LEMMA == "be" else: assert isinstance(verb, DependencyGraphSuperNode) assert aux is None for n in verb.nodes: if isinstance(n, DependencyGraphNode): if n.LEMMA == "be": is_be_verb = True # print('verb.nodes:', str(" ".join(str(xx.LEMMA) for xx in verb.nodes))) # print('is_be_verb222:', is_be_verb) if n.UPOS == "AUX": aux = n # print('is_be_verb:', is_be_verb) if aux is None and not is_be_verb: # cannot be a general question continue expl_child = [n for n, l in dep_graph.children(verb) if l == "expl"] if expl_child: assert len(expl_child) == 1 subj = expl_child[0] if subj is None: logger.warning( "subject is none, cannot decide whether it is a question") continue # print('subj.LOC:', subj.LOC) # print('subj.LOC type:', type(subj.LOC)) oia_verb_node = oia_graph.add_words(verb.position) is_there_be_verb = is_be_verb and ("there" in verb.LEMMA.split(' ') or "here" in verb.LEMMA.split(' ')) is_question = False if is_there_be_verb: assert isinstance(verb, DependencyGraphSuperNode) be_node = [n for n in verb.nodes if n.LEMMA == "be"][0] there_node = [ n for n in verb.nodes if n.LEMMA == "there" or n.LEMMA == "here" ][0] # print('there_node:', there_node) if be_node.LOC < there_node.LOC: is_question = True elif (is_be_verb and verb.LOC < subj.LOC): is_question = True elif (aux is not None and aux.LOC < subj.LOC): is_question = True if is_question: # if aux is not None and aux.LEMMA == "do": # oia_question_node = oia_graph.add_word_with_head(aux.LOC) # else: oia_question_node = oia_graph.add_aux("WHETHER") oia_graph.add_function(oia_question_node, oia_verb_node)
def verb_phrase(dep_graph: DependencyGraph): """ ##### Merging aux and cop with their head VERB ##### Cases: :param sentence: :return: """ verb_phrases = [] for node in dep_graph.nodes(filter=lambda x: x.UPOS in {"VERB", "AUX"}): if node.UPOS == "AUX": parent = [ n for n, l in dep_graph.parents(node, filter=lambda n, l: l == "aux") ] if len(parent) > 0: continue # if "VerbForm" in node.FEATS and "Ger" in node.FEATS["VerbForm"]: # continue if "Tense" in node.FEATS and "Past" in node.FEATS["Tense"]: # if the verb is before the noun, it will be processed by noun_phrase and taken as a part of the noun parent = [ n for n, l in dep_graph.parents( node, filter=lambda n, l: l == "amod" and node.LOC < n.LOC) ] if len(parent) > 0: continue # logger.debug("We are checking node {0}".format(node)) root = node verbs = [root] for n, l in dep_graph.children(root): if dep_graph.get_dependency(n, root): continue if n.LEMMA in {"so", "also", "why"}: continue if "advmod" in l: offsprings = list(dep_graph.offsprings(n)) if any(x.UPOS in {"VERB", "NOUN", "AUX", "PRON"} for x in offsprings): continue verbs.extend(offsprings) elif "compound" in l: verbs.append(n) verbs = [ x for x in verbs if x.LOC <= root.LOC or "compound" in dep_graph.get_dependency(root, x) ] # logger.debug("Verb: before continuous component ") # logger.debug("\n".join(str(verb) for verb in verbs)) verbs = continuous_component(verbs, root) # add aux verbs.extend(n for n, l in dep_graph.children(root) if "aux" in l) # logger.debug("Verb: after continuous component ") # for verb in verbs: # logger.debug(verb) verbs.sort(key=lambda x: x.LOC) last_loc = verbs[-1].LOC # next_node = dep_graph.get_node_by_loc(last_loc + 1) # if next_node and next_node.LEMMA == "not": # verbs.append(next_node) if len(verbs) > 1: verb_phrases.append((verbs, root)) for verbs, root in verb_phrases: verb_node = merge_dep_nodes(verbs, UPOS="VERB", LOC=root.LOC, FEATS=root.FEATS) dep_graph.replace_nodes(verbs, verb_node)
def gradation(dep_graph: DependencyGraph): """ TODO: do not match with the tech report, and the verb is not considered ##### Comparative ##### ##### Periphrastic gradation ##### ##### He runs faster than her ##### ##### Martin is more intelligent than Donald ##### ##### He is a nicer person than Tom ##### She is more than a regular cook :param dep_graph: :param oia_graph: :return: """ pattern = DependencyGraph() verb_node = pattern.create_node(UPOS="VERB|NOUN|PRON|PROPN|SYM") advj_node = pattern.create_node(UPOS="ADJ|ADV", FEATS={"Degree": "Cmp"}) than_node = pattern.create_node(FORM="than") obj_node = pattern.create_node() pattern.add_dependency(verb_node, advj_node, r'advmod|amod') pattern.add_dependency(advj_node, obj_node, r'\w*(nmod:than|obl:than|advcl:than)\w*') pattern.add_dependency(obj_node, than_node, r'\w*case|mark\w*') for match in list(dep_graph.match(pattern)): dep_verb_node = match[verb_node] dep_advj_node = match[advj_node] dep_than_node = match[than_node] dep_obj_node = match[obj_node] def __valid_mod(n, l): return (l == "amod" or l == "advmod") and in_interval( n, None, dep_advj_node) aux_node = list(dep_graph.children(dep_advj_node, filter=__valid_mod)) if aux_node: aux_node = aux_node[0][0] offsprings = dep_graph.offsprings(aux_node) more_than_nodes = offsprings + [dep_than_node] else: more_than_nodes = (dep_advj_node, dep_than_node) dep_more_than_node = merge_dep_nodes(more_than_nodes, UPOS="ADP", LOC=dep_than_node.LOC) dep_graph.replace_nodes(more_than_nodes, dep_more_than_node) dep_graph.remove_dependency(dep_obj_node, dep_more_than_node) dep_graph.remove_dependency(dep_more_than_node, dep_obj_node) dep_graph.remove_dependency(dep_verb_node, dep_more_than_node) if dep_verb_node.UPOS == "VERB": dep_graph.set_dependency(dep_verb_node, dep_obj_node, "advcl:" + dep_more_than_node.FORM) dep_graph.set_dependency(dep_obj_node, dep_more_than_node, "mark") else: dep_graph.set_dependency(dep_verb_node, dep_obj_node, "obl:" + dep_more_than_node.FORM) dep_graph.set_dependency(dep_obj_node, dep_more_than_node, "case")
def multi_word_fix_flat(dep_graph: DependencyGraph): """ :param dep_graph: :param oia_graph: :return: """ fixed_rels = {"fixed", "flat", "compound"} phrases = [] for node in dep_graph.nodes(): parents = [n for n, l in dep_graph.parents(node, filter=lambda n, l: any(x in l for x in fixed_rels))] if parents: continue phrase = [] for n, l in dep_graph.children(node, filter=lambda n, l: any(x in l for x in fixed_rels)): phrase.extend(dep_graph.offsprings(n)) if not phrase: continue phrase.append(node) if len(phrase) > 1: phrase.sort(key=lambda n: n.LOC) # min_loc = phrase[0].LOC # max_loc = phrase[-1].LOC # phrase = [n for n in dep_graph.nodes() if min_loc <= n.LOC <= max_loc] phrases.append((phrase, node)) phrases.sort(key=lambda x: len(x[0]), reverse=True) for phrase, head in phrases: if not all([dep_graph.get_node(x.ID) for x in phrase]): continue # already been processed merging_nodes = set() min_loc = 10000 max_loc = -1 for child in phrase: if isinstance(child, DependencyGraphNode): min_loc = min(min_loc, child.LOC) max_loc = max(min_loc, child.LOC) elif isinstance(child, DependencyGraphSuperNode): min_loc = min(min_loc, min([x.LOC for x in child.nodes])) max_loc = max(max_loc, max([x.LOC for x in child.nodes])) merging_nodes.update(dep_graph.offsprings(child)) merged_nodes = set([n for n in merging_nodes if min_loc <= n.LOC <= max_loc]) for node in merging_nodes: if node.LOC == min_loc - 1 and node.LEMMA in {"\"", "\'"}: merged_nodes.add(node) if node.LOC == max_loc + 1 and node.LEMMA in {"\"", "\'"}: merged_nodes.add(node) merged_nodes = list(merged_nodes) merged_nodes.sort(key=lambda x: x.LOC) logger.debug("multi_word_fix_flat: we are merging ") logger.debug("\n".join(str(node) for node in merged_nodes)) logger.debug("with head \n" + str(head)) new_node = merge_dep_nodes(merged_nodes, UPOS=head.UPOS, LOC=head.LOC) dep_graph.replace_nodes(merged_nodes, new_node)
def multi_words_case(dep_graph: DependencyGraph): """ :TODO add example case :param dep_graph: :param oia_graph: :return: """ pattern = DependencyGraph() noun_node = DependencyGraphNode() x_node = DependencyGraphNode() case_node = DependencyGraphNode() pattern.add_node(noun_node) pattern.add_node(x_node) pattern.add_node(case_node) pattern.add_dependency(noun_node, x_node, r'\w*:\w*') pattern.add_dependency(x_node, case_node, r'\bcase\b') for match in list(dep_graph.match(pattern)): multiword_cases = [] dep_noun_node = match[noun_node] dep_x_node = match[x_node] dep_case_node = match[case_node] if not dep_graph.has_node(dep_case_node): continue direct_case_nodes = [n for n, l in dep_graph.children(dep_x_node, filter=lambda n, l: "case" == l)] all_case_nodes = set() for node in direct_case_nodes: all_case_nodes.update(dep_graph.offsprings(node)) if len(all_case_nodes) == 1: continue all_case_nodes = sorted(list(all_case_nodes), key=lambda n: n.LOC) logger.debug("multi case discovered") for node in all_case_nodes: logger.debug(str(node)) # if len(case_nodes) > 2: # raise Exception("multi_words_case: Unexpected Situation: nodes with more than two cases") x_rel = dep_graph.get_dependency(dep_noun_node, dep_x_node) for rel in x_rel: if ":" in rel: # print('-----------------rel: ',rel) rel_str, case_str = rel.split(":") # some times, the rel only contains one word # Example : # that OBSF values within the extended trial balance may be misstated due to data issues ( above and beyond existing conversations with AA on model simplifications) if case_str in "_".join([x.LEMMA for x in all_case_nodes]): multiword_cases.append((dep_noun_node, dep_x_node, dep_case_node, all_case_nodes, rel_str)) for dep_noun_node, dep_x_node, dep_case_node, case_nodes, rel_str in multiword_cases: logger.debug("we are merging:") for node in case_nodes: logger.debug(str(node)) if not all([dep_graph.has_node(x) for x in case_nodes]): continue new_case_node = merge_dep_nodes(case_nodes, UPOS=dep_case_node.UPOS, LOC=dep_case_node.LOC ) dep_graph.replace_nodes(case_nodes, new_case_node) dep_graph.remove_dependency(dep_noun_node, dep_x_node) dep_graph.add_dependency(dep_noun_node, dep_x_node, rel_str + ":" + " ".join([x.LEMMA for x in case_nodes]))