def ever_since(dep_graph: DependencyGraph):
    """TODO: add doc string
    """
    ever_nodes = []
    since_nodes = []
    for node in dep_graph.nodes():
        if node.LEMMA == "ever":
            ever_nodes.append(node)
        elif node.LEMMA == "since":
            since_nodes.append(node)
    if not ever_nodes or not since_nodes:
        return
    since_LOCs = [node.LOC for node in since_nodes]
    rel_remove = []
    union_nodes = []
    for ever_node in ever_nodes:
        expect_LOC = ever_node.LOC + 1
        if expect_LOC not in since_LOCs:
            continue
        union_nodes.append(
            (ever_node, since_nodes[since_LOCs.index(expect_LOC)]))
        for p_node, p_rel in dep_graph.parents(ever_node):
            if 'advmod' not in p_rel:
                continue
            rel_remove.append((p_node, ever_node, 'advmod'))
    for src, trg, rel in rel_remove:
        dep_graph.remove_dependency(src, trg, rel)
    for ever_node, since_node in union_nodes:
        new_since_node = merge_dep_nodes([ever_node, since_node],
                                         UPOS=since_node.UPOS,
                                         LOC=since_node.LOC)
        dep_graph.replace_nodes([ever_node, since_node], new_since_node)
def conjunction(dep_graph: DependencyGraph):
    """

    #### Coordination ####
    #### I like apples, bananas and oranges. conj:and/or with punct
    #### @return a list of list of conjuncted entities
    TODO: currently cannot process nested conjunction. should process from bottom to up
    :param sentence:
    :return:
    """

    # find the root of conj and do the process

    root_of_conj = []

    for node in dep_graph.nodes():

        if any(
                rels.startswith("conj")
                for parent, rels in dep_graph.parents(node)):
            continue

        if any(
                rels.startswith("conj")
                for child, rels in dep_graph.children(node)):
            root_of_conj.append(node)

    for root in root_of_conj:
        logger.debug("found the root of conjunction")
        logger.debug(str(root))

        process_conjunction(dep_graph, root)

    process_head_conj(dep_graph)
def det_adjv_phrase(dep_graph: DependencyGraph):
    """

    :param dep_graph:
    :param oia_graph:
    :return:
    """
    phrases = []

    for node in dep_graph.nodes(filter=lambda n: n.UPOS in {"ADJ", "ADV"}):

        parent_rels = itertools.chain.from_iterable(
            (rel for parent, rel in dep_graph.parents(node)))
        if any([rel in valid_adj_form for rel in parent_rels]):
            continue

        if any([rel in {"amod", "advmod"} for rel in parent_rels]):
            continue

        det = [
            n for n, l in dep_graph.children(node,
                                             filter=lambda n, l: l == "det")
        ]

        if not det:
            continue

        det.sort(key=lambda x: x.LOC)

        det = det[-1]

        if det.LEMMA not in {"the", "a", "an", "some", "any", "all"}:
            continue

        root = node
        np_elements = list(
            dep_graph.offsprings(
                root, filter=lambda n: det.LOC <= n.LOC <= root.LOC))

        # check the element should be continuous

        np_elements = sorted(list(np_elements), key=lambda x: x.LOC)
        # if np_elements[-1].LOC - np_elements[0].LOC != len(np_elements) - 1:
        #     print ("root", root)
        #     for n in np_elements:
        #         print("np element", n.LOC, n)
        #     raise Exception("Bad Business Logic")

        phrases.append((np_elements, root))

    for np, root in phrases:
        noun_node = merge_dep_nodes(np, UPOS="NOUN", LOC=root.LOC)
        # print("Noun detected", noun_node.ID)
        dep_graph.replace_nodes(np, noun_node)
def process_head_conj(dep_graph: DependencyGraph):
    """

    :param dep_graph:
    :return:
    """
    first_word = dep_graph.get_node_by_loc(0)
    if first_word and first_word.LEMMA in {"and", "but"}:
        cc_parents = [n for n, l in dep_graph.parents(first_word) if l == "cc"]
        for p in cc_parents:
            dep_graph.remove_dependency(p, first_word)
            dep_graph.add_dependency(first_word, p, "arg_conj:1")
Esempio n. 5
0
def adjv_phrase(dep_graph: DependencyGraph):
    """

    :param dep_graph:
    :param oia_graph:
    :return:
    """
    phrases = []

    for node in dep_graph.nodes(filter=lambda n: n.UPOS in {"ADJ", "ADV"}):

        is_root = True
        for parent, rel in dep_graph.parents(node):
            if "advmod" in rel and parent.UPOS not in {"ADJ", "ADV"}:
                is_root = True
                break
            elif rel.intersect(valid_adj_form):
                is_root = False

        if not is_root:
            continue

        adjv_element = valid_adjv_element(node, dep_graph)

        adjv_element = sorted(list(adjv_element), key=lambda x: x.LOC)

        connected_components = [node]
        start_loc = node.LOC
        for child in reversed(adjv_element):
            # print(str(node.FORM))

            if child.UPOS in {"ADJ", "ADV"} and child.LOC == start_loc - 1:

                connected_components.append(child)
                start_loc = child.LOC

        connected_components.sort(key=lambda x: x.LOC)

        if len(connected_components) > 1:
            phrases.append((connected_components, node))

    for adjv_phrase, node in phrases:
        adjv_node = merge_dep_nodes(adjv_phrase, UPOS=node.UPOS, LOC=node.LOC)
        # print("Noun detected", noun_node.ID)
        dep_graph.replace_nodes(adjv_phrase, adjv_node)
Esempio n. 6
0
def xcomp_verb(dep_graph: DependencyGraph):
    """

    :param dep_graph:
    :return:
    """

    pattern = DependencyGraph()

    pred_node = pattern.create_node()
    xcomp_verb_node = pattern.create_node(UPOS="VERB|AUX")
    xcomp_mark_node = pattern.create_node(UPOS="PART")

    pattern.add_dependency(pred_node, xcomp_verb_node, "xcomp")
    pattern.add_dependency(xcomp_verb_node, xcomp_mark_node, "mark")

    for match in list(dep_graph.match(pattern)):

        dep_pred_node = match[pred_node]
        dep_xcomp_verb_node = match[xcomp_verb_node]
        dep_xcomp_mark_node = match[xcomp_mark_node]

        if dep_xcomp_mark_node.LEMMA != "to":
            # print('--------------------------LEMMA:      ',dep_xcomp_mark_node.LEMMA)
            # raise Exception("Unexpected Situation: xcomp mark != to let's throw out to see what happens")
            continue

        if dep_xcomp_mark_node.LOC > dep_xcomp_verb_node.LOC:
            raise Exception(
                "Unexpected Situation: xcomp mark after the xcomp verb")

        pred_nodes = list(
            dep_graph.parents(dep_xcomp_verb_node,
                              filter=lambda n, l: "xcomp" in l))

        if len(pred_nodes) > 1:
            raise Exception(
                "Unexpected Situation: Multiple xcomp parents found")

        new_verb_phrase = [dep_xcomp_mark_node, dep_xcomp_verb_node]
        dep_new_verb = merge_dep_nodes(new_verb_phrase,
                                       UPOS="VERB",
                                       LOC=dep_xcomp_verb_node.LOC)
        dep_graph.replace_nodes(new_verb_phrase, dep_new_verb)
def det_of_noun(dep_graph: DependencyGraph):
    """
    any/some/all of noun
    :param dep_graph:
    :return:
    """
    pattern = DependencyGraph()
    det_node = pattern.create_node(UPOS="DET")
    of_node = pattern.create_node(LEMMA="of")
    noun2_node = pattern.create_node(UPOS="NOUN|PROPN|PRON|X|NUM|SYM")

    pattern.add_dependency(det_node, noun2_node, "nmod:of")
    pattern.add_dependency(noun2_node, of_node, "case")

    for match in list(dep_graph.match(pattern)):

        dep_det_node = match[det_node]
        dep_noun2_node = match[noun2_node]
        dep_of_node = match[of_node]

        if not all([dep_det_node, dep_noun2_node, dep_of_node]):
            # processed by others
            continue

        if isinstance(dep_noun2_node,
                      DependencyGraphSuperNode) and dep_noun2_node.is_conj:
            continue

        dep_noun2_parents = [
            parent for parent, rel in dep_graph.parents(dep_noun2_node)
        ]
        if len(dep_noun2_parents) == 1:
            assert dep_noun2_parents[0] == dep_det_node

            new_noun_nodes = [dep_det_node, dep_of_node, dep_noun2_node]

            new_noun = merge_dep_nodes(new_noun_nodes,
                                       UPOS=dep_det_node.UPOS,
                                       FEATS=dep_det_node.FEATS,
                                       LOC=dep_det_node.LOC)

            dep_graph.replace_nodes(new_noun_nodes, new_noun)
Esempio n. 8
0
def to_verb(dep_graph: DependencyGraph):
    """

    :param dep_graph:
    :return:
    """
    to_verb_phrase = []
    for root in dep_graph.nodes(filter=lambda x: x.UPOS in {"VERB"}):
        if any("to" in rels.values()
               for parent, rels in dep_graph.parents(root)):
            continue

        for child, rels in dep_graph.children(root):
            if "mark" in rels and child.LEMMA == "to" and child.LOC == root.LOC - 1 and \
                    not (isinstance(child, DependencyGraphSuperNode) and child.is_conj):
                to_verb_phrase.append((child, root))

    for to, verb in to_verb_phrase:
        noun_node = merge_dep_nodes([to, verb], UPOS=verb.UPOS, LOC=verb.LOC)
        # print("Noun detected", noun_node.ID)
        dep_graph.replace_nodes([to, verb], noun_node)
Esempio n. 9
0
def fallback_sconj(dep_graph: DependencyGraph, oia_graph: OIAGraph,
                   context: UD2OIAContext):
    """

    :param dep_graph:
    :param oia_graph:
    :return:
    """

    for node in dep_graph.nodes():

        if oia_graph.has_word(node.position):
            continue

        if node.UPOS == "SCONJ" and node.LEMMA in {
                "because", "so", "if", "then", "otherwise", "after", "before",
                "and", "or", "but"
        }:

            parents = [n for n, l in dep_graph.parents(node) if "mark" in l]

            if not parents:
                continue

            assert len(parents) == 1

            parent = parents[0]

            logger.debug("context = " + str(context.processed_edges))

            if context.is_processed(parent, node):
                continue

            oiar_node = oia_graph.add_words(parent.position)
            oia_sconj_node = oia_graph.add_words(node.position)

            if node.LEMMA in {"because", "if"}:
                oia_graph.add_argument(oia_sconj_node, oiar_node, 1)
            else:
                oia_graph.add_argument(oia_sconj_node, oiar_node, 1)
Esempio n. 10
0
def noun_of_noun(dep_graph: DependencyGraph):
    """

    :param dep_graph:
    :return:
    """
    pattern = DependencyGraph()
    noun1_node = pattern.create_node(UPOS="NOUN|PROPN|PRON|X|NUM|SYM")
    of_node = pattern.create_node(LEMMA="of")
    noun2_node = pattern.create_node(UPOS="NOUN|PROPN|PRON|X|NUM|SYM")

    pattern.add_dependency(noun1_node, noun2_node, "nmod:of")
    pattern.add_dependency(noun2_node, of_node, "case")

    merged_map = dict()

    #    need_merge = []
    for match in list(dep_graph.match(pattern)):

        dep_noun1_node = match[noun1_node]
        if dep_noun1_node in merged_map:
            dep_noun1_node = merged_map[dep_noun1_node]

        dep_noun2_node = match[noun2_node]
        if dep_noun2_node in merged_map:
            dep_noun2_node = merged_map[dep_noun2_node]

        dep_of_node = match[of_node]

        if not all([dep_noun1_node, dep_noun2_node, dep_of_node]):
            # processed by others
            continue

        involved_in_complex_structure = False
        for child, rel in dep_graph.children(dep_noun2_node):
            if "conj" in rel or "acl" in rel:
                involved_in_complex_structure = True

        for parent, rel in dep_graph.parents(dep_noun2_node):
            if "conj" in rel or "acl" in rel:
                involved_in_complex_structure = True

        if involved_in_complex_structure:
            continue

        if isinstance(dep_noun1_node,
                      DependencyGraphSuperNode) and dep_noun1_node.is_conj:
            continue

        if isinstance(dep_noun2_node,
                      DependencyGraphSuperNode) and dep_noun2_node.is_conj:
            continue

        dep_noun2_parents = [
            parent for parent, rel in dep_graph.parents(dep_noun2_node)
        ]
        if len(dep_noun2_parents) == 1:
            if dep_noun2_parents[0] != dep_noun1_node:
                logger.error("dep_noun1 {0} {1}".format(
                    dep_noun1_node.ID, dep_noun1_node.FORM))
                logger.error("dep_noun2 {0} {1}".format(
                    dep_noun2_node.ID, dep_noun2_node.FORM))
                logger.error("dep_noun2_parent {0} {1}".format(
                    dep_noun2_parents[0].ID, dep_noun2_parents[0].FORM))
                raise Exception("Noun of Noun failed")

            new_noun_nodes = [dep_noun1_node, dep_of_node, dep_noun2_node]
            # <<<<<<< HEAD

            new_noun = merge_dep_nodes(new_noun_nodes,
                                       UPOS=dep_noun1_node.UPOS,
                                       FEATS=dep_noun1_node.FEATS,
                                       LOC=dep_noun1_node.LOC)

            dep_graph.replace_nodes(new_noun_nodes, new_noun)
            for node in new_noun_nodes:
                merged_map[node] = new_noun

            logger.debug("node merged :" + " ".join(
                [dep_noun1_node.ID, dep_of_node.ID, dep_noun2_node.ID]))
def adv_ccomp(dep_graph: DependencyGraph, oia_graph: OIAGraph,
              context: UD2OIAContext):
    """

    :param dep_graph:
    :param oia_graph:
    :return:
    """

    pattern = DependencyGraph()

    # TODO: it seems that in UD labeling, adv is used instead of adj for noun
    # verb_node = pattern.create_node(UPOS="VERB|NOUN|PROPN")
    adv_node = pattern.create_node(UPOS="ADV|X|NOUN|PART")  # part is for "not"
    ccomp_node = pattern.create_node()

    # pattern.add_dependency(verb_node, adv_node, r'advmod')
    pattern.add_dependency(adv_node, ccomp_node, r"ccomp|xcomp")

    patterns = []
    for match in dep_graph.match(pattern):

        # dep_verb_node = match[verb_node]
        dep_adv_node = match[adv_node]
        dep_ccomp_node = match[ccomp_node]

        if oia_graph.has_relation(dep_adv_node, dep_ccomp_node):
            continue

        dep_case_nodes = [
            n for n, l in
            dep_graph.children(dep_ccomp_node,
                               filter=lambda n, l: "case" == l and dep_adv_node
                               .LOC < n.LOC < dep_ccomp_node.LOC)
        ]

        if dep_case_nodes:
            dep_case_nodes = continuous_component(dep_case_nodes,
                                                  dep_case_nodes[0])
            predicate_nodes = [dep_adv_node] + dep_case_nodes
            predicate_nodes.sort(key=lambda n: n.LOC)
        else:
            predicate_nodes = [dep_adv_node]

        dep_subj_nodes = [
            n for n, l in dep_graph.parents(dep_adv_node,
                                            filter=lambda n, l: "advmod" == l
                                            and n.UPOS in {"ADV", "X", "NOUN"})
        ]
        if len(dep_subj_nodes) > 1:
            raise Exception("Multiple subject")
        elif len(dep_subj_nodes) > 0:
            dep_subj_node = dep_subj_nodes[0]
        else:
            dep_subj_node = None

        patterns.append([dep_subj_node, predicate_nodes, dep_ccomp_node])

    for dep_subj_node, predicate_nodes, dep_ccomp_node in patterns:

        if len(predicate_nodes) > 1:

            new_pred_node = dep_graph.create_node(
                ID=" ".join([x.ID for x in predicate_nodes]),
                FORM=" ".join([x.FORM for x in predicate_nodes]),
                LEMMA=" ".join([x.LEMMA for x in predicate_nodes]),
                UPOS="ADV",
                LOC=predicate_nodes[0].LOC)

            new_pred_node.aux = True

            dep_graph.replace_nodes(predicate_nodes, new_pred_node)

            dep_graph.remove_dependency(dep_ccomp_node, new_pred_node)

        else:
            new_pred_node = predicate_nodes[0]

        oia_pred_node = oia_graph.add_words(new_pred_node.position)

        if dep_subj_node:
            oia_subj_node = oia_graph.add_words(dep_subj_node.position)
            oia_graph.add_argument(oia_pred_node, oia_subj_node, 1, mod=True)

        else:
            oia_ccomp_node = oia_graph.add_words(dep_ccomp_node.position)
            oia_graph.add_argument(oia_pred_node, oia_ccomp_node, 2)
def general_question(dep_graph: DependencyGraph, oia_graph: OIAGraph,
                     context: UD2OIAContext):
    """

    :param dep_graph:
    :param oia_graph:
    :return:
    """

    for verb in dep_graph.nodes(filter=lambda n: n.UPOS == "VERB"):

        if any(
                any(x in n.LEMMA
                    for x in {"what", "how", "why", "when", "where"})
                for n in dep_graph.offsprings(verb)):
            continue

        parents = [n for n, _ in dep_graph.parents(verb)]

        # if not(len(parents) == 1 and parents[0].ID == "0"):
        #    continue
        # check subj and aux

        subj = None
        aux = None
        for child, rel in dep_graph.children(verb):
            if "subj" in rel:
                subj = child
            if "aux" in rel:
                aux = child

        is_be_verb = False

        if not isinstance(verb, DependencyGraphSuperNode):
            is_be_verb = verb.LEMMA == "be"
        else:
            assert isinstance(verb, DependencyGraphSuperNode)
            assert aux is None
            for n in verb.nodes:
                if isinstance(n, DependencyGraphNode):
                    if n.LEMMA == "be":
                        is_be_verb = True
                        # print('verb.nodes:', str(" ".join(str(xx.LEMMA) for xx in verb.nodes)))
                        # print('is_be_verb222:', is_be_verb)
                    if n.UPOS == "AUX":
                        aux = n
        # print('is_be_verb:', is_be_verb)
        if aux is None and not is_be_verb:
            # cannot be a general question
            continue

        expl_child = [n for n, l in dep_graph.children(verb) if l == "expl"]
        if expl_child:
            assert len(expl_child) == 1
            subj = expl_child[0]

        if subj is None:
            logger.warning(
                "subject is none, cannot decide whether it is a question")
            continue
        #        print('subj.LOC:', subj.LOC)
        #        print('subj.LOC type:', type(subj.LOC))
        oia_verb_node = oia_graph.add_words(verb.position)

        is_there_be_verb = is_be_verb and ("there" in verb.LEMMA.split(' ')
                                           or "here" in verb.LEMMA.split(' '))

        is_question = False

        if is_there_be_verb:

            assert isinstance(verb, DependencyGraphSuperNode)
            be_node = [n for n in verb.nodes if n.LEMMA == "be"][0]
            there_node = [
                n for n in verb.nodes
                if n.LEMMA == "there" or n.LEMMA == "here"
            ][0]
            # print('there_node:', there_node)
            if be_node.LOC < there_node.LOC:
                is_question = True

        elif (is_be_verb and verb.LOC < subj.LOC):

            is_question = True

        elif (aux is not None and aux.LOC < subj.LOC):

            is_question = True

        if is_question:
            # if aux is not None and aux.LEMMA == "do":
            #    oia_question_node = oia_graph.add_word_with_head(aux.LOC)
            # else:

            oia_question_node = oia_graph.add_aux("WHETHER")

            oia_graph.add_function(oia_question_node, oia_verb_node)
def process_conjunction(dep_graph: DependencyGraph, root: DependencyGraphNode):
    """

    :param dep_graph:
    :param root:
    :return:
    """
    conj_childs = [
        child for child, rels in dep_graph.children(
            root, filter=lambda n, l: l.startswith("conj"))
    ]

    assert conj_childs

    parallel_components = [root]

    for child in conj_childs:

        is_nest = any(
            grand_rels.startswith("conj")
            for grand_sun, grand_rels in dep_graph.children(child))
        if is_nest:
            logger.debug("nested conj is found ")
            logger.debug(str(child))

            conj_node, parallel_nodes = process_conjunction(dep_graph, child)
            logger.debug("conj_node is created ")
            logger.debug(str(conj_node))

            for node in parallel_nodes:
                logger.debug("Containing nodes  ")
                logger.debug(str(node))
                rels = list(dep_graph.get_dependency(root, node))
                for rel in rels:
                    if rel.startswith("conj"):
                        logger.debug("remove dependency {0}".format(
                            (root.ID, node.ID, rel)))

                        dep_graph.remove_dependency(root, node, rel)
                        dep_graph.add_dependency(root, conj_node, rel)
            child = conj_node

        parallel_components.append(child)

    parallel_components.sort(key=lambda x: x.LOC)

    # if all(n.UPOS in NOUN_UPOS for n in parallel_components):
    #
    #     logger.debug("Processing all noun conjunction")
    #
    #     is_pure_noun = True
    #
    #     merging_noun_nodes = []
    #     min_loc = 10000
    #     max_loc = -1
    #     for child in parallel_components:
    #         if isinstance(child, DependencyGraphNode):
    #             min_loc = min(min_loc, child.LOC)
    #             max_loc = max(min_loc, child.LOC)
    #         elif isinstance(child, DependencyGraphSuperNode):
    #             min_loc = min(min_loc, min([x.LOC for x in child.nodes]))
    #             max_loc = max(max_loc, max([x.LOC for x in child.nodes]))
    #         merging_noun_nodes.extend(dep_graph.offsprings(child))
    #
    #         logger.debug("Checking acl for {0}".format(child))
    #         for n, l in dep_graph.children(child):
    #             logger.debug(n)
    #             logger.debug("label {0}".format(l))
    #             if "acl" in l:
    #                 is_pure_noun = False
    #                 break
    #
    #     if is_pure_noun:
    #         merging_noun_nodes = [n for n in merging_noun_nodes if min_loc <= n.LOC <= max_loc]
    #         is_pure_noun = not any(n.UPOS in {"ADP", "VERB", "SCONJ", "AUX"} for n in merging_noun_nodes)
    #
    #     if is_pure_noun:
    #         # merged_noun_nodes.sort(key=lambda x: x.LOC)
    #         for node in merging_noun_nodes:
    #             logger.debug("merging {0}".format(node))
    #
    #         new_noun = merge_dep_nodes(merging_noun_nodes, UPOS=root.UPOS, LOC=root.LOC)
    #         dep_graph.replace_nodes(merging_noun_nodes, new_noun)
    #
    #         return new_noun, []

    root_parents = list(set(parent
                            for parent, rels in dep_graph.parents(root)))
    root_parents.sort(key=lambda x: x.LOC)

    # ic(list(map(str, root_parents)))

    conj_node, with_arg_palceholder = build_conjunction_node(
        dep_graph, root, root_parents, parallel_components)

    relation_to_conj = get_relation_to_conj(dep_graph, root, root_parents,
                                            parallel_components)

    case_marks = dict()
    for index, node in enumerate(parallel_components):
        case_marks[node.ID] = [(n, l) for n, l in dep_graph.children(node)
                               if ("case" in l or "mark" in l or "cc" in l)]
    for key, values in case_marks.items():
        for v in values:
            logger.debug("case_marker = {} {} {}".format(
                key, v[0].ID, v[1].rels))

    logger.debug("relation_to_conj = {}".format(relation_to_conj))

    for parent in root_parents:
        # ic(parent)

        prefix, shared_prefix, required_mark = relation_to_conj[parent.ID]
        if any(x in prefix for x in {"subj", "obj", "ccomp", "xcomp"}) \
                or not required_mark or len(set(required_mark)) == 1:

            for node in parallel_components:
                dep_graph.remove_dependency(parent, node)

            relation = prefix

            if required_mark and len(set(required_mark)) == 1:
                ## with same mark

                mark_lemma = list(set(required_mark))[0]

                relation += ":" + mark_lemma

                mark_node = find_mark(case_marks, parallel_components,
                                      mark_lemma)

                if mark_node:

                    mark_node, mark_rel = mark_node

                    dep_graph.remove_node(mark_node)
                    dep_graph.add_node(mark_node)  # clear the dependency

                    dep_graph.add_dependency(conj_node, mark_node, mark_rel)
                else:
                    logger.error("cannot find the mark node")

            dep_graph.add_dependency(parent, conj_node, relation)

        else:

            complete_missing_case_mark(dep_graph, root, root_parents,
                                       parallel_components, relation_to_conj,
                                       case_marks)

            if not required_mark:
                required_mark = [None] * len(parallel_components)

            for index, (node, mark) in enumerate(
                    zip(parallel_components, required_mark)):
                if mark:
                    rel = prefix + ":" + mark
                else:
                    rel = prefix

                # if rel.startswith("conj"):
                #    continue
                logger.debug("add dependency {0}".format(
                    (parent.ID, node.ID, rel)))

                dep_graph.add_dependency(parent, node, rel)

        for idx, node in enumerate(parallel_components):
            if node != root:
                rels = dep_graph.get_dependency(root, node)
                for rel in rels:
                    if rel.startswith("conj"):
                        dep_graph.remove_dependency(root, node)

            if with_arg_palceholder:
                index = idx + 1
            else:
                # a, but b, b should be the arg1 and a be the arg2
                index = len(parallel_components) - idx

            dep_graph.add_dependency(conj_node, node,
                                     "arg_conj:{0}".format(index))

    return conj_node, parallel_components
Esempio n. 14
0
def multi_word_fix_flat(dep_graph: DependencyGraph):
    """

    :param dep_graph:
    :param oia_graph:
    :return:
    """

    fixed_rels = {"fixed", "flat", "compound"}

    phrases = []

    for node in dep_graph.nodes():

        parents = [n for n, l in dep_graph.parents(node,
                                                   filter=lambda n, l: any(x in l for x in fixed_rels))]

        if parents:
            continue

        phrase = []
        for n, l in dep_graph.children(node,
                                       filter=lambda n, l: any(x in l for x in fixed_rels)):
            phrase.extend(dep_graph.offsprings(n))

        if not phrase:
            continue

        phrase.append(node)

        if len(phrase) > 1:
            phrase.sort(key=lambda n: n.LOC)
            # min_loc = phrase[0].LOC
            # max_loc = phrase[-1].LOC
            # phrase = [n for n in dep_graph.nodes() if min_loc <= n.LOC <= max_loc]
            phrases.append((phrase, node))

    phrases.sort(key=lambda x: len(x[0]), reverse=True)

    for phrase, head in phrases:

        if not all([dep_graph.get_node(x.ID) for x in phrase]):
            continue  # already been processed

        merging_nodes = set()
        min_loc = 10000
        max_loc = -1
        for child in phrase:
            if isinstance(child, DependencyGraphNode):
                min_loc = min(min_loc, child.LOC)
                max_loc = max(min_loc, child.LOC)
            elif isinstance(child, DependencyGraphSuperNode):
                min_loc = min(min_loc, min([x.LOC for x in child.nodes]))
                max_loc = max(max_loc, max([x.LOC for x in child.nodes]))
            merging_nodes.update(dep_graph.offsprings(child))

        merged_nodes = set([n for n in merging_nodes if min_loc <= n.LOC <= max_loc])
        for node in merging_nodes:
            if node.LOC == min_loc - 1 and node.LEMMA in {"\"", "\'"}:
                merged_nodes.add(node)
            if node.LOC == max_loc + 1 and node.LEMMA in {"\"", "\'"}:
                merged_nodes.add(node)
        merged_nodes = list(merged_nodes)
        merged_nodes.sort(key=lambda x: x.LOC)

        logger.debug("multi_word_fix_flat: we are merging ")
        logger.debug("\n".join(str(node) for node in merged_nodes))
        logger.debug("with head \n" + str(head))
        new_node = merge_dep_nodes(merged_nodes, UPOS=head.UPOS, LOC=head.LOC)

        dep_graph.replace_nodes(merged_nodes, new_node)
Esempio n. 15
0
def verb_phrase(dep_graph: DependencyGraph):
    """
    ##### Merging aux and cop with their head VERB #####
    Cases:

    :param sentence:
    :return:
    """
    verb_phrases = []

    for node in dep_graph.nodes(filter=lambda x: x.UPOS in {"VERB", "AUX"}):

        if node.UPOS == "AUX":
            parent = [
                n for n, l in dep_graph.parents(node,
                                                filter=lambda n, l: l == "aux")
            ]
            if len(parent) > 0:
                continue

        #        if "VerbForm" in node.FEATS and "Ger" in node.FEATS["VerbForm"]:
        #            continue

        if "Tense" in node.FEATS and "Past" in node.FEATS["Tense"]:
            # if the verb is before the noun, it will be processed by noun_phrase and taken as a part of the noun
            parent = [
                n for n, l in dep_graph.parents(
                    node, filter=lambda n, l: l == "amod" and node.LOC < n.LOC)
            ]
            if len(parent) > 0:
                continue
        # logger.debug("We are checking node {0}".format(node))

        root = node
        verbs = [root]
        for n, l in dep_graph.children(root):
            if dep_graph.get_dependency(n, root):
                continue

            if n.LEMMA in {"so", "also", "why"}:
                continue

            if "advmod" in l:
                offsprings = list(dep_graph.offsprings(n))
                if any(x.UPOS in {"VERB", "NOUN", "AUX", "PRON"}
                       for x in offsprings):
                    continue

                verbs.extend(offsprings)
            elif "compound" in l:
                verbs.append(n)

        verbs = [
            x for x in verbs if x.LOC <= root.LOC
            or "compound" in dep_graph.get_dependency(root, x)
        ]

        # logger.debug("Verb: before continuous component ")
        # logger.debug("\n".join(str(verb) for verb in verbs))

        verbs = continuous_component(verbs, root)

        # add aux
        verbs.extend(n for n, l in dep_graph.children(root) if "aux" in l)

        # logger.debug("Verb: after continuous component ")
        # for verb in verbs:
        #    logger.debug(verb)

        verbs.sort(key=lambda x: x.LOC)
        last_loc = verbs[-1].LOC

        #        next_node = dep_graph.get_node_by_loc(last_loc + 1)
        #        if next_node and next_node.LEMMA == "not":
        #            verbs.append(next_node)

        if len(verbs) > 1:
            verb_phrases.append((verbs, root))

    for verbs, root in verb_phrases:
        verb_node = merge_dep_nodes(verbs,
                                    UPOS="VERB",
                                    LOC=root.LOC,
                                    FEATS=root.FEATS)

        dep_graph.replace_nodes(verbs, verb_node)
Esempio n. 16
0
def noun_phrase(dep_graph: DependencyGraph):
    """

    :param dep_graph:
    :param oia_graph:
    :return:
    """
    nouns = []
    # we first find np roots
    for root in dep_graph.nodes(
            filter=lambda x: x.UPOS in {"NOUN", "PROPN", "X", "NUM", "SYM"}):

        logger.debug("checking the node:")
        logger.debug(str(root))

        # np_elements = valid_np_element(root, dep_graph)
        parent_rels = set(
            itertools.chain.from_iterable(l.values()
                                          for n, l in dep_graph.parents(root)))
        parent_rels = set(rel.replace("_", " ") for rel in parent_rels)

        escaped_case_node = set()
        if parent_rels:
            case_nodes = [
                x
                for x, l in dep_graph.children(root,
                                               filter=lambda n, l: l == "case")
            ]
            for node in case_nodes:
                if node.LEMMA.lower() in parent_rels or node.FORM.lower(
                ) in parent_rels:
                    # lemma is for including
                    escaped_case_node.add(node)

        valid_np_children = [(n, l) for n, l in dep_graph.children(
            root, filter=lambda n, l: is_valid_np_child(dep_graph, root, l, n))
                             ]
        logger.debug("noun_phrase: valid_np_children:")
        for node, l in valid_np_children:
            logger.debug(str(node))

        np_elements = [root]

        for n, l in valid_np_children:
            if n.UPOS == "ADP":
                continue
            if n.LOC > root.LOC and \
                    not any(l.startswith(x)
                            for x in {"fixed", "compound", "nummod",
                                      "nmod:tmod", "flat", "nmod:npmod", "dep"}):
                continue
            if n in escaped_case_node:
                continue

            if isinstance(n, DependencyGraphSuperNode) and n.is_conj:
                continue

            offsprings = list(dep_graph.offsprings(n))
            valid_np_component = True

            for x in offsprings:
                for parent, rels in dep_graph.parents(x):
                    if any(x in rels
                           for x in {"acl", "obl", "advcl", "subj", "obj"}):
                        valid_np_component = False
                        break
                if not valid_np_component:
                    break
            if valid_np_component:
                np_elements.extend(offsprings)

        logger.debug("noun_phrase: candidate np_elements:")
        for node in np_elements:
            logger.debug(str(node))

        det = [
            n for n, l in dep_graph.children(root,
                                             filter=lambda n, l: l == "det")
        ]
        det = [x for x in det if x.LOC <= root.LOC]
        det.sort(key=lambda x: x.LOC)

        if det:
            # raise Exception("noun phrase without det ")

            det = det[-1]
            # check the element should be continuous
            np_elements = [x for x in np_elements if det.LOC <= x.LOC]
            logger.debug(
                "noun_phrase: det found, cut the nodes before the det")

        filtered_np_elements = sorted(list(np_elements), key=lambda x: x.LOC)
        # if np_elements[-1].LOC - np_elements[0].LOC != len(np_elements) - 1:
        #     print ("root", root)
        #     for n in np_elements:
        #         print("np element", n.LOC, n)
        #     raise Exception("Bad Business Logic")
        changed = True
        while changed:
            changed = False
            if filtered_np_elements and filtered_np_elements[0].LEMMA in {
                    "-", "--"
            }:
                filtered_np_elements.pop(0)
                changed = True
            if filtered_np_elements and filtered_np_elements[0].UPOS in {
                    "ADP", "CCONJ", "PUNCT"
            }:
                filtered_np_elements.pop(0)
                changed = True

        if filtered_np_elements:
            nouns.append((set(filtered_np_elements), root))

    sub_nouns = []
    for idx1, (phrase1, head1) in enumerate(nouns):
        for idx2, (phrase2, head2) in enumerate(nouns):
            if idx1 == idx2:
                continue

            phrasex, phrasey = (
                phrase1, phrase2) if len(phrase1) > len(phrase2) else (phrase2,
                                                                       phrase1)
            common = phrasex.intersection(phrasey)

            if not common:
                continue
            elif len(common) == len(phrasey):
                # node2 is a sub np of node1, delete
                sub_nouns.append(phrasey)
            else:
                print("Phrase 1", [x.ID for x in phrase1])
                print("Phrase 2", [x.ID for x in phrase2])
                # return
                raise Exception("duplicate words found")

    for idx, (phrase, head) in enumerate(nouns):

        if phrase in sub_nouns:
            continue

        phrase = sorted(list(phrase), key=lambda x: x.LOC)

        for node in phrase:
            for child, _ in dep_graph.children(node):
                if child.LOC == phrase[0].LOC - 1 and child.LEMMA in {
                        "\"", "\'"
                }:
                    phrase.insert(0, child)
                if child.LOC == phrase[-1].LOC + 1 and child.LEMMA in {
                        "\"", "\'"
                }:
                    phrase.append(child)

        noun_node = merge_dep_nodes(phrase, UPOS=head.UPOS, LOC=phrase[-1].LOC)
        # print("Noun detected", noun_node.ID)
        dep_graph.replace_nodes(phrase, noun_node)
Esempio n. 17
0
def advp_phrase(dep_graph: DependencyGraph):
    """

    :param dep_graph:
    :param oia_graph:
    :return:
    case: english-UD-12774
    """
    # return
    phrases = []
    remove_rels = []
    for node in dep_graph.nodes(filter=lambda n: n.UPOS in {"ADP"}):
        # is_root = True
        need_merge_node = set()
        # if str(node.FORM).lower() != 'after':
        #     continue
        # print('advp node:', str(node.FORM))

        for parent, rel in dep_graph.parents(node):

            if "case" in rel and \
                    any(node.FORM in l.values() or node.LEMMA in l.values() for x, l in dep_graph.parents(parent)):
                break

            remove_rel = False

            # we find neighborhood adjvs
            silibings = list(dep_graph.children(parent))
            silibings.sort(key=lambda x: x[0].LOC)

            start_loc = -1
            for child, ch_rel in reversed(silibings):
                # print(str(node.FORM))
                if child.LOC >= node.LOC:
                    start_loc = child.LOC
                    continue

                if "advmod" in ch_rel and child.UPOS in {
                        "ADJ", "ADV"
                } and child.LOC == start_loc - 1:
                    # is_root = True
                    need_merge_node.update(
                        set(valid_adjv_element(child, dep_graph)))
                    remove_rel = True
                    start_loc = child.LOC
                    # adjv_element = valid_adjv_element(child, dep_graph)
            if remove_rel:
                if 'case' in rel:
                    remove_rels.append((parent, node, 'case'))
        if len(need_merge_node) == 0:
            continue
        need_merge_node.add(node)
        adjv_element = sorted(list(need_merge_node), key=lambda x: x.LOC)
        phrases.append((adjv_element, node))
    for src, trg, rel in remove_rels:
        dep_graph.remove_dependency(src, trg, rel)
    for adjv_phrase, node in phrases:
        advp_node = merge_dep_nodes(
            adjv_phrase,
            # UPOS=node.UPOS,
            UPOS='ADV',
            LOC=node.LOC)
        # print("Noun detected", noun_node.ID)
        dep_graph.replace_nodes(adjv_phrase, advp_node)