Ejemplo n.º 1
0
    def find_best_partition(self, x, y, avail_attrs, depth):
        # Randomly select F features from the available ones
        np.random.shuffle(avail_attrs)
        feat_ixs = avail_attrs[:self.n_features]

        # Compute the score for each attribute and keep the one with the highest score
        best_score = -100
        for feat_ix in feat_ixs:
            score = self.compute_score(x[:, feat_ix], y)

            if score > best_score:
                best_feat_ix = feat_ix
                best_score = score

        # Annotate this feature as selected in the tree creation (To measure the feature importance in the forest)
        self.feat_selected[best_feat_ix] = 1

        # Remove the attribute from the list of available attributes
        avail_attrs = [attr for attr in avail_attrs if attr != best_feat_ix]

        # Create the Node and add a child per value of the selected attribute
        out_node = Node(attribute=best_feat_ix,
                        avail_attrs=avail_attrs,
                        depth=depth,
                        children={})
        for val in self.attr_vals[best_feat_ix]:
            out_node.add_child(val,
                               np.argwhere(x[:, best_feat_ix] == val)[:, 0])

        return out_node
Ejemplo n.º 2
0
 def _make_named_entity(self, concept, literals):
     wiki_tag = "_".join(literals)
     wiki_node = Node(wiki_tag, tag=wiki_tag)
     name_node = Node("name")
     for i, literal in enumerate(literals):
         literal_node = Node(literal, tag=literal)
         name_node.add_child(literal_node, "op{}".format(i + 1))
     concept_node = Node(concept)
     concept_node.add_child(wiki_node, "wiki")
     concept_node.add_child(name_node, "name")
     return concept_node
Ejemplo n.º 3
0
 def _make_date_entity(self, date_relations, quantities):
     date_entity_node = Node("date-entity")
     for date_relation, quantity in zip(date_relations, quantities):
         date_entity_node.add_child(Node(quantity, quantity), date_relation)
     return date_entity_node
Ejemplo n.º 4
0
def replace_named_entities(amr, sentence):
    amr_copy = copy.deepcopy(amr)
    # Find all the nodes which have a :name relation along with the node containing the "name" variable
    name_nodes = [(k, amr_copy[k]["name"][0]) for k in amr_copy
                  if amr_copy[k] and "name" in amr_copy[k]]
    if len(name_nodes) == 0:
        return amr, sentence, []
    # Find the literals associated with each named entity
    literals_triplets = []
    for name_tuple in name_nodes:
        op_regexp = re.compile("^op([0-9])+$")
        name_var = name_tuple[1]
        op_rel_list = amr_copy[name_var]
        literals = []
        node = Node("name")
        for op_rel in op_rel_list:
            if op_regexp.match(op_rel):
                literal = op_rel_list[op_rel][0]
                literals.append(literal)
                literal_node = Node(None, "\"" + literal + "\"")
                node.add_child(literal_node, op_rel)
        root = Node(amr_copy.node_to_concepts[name_tuple[0]])
        root.add_child(node, "name")
        literals_triplets.append(
            (name_tuple[0], name_tuple[1], literals, root))

    # Create a structure with named-entity-root-var, name-var, literals list, beginning of literal index, end of literal
    # index
    named_entities = []
    for literals_triplet in literals_triplets:
        literals_list = literals_triplet[2]
        tokens = [
            int(amr_copy.node_to_tokens[literal][0][0])
            for literal in literals_list
        ]
        named_entities.append(
            (literals_triplet[0], literals_triplet[1], literals_triplet[2],
             min(tokens), max(tokens), literals_triplet[3]))

    # Remove name vars from node_to_concepts
    name_variables = [n[1] for n in named_entities]
    amr_copy.node_to_concepts = dict(
        (key, value) for key, value in amr_copy.node_to_concepts.items()
        if key not in name_variables)

    # Remove literals from node_to_tokens
    literals = sum([n[2] for n in named_entities], [])
    amr_copy.node_to_tokens = dict(
        (key, value) for key, value in amr_copy.node_to_tokens.items()
        if key not in literals)

    # Remove name vars and literals from amr_copy_copy dict
    for l in literals:
        if l in list(amr_copy.keys()):
            amr_copy.pop(l)
    for n in name_variables:
        if n in list(amr_copy.keys()):
            amr_copy.pop(n)

    # Update name root vars to have no name and wiki children
    for name_entity in named_entities:
        name_root = name_entity[0]
        if "wiki" in list(amr_copy[name_root].keys()):
            wiki_content = amr_copy[name_root]["wiki"][0]
            if wiki_content in list(amr_copy.keys()):
                amr_copy.pop(wiki_content)
            name_entity[5].add_child(Node(None, "\"" + wiki_content + "\""),
                                     "wiki")
        amr_copy[name_root] = dict(
            (key, value) for key, value in amr_copy[name_root].items()
            if key != "name" and key != "wiki")

    # Add name root vars in node_to_tokens and update incrementally the token indices of the affected nodes
    named_entities = sorted(named_entities, key=itemgetter(3))
    tokens = sentence.split(" ")
    total_displacement = 0
    for named_entity in named_entities:
        span_min = named_entity[3]
        span_max = named_entity[4]
        for n in amr_copy.node_to_tokens:
            amr_copy.node_to_tokens[n] = [
                t if isinstance(t, tuple) and int(t[0]) < span_max else
                (str(int(t[0]) - (span_max - span_min)),
                 t[1]) if isinstance(t, tuple) else str(t)
                if int(t) < span_max else str(int(t) - (span_max - span_min))
                for t in amr_copy.node_to_tokens[n]
            ]
        amr_copy.node_to_tokens[named_entity[0]] = [
            named_entity[3] - total_displacement
        ]
        tokens = [
            tokens[:(span_min - total_displacement)] +
            [amr_copy.node_to_concepts[named_entity[0]]] +
            tokens[(span_max - total_displacement + 1):]
        ][0]
        total_displacement = total_displacement + span_max - span_min
    sentence_copy = " ".join(t for t in tokens)
    return amr_copy, sentence_copy, named_entities
Ejemplo n.º 5
0
def replace_date_entities(amr, sentence):
    amr_copy = copy.deepcopy(amr)

    date_rels = [
        "calendar", "century", "day", "dayperiod", "decade", "month", "quant",
        "quarter", "season", "time", "time-of", "timezone", "unit", "weekday",
        "year"
    ]
    date_entities = [
        k for k in list(amr_copy.keys())
        if k in list(amr_copy.node_to_concepts.keys())
        and amr_copy.node_to_concepts[k] == "date-entity"
    ]

    if len(date_entities) == 0:
        return amr, sentence, []

    date_tuples = []
    for date_entity in date_entities:
        op_rel_list = amr_copy[date_entity]
        literals = []
        relations = []
        node = Node("date-entity")
        for op_rel in op_rel_list:
            if op_rel in date_rels:
                child = op_rel_list[op_rel][0]
                # if it"s not in node_to_tokens, all good
                if child not in list(amr_copy.node_to_concepts.keys()):
                    literals.append(child)
                    relations.append(op_rel)
                    node.add_child(Node(None, child), op_rel)
        date_tuples.append((date_entity, literals, relations, node))

    date_entities = []
    for date_tuple in date_tuples:
        literals_list = date_tuple[1]
        tokens = [
            int(amr_copy.node_to_tokens[literal][0][0])
            for literal in literals_list
        ]
        date_entities.append((date_tuple[0], date_tuple[1], date_tuple[2],
                              min(tokens), max(tokens), date_tuple[3]))

    # Remove literals from node_to_tokens
    literals = sum([d[1] for d in date_entities], [])
    amr_copy.node_to_tokens = dict(
        (key, value) for key, value in amr_copy.node_to_tokens.items()
        if key not in literals)

    for l in literals:
        if l in list(amr_copy.keys()):
            amr_copy.pop(l)

    for date_entity in date_entities:
        amr_copy[date_entity[0]] = dict(
            (key, value) for key, value in amr_copy[date_entity[0]].items()
            if key not in date_entity[2])

    # Add name root vars in node_to_tokens and update incrementally the token indices of the affected nodes
    date_entities = sorted(date_entities, key=itemgetter(3))
    tokens = sentence.split(" ")
    total_displacement = 0
    for date_entity in date_entities:
        span_min = date_entity[3]
        span_max = date_entity[4]
        for n in amr_copy.node_to_tokens:
            amr_copy.node_to_tokens[n] = [
                t if isinstance(t, tuple) and int(t[0]) < span_max else
                (str(int(t[0]) - (span_max - span_min)),
                 t[1]) if isinstance(t, tuple) else str(t)
                if int(t) < span_max else str(int(t) - (span_max - span_min))
                for t in amr_copy.node_to_tokens[n]
            ]
        amr_copy.node_to_tokens[date_entity[0]] = [
            date_entity[3] - total_displacement
        ]
        tokens = [
            tokens[:(span_min - total_displacement)] +
            [amr_copy.node_to_concepts[date_entity[0]]] +
            tokens[(span_max - total_displacement + 1):]
        ][0]
        total_displacement = total_displacement + span_max - span_min
    sentence_copy = " ".join(t for t in tokens)
    return amr_copy, sentence_copy, date_entities
Ejemplo n.º 6
0
def test_replace_named_entities():
    sentence = "The center will bolster NATO 's defenses against cyber attacks ."
    amr_str = """(b / bolster-01~e.3 
      :ARG0 (c / center~e.1) 
      :ARG1 (d / defend-01~e.6 
            :ARG0~e.5 (m / military :wiki "NATO" 
                  :name (n / name :op1 "NATO"~e.4)) 
            :ARG1 m 
            :ARG3 (a / attack-01~e.9 
                  :medium (c2 / cyber~e.8))))"""
    amr: AMR = AMR()
    amr.node_to_concepts = {
        'c': 'center',
        'n': 'name',
        'm': 'military',
        'c2': 'cyber',
        'a': 'attack-01',
        'd': 'defend-01',
        'b': 'bolster-01'
    }
    amr.node_to_tokens = {
        'NATO': [('4', 'n')],
        'c2': ['8'],
        'a': ['9'],
        'c': ['1'],
        'd': ['6'],
        'b': ['3']
    }
    amr.relation_to_tokens = {'ARG0': [('5', 'd')]}
    amr['c'] = {}
    amr["NATO"] = {}
    amr['n'] = {'op1': ["NATO"]}
    amr['m'] = {'wiki': ["NATO"], 'name': ['n']}
    amr['c2'] = {}
    amr['a'] = {'medium': ['c2']}
    amr['d'] = {'ARG0': ['m'], 'ARG1': ['m'], 'ARG3': ['a']}
    amr['b'] = {'ARG0': ['c'], 'ARG1': ['d']}
    amr.reentrance_triples = [('d', 'ARG1', 'm')]
    amr.roots = ['b']
    generated_amr, generated_sentence, generated_metadata = replace_named_entities(
        amr, sentence)
    generated_subgraph = generated_metadata[0][5]
    expected_sentence = "The center will bolster military 's defenses against cyber attacks ."
    expected_amr: AMR = AMR()
    expected_amr.node_to_concepts = {
        'c': 'center',
        'm': 'military',
        'c2': 'cyber',
        'a': 'attack-01',
        'd': 'defend-01',
        'b': 'bolster-01'
    }
    expected_amr.node_to_tokens = {
        'c2': ['8'],
        'a': ['9'],
        'c': ['1'],
        'd': ['6'],
        'b': ['3']
    }
    expected_amr.relation_to_tokens = {'ARG0': [('5', 'd')]}
    expected_amr['c'] = {}
    expected_amr['m'] = {}
    expected_amr['c2'] = {}
    expected_amr['a'] = {'medium': ['c2']}
    expected_amr['d'] = {'ARG0': ['m'], 'ARG1': ['m'], 'ARG3': ['a']}
    expected_amr['b'] = {'ARG0': ['c'], 'ARG1': ['d']}
    expected_amr.reentrance_triples = [('d', 'ARG1', 'm')]
    expected_amr.roots = ['b']
    # metadata
    expected_subgraph: Node = Node('military')
    n = Node('name')
    op1_literal = Node(None, '\"NATO\"')
    wiki_literal = Node(None, '\"NATO\"')
    n.add_child(op1_literal, 'op1')
    expected_subgraph.add_child(n, 'name')
    expected_subgraph.add_child(wiki_literal, 'wiki')
    assert generated_sentence == expected_sentence
    assert generated_amr == expected_amr
    assert generated_subgraph.amr_print() == expected_subgraph.amr_print()
    assert generated_metadata[0][0] == 'm'
    assert generated_metadata[0][1] == 'n'
    assert generated_metadata[0][2] == ['NATO']
    assert generated_metadata[0][3] == 4
    assert generated_metadata[0][4] == 4