def find_best_partition(self, x, y, avail_attrs, depth): # Randomly select F features from the available ones np.random.shuffle(avail_attrs) feat_ixs = avail_attrs[:self.n_features] # Compute the score for each attribute and keep the one with the highest score best_score = -100 for feat_ix in feat_ixs: score = self.compute_score(x[:, feat_ix], y) if score > best_score: best_feat_ix = feat_ix best_score = score # Annotate this feature as selected in the tree creation (To measure the feature importance in the forest) self.feat_selected[best_feat_ix] = 1 # Remove the attribute from the list of available attributes avail_attrs = [attr for attr in avail_attrs if attr != best_feat_ix] # Create the Node and add a child per value of the selected attribute out_node = Node(attribute=best_feat_ix, avail_attrs=avail_attrs, depth=depth, children={}) for val in self.attr_vals[best_feat_ix]: out_node.add_child(val, np.argwhere(x[:, best_feat_ix] == val)[:, 0]) return out_node
def _make_named_entity(self, concept, literals): wiki_tag = "_".join(literals) wiki_node = Node(wiki_tag, tag=wiki_tag) name_node = Node("name") for i, literal in enumerate(literals): literal_node = Node(literal, tag=literal) name_node.add_child(literal_node, "op{}".format(i + 1)) concept_node = Node(concept) concept_node.add_child(wiki_node, "wiki") concept_node.add_child(name_node, "name") return concept_node
def _make_date_entity(self, date_relations, quantities): date_entity_node = Node("date-entity") for date_relation, quantity in zip(date_relations, quantities): date_entity_node.add_child(Node(quantity, quantity), date_relation) return date_entity_node
def replace_named_entities(amr, sentence): amr_copy = copy.deepcopy(amr) # Find all the nodes which have a :name relation along with the node containing the "name" variable name_nodes = [(k, amr_copy[k]["name"][0]) for k in amr_copy if amr_copy[k] and "name" in amr_copy[k]] if len(name_nodes) == 0: return amr, sentence, [] # Find the literals associated with each named entity literals_triplets = [] for name_tuple in name_nodes: op_regexp = re.compile("^op([0-9])+$") name_var = name_tuple[1] op_rel_list = amr_copy[name_var] literals = [] node = Node("name") for op_rel in op_rel_list: if op_regexp.match(op_rel): literal = op_rel_list[op_rel][0] literals.append(literal) literal_node = Node(None, "\"" + literal + "\"") node.add_child(literal_node, op_rel) root = Node(amr_copy.node_to_concepts[name_tuple[0]]) root.add_child(node, "name") literals_triplets.append( (name_tuple[0], name_tuple[1], literals, root)) # Create a structure with named-entity-root-var, name-var, literals list, beginning of literal index, end of literal # index named_entities = [] for literals_triplet in literals_triplets: literals_list = literals_triplet[2] tokens = [ int(amr_copy.node_to_tokens[literal][0][0]) for literal in literals_list ] named_entities.append( (literals_triplet[0], literals_triplet[1], literals_triplet[2], min(tokens), max(tokens), literals_triplet[3])) # Remove name vars from node_to_concepts name_variables = [n[1] for n in named_entities] amr_copy.node_to_concepts = dict( (key, value) for key, value in amr_copy.node_to_concepts.items() if key not in name_variables) # Remove literals from node_to_tokens literals = sum([n[2] for n in named_entities], []) amr_copy.node_to_tokens = dict( (key, value) for key, value in amr_copy.node_to_tokens.items() if key not in literals) # Remove name vars and literals from amr_copy_copy dict for l in literals: if l in list(amr_copy.keys()): amr_copy.pop(l) for n in name_variables: if n in list(amr_copy.keys()): amr_copy.pop(n) # Update name root vars to have no name and wiki children for name_entity in named_entities: name_root = name_entity[0] if "wiki" in list(amr_copy[name_root].keys()): wiki_content = amr_copy[name_root]["wiki"][0] if wiki_content in list(amr_copy.keys()): amr_copy.pop(wiki_content) name_entity[5].add_child(Node(None, "\"" + wiki_content + "\""), "wiki") amr_copy[name_root] = dict( (key, value) for key, value in amr_copy[name_root].items() if key != "name" and key != "wiki") # Add name root vars in node_to_tokens and update incrementally the token indices of the affected nodes named_entities = sorted(named_entities, key=itemgetter(3)) tokens = sentence.split(" ") total_displacement = 0 for named_entity in named_entities: span_min = named_entity[3] span_max = named_entity[4] for n in amr_copy.node_to_tokens: amr_copy.node_to_tokens[n] = [ t if isinstance(t, tuple) and int(t[0]) < span_max else (str(int(t[0]) - (span_max - span_min)), t[1]) if isinstance(t, tuple) else str(t) if int(t) < span_max else str(int(t) - (span_max - span_min)) for t in amr_copy.node_to_tokens[n] ] amr_copy.node_to_tokens[named_entity[0]] = [ named_entity[3] - total_displacement ] tokens = [ tokens[:(span_min - total_displacement)] + [amr_copy.node_to_concepts[named_entity[0]]] + tokens[(span_max - total_displacement + 1):] ][0] total_displacement = total_displacement + span_max - span_min sentence_copy = " ".join(t for t in tokens) return amr_copy, sentence_copy, named_entities
def replace_date_entities(amr, sentence): amr_copy = copy.deepcopy(amr) date_rels = [ "calendar", "century", "day", "dayperiod", "decade", "month", "quant", "quarter", "season", "time", "time-of", "timezone", "unit", "weekday", "year" ] date_entities = [ k for k in list(amr_copy.keys()) if k in list(amr_copy.node_to_concepts.keys()) and amr_copy.node_to_concepts[k] == "date-entity" ] if len(date_entities) == 0: return amr, sentence, [] date_tuples = [] for date_entity in date_entities: op_rel_list = amr_copy[date_entity] literals = [] relations = [] node = Node("date-entity") for op_rel in op_rel_list: if op_rel in date_rels: child = op_rel_list[op_rel][0] # if it"s not in node_to_tokens, all good if child not in list(amr_copy.node_to_concepts.keys()): literals.append(child) relations.append(op_rel) node.add_child(Node(None, child), op_rel) date_tuples.append((date_entity, literals, relations, node)) date_entities = [] for date_tuple in date_tuples: literals_list = date_tuple[1] tokens = [ int(amr_copy.node_to_tokens[literal][0][0]) for literal in literals_list ] date_entities.append((date_tuple[0], date_tuple[1], date_tuple[2], min(tokens), max(tokens), date_tuple[3])) # Remove literals from node_to_tokens literals = sum([d[1] for d in date_entities], []) amr_copy.node_to_tokens = dict( (key, value) for key, value in amr_copy.node_to_tokens.items() if key not in literals) for l in literals: if l in list(amr_copy.keys()): amr_copy.pop(l) for date_entity in date_entities: amr_copy[date_entity[0]] = dict( (key, value) for key, value in amr_copy[date_entity[0]].items() if key not in date_entity[2]) # Add name root vars in node_to_tokens and update incrementally the token indices of the affected nodes date_entities = sorted(date_entities, key=itemgetter(3)) tokens = sentence.split(" ") total_displacement = 0 for date_entity in date_entities: span_min = date_entity[3] span_max = date_entity[4] for n in amr_copy.node_to_tokens: amr_copy.node_to_tokens[n] = [ t if isinstance(t, tuple) and int(t[0]) < span_max else (str(int(t[0]) - (span_max - span_min)), t[1]) if isinstance(t, tuple) else str(t) if int(t) < span_max else str(int(t) - (span_max - span_min)) for t in amr_copy.node_to_tokens[n] ] amr_copy.node_to_tokens[date_entity[0]] = [ date_entity[3] - total_displacement ] tokens = [ tokens[:(span_min - total_displacement)] + [amr_copy.node_to_concepts[date_entity[0]]] + tokens[(span_max - total_displacement + 1):] ][0] total_displacement = total_displacement + span_max - span_min sentence_copy = " ".join(t for t in tokens) return amr_copy, sentence_copy, date_entities
def test_replace_named_entities(): sentence = "The center will bolster NATO 's defenses against cyber attacks ." amr_str = """(b / bolster-01~e.3 :ARG0 (c / center~e.1) :ARG1 (d / defend-01~e.6 :ARG0~e.5 (m / military :wiki "NATO" :name (n / name :op1 "NATO"~e.4)) :ARG1 m :ARG3 (a / attack-01~e.9 :medium (c2 / cyber~e.8))))""" amr: AMR = AMR() amr.node_to_concepts = { 'c': 'center', 'n': 'name', 'm': 'military', 'c2': 'cyber', 'a': 'attack-01', 'd': 'defend-01', 'b': 'bolster-01' } amr.node_to_tokens = { 'NATO': [('4', 'n')], 'c2': ['8'], 'a': ['9'], 'c': ['1'], 'd': ['6'], 'b': ['3'] } amr.relation_to_tokens = {'ARG0': [('5', 'd')]} amr['c'] = {} amr["NATO"] = {} amr['n'] = {'op1': ["NATO"]} amr['m'] = {'wiki': ["NATO"], 'name': ['n']} amr['c2'] = {} amr['a'] = {'medium': ['c2']} amr['d'] = {'ARG0': ['m'], 'ARG1': ['m'], 'ARG3': ['a']} amr['b'] = {'ARG0': ['c'], 'ARG1': ['d']} amr.reentrance_triples = [('d', 'ARG1', 'm')] amr.roots = ['b'] generated_amr, generated_sentence, generated_metadata = replace_named_entities( amr, sentence) generated_subgraph = generated_metadata[0][5] expected_sentence = "The center will bolster military 's defenses against cyber attacks ." expected_amr: AMR = AMR() expected_amr.node_to_concepts = { 'c': 'center', 'm': 'military', 'c2': 'cyber', 'a': 'attack-01', 'd': 'defend-01', 'b': 'bolster-01' } expected_amr.node_to_tokens = { 'c2': ['8'], 'a': ['9'], 'c': ['1'], 'd': ['6'], 'b': ['3'] } expected_amr.relation_to_tokens = {'ARG0': [('5', 'd')]} expected_amr['c'] = {} expected_amr['m'] = {} expected_amr['c2'] = {} expected_amr['a'] = {'medium': ['c2']} expected_amr['d'] = {'ARG0': ['m'], 'ARG1': ['m'], 'ARG3': ['a']} expected_amr['b'] = {'ARG0': ['c'], 'ARG1': ['d']} expected_amr.reentrance_triples = [('d', 'ARG1', 'm')] expected_amr.roots = ['b'] # metadata expected_subgraph: Node = Node('military') n = Node('name') op1_literal = Node(None, '\"NATO\"') wiki_literal = Node(None, '\"NATO\"') n.add_child(op1_literal, 'op1') expected_subgraph.add_child(n, 'name') expected_subgraph.add_child(wiki_literal, 'wiki') assert generated_sentence == expected_sentence assert generated_amr == expected_amr assert generated_subgraph.amr_print() == expected_subgraph.amr_print() assert generated_metadata[0][0] == 'm' assert generated_metadata[0][1] == 'n' assert generated_metadata[0][2] == ['NATO'] assert generated_metadata[0][3] == 4 assert generated_metadata[0][4] == 4