def to_indices_with_real_entities_and_entity_nums_with_vertex_padding_and_entity_pair( graphs, word2idx, property2idx, max_sent_len, mode='train', **kwargs): """ :param graphs: :param word2idx: :param property2idx: :param max_sent_len: :return: """ graphs_to_process = [] for g in graphs: if len(g['edgeSet']) > 0: if len(g['edgeSet']) <= MAX_EDGES_PER_GRAPH: graphs_to_process.append(g) else: continue # here we discard these data points for i in range(0, len(g['edgeSet']), MAX_EDGES_PER_GRAPH): graphs_to_process.append({ "tokens": g["tokens"], "edgeSet": g["edgeSet"][i:i + MAX_EDGES_PER_GRAPH] }) graphs = graphs_to_process sentences_matrix = np.zeros((len(graphs), max_sent_len), dtype="int32") entity_matrix = np.zeros((len(graphs), MAX_EDGES_PER_GRAPH, max_sent_len), dtype="int8") y_matrix = np.zeros((len(graphs), MAX_EDGES_PER_GRAPH), dtype="int16") entity_cnt = [] pos2id = dict() entity_pair = [] for index, g in enumerate(tqdm.tqdm(graphs, ascii=True)): try: entity_cnt.append(len(g["vertexSet"])) for i in g['vertexSet']: pos2id[tuple(i['tokenpositions'])] = i['kbID'] except: continue token_ids = embedding_utils.get_idx_sequence(g["tokens"], word2idx) if len(token_ids) > max_sent_len: token_ids = token_ids[:max_sent_len] sentences_matrix[index, :len(token_ids)] = token_ids entity_pair_instance = [] for j, edge in enumerate(g["edgeSet"][:MAX_EDGES_PER_GRAPH]): new_j = calculate_order_conversion(j, len(g["vertexSet"])) entity_matrix[index, new_j, :len(token_ids)] = \ [m for _, m in graph_utils.get_entity_indexed_vector(token_ids, edge, mode="mark-bi")] _, property_kbid, _ = graph_utils.edge_to_kb_ids(edge, g) property_kbid = property2idx.get( property_kbid, property2idx[embedding_utils.unknown]) y_matrix[index, new_j] = property_kbid entity_pair_instance.append( (pos2id[tuple(edge['left'])], pos2id[tuple(edge['right'])])) entity_pair.append(entity_pair_instance) entity_cnt = np.array(entity_cnt, dtype=np.int32) return sentences_matrix, entity_matrix, y_matrix, entity_cnt, entity_pair
def to_indices_and_entity_pair(graphs, word2idx, property2idx, max_sent_len, replace_entities_with_unkown=False, mode='train', **kwargs): """ :param graphs: :param word2idx: :param property2idx: :param max_sent_len: :return: """ num_edges = len([ e for g in graphs for e in g['edgeSet'] if e['kbID'] not in property_blacklist ]) print("Dataset number of edges: {}".format(num_edges)) sentences_matrix = np.zeros((num_edges, max_sent_len), dtype="int32") entity_matrix = np.zeros((num_edges, max_sent_len), dtype="int8") y_matrix = np.zeros(num_edges, dtype="int16") index = 0 entity_cnt = [] pos2id = dict() entity_pair = [] for g in tqdm.tqdm(graphs, ascii=True): token_ids = embedding_utils.get_idx_sequence(g["tokens"], word2idx) try: entity_cnt.append(len(g["vertexSet"])) for i in g['vertexSet']: pos2id[tuple(i['tokenpositions'])] = i['kbID'] except: continue if len(token_ids) > max_sent_len: token_ids = token_ids[:max_sent_len] for edge in g["edgeSet"]: if edge['kbID'] not in property_blacklist: sentences_matrix[index, :len(token_ids)] = \ [word2idx[embedding_utils.unknown] if i in edge["left"] + edge["right"] else t for i, t in enumerate(token_ids)] \ if replace_entities_with_unkown else token_ids entity_matrix[index, :len(token_ids)] = \ [m for _, m in graph_utils.get_entity_indexed_vector(token_ids, edge, mode="mark-bi")] if mode == "train": _, property_kbid, _ = graph_utils.edge_to_kb_ids(edge, g) property_kbid = property2idx.get( property_kbid, property2idx[embedding_utils.unknown]) y_matrix[index] = property_kbid entity_pair.append((pos2id[tuple(edge['left'])], pos2id[tuple(edge['right'])])) index += 1 return [sentences_matrix, entity_matrix, y_matrix, entity_pair]
def to_indices_with_relative_positions_and_pcnn_mask_and_entity_pair( graphs, word2idx, property2idx, max_sent_len, position2idx, **kwargs): num_edges = len([e for g in graphs for e in g['edgeSet']]) sentences_matrix = np.zeros((num_edges, max_sent_len), dtype="int32") entity_matrix = np.zeros((num_edges, 2, max_sent_len), dtype="int8") pcnn_mask = np.zeros((num_edges, 3, max_sent_len), dtype="float32") y_matrix = np.zeros(num_edges, dtype="int16") index = 0 max_entity_index = max_sent_len - 1 entity_pair = [] pos2id = dict() for g in tqdm.tqdm(graphs, ascii=True): try: for i in g['vertexSet']: pos2id[tuple(i['tokenpositions'])] = i['kbID'] except: continue token_ids = embedding_utils.get_idx_sequence(g["tokens"], word2idx) if len(token_ids) > max_sent_len: token_ids = token_ids[:max_sent_len] entity_pair_instance = [] for edge in g["edgeSet"]: sentences_matrix[index, :len(token_ids)] = token_ids _, property_kbid, _ = graph_utils.edge_to_kb_ids(edge, g) try: property_kbid = property2idx.get( property_kbid, property2idx[embedding_utils.unknown]) except: pdb.set_trace() entity_vector = graph_utils.get_entity_indexed_vector( token_ids, edge, mode="position") entity_vector = [ (-max_entity_index if m1 < -max_entity_index else max_entity_index if m1 > max_entity_index else m1, -max_entity_index if m2 < -max_entity_index else max_entity_index if m2 > max_entity_index else m2) for _, m1, m2 in entity_vector ] entity_matrix[index, :, :len(token_ids)] = [[ position2idx[m] for m, _ in entity_vector ], [position2idx[m] for _, m in entity_vector]] pcnn_mask[index, 0, :len(token_ids)], pcnn_mask[ index, 1, :len(token_ids)], pcnn_mask[ index, 2, :len(token_ids)] = graph_utils.get_pcnn_mask( token_ids, edge) y_matrix[index] = property_kbid index += 1 entity_pair_instance.append( (pos2id[tuple(edge['left'])], pos2id[tuple(edge['right'])])) entity_pair += entity_pair_instance return [sentences_matrix, entity_matrix, y_matrix, pcnn_mask, entity_pair]
def to_indices_with_real_entities_completely(graphs, word2idx, property2idx, max_sent_len, mode='train', **kwargs): """ This function add N/A relations to all entity pairs with no relation in dataset :param graphs: :param word2idx: :param property2idx: :param max_sent_len: :return: """ graphs_to_process = [] for g in graphs: if len(g['edgeSet']) > 0: if len(g['edgeSet']) <= MAX_EDGES_PER_GRAPH: graphs_to_process.append(g) else: for i in range(0, len(g['edgeSet']), MAX_EDGES_PER_GRAPH): graphs_to_process.append({ "tokens": g["tokens"], "edgeSet": g["edgeSet"][i:i + MAX_EDGES_PER_GRAPH] }) graphs = graphs_to_process sentences_matrix = np.zeros((len(graphs), max_sent_len), dtype="int32") entity_matrix = np.zeros((len(graphs), MAX_EDGES_PER_GRAPH, max_sent_len), dtype="int8") y_matrix = np.zeros((len(graphs), MAX_EDGES_PER_GRAPH), dtype="int16") for index, g in enumerate(tqdm.tqdm(graphs, ascii=True)): token_ids = embedding_utils.get_idx_sequence(g["tokens"], word2idx) if len(token_ids) > max_sent_len: token_ids = token_ids[:max_sent_len] sentences_matrix[index, :len(token_ids)] = token_ids for j, edge in enumerate(g["edgeSet"][:MAX_EDGES_PER_GRAPH]): entity_matrix[index, j, :len(token_ids)] = \ [m for _, m in graph_utils.get_entity_indexed_vector(token_ids, edge, mode="mark-bi")] _, property_kbid, _ = graph_utils.edge_to_kb_ids(edge, g) property_kbid = property2idx.get( property_kbid, property2idx[embedding_utils.unknown]) y_matrix[index, j] = property_kbid return sentences_matrix, entity_matrix, y_matrix
def to_indices_with_relative_positions(graphs, word2idx, property2idx, max_sent_len, position2idx, **kwargs): num_edges = len([e for g in graphs for e in g['edgeSet']]) sentences_matrix = np.zeros((num_edges, max_sent_len), dtype="int32") entity_matrix = np.zeros((num_edges, 2, max_sent_len), dtype="int8") y_matrix = np.zeros(num_edges, dtype="int16") index = 0 max_entity_index = max_sent_len - 1 for g in tqdm.tqdm(graphs, ascii=True): token_ids = embedding_utils.get_idx_sequence(g["tokens"], word2idx) if len(token_ids) > max_sent_len: token_ids = token_ids[:max_sent_len] for edge in g["edgeSet"]: sentences_matrix[index, :len(token_ids)] = token_ids _, property_kbid, _ = graph_utils.edge_to_kb_ids(edge, g) try: property_kbid = property2idx.get( property_kbid, property2idx[embedding_utils.unknown]) except: pdb.set_trace() entity_vector = graph_utils.get_entity_indexed_vector( token_ids, edge, mode="position") entity_vector = [ (-max_entity_index if m1 < -max_entity_index else max_entity_index if m1 > max_entity_index else m1, -max_entity_index if m2 < -max_entity_index else max_entity_index if m2 > max_entity_index else m2) for _, m1, m2 in entity_vector ] entity_matrix[index, :, :len(token_ids)] = [[ position2idx[m] for m, _ in entity_vector ], [position2idx[m] for _, m in entity_vector]] y_matrix[index] = property_kbid index += 1 return [sentences_matrix, entity_matrix, y_matrix]