Exemple #1
0
def to_indices_with_real_entities_and_entity_nums_with_vertex_padding_and_entity_pair(
        graphs, word2idx, property2idx, max_sent_len, mode='train', **kwargs):
    """
    :param graphs:
    :param word2idx:
    :param property2idx:
    :param max_sent_len:
    :return:
    """
    graphs_to_process = []
    for g in graphs:
        if len(g['edgeSet']) > 0:
            if len(g['edgeSet']) <= MAX_EDGES_PER_GRAPH:
                graphs_to_process.append(g)
            else:
                continue  # here we discard these data points
                for i in range(0, len(g['edgeSet']), MAX_EDGES_PER_GRAPH):
                    graphs_to_process.append({
                        "tokens":
                        g["tokens"],
                        "edgeSet":
                        g["edgeSet"][i:i + MAX_EDGES_PER_GRAPH]
                    })
    graphs = graphs_to_process
    sentences_matrix = np.zeros((len(graphs), max_sent_len), dtype="int32")
    entity_matrix = np.zeros((len(graphs), MAX_EDGES_PER_GRAPH, max_sent_len),
                             dtype="int8")
    y_matrix = np.zeros((len(graphs), MAX_EDGES_PER_GRAPH), dtype="int16")
    entity_cnt = []
    pos2id = dict()
    entity_pair = []
    for index, g in enumerate(tqdm.tqdm(graphs, ascii=True)):
        try:
            entity_cnt.append(len(g["vertexSet"]))
            for i in g['vertexSet']:
                pos2id[tuple(i['tokenpositions'])] = i['kbID']
        except:
            continue
        token_ids = embedding_utils.get_idx_sequence(g["tokens"], word2idx)
        if len(token_ids) > max_sent_len:
            token_ids = token_ids[:max_sent_len]
        sentences_matrix[index, :len(token_ids)] = token_ids
        entity_pair_instance = []
        for j, edge in enumerate(g["edgeSet"][:MAX_EDGES_PER_GRAPH]):
            new_j = calculate_order_conversion(j,
                                               len(g["vertexSet"]),
                                               PAD_ENT_CNT=MAX_NUM_NODES)
            entity_matrix[index, new_j, :len(token_ids)] = \
                [m for _, m in graph_utils.get_entity_indexed_vector(token_ids, edge, mode="mark-bi")]
            _, property_kbid, _ = graph_utils.edge_to_kb_ids(edge, g)
            property_kbid = property2idx.get(
                property_kbid, property2idx[embedding_utils.unknown])
            y_matrix[index, new_j] = property_kbid
            entity_pair_instance.append(
                (pos2id[tuple(edge['left'])], pos2id[tuple(edge['right'])]))
        entity_pair.append(entity_pair_instance)
    entity_cnt = np.array(entity_cnt, dtype=np.int32)

    return sentences_matrix, entity_matrix, y_matrix, entity_cnt, entity_pair
Exemple #2
0
def to_indices_with_real_entities_and_entity_nums_with_vertex_padding_and_negative_sampling(
        graphs,
        word2idx,
        property2idx,
        max_sent_len,
        idx2property,
        mode='train',
        **kwargs):
    """
    :param graphs:
    :param word2idx:
    :param property2idx:
    :param max_sent_len:
    :return:
    """
    graphs_to_process = []
    for g in graphs:
        if len(g['edgeSet']) > 0:
            if len(g['edgeSet']) <= MAX_EDGES_PER_GRAPH:
                graphs_to_process.append(g)
            else:
                continue  # here we discard these data points
                for i in range(0, len(g['edgeSet']), MAX_EDGES_PER_GRAPH):
                    graphs_to_process.append({
                        "tokens":
                        g["tokens"],
                        "edgeSet":
                        g["edgeSet"][i:i + MAX_EDGES_PER_GRAPH]
                    })
    graphs = graphs_to_process
    sentences_matrix = np.zeros((len(graphs), max_sent_len), dtype="int32")
    entity_matrix = np.zeros((len(graphs), MAX_EDGES_PER_GRAPH, max_sent_len),
                             dtype="int8")
    y_matrix = np.zeros((len(graphs), MAX_EDGES_PER_GRAPH), dtype="int16")
    entity_cnt = []
    entity_indices = -1 * np.ones(
        (len(graphs), MAX_EDGES_PER_GRAPH, 2), dtype="int32")
    entity_surface_forms = np.empty((len(graphs), MAX_EDGES_PER_GRAPH, 2),
                                    dtype=object)
    entity_surface_forms[:, :, :] = [[[['ALL_ZEROS']]]]
    pos2id = dict()
    # relations = [idx2property[0] for _ in range(y_matrix.shape[0]*y_matrix.shape[1])]
    relations = np.empty((len(graphs), MAX_EDGES_PER_GRAPH), dtype=object)
    relations[:, :] = [[idx2property[0]]]
    entity_pair = []
    for index, g in enumerate(tqdm.tqdm(graphs, ascii=True)):
        try:
            entity_cnt.append(len(g["vertexSet"]))
            for i in g['vertexSet']:
                pos2id[tuple(i['tokenpositions'])] = i['kbID']
        except:
            continue
        vertex2qids = context_utils.get_vertex2Qid(g)
        token_ids = embedding_utils.get_idx_sequence(g["tokens"], word2idx)
        if len(token_ids) > max_sent_len:
            token_ids = token_ids[:max_sent_len]
        sentences_matrix[index, :len(token_ids)] = token_ids
        entity_pair_instance = []
        for j, edge in enumerate(g["edgeSet"][:MAX_EDGES_PER_GRAPH]):
            new_j = calculate_order_conversion(j,
                                               len(g["vertexSet"]),
                                               PAD_ENT_CNT=MAX_NUM_NODES)
            entity_matrix[index, new_j, :len(token_ids)] = \
                [m for _, m in graph_utils.get_entity_indexed_vector(token_ids, edge, mode="mark-bi")]
            _, property_kbid, _ = graph_utils.edge_to_kb_ids(edge, g)
            relations[index, new_j] = property_kbid
            property_kbid = property2idx.get(
                property_kbid, property2idx[embedding_utils.unknown])
            y_matrix[index, new_j] = property_kbid
            entity_indices[index, new_j,
                           0] = context_utils.get_entityIdx_from_tokens(
                               g['tokens'], edge['left'], vertex2qids,
                               kwargs['entity2idx'])
            entity_indices[index, new_j,
                           1] = context_utils.get_entityIdx_from_tokens(
                               g['tokens'], edge['right'], vertex2qids,
                               kwargs['entity2idx'])
            entity_surface_forms[index, new_j,
                                 0] = [g['tokens'][ti] for ti in edge['left']]
            entity_surface_forms[index, new_j, 1] = [
                g['tokens'][ti] for ti in edge['right']
            ]

            entity_pair_instance.append(
                (pos2id[tuple(edge['left'])], pos2id[tuple(edge['right'])]))
        entity_pair.append(entity_pair_instance)

    entity_cnt = np.array(entity_cnt, dtype=np.int32)

    return sentences_matrix, entity_matrix, y_matrix, entity_cnt, entity_indices, entity_surface_forms, relations, entity_pair