Beispiel #1
0
def to_indices_with_real_entities_and_entity_nums_with_vertex_padding_and_entity_pair(
        graphs, word2idx, property2idx, max_sent_len, mode='train', **kwargs):
    """
    :param graphs:
    :param word2idx:
    :param property2idx:
    :param max_sent_len:
    :return:
    """
    graphs_to_process = []
    for g in graphs:
        if len(g['edgeSet']) > 0:
            if len(g['edgeSet']) <= MAX_EDGES_PER_GRAPH:
                graphs_to_process.append(g)
            else:
                continue  # here we discard these data points
                for i in range(0, len(g['edgeSet']), MAX_EDGES_PER_GRAPH):
                    graphs_to_process.append({
                        "tokens":
                        g["tokens"],
                        "edgeSet":
                        g["edgeSet"][i:i + MAX_EDGES_PER_GRAPH]
                    })
    graphs = graphs_to_process
    sentences_matrix = np.zeros((len(graphs), max_sent_len), dtype="int32")
    entity_matrix = np.zeros((len(graphs), MAX_EDGES_PER_GRAPH, max_sent_len),
                             dtype="int8")
    y_matrix = np.zeros((len(graphs), MAX_EDGES_PER_GRAPH), dtype="int16")
    entity_cnt = []
    pos2id = dict()
    entity_pair = []
    for index, g in enumerate(tqdm.tqdm(graphs, ascii=True)):
        try:
            entity_cnt.append(len(g["vertexSet"]))
            for i in g['vertexSet']:
                pos2id[tuple(i['tokenpositions'])] = i['kbID']
        except:
            continue
        token_ids = embedding_utils.get_idx_sequence(g["tokens"], word2idx)
        if len(token_ids) > max_sent_len:
            token_ids = token_ids[:max_sent_len]
        sentences_matrix[index, :len(token_ids)] = token_ids
        entity_pair_instance = []
        for j, edge in enumerate(g["edgeSet"][:MAX_EDGES_PER_GRAPH]):
            new_j = calculate_order_conversion(j,
                                               len(g["vertexSet"]),
                                               PAD_ENT_CNT=MAX_NUM_NODES)
            entity_matrix[index, new_j, :len(token_ids)] = \
                [m for _, m in graph_utils.get_entity_indexed_vector(token_ids, edge, mode="mark-bi")]
            _, property_kbid, _ = graph_utils.edge_to_kb_ids(edge, g)
            property_kbid = property2idx.get(
                property_kbid, property2idx[embedding_utils.unknown])
            y_matrix[index, new_j] = property_kbid
            entity_pair_instance.append(
                (pos2id[tuple(edge['left'])], pos2id[tuple(edge['right'])]))
        entity_pair.append(entity_pair_instance)
    entity_cnt = np.array(entity_cnt, dtype=np.int32)

    return sentences_matrix, entity_matrix, y_matrix, entity_cnt, entity_pair
Beispiel #2
0
def to_indices_with_real_entities_and_entity_nums(graphs,
                                                  word2idx,
                                                  property2idx,
                                                  max_sent_len,
                                                  mode='train',
                                                  **kwargs):
    """
    :param graphs:
    :param word2idx:
    :param property2idx:
    :param max_sent_len:
    :return:
    """
    graphs_to_process = []
    for g in graphs:
        if len(g['edgeSet']) > 0:
            if len(g['edgeSet']) <= MAX_EDGES_PER_GRAPH:
                graphs_to_process.append(g)
            else:
                continue  # here we discard these data points
                for i in range(0, len(g['edgeSet']), MAX_EDGES_PER_GRAPH):
                    graphs_to_process.append({
                        "tokens":
                        g["tokens"],
                        "edgeSet":
                        g["edgeSet"][i:i + MAX_EDGES_PER_GRAPH]
                    })
    graphs = graphs_to_process
    sentences_matrix = np.zeros((len(graphs), max_sent_len), dtype="int32")
    entity_matrix = np.zeros((len(graphs), MAX_EDGES_PER_GRAPH, max_sent_len),
                             dtype="int8")
    y_matrix = np.zeros((len(graphs), MAX_EDGES_PER_GRAPH), dtype="int16")
    entity_cnt = []
    for index, g in enumerate(tqdm.tqdm(graphs, ascii=True)):
        try:
            entity_cnt.append(len(g["vertexSet"]))
        except:
            continue
        token_ids = embedding_utils.get_idx_sequence(g["tokens"], word2idx)
        if len(token_ids) > max_sent_len:
            token_ids = token_ids[:max_sent_len]
        sentences_matrix[index, :len(token_ids)] = token_ids
        for j, edge in enumerate(g["edgeSet"][:MAX_EDGES_PER_GRAPH]):
            entity_matrix[index, j, :len(token_ids)] = \
                [m for _, m in graph_utils.get_entity_indexed_vector(token_ids, edge, mode="mark-bi")]
            _, property_kbid, _ = graph_utils.edge_to_kb_ids(edge, g)
            property_kbid = property2idx.get(
                property_kbid, property2idx[embedding_utils.unknown])
            y_matrix[index, j] = property_kbid
    entity_cnt = np.array(entity_cnt, dtype=np.int32)

    return sentences_matrix, entity_matrix, y_matrix, entity_cnt
Beispiel #3
0
def to_indices_and_entity_pair(graphs,
                               word2idx,
                               property2idx,
                               max_sent_len,
                               replace_entities_with_unkown=False,
                               mode='train',
                               **kwargs):
    """
    :param graphs:
    :param word2idx:
    :param property2idx:
    :param max_sent_len:
    :return:
    """
    num_edges = len([
        e for g in graphs for e in g['edgeSet']
        if e['kbID'] not in property_blacklist
    ])
    print("Dataset number of edges: {}".format(num_edges))
    sentences_matrix = np.zeros((num_edges, max_sent_len), dtype="int32")
    entity_matrix = np.zeros((num_edges, max_sent_len), dtype="int8")
    y_matrix = np.zeros(num_edges, dtype="int16")
    index = 0
    entity_cnt = []
    pos2id = dict()
    entity_pair = []
    for g in tqdm.tqdm(graphs, ascii=True):
        token_ids = embedding_utils.get_idx_sequence(g["tokens"], word2idx)
        try:
            entity_cnt.append(len(g["vertexSet"]))
            for i in g['vertexSet']:
                pos2id[tuple(i['tokenpositions'])] = i['kbID']
        except:
            continue
        if len(token_ids) > max_sent_len:
            token_ids = token_ids[:max_sent_len]
        for edge in g["edgeSet"]:
            if edge['kbID'] not in property_blacklist:
                sentences_matrix[index, :len(token_ids)] = \
                    [word2idx[embedding_utils.unknown] if i in edge["left"] + edge["right"] else t for i, t in enumerate(token_ids)] \
                        if replace_entities_with_unkown else token_ids
                entity_matrix[index, :len(token_ids)] = \
                    [m for _, m in graph_utils.get_entity_indexed_vector(token_ids, edge, mode="mark-bi")]
                if mode == "train":
                    _, property_kbid, _ = graph_utils.edge_to_kb_ids(edge, g)
                    property_kbid = property2idx.get(
                        property_kbid, property2idx[embedding_utils.unknown])
                    y_matrix[index] = property_kbid
                entity_pair.append((pos2id[tuple(edge['left'])],
                                    pos2id[tuple(edge['right'])]))
                index += 1
    return [sentences_matrix, entity_matrix, y_matrix, entity_pair]
Beispiel #4
0
def to_indices_with_relative_positions_and_pcnn_mask_and_entity_pair(
        graphs, word2idx, property2idx, max_sent_len, position2idx, **kwargs):
    num_edges = len([e for g in graphs for e in g['edgeSet']])
    sentences_matrix = np.zeros((num_edges, max_sent_len), dtype="int32")
    entity_matrix = np.zeros((num_edges, 2, max_sent_len), dtype="int8")
    pcnn_mask = np.zeros((num_edges, 3, max_sent_len), dtype="float32")
    y_matrix = np.zeros(num_edges, dtype="int16")
    index = 0
    max_entity_index = max_sent_len - 1
    entity_pair = []
    pos2id = dict()
    for g in tqdm.tqdm(graphs, ascii=True):
        try:
            for i in g['vertexSet']:
                pos2id[tuple(i['tokenpositions'])] = i['kbID']
        except:
            continue
        token_ids = embedding_utils.get_idx_sequence(g["tokens"], word2idx)
        if len(token_ids) > max_sent_len:
            token_ids = token_ids[:max_sent_len]
        entity_pair_instance = []
        for edge in g["edgeSet"]:
            sentences_matrix[index, :len(token_ids)] = token_ids
            _, property_kbid, _ = graph_utils.edge_to_kb_ids(edge, g)
            try:
                property_kbid = property2idx.get(
                    property_kbid, property2idx[embedding_utils.unknown])
            except:
                pdb.set_trace()
            entity_vector = graph_utils.get_entity_indexed_vector(
                token_ids, edge, mode="position")
            entity_vector = [
                (-max_entity_index if m1 < -max_entity_index else
                 max_entity_index if m1 > max_entity_index else m1,
                 -max_entity_index if m2 < -max_entity_index else
                 max_entity_index if m2 > max_entity_index else m2)
                for _, m1, m2 in entity_vector
            ]
            entity_matrix[index, :, :len(token_ids)] = [[
                position2idx[m] for m, _ in entity_vector
            ], [position2idx[m] for _, m in entity_vector]]
            pcnn_mask[index, 0, :len(token_ids)], pcnn_mask[
                index, 1, :len(token_ids)], pcnn_mask[
                    index, 2, :len(token_ids)] = graph_utils.get_pcnn_mask(
                        token_ids, edge)
            y_matrix[index] = property_kbid
            index += 1
            entity_pair_instance.append(
                (pos2id[tuple(edge['left'])], pos2id[tuple(edge['right'])]))
        entity_pair += entity_pair_instance
    return [sentences_matrix, entity_matrix, y_matrix, pcnn_mask, entity_pair]
Beispiel #5
0
def to_indices_with_real_entities_completely(graphs,
                                             word2idx,
                                             property2idx,
                                             max_sent_len,
                                             mode='train',
                                             **kwargs):
    """
    This function add N/A relations to all entity pairs with no relation in dataset
    :param graphs:
    :param word2idx:
    :param property2idx:
    :param max_sent_len:
    :return:
    """
    graphs_to_process = []
    for g in graphs:
        if len(g['edgeSet']) > 0:
            if len(g['edgeSet']) <= MAX_EDGES_PER_GRAPH:
                graphs_to_process.append(g)
            else:
                for i in range(0, len(g['edgeSet']), MAX_EDGES_PER_GRAPH):
                    graphs_to_process.append({
                        "tokens":
                        g["tokens"],
                        "edgeSet":
                        g["edgeSet"][i:i + MAX_EDGES_PER_GRAPH]
                    })
    graphs = graphs_to_process
    sentences_matrix = np.zeros((len(graphs), max_sent_len), dtype="int32")
    entity_matrix = np.zeros((len(graphs), MAX_EDGES_PER_GRAPH, max_sent_len),
                             dtype="int8")
    y_matrix = np.zeros((len(graphs), MAX_EDGES_PER_GRAPH), dtype="int16")
    for index, g in enumerate(tqdm.tqdm(graphs, ascii=True)):
        token_ids = embedding_utils.get_idx_sequence(g["tokens"], word2idx)
        if len(token_ids) > max_sent_len:
            token_ids = token_ids[:max_sent_len]
        sentences_matrix[index, :len(token_ids)] = token_ids
        for j, edge in enumerate(g["edgeSet"][:MAX_EDGES_PER_GRAPH]):
            entity_matrix[index, j, :len(token_ids)] = \
                [m for _, m in graph_utils.get_entity_indexed_vector(token_ids, edge, mode="mark-bi")]
            _, property_kbid, _ = graph_utils.edge_to_kb_ids(edge, g)
            property_kbid = property2idx.get(
                property_kbid, property2idx[embedding_utils.unknown])
            y_matrix[index, j] = property_kbid
    return sentences_matrix, entity_matrix, y_matrix
Beispiel #6
0
def to_indices_with_relative_positions(graphs, word2idx, property2idx,
                                       max_sent_len, position2idx, **kwargs):
    num_edges = len([e for g in graphs for e in g['edgeSet']])
    sentences_matrix = np.zeros((num_edges, max_sent_len), dtype="int32")
    entity_matrix = np.zeros((num_edges, 2, max_sent_len), dtype="int8")
    y_matrix = np.zeros(num_edges, dtype="int16")
    index = 0
    max_entity_index = max_sent_len - 1
    for g in tqdm.tqdm(graphs, ascii=True):
        token_ids = embedding_utils.get_idx_sequence(g["tokens"], word2idx)
        if len(token_ids) > max_sent_len:
            token_ids = token_ids[:max_sent_len]
        for edge in g["edgeSet"]:
            sentences_matrix[index, :len(token_ids)] = token_ids
            _, property_kbid, _ = graph_utils.edge_to_kb_ids(edge, g)
            try:
                property_kbid = property2idx.get(
                    property_kbid, property2idx[embedding_utils.unknown])
            except:
                pdb.set_trace()
            entity_vector = graph_utils.get_entity_indexed_vector(
                token_ids, edge, mode="position")
            entity_vector = [
                (-max_entity_index if m1 < -max_entity_index else
                 max_entity_index if m1 > max_entity_index else m1,
                 -max_entity_index if m2 < -max_entity_index else
                 max_entity_index if m2 > max_entity_index else m2)
                for _, m1, m2 in entity_vector
            ]
            entity_matrix[index, :, :len(token_ids)] = [[
                position2idx[m] for m, _ in entity_vector
            ], [position2idx[m] for _, m in entity_vector]]

            y_matrix[index] = property_kbid
            index += 1
    return [sentences_matrix, entity_matrix, y_matrix]
Beispiel #7
0
def to_indices_with_real_entities_and_entity_nums_with_vertex_padding_and_negative_sampling(
        graphs,
        word2idx,
        property2idx,
        max_sent_len,
        idx2property,
        mode='train',
        **kwargs):
    """
    :param graphs:
    :param word2idx:
    :param property2idx:
    :param max_sent_len:
    :return:
    """
    graphs_to_process = []
    for g in graphs:
        if len(g['edgeSet']) > 0:
            if len(g['edgeSet']) <= MAX_EDGES_PER_GRAPH:
                graphs_to_process.append(g)
            else:
                continue  # here we discard these data points
                for i in range(0, len(g['edgeSet']), MAX_EDGES_PER_GRAPH):
                    graphs_to_process.append({
                        "tokens":
                        g["tokens"],
                        "edgeSet":
                        g["edgeSet"][i:i + MAX_EDGES_PER_GRAPH]
                    })
    graphs = graphs_to_process
    sentences_matrix = np.zeros((len(graphs), max_sent_len), dtype="int32")
    entity_matrix = np.zeros((len(graphs), MAX_EDGES_PER_GRAPH, max_sent_len),
                             dtype="int8")
    y_matrix = np.zeros((len(graphs), MAX_EDGES_PER_GRAPH), dtype="int16")
    entity_cnt = []
    entity_indices = -1 * np.ones(
        (len(graphs), MAX_EDGES_PER_GRAPH, 2), dtype="int32")
    entity_surface_forms = np.empty((len(graphs), MAX_EDGES_PER_GRAPH, 2),
                                    dtype=object)
    entity_surface_forms[:, :, :] = [[[['ALL_ZEROS']]]]
    pos2id = dict()
    # relations = [idx2property[0] for _ in range(y_matrix.shape[0]*y_matrix.shape[1])]
    relations = np.empty((len(graphs), MAX_EDGES_PER_GRAPH), dtype=object)
    relations[:, :] = [[idx2property[0]]]
    entity_pair = []
    for index, g in enumerate(tqdm.tqdm(graphs, ascii=True)):
        try:
            entity_cnt.append(len(g["vertexSet"]))
            for i in g['vertexSet']:
                pos2id[tuple(i['tokenpositions'])] = i['kbID']
        except:
            continue
        vertex2qids = context_utils.get_vertex2Qid(g)
        token_ids = embedding_utils.get_idx_sequence(g["tokens"], word2idx)
        if len(token_ids) > max_sent_len:
            token_ids = token_ids[:max_sent_len]
        sentences_matrix[index, :len(token_ids)] = token_ids
        entity_pair_instance = []
        for j, edge in enumerate(g["edgeSet"][:MAX_EDGES_PER_GRAPH]):
            new_j = calculate_order_conversion(j,
                                               len(g["vertexSet"]),
                                               PAD_ENT_CNT=MAX_NUM_NODES)
            entity_matrix[index, new_j, :len(token_ids)] = \
                [m for _, m in graph_utils.get_entity_indexed_vector(token_ids, edge, mode="mark-bi")]
            _, property_kbid, _ = graph_utils.edge_to_kb_ids(edge, g)
            relations[index, new_j] = property_kbid
            property_kbid = property2idx.get(
                property_kbid, property2idx[embedding_utils.unknown])
            y_matrix[index, new_j] = property_kbid
            entity_indices[index, new_j,
                           0] = context_utils.get_entityIdx_from_tokens(
                               g['tokens'], edge['left'], vertex2qids,
                               kwargs['entity2idx'])
            entity_indices[index, new_j,
                           1] = context_utils.get_entityIdx_from_tokens(
                               g['tokens'], edge['right'], vertex2qids,
                               kwargs['entity2idx'])
            entity_surface_forms[index, new_j,
                                 0] = [g['tokens'][ti] for ti in edge['left']]
            entity_surface_forms[index, new_j, 1] = [
                g['tokens'][ti] for ti in edge['right']
            ]

            entity_pair_instance.append(
                (pos2id[tuple(edge['left'])], pos2id[tuple(edge['right'])]))
        entity_pair.append(entity_pair_instance)

    entity_cnt = np.array(entity_cnt, dtype=np.int32)

    return sentences_matrix, entity_matrix, y_matrix, entity_cnt, entity_indices, entity_surface_forms, relations, entity_pair