예제 #1
0
def load_w2v_map(w2v_path):
    """
    Load embeddings for words in w2v txt format
    :param w2v_path:
    :return: Embedder
    """
    embs = []
    w_map = dict()

    with open(w2v_path) as w2v:
        n_vectors, n_dims = map(int, w2v.readline().strip().split())
        for ind in range(n_vectors):
            line = w2v.readline()
            e = line.rstrip("\n").split(" ")

            word = e[0]
            w_map[word] = len(w_map)

            assert len(e[1:]) == n_dims
            embs.append(list(map(float, e[1:])))

    from SourceCodeTools.models.Embedder import Embedder
    import numpy as np

    return Embedder(w_map, np.array(embs))
    def get_embeddings(self):
        # self.graph_model.g.nodes["function"].data.keys()
        nodes = self.graph_model.g.nodes
        node_embs = {
            ntype: self.node_embedder(node_type=ntype,
                                      node_ids=nodes[ntype].data['typed_id'],
                                      train_embeddings=False)
            for ntype in self.graph_model.g.ntypes
        }

        h = self.graph_model.inference(batch_size=256,
                                       device='cpu',
                                       num_workers=0,
                                       x=node_embs)

        original_id = []
        global_id = []
        embeddings = []
        for ntype in self.graph_model.g.ntypes:
            embeddings.append(h[ntype])
            original_id.extend(nodes[ntype].data['original_id'].tolist())
            global_id.extend(nodes[ntype].data['global_graph_id'].tolist())

        embeddings = torch.cat(embeddings, dim=0).detach().numpy()

        return [Embedder(dict(zip(original_id, global_id)), embeddings)]
예제 #3
0
    def train_model(self):
        # graph_emb = load_pkl_emb(self.graph_emb_path) if self.graph_emb_path is not None else None

        typed_nodes = load_typed_nodes(self.type_ann_edges)

        decoder_mapping = RobertaTokenizer.from_pretrained(
            "microsoft/codebert-base").decoder
        tok_ids, words = zip(*decoder_mapping.items())
        vocab_mapping = dict(zip(words, tok_ids))
        batcher = self.get_batcher(self.train_data + self.test_data,
                                   self.batch_size,
                                   seq_len=self.seq_len,
                                   graphmap=None,
                                   wordmap=vocab_mapping,
                                   tagmap=None,
                                   class_weights=False,
                                   element_hash_size=1)

        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        model = RobertaModel.from_pretrained("microsoft/codebert-base")
        model.to(device)

        node_ids = []
        embeddings = []

        for ind, batch in enumerate(tqdm(batcher)):
            # token_ids, graph_ids, labels, class_weights, lengths = b
            token_ids = torch.LongTensor(batch["tok_ids"])
            lens = torch.LongTensor(batch["lens"])

            token_ids[token_ids == len(vocab_mapping)] = vocab_mapping["<unk>"]

            def get_length_mask(target, lens):
                mask = torch.arange(target.size(1)).to(
                    target.device)[None, :] < lens[:, None]
                return mask

            mask = get_length_mask(token_ids, lens)
            with torch.no_grad():
                embs = model(input_ids=token_ids, attention_mask=mask)

            for s_emb, s_repl in zip(embs.last_hidden_state,
                                     batch["replacements"]):
                unique_repls = set(list(s_repl))
                repls_for_ann = [r for r in unique_repls if r in typed_nodes]

                for r in repls_for_ann:
                    position = s_repl.index(r)
                    if position > 512:
                        continue
                    node_ids.append(r)
                    embeddings.append(s_emb[position])

        all_embs = torch.stack(embeddings, dim=0).numpy()
        embedder = Embedder(dict(zip(node_ids, range(len(node_ids)))),
                            all_embs)
        pickle.dump(embedder,
                    open("codebert_embeddings.pkl", "wb"),
                    fix_imports=False)
        print(node_ids)
def main():
    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument("entities")
    parser.add_argument("vectors")
    parser.add_argument("output")

    args = parser.parse_args()

    entities = read_entities(args.entities)
    vectors = read_vectors(args.vectors)

    embedder = Embedder(dict(zip(entities, range(len(entities)))), vectors)
    with open(args.output, "wb") as sink:
        pickle.dump(embedder, sink)
예제 #5
0
def load_w2v_map(w2v_path):

    embs = []
    w_map = dict()

    with open(w2v_path) as w2v:
        n_vectors, n_dims = map(int, w2v.readline().strip().split())
        for ind in range(n_vectors):
            e = w2v.readline().strip().split()

            word = e[0]
            w_map[word] = len(w_map)

            embs.append(list(map(float, e[1:])))

    return Embedder(w_map, np.array(embs))
# splits = get_train_val_test_indices(nodes.index)
from SourceCodeTools.code.data.sourcetrail.sourcetrail_types import node_types
splits = SourceGraphDataset.get_train_val_test_indices(
    nodes.query(f"type_backup == '{node_types[4096]}'").index)

# nodes, edges, held = SourceGraphDataset.holdout(nodes, edges, 0.001)
# nodes['label'] = nodes['type']

# from SourceCodeTools.code.data.dataset.Dataset import create_train_val_test_masks
# def add_splits(nodes, splits):
#     nodes['train_mask'] = False
#     nodes.loc[nodes.index[splits[0]], 'train_mask'] = True
#     nodes['val_mask'] = False
#     nodes.loc[nodes.index[splits[1]], 'val_mask'] = True
#     nodes['test_mask'] = False
#     nodes.loc[nodes.index[splits[2]], 'test_mask'] = True

emb = Embedder(ent_map, new_embs)

if not os.path.isdir(args.out_path):
    os.mkdir(args.out_path)

torch.save({"splits": splits}, os.path.join(args.out_path, "state_dict.pt"))

SourceGraphDataset.create_train_val_test_masks(nodes, *splits)

nodes.to_csv(os.path.join(args.out_path, "nodes.csv"), index=False)
edges.to_csv(os.path.join(args.out_path, "edges.csv"), index=False)
# held.to_csv(os.path.join(args.out_path,  "held.csv"), index=False)

pickle.dump([emb], open(os.path.join(args.out_path, "embeddings.pkl"), "wb"))
예제 #7
0
 def get_embeddings(self, id_maps):
     return [Embedder(id_maps, e) for e in self.get_layers()]
예제 #8
0
nodes, edges = load_data(nodes_path, edges_path)

# splits = get_train_val_test_indices(nodes.index)
from SourceCodeTools.code.data.sourcetrail.sourcetrail_types import node_types
splits = SourceGraphDataset.get_train_val_test_indices(
    nodes.query(f"type_backup == '{node_types[4096]}'").index)

id_map, vecs = load_w2v(emb_path)

nodes['global_graph_id'] = nodes['id'].apply(lambda x: id_map[x])

# nodes, edges, held = SourceGraphDataset.holdout(nodes, edges, 0.001)
# nodes['label'] = nodes['type']

# emb = Embedder.load_word2vec(emb_path)
emb = Embedder(id_map, vecs)

if not os.path.isdir(out_path):
    os.mkdir(out_path)

torch.save({"splits": splits}, os.path.join(out_path, "state_dict.pt"))

# from SourceCodeTools.code.data.dataset.Dataset import create_train_val_test_masks
SourceGraphDataset.create_train_val_test_masks(nodes, *splits)

nodes.to_csv(os.path.join(out_path, "nodes.csv"), index=False)
edges.to_csv(os.path.join(out_path, "edges.csv"), index=False)
# held.to_csv(os.path.join(out_path,  "held.csv"), index=False)

pickle.dump([emb], open(os.path.join(out_path, "embeddings.pkl"), "wb"))