Example #1
0
    def buckets_from_pretrained_embeddings(self, pretrained_path, n_buckets):

        from SourceCodeTools.nlp.embed.fasttext import load_w2v_map
        from SourceCodeTools.nlp import token_hasher
        pretrained = load_w2v_map(pretrained_path)

        import numpy as np

        embs_init = np.random.randn(n_buckets, pretrained.n_dims).astype(np.float32)

        for word in pretrained.keys():
            ind = token_hasher(word, n_buckets)
            embs_init[ind, :] = pretrained[word]

        def op_embedding(op_tokens):
            embedding = None
            for token in op_tokens:
                token_emb = pretrained.get(token, None)
                if embedding is None:
                    embedding = token_emb
                else:
                    embedding = embedding + token_emb
            return embedding

        python_ops_to_bpe = self._op_tokens()
        for op, op_tokens in python_ops_to_bpe.items():
            op_emb = op_embedding(op_tokens)
            if op_emb is not None:
                op_ind = token_hasher(op, n_buckets)
                embs_init[op_ind, :] = op_emb

        return embs_init
    def __init__(
            self, data, batch_size: int, seq_len: int,
            wordmap: Dict[str, int], *, graphmap: Optional[Dict[str, int]], tagmap: Optional[TagMap] = None,
            mask_unlabeled_declarations=True,
            class_weights=False, element_hash_size=1000, len_sort=True, tokenizer="spacy", no_localization=False
    ):

        self.create_cache()

        self.data = sorted(data, key=lambda x: len(x[0])) if len_sort else data
        self.batch_size = batch_size
        self.seq_len = seq_len
        self.class_weights = None
        self.mask_unlabeled_declarations = mask_unlabeled_declarations
        self.tokenizer = tokenizer
        if tokenizer == "codebert":
            self.vocab = spacy.blank("en").vocab
        self.no_localization = no_localization

        self.nlp = create_tokenizer(tokenizer)
        if tagmap is None:
            self.tagmap = tag_map_from_sentences(list(zip(*[self.prepare_sent(sent) for sent in data]))[1])
        else:
            self.tagmap = tagmap

        self.graphpad = len(graphmap) if graphmap is not None else None
        self.wordpad = len(wordmap)
        self.tagpad = self.tagmap["O"]
        self.prefpad = element_hash_size
        self.suffpad = element_hash_size

        self.graphmap_func = (lambda g: graphmap.get(g, len(graphmap))) if graphmap is not None else None
        self.wordmap_func = lambda w: wordmap.get(w, len(wordmap))
        self.tagmap_func = lambda t: self.tagmap.get(t, self.tagmap["O"])
        self.prefmap_func = lambda w: token_hasher(w[:3], element_hash_size)
        self.suffmap_func = lambda w: token_hasher(w[-3:], element_hash_size)

        self.mask_unlblpad = 1.
        if mask_unlabeled_declarations:
            self.mask_unlbl_func = lambda t: 1 if t == "O" else 0
        else:
            self.mask_unlbl_func = lambda t: 1.

        self.classwpad = 1.
        if class_weights:
            self.class_weights = ClassWeightNormalizer()
            self.class_weights.init(list(zip(*[self.prepare_sent(sent) for sent in data]))[1])
            self.classw_func = lambda t: self.class_weights.get(t, self.classwpad)
        else:
            self.classw_func = lambda t: 1.
    def init_subwords(self, elements, num_buckets, max_len):
        names = elements['dst']
        reprs = names.map(lambda x: char_ngram_window(x, self.gram_size)) \
            .map(lambda grams: (token_hasher(g, num_buckets) for g in grams)) \
            .map(lambda int_grams: np.fromiter(int_grams, dtype=np.int32)) \
            .map(lambda parts: create_fixed_length(parts, max_len, 0))

        self.name2repr = dict(zip(names, reprs))

        self.embed = nn.Embedding(num_buckets, self.emb_size, padding_idx=0)
        self.norm = nn.LayerNorm(self.emb_size)
    def init_subwords(self, elements, num_buckets, max_len):
        from SourceCodeTools.nlp.embed.bpe import load_bpe_model, make_tokenizer
        tokenize = make_tokenizer(load_bpe_model(self.tokenizer_path))

        names = elements['dst']
        reprs = names.map(tokenize) \
            .map(lambda tokens: (token_hasher(t, num_buckets) for t in tokens)) \
            .map(lambda int_tokens: np.fromiter(int_tokens, dtype=np.int32))\
            .map(lambda parts: create_fixed_length(parts, max_len, 0))

        self.name2repr = dict(zip(names, reprs))

        self.embed = nn.Embedding(num_buckets, self.emb_size, padding_idx=0)
        self.norm = nn.LayerNorm(self.emb_size)
Example #5
0
 def forward(self, list_inputs, dsttype):  # pylint: disable=unused-argument
     if len(list_inputs) == 1:
         return list_inputs[0]
     key = value = th.stack(list_inputs)  #.squeeze(dim=1)
     query = self.query_emb(
         th.LongTensor([token_hasher(dsttype, self.num_query_buckets)]).to(
             self.att.in_proj_bias.device)).unsqueeze(0).repeat(
                 1, key.shape[1], 1)
     # query = self.query_emb[token_hasher(dsttype, self.num_query_buckets)].unsqueeze(0).repeat(1, key.shape[1], 1)
     if self.use_checkpoint:
         att_out, att_w = checkpoint.checkpoint(self.do_stuff, query, key,
                                                value, self.dummy_tensor)
     else:
         att_out, att_w = self.do_stuff(query, key, value)
     # att_out, att_w = self.att(query, key, value)
     # return att_out.mean(0)#.unsqueeze(1)
     return att_out.mean(0).unsqueeze(1)
Example #6
0
    def _get_embedding_from_node_info(self, keys, node_info, masked=None):
        idxs = []

        if isinstance(masked, dict):
            new_masked = set()
            for ntype, nids in masked.items():
                for nid in nids:
                    new_masked.add((ntype, nid))
            masked = new_masked

        for key in keys:
            if key not in node_info or masked is not None and key in masked:
                # if key in node_info and key not in masked:
                idxs.append(self.n_buckets)
            else:
                real_type, name = node_info[key]
                idxs.append(token_hasher(name, self.n_buckets))

        return self.buckets(torch.LongTensor(idxs))