def buckets_from_pretrained_embeddings(self, pretrained_path, n_buckets): from SourceCodeTools.nlp.embed.fasttext import load_w2v_map from SourceCodeTools.nlp import token_hasher pretrained = load_w2v_map(pretrained_path) import numpy as np embs_init = np.random.randn(n_buckets, pretrained.n_dims).astype(np.float32) for word in pretrained.keys(): ind = token_hasher(word, n_buckets) embs_init[ind, :] = pretrained[word] def op_embedding(op_tokens): embedding = None for token in op_tokens: token_emb = pretrained.get(token, None) if embedding is None: embedding = token_emb else: embedding = embedding + token_emb return embedding python_ops_to_bpe = self._op_tokens() for op, op_tokens in python_ops_to_bpe.items(): op_emb = op_embedding(op_tokens) if op_emb is not None: op_ind = token_hasher(op, n_buckets) embs_init[op_ind, :] = op_emb return embs_init
def __init__( self, data, batch_size: int, seq_len: int, wordmap: Dict[str, int], *, graphmap: Optional[Dict[str, int]], tagmap: Optional[TagMap] = None, mask_unlabeled_declarations=True, class_weights=False, element_hash_size=1000, len_sort=True, tokenizer="spacy", no_localization=False ): self.create_cache() self.data = sorted(data, key=lambda x: len(x[0])) if len_sort else data self.batch_size = batch_size self.seq_len = seq_len self.class_weights = None self.mask_unlabeled_declarations = mask_unlabeled_declarations self.tokenizer = tokenizer if tokenizer == "codebert": self.vocab = spacy.blank("en").vocab self.no_localization = no_localization self.nlp = create_tokenizer(tokenizer) if tagmap is None: self.tagmap = tag_map_from_sentences(list(zip(*[self.prepare_sent(sent) for sent in data]))[1]) else: self.tagmap = tagmap self.graphpad = len(graphmap) if graphmap is not None else None self.wordpad = len(wordmap) self.tagpad = self.tagmap["O"] self.prefpad = element_hash_size self.suffpad = element_hash_size self.graphmap_func = (lambda g: graphmap.get(g, len(graphmap))) if graphmap is not None else None self.wordmap_func = lambda w: wordmap.get(w, len(wordmap)) self.tagmap_func = lambda t: self.tagmap.get(t, self.tagmap["O"]) self.prefmap_func = lambda w: token_hasher(w[:3], element_hash_size) self.suffmap_func = lambda w: token_hasher(w[-3:], element_hash_size) self.mask_unlblpad = 1. if mask_unlabeled_declarations: self.mask_unlbl_func = lambda t: 1 if t == "O" else 0 else: self.mask_unlbl_func = lambda t: 1. self.classwpad = 1. if class_weights: self.class_weights = ClassWeightNormalizer() self.class_weights.init(list(zip(*[self.prepare_sent(sent) for sent in data]))[1]) self.classw_func = lambda t: self.class_weights.get(t, self.classwpad) else: self.classw_func = lambda t: 1.
def init_subwords(self, elements, num_buckets, max_len): names = elements['dst'] reprs = names.map(lambda x: char_ngram_window(x, self.gram_size)) \ .map(lambda grams: (token_hasher(g, num_buckets) for g in grams)) \ .map(lambda int_grams: np.fromiter(int_grams, dtype=np.int32)) \ .map(lambda parts: create_fixed_length(parts, max_len, 0)) self.name2repr = dict(zip(names, reprs)) self.embed = nn.Embedding(num_buckets, self.emb_size, padding_idx=0) self.norm = nn.LayerNorm(self.emb_size)
def init_subwords(self, elements, num_buckets, max_len): from SourceCodeTools.nlp.embed.bpe import load_bpe_model, make_tokenizer tokenize = make_tokenizer(load_bpe_model(self.tokenizer_path)) names = elements['dst'] reprs = names.map(tokenize) \ .map(lambda tokens: (token_hasher(t, num_buckets) for t in tokens)) \ .map(lambda int_tokens: np.fromiter(int_tokens, dtype=np.int32))\ .map(lambda parts: create_fixed_length(parts, max_len, 0)) self.name2repr = dict(zip(names, reprs)) self.embed = nn.Embedding(num_buckets, self.emb_size, padding_idx=0) self.norm = nn.LayerNorm(self.emb_size)
def forward(self, list_inputs, dsttype): # pylint: disable=unused-argument if len(list_inputs) == 1: return list_inputs[0] key = value = th.stack(list_inputs) #.squeeze(dim=1) query = self.query_emb( th.LongTensor([token_hasher(dsttype, self.num_query_buckets)]).to( self.att.in_proj_bias.device)).unsqueeze(0).repeat( 1, key.shape[1], 1) # query = self.query_emb[token_hasher(dsttype, self.num_query_buckets)].unsqueeze(0).repeat(1, key.shape[1], 1) if self.use_checkpoint: att_out, att_w = checkpoint.checkpoint(self.do_stuff, query, key, value, self.dummy_tensor) else: att_out, att_w = self.do_stuff(query, key, value) # att_out, att_w = self.att(query, key, value) # return att_out.mean(0)#.unsqueeze(1) return att_out.mean(0).unsqueeze(1)
def _get_embedding_from_node_info(self, keys, node_info, masked=None): idxs = [] if isinstance(masked, dict): new_masked = set() for ntype, nids in masked.items(): for nid in nids: new_masked.add((ntype, nid)) masked = new_masked for key in keys: if key not in node_info or masked is not None and key in masked: # if key in node_info and key not in masked: idxs.append(self.n_buckets) else: real_type, name = node_info[key] idxs.append(token_hasher(name, self.n_buckets)) return self.buckets(torch.LongTensor(idxs))