def prepare_language_vocabulary(args): language, (tokens_key, vocab_size, type_) = args print(f'Building vocabulary for {language} {type_}') docs = utils.load_cached_docs(language, 'train') tokens = utils.flatten(preprocess_query_tokens(utils.flatten(doc[tokens_key] for doc in docs))) vocabulary = BpeVocabulary(vocab_size=vocab_size, pct_bpe=shared.VOCABULARY_PCT_BPE) vocabulary.fit(Counter(tokens)) utils.cache_vocabulary(vocabulary, language, type_) print(f'Done building vocabulary for {language} {type_}')
def prepare_query_vocabulary(self, vocabulary_size: int, pct_bpe: float): if self.verbose: print('Preparing query vocabulary') corpora = utils.flatten( self.data_manager.get_preprocessed_language_corpus( language, set_=shared.DataSet.TRAIN) for language in self.languages) tokens = utils.flatten(doc['query_tokens'] for doc in corpora) vocabulary = build_vocabulary(tokens, vocabulary_size, pct_bpe) self.data_manager.save_query_vocabulary(vocabulary)
def extract_sub_tokens(token): sub_tokens = re.split('[._]', token) sub_tokens = [IDENTIFIER_CAMEL_CASE_SPLIT.sub(r' \1', sub_token).split(' ') if IDENTIFIER_TOKEN_REGEX.match(sub_token) else [sub_token] for sub_token in sub_tokens] return [token.lower() for token in utils.flatten(sub_tokens) if len(token.strip()) > 0]
def pad_encode_seqs(seqs: shared.TokensGenerator, max_length: int, vocabulary: BpeVocabulary, preprocess_tokens_fn) -> np.ndarray: encoded_seqs = vocabulary.transform( (utils.flatten(preprocess_tokens_fn(seq)) for seq in seqs), fixed_length=max_length) return np.array(list(encoded_seqs))
def get_query_tokens(docstring_tokens: List[str], identifier: str): query_tokens = list( utils.flatten(preprocess_query_tokens(docstring_tokens))) if len(query_tokens) > 0: return query_tokens elif identifier and len(identifier) >= shared.MIN_FUNC_NAME_QUERY_LENGTH: return extract_sub_tokens(identifier) return []
def pad_encode_seqs( preprocess_tokens_fn: Callable[[Iterable[str]], shared.TokensGenerator], seqs: shared.TokensGenerator, max_length: int, language: str, type_: str) -> np.ndarray: bpe = utils.load_cached_vocabulary(language, type_) encoded_seqs = bpe.transform( (utils.flatten(preprocess_tokens_fn(seq)) for seq in seqs), fixed_length=max_length) return np.array(list(encoded_seqs))
def prepare_language_vocabulary(self, language: str, vocabulary_size: int, pct_bpe: float): if self.verbose: print(f'Preparing language vocabulary: {language}') corpus = self.data_manager.get_preprocessed_language_corpus( language, set_=shared.DataSet.TRAIN) tokens = utils.flatten(doc['code_tokens'] for doc in corpus) vocabulary = build_vocabulary(tokens, vocabulary_size, pct_bpe) self.data_manager.save_language_vocabulary(vocabulary, language)
def evaluate_mrr(model: CodeSearchNN, language_code_seqs: Dict[str, np.ndarray], language_query_seqs: Dict[str, np.ndarray], device: torch.device, batch_size: int = 1000): mrrs_per_language = {} for language in language_code_seqs.keys(): code_seqs = np_to_torch(language_code_seqs[language], device) query_seqs = np_to_torch(language_query_seqs[language], device) mrrs_per_language[language] = get_language_mrrs(model, language, code_seqs, query_seqs, batch_size=batch_size) mean_mrr = np.mean(list(utils.flatten(mrrs_per_language.values()))) mean_mrr_per_language = {language: np.mean(values) for language, values in mrrs_per_language.items()} return mean_mrr, mean_mrr_per_language
def preprocess_doc(doc, language: str): identifier = doc['identifier'] docstring_tokens = doc['docstring_tokens'] code_tokens = doc['code_tokens'] return { # func_name and url are needed for evaluation 'identifier': identifier, 'url': doc.get('url'), 'query_tokens': get_query_tokens(docstring_tokens, identifier), 'code_tokens': list(utils.flatten(preprocess_code_tokens(language, code_tokens))), }
def extract_sub_tokens(token): # Skip strings if len(token) > 0 and (token[0] in ['\'', '"'] or token[:2] in ['r\'', 'r"', 'f\'', 'f"']): return [token] sub_tokens = re.split('[._]', token) sub_tokens = [ IDENTIFIER_CAMEL_CASE_SPLIT.sub(r' \1', sub_token).split(' ') if IDENTIFIER_TOKEN_REGEX.match(sub_token) else [sub_token] for sub_token in sub_tokens ] return [ token.strip() for token in utils.flatten(sub_tokens) if len(token.strip()) > 0 ]
def pad_encode_query(query: str, language: str) -> np.ndarray: seq = query.split(' ') bpe = utils.load_cached_vocabulary(language, 'query') encoded_seq = bpe.transform( (utils.flatten(preprocess_query_tokens(seq_)) for seq_ in [seq]), fixed_length=shared.QUERY_MAX_SEQ_LENGTH) return np.array(list(encoded_seq)[0])