def prepare_language_vocabulary(args): language, (tokens_key, vocab_size, type_) = args print(f'Building vocabulary for {language} {type_}') docs = utils.load_cached_docs(language, 'train') tokens = utils.flatten(preprocess_query_tokens(utils.flatten(doc[tokens_key] for doc in docs))) vocabulary = BpeVocabulary(vocab_size=vocab_size, pct_bpe=shared.VOCABULARY_PCT_BPE) vocabulary.fit(Counter(tokens)) utils.cache_vocabulary(vocabulary, language, type_) print(f'Done building vocabulary for {language} {type_}')
def emit_ndcg_model_predictions(use_wandb=False): build_code_embeddings() queries = utils.get_evaluation_queries() predictions = [] for language in shared.LANGUAGES: print(f'Evaluating {language}') evaluation_docs = [{ 'url': doc['url'], 'identifier': doc['identifier'] } for doc in utils.load_cached_docs(language, 'evaluation')] code_embeddings = utils.load_cached_code_embeddings(language) model = utils.load_cached_model_weights(language, train_model.get_model()) query_embedding_predictor = train_model.get_query_embedding_predictor( model) query_seqs = prepare_data.pad_encode_seqs( prepare_data.preprocess_query_tokens, (line.split(' ') for line in queries), shared.QUERY_MAX_SEQ_LENGTH, language, 'query') query_embeddings = query_embedding_predictor.predict(query_seqs) # TODO: Query annoy index nn = NearestNeighbors(n_neighbors=100, metric='cosine', n_jobs=-1) nn.fit(code_embeddings) _, nearest_neighbor_indices = nn.kneighbors(query_embeddings) for query_idx, query in enumerate(queries): for query_nearest_code_idx in nearest_neighbor_indices[ query_idx, :]: predictions.append({ 'query': query, 'language': language, 'identifier': evaluation_docs[query_nearest_code_idx]['identifier'], 'url': evaluation_docs[query_nearest_code_idx]['url'], }) del evaluation_docs gc.collect() df_predictions = pd.DataFrame( predictions, columns=['query', 'language', 'identifier', 'url']) save_path = os.path.join( wandb.run.dir, 'model_predictions.csv') if use_wandb else '../model_predictions.csv' df_predictions.to_csv(save_path, index=False)
def prepare_set_seqs(args): language, set_ = args print(f'Building sequences for {language} {set_}') # Prepare code seqs code_seqs = (doc['code_tokens'] for doc in utils.load_cached_docs(language, set_)) padded_encoded_code_seqs = pad_encode_seqs( preprocess_code_tokens, code_seqs, shared.CODE_MAX_SEQ_LENGTH, language, 'code') # Prepare query seqs query_seqs = (doc['query_tokens'] for doc in utils.load_cached_docs(language, set_)) padded_encoded_query_seqs = pad_encode_seqs( preprocess_query_tokens, query_seqs, shared.QUERY_MAX_SEQ_LENGTH, language, 'query') # Check for invalid sequences padded_encoded_code_seqs, padded_encoded_query_seqs = keep_valid_seqs( padded_encoded_code_seqs, padded_encoded_query_seqs) utils.cache_seqs(padded_encoded_code_seqs, language, set_, 'code') utils.cache_seqs(padded_encoded_query_seqs, language, set_, 'query') print(f'Done building sequences for {language} {set_}')
def prepare_evaluation_seqs(language): print(f'Building evaluation sequences for {language}') evaluation_docs = utils.load_cached_docs(language, 'evaluation') evaluation_code_seqs = (doc['function_tokens'] for doc in evaluation_docs) evaluation_padded_encoded_code_seqs = pad_encode_seqs( preprocess_code_tokens, evaluation_code_seqs, shared.CODE_MAX_SEQ_LENGTH, language, 'code') utils.cache_seqs(evaluation_padded_encoded_code_seqs, language, 'evaluation', 'code') # Just to be safe del evaluation_code_seqs gc.collect() print(f'Done building evaluation sequences for {language}')
def handle(self, *args, **options): models.CodeDocument.objects.all().delete() batch_size = 500 for language in shared.LANGUAGES: print(f'Importing {language} code documents') code_docs = [] for idx, doc in enumerate( utils.load_cached_docs(language, 'evaluation')): code_doc = models.CodeDocument( code=doc['function'], code_hash=hashlib.sha1( doc['function'].encode('utf-8')).hexdigest(), url=doc['url'], language=language, repo=doc['nwo'], file_path=doc['path'], identifier=doc['identifier'], embedded_row_index=idx, ) code_docs.append(code_doc) models.CodeDocument.objects.bulk_create(code_docs, batch_size=batch_size)
from sklearn.neighbors import NearestNeighbors from code_search import shared from code_search import utils from code_search import prepare_data from code_search import train_model query = sys.argv[1] for language in shared.LANGUAGES: print(f'Evaluating {language}') evaluation_docs = [{ 'url': doc['url'], 'identifier': doc['identifier'] } for doc in utils.load_cached_docs(language, 'evaluation')] print('Read the docs') code_embeddings = utils.load_cached_code_embeddings(language) query_seqs = prepare_data.pad_encode_seqs( prepare_data.preprocess_query_tokens, (line.split(' ') for line in [query]), shared.QUERY_MAX_SEQ_LENGTH, language, 'query') model = utils.load_cached_model_weights(language, train_model.get_model()) query_embedding_predictor = train_model.get_query_embedding_predictor( model) query_embeddings = query_embedding_predictor.predict(query_seqs) # TODO: Replace with annoy index