Example #1
0
def prepare_language_vocabulary(args):
    language, (tokens_key, vocab_size, type_) = args
    print(f'Building vocabulary for {language} {type_}')

    docs = utils.load_cached_docs(language, 'train')
    tokens = utils.flatten(preprocess_query_tokens(utils.flatten(doc[tokens_key] for doc in docs)))
    vocabulary = BpeVocabulary(vocab_size=vocab_size, pct_bpe=shared.VOCABULARY_PCT_BPE)
    vocabulary.fit(Counter(tokens))
    utils.cache_vocabulary(vocabulary, language, type_)

    print(f'Done building vocabulary for {language} {type_}')
Example #2
0
def emit_ndcg_model_predictions(use_wandb=False):
    build_code_embeddings()
    queries = utils.get_evaluation_queries()

    predictions = []
    for language in shared.LANGUAGES:
        print(f'Evaluating {language}')

        evaluation_docs = [{
            'url': doc['url'],
            'identifier': doc['identifier']
        } for doc in utils.load_cached_docs(language, 'evaluation')]

        code_embeddings = utils.load_cached_code_embeddings(language)

        model = utils.load_cached_model_weights(language,
                                                train_model.get_model())
        query_embedding_predictor = train_model.get_query_embedding_predictor(
            model)
        query_seqs = prepare_data.pad_encode_seqs(
            prepare_data.preprocess_query_tokens,
            (line.split(' ') for line in queries), shared.QUERY_MAX_SEQ_LENGTH,
            language, 'query')
        query_embeddings = query_embedding_predictor.predict(query_seqs)

        # TODO: Query annoy index
        nn = NearestNeighbors(n_neighbors=100, metric='cosine', n_jobs=-1)
        nn.fit(code_embeddings)
        _, nearest_neighbor_indices = nn.kneighbors(query_embeddings)

        for query_idx, query in enumerate(queries):
            for query_nearest_code_idx in nearest_neighbor_indices[
                    query_idx, :]:
                predictions.append({
                    'query':
                    query,
                    'language':
                    language,
                    'identifier':
                    evaluation_docs[query_nearest_code_idx]['identifier'],
                    'url':
                    evaluation_docs[query_nearest_code_idx]['url'],
                })

        del evaluation_docs
        gc.collect()

    df_predictions = pd.DataFrame(
        predictions, columns=['query', 'language', 'identifier', 'url'])
    save_path = os.path.join(
        wandb.run.dir,
        'model_predictions.csv') if use_wandb else '../model_predictions.csv'
    df_predictions.to_csv(save_path, index=False)
Example #3
0
def prepare_set_seqs(args):
    language, set_ = args
    print(f'Building sequences for {language} {set_}')

    # Prepare code seqs
    code_seqs = (doc['code_tokens'] for doc in utils.load_cached_docs(language, set_))
    padded_encoded_code_seqs = pad_encode_seqs(
        preprocess_code_tokens, code_seqs, shared.CODE_MAX_SEQ_LENGTH, language, 'code')

    # Prepare query seqs
    query_seqs = (doc['query_tokens'] for doc in utils.load_cached_docs(language, set_))
    padded_encoded_query_seqs = pad_encode_seqs(
        preprocess_query_tokens, query_seqs, shared.QUERY_MAX_SEQ_LENGTH, language, 'query')

    # Check for invalid sequences
    padded_encoded_code_seqs, padded_encoded_query_seqs = keep_valid_seqs(
        padded_encoded_code_seqs, padded_encoded_query_seqs)

    utils.cache_seqs(padded_encoded_code_seqs, language, set_, 'code')
    utils.cache_seqs(padded_encoded_query_seqs, language, set_, 'query')

    print(f'Done building sequences for {language} {set_}')
Example #4
0
def prepare_evaluation_seqs(language):
    print(f'Building evaluation sequences for {language}')

    evaluation_docs = utils.load_cached_docs(language, 'evaluation')
    evaluation_code_seqs = (doc['function_tokens'] for doc in evaluation_docs)

    evaluation_padded_encoded_code_seqs = pad_encode_seqs(
        preprocess_code_tokens, evaluation_code_seqs, shared.CODE_MAX_SEQ_LENGTH, language, 'code')
    utils.cache_seqs(evaluation_padded_encoded_code_seqs, language, 'evaluation', 'code')

    # Just to be safe
    del evaluation_code_seqs
    gc.collect()

    print(f'Done building evaluation sequences for {language}')
    def handle(self, *args, **options):
        models.CodeDocument.objects.all().delete()
        batch_size = 500
        for language in shared.LANGUAGES:
            print(f'Importing {language} code documents')
            code_docs = []
            for idx, doc in enumerate(
                    utils.load_cached_docs(language, 'evaluation')):
                code_doc = models.CodeDocument(
                    code=doc['function'],
                    code_hash=hashlib.sha1(
                        doc['function'].encode('utf-8')).hexdigest(),
                    url=doc['url'],
                    language=language,
                    repo=doc['nwo'],
                    file_path=doc['path'],
                    identifier=doc['identifier'],
                    embedded_row_index=idx,
                )
                code_docs.append(code_doc)

            models.CodeDocument.objects.bulk_create(code_docs,
                                                    batch_size=batch_size)
Example #6
0
from sklearn.neighbors import NearestNeighbors

from code_search import shared
from code_search import utils
from code_search import prepare_data
from code_search import train_model

query = sys.argv[1]

for language in shared.LANGUAGES:
    print(f'Evaluating {language}')

    evaluation_docs = [{
        'url': doc['url'],
        'identifier': doc['identifier']
    } for doc in utils.load_cached_docs(language, 'evaluation')]
    print('Read the docs')

    code_embeddings = utils.load_cached_code_embeddings(language)

    query_seqs = prepare_data.pad_encode_seqs(
        prepare_data.preprocess_query_tokens,
        (line.split(' ') for line in [query]), shared.QUERY_MAX_SEQ_LENGTH,
        language, 'query')

    model = utils.load_cached_model_weights(language, train_model.get_model())
    query_embedding_predictor = train_model.get_query_embedding_predictor(
        model)
    query_embeddings = query_embedding_predictor.predict(query_seqs)

    # TODO: Replace with annoy index