Ejemplo n.º 1
0
def emit_ndcg_model_predictions(use_wandb=False):
    build_code_embeddings()
    queries = utils.get_evaluation_queries()

    predictions = []
    for language in shared.LANGUAGES:
        print(f'Evaluating {language}')

        evaluation_docs = [{
            'url': doc['url'],
            'identifier': doc['identifier']
        } for doc in utils.load_cached_docs(language, 'evaluation')]

        code_embeddings = utils.load_cached_code_embeddings(language)

        model = utils.load_cached_model_weights(language,
                                                train_model.get_model())
        query_embedding_predictor = train_model.get_query_embedding_predictor(
            model)
        query_seqs = prepare_data.pad_encode_seqs(
            prepare_data.preprocess_query_tokens,
            (line.split(' ') for line in queries), shared.QUERY_MAX_SEQ_LENGTH,
            language, 'query')
        query_embeddings = query_embedding_predictor.predict(query_seqs)

        # TODO: Query annoy index
        nn = NearestNeighbors(n_neighbors=100, metric='cosine', n_jobs=-1)
        nn.fit(code_embeddings)
        _, nearest_neighbor_indices = nn.kneighbors(query_embeddings)

        for query_idx, query in enumerate(queries):
            for query_nearest_code_idx in nearest_neighbor_indices[
                    query_idx, :]:
                predictions.append({
                    'query':
                    query,
                    'language':
                    language,
                    'identifier':
                    evaluation_docs[query_nearest_code_idx]['identifier'],
                    'url':
                    evaluation_docs[query_nearest_code_idx]['url'],
                })

        del evaluation_docs
        gc.collect()

    df_predictions = pd.DataFrame(
        predictions, columns=['query', 'language', 'identifier', 'url'])
    save_path = os.path.join(
        wandb.run.dir,
        'model_predictions.csv') if use_wandb else '../model_predictions.csv'
    df_predictions.to_csv(save_path, index=False)
def build_code_embeddings():
    for language in shared.LANGUAGES:
        print(f'Building {language} code embeddings')
        model = utils.load_cached_model_weights(language,
                                                train_model.get_model())
        code_embedding_predictor = train_model.get_code_embedding_predictor(
            model)

        evaluation_code_seqs = utils.load_cached_seqs(language, 'evaluation',
                                                      'code')
        code_embedding = code_embedding_predictor.predict(evaluation_code_seqs)

        utils.cache_code_embeddings(code_embedding, language)
Ejemplo n.º 3
0
def get_nearest_query_neighbors_per_language(query):
    nearest_neighbors_per_language = {}
    for language in shared.LANGUAGES:
        query_seq = prepare_data.pad_encode_query(query, language)

        model = utils.load_cached_model_weights(language,
                                                train_model.get_model())
        query_embedding_predictor = train_model.get_query_embedding_predictor(
            model)
        query_embedding = query_embedding_predictor.predict(
            query_seq.reshape(1, -1))[0, :]

        ann = utils.load_cached_ann(language)
        nearest_neighbors_per_language[language] = ann.get_nns_by_vector(
            query_embedding, RESULTS_PER_LANGUAGE, include_distances=True)
    return nearest_neighbors_per_language
Ejemplo n.º 4
0
def evaluate_language_mean_mrr(language):
    model = utils.load_cached_model_weights(language, train_model.get_model())

    valid_code_seqs = utils.load_cached_seqs(language, 'valid', 'code')
    valid_query_seqs = utils.load_cached_seqs(language, 'valid', 'query')
    valid_mean_mrr = evaluate_model_mean_mrr(model, valid_code_seqs,
                                             valid_query_seqs)

    test_code_seqs = utils.load_cached_seqs(language, 'test', 'code')
    test_query_seqs = utils.load_cached_seqs(language, 'test', 'query')
    test_mean_mrr = evaluate_model_mean_mrr(model, test_code_seqs,
                                            test_query_seqs)

    print(
        f'Evaluating {language} - Valid Mean MRR: {valid_mean_mrr}, Test Mean MRR: {test_mean_mrr}'
    )
    return valid_mean_mrr, test_mean_mrr
Ejemplo n.º 5
0
query = sys.argv[1]

for language in shared.LANGUAGES:
    print(f'Evaluating {language}')

    evaluation_docs = [{
        'url': doc['url'],
        'identifier': doc['identifier']
    } for doc in utils.load_cached_docs(language, 'evaluation')]
    print('Read the docs')

    code_embeddings = utils.load_cached_code_embeddings(language)

    query_seqs = prepare_data.pad_encode_seqs(
        prepare_data.preprocess_query_tokens,
        (line.split(' ') for line in [query]), shared.QUERY_MAX_SEQ_LENGTH,
        language, 'query')

    model = utils.load_cached_model_weights(language, train_model.get_model())
    query_embedding_predictor = train_model.get_query_embedding_predictor(
        model)
    query_embeddings = query_embedding_predictor.predict(query_seqs)

    # TODO: Replace with annoy index
    nn = NearestNeighbors(n_neighbors=3, metric='cosine', n_jobs=-1)
    nn.fit(code_embeddings)
    _, nearest_neighbor_indices = nn.kneighbors(query_embeddings)

    for query_nearest_code_idx in nearest_neighbor_indices[0, :]:
        print(evaluation_docs[query_nearest_code_idx]['url'])