def test_vectorizer_query(self):
     vectorizer = BM25Vectorizer(self.index_path, min_df=5)
     result = vectorizer.get_query_vector(
         'this is a query to test query vector')
     self.assertEqual(result[0, 2703], 2)
     self.assertEqual(result[0, 3078], 1)
     self.assertEqual(result[0, 3204], 1)
 def test_bm25_vectorizer_train(self):
     vectorizer = BM25Vectorizer(self.index_path, min_df=5)
     train_docs = ['CACM-0239', 'CACM-0440', 'CACM-3168', 'CACM-3169']
     train_labels = [1, 1, 0, 0]
     test_docs = ['CACM-0634', 'CACM-3134']
     train_vectors = vectorizer.get_vectors(train_docs)
     test_vectors = vectorizer.get_vectors(test_docs)
     clf = LogisticRegression()
     clf.fit(train_vectors, train_labels)
     pred = clf.predict_proba(test_vectors)
     self.assertAlmostEqual(0.4629749, pred[0][0], places=8)
     self.assertAlmostEqual(0.5370251, pred[0][1], places=8)
     self.assertAlmostEqual(0.48288416, pred[1][0], places=8)
     self.assertAlmostEqual(0.51711584, pred[1][1], places=8)
def rank(new_qrels: str, base: str,tmp_base:str, qrels_path: str, lucene_index_path: str, R: List[int], score_path: str, alpha: float, clf_type: ClassifierType, vec_type: VectorizerType, tag: str):
    # build output path
    base_str = base.split('/')[-1]
    R_str = ''.join([str(i) for i in R])
    curdir = os.getcwd()
    if curdir.endswith('integrations'):
       output_path = f'{tmp_base}/runs/{base_str}.{ClassifierStr[clf_type]}.{VectorizerStr[vec_type]}.R{R_str}.A{alpha}.txt'
    else:
       output_path = f'integrations/{tmp_base}/runs/{base_str}.{ClassifierStr[clf_type]}.{VectorizerStr[vec_type]}.R{R_str}.A{alpha}.txt'
    print(f'Output -> {output_path}')
    os.system('mkdir -p runs')

    vectorizer = None
    if vec_type == VectorizerType.TFIDF:
        vectorizer = TfidfVectorizer(lucene_index_path, min_df=5)
    elif vec_type == VectorizerType.SPECTER:
        base += '.specter'
        qrels_path += '.specter'
        vectorizer = SpecterVectorizer()
    elif vec_type == VectorizerType.BM25:
        vectorizer = BM25Vectorizer(lucene_index_path, min_df=5)
    else:
        print('invalid vectorizer')
        exit()

    f = open(output_path, 'w+')

    skipped_topics = set()
    topics = get_topics_from_qrun(base)
    for topic in topics:
        train_docs, train_labels = get_X_Y_from_qrels_by_topic(qrels_path, topic, R)
        if len(train_docs) == 0:
            print(f'[topic][{topic}] skipped')
            skipped_topics.add(topic)
            continue

        print(f'[topic][{topic}] eligible train docs {len(train_docs)}')

        clf = None
        if clf_type == ClassifierType.NB:
            clf = MultinomialNB()
        elif clf_type == ClassifierType.LR:
            clf = LogisticRegression()
        elif clf_type == ClassifierType.SVM:
            clf = SVC(kernel='linear', probability=True)
        else:
            print('ClassifierType not supported')
            exit()

        train_vectors = vectorizer.get_vectors(train_docs)
        clf.fit(train_vectors, train_labels)

        test_docs, base_scores = get_docs_from_qrun_by_topic(base, topic)
        print(f'[topic][{topic}] eligible test docs {len(test_docs)}')
        test_vectors = vectorizer.get_vectors(test_docs)

        rank_scores = clf.predict_proba(test_vectors)
        rank_scores = [row[1] for row in rank_scores]

        rank_scores = normalize(rank_scores)
        base_scores = normalize(base_scores)

        preds = [a * alpha + b * (1-alpha) for a, b in zip(rank_scores, base_scores)]
        preds, docs = sort_dual_list(preds, test_docs)

        for index, (score, doc_id) in enumerate(zip(preds, docs)):
            rank = index + 1
            f.write(f'{topic} Q0 {doc_id} {rank} {score} {tag}\n')

    for topic in sort_str_topics_list(list(skipped_topics)):
        lines = get_lines_by_topic(base, topic, tag)
        print(f'Copying over skipped topic {topic} with {len(lines)} lines')
        for line in lines:
            f.write(f'{line}\n')

    f.close()
    map_score,ndcg_score = evaluate(new_qrels, output_path)
    with open(score_path, 'w') as outfile:
    	json.dump({'map':map_score,'ndcg':ndcg_score}, outfile)
 def test_bm25_vectorizer(self):
     vectorizer = BM25Vectorizer(self.index_path, min_df=5)
     result = vectorizer.get_vectors(['CACM-0239', 'CACM-0440'], norm=None)
     self.assertAlmostEqual(result[0, 190], 1.7513844966888428, places=8)
     self.assertAlmostEqual(result[1, 391], 0.03765463829040527, places=8)