Esempio n. 1
0
def run_validation(validation_file_path, w2vec_model_file_path, nn_model, result_path, threshold,
                   vector_dimension):
    # loading the pre-trained model
    w2vec_model = gensim.models.KeyedVectors.load_word2vec_format(w2vec_model_file_path)

    lines = file_processor.read_lines(validation_file_path)
    lines.pop(0)  # removing column header
    qid_queries = get_queries(lines)
    counter = 0;
    total_queries = len(qid_queries)
    for qid in qid_queries:
        counter += 1
        print("Started->", counter)
        query = qid_queries[qid]
        pid_passage_relevance_tuple = file_processor.get_candidate_passages_relevance_by_qid(lines, qid)
        scored_passage = []
        for ppr in pid_passage_relevance_tuple:
            passage = ppr["passage"]
            relevancy = float(ppr["relevancy"])
            feature_vector = word2vec.build_feature_vector(query, passage, w2vec_model, vector_dimension)
            score = get_probability_score(nn_model, feature_vector)
            pre_relevancy = 1.0 if score >= threshold else 0.0
            scored_passage.append(
                {"qid": qid, "pid": ppr["pid"], "rank": 0, "score": score, "relevancy": relevancy,
                 "pre_relevancy": pre_relevancy, "assigment_name": "A1", "algorithm_name": "NN"})
        sorted_passage = set_rank_by_score(scored_passage)
        file_processor.write_scored_passage(sorted_passage, result_path)
        # select top 250 queries
        if counter > 250:
            break
def get_scored_passage_from_result(result_file_path):
    result_lines = file_processor.read_lines(result_file_path)
    scored_passage = []
    for line in result_lines:
        elements = line.split("\t")
        qid = elements[0]
        pid = elements[2]
        rank = int(elements[3])
        score = float(elements[4])
        relevancy = float(elements[6])
        scored_passage.append({
            "qid": qid,
            "pid": pid,
            "rank": rank,
            "score": score,
            "relevancy": relevancy
        })
    return scored_passage
Esempio n. 3
0
def run_bm25_model(validation_file_path, result_file_path):
    lines = file_processor.read_lines(validation_file_path)
    lines.pop(0)  # removing column header
    qid_queries = get_queries(lines)
    counter = 0
    total_queries = len(qid_queries)
    for qid in qid_queries:
        counter += 1
        query = qid_queries[qid]
        pid_passage_relevance_tuple = file_processor.get_candidate_passages_relevance_by_qid(
            lines, qid)
        all_pid_passage = get_pid_passage_pair(pid_passage_relevance_tuple)
        relevant_pid_passage = get_relevant_pid_passage_pair(
            pid_passage_relevance_tuple)
        scored_passage = get_scored_passage_by_query(qid, query,
                                                     all_pid_passage,
                                                     relevant_pid_passage)
        print("Completed->  " + str(counter) + "  -> out of: " +
              str(total_queries))
        file_processor.write_ranked_passage(scored_passage, result_file_path)
Esempio n. 4
0
import file_util.file_processor as file_processor
import index.index_builder as index_builder

file_path = "../dataset/candidate_passages_top1000.tsv"

# passage collection
lines = file_processor.read_lines(file_path)

pid_passage_pair = file_processor.get_candidate_passages_by_qid(
    lines, "1113437")
index = index_builder.build_inverted_index(pid_passage_pair)