Example #1
0
def main():
    qp = QueryParser(filename="../text/domain_keywords_20150617.txt")
    cp = CorpusParser(filename=candidate_text_path)
    qp.parse()
    queries = qp.get_queries()
    cp.parse()
    corpus = cp.get_corpus()
    proc = QueryProcessor(queries, corpus)
    results = proc.run()
    qid = 0
    topn_idx = []
    for result in results:
        sorted_x = sorted(result.iteritems(), key=operator.itemgetter(1))
        sorted_x.reverse()
        index = 0
        for i in sorted_x[:]:
            tmp = (qid, i[0], i[1], texts[int(i[0]) - 1])
            # print '{:>1}\t{:>4}\t{:>12}\t{}'.format(*tmp)
            index += 1
            topn_idx.append(i[0])
        qid += 1

    labels = read_candidate_label()
    precision(topn_idx, labels, topn=10)
    precision(topn_idx, labels, topn=20)
    precision(topn_idx, labels, topn=30)
    precision(topn_idx, labels, topn=40)
    precision(topn_idx, labels, topn=50)
    precision(topn_idx, labels, topn=60)
    precision(topn_idx, labels, topn=70)
    precision(topn_idx, labels, topn=80)
    precision(topn_idx, labels, topn=90)
    precision(topn_idx, labels, topn=100)
Example #2
0
def main():
    qp = QueryParser(filename=r'..\text\query.txt')
    cp = CorpusParser(filename=r'..\text\corpus.txt')
    qp.parse()
    queries = qp.get_queries()
    cp.parse()
    corpus = cp.get_corpus()
    proc = QueryProcessor(queries, corpus)

    for k1_value in range(12, 18):
        k1 = k1_value / 10.0
        for k2 in range(100, 600, 100):
            for b_value in range(3, 8):
                b = b_value / 10.0
                print 'k1=' + str(k1) + ' k2=' + str(k2) + ' b=' + str(b)
                result_path = 'E:\\GridSearchBM25Component\\' + str(
                    k1) + ' ' + str(k2) + ' ' + str(b) + ' ' + 'Result.txt'
                out = open(result_path, 'w')

                results = proc.run(k1, k2, b)
                qid = 0
                for result in results:
                    sorted_x = sorted(result.iteritems(),
                                      key=operator.itemgetter(1))
                    sorted_x.reverse()
                    index = 0
                    #这里可指定输出topK个
                    for i in sorted_x[:]:
                        out.write(
                            str(qid) + '\t' + str(index) + '\t' + i[0] + '\t' +
                            str(i[1]) + '\n')
                        out.flush()
                        index += 1
                    qid += 1
                out.close()
Example #3
0
def main():
    qp = QueryParser(
        filename=
        'C:\\Users\\jrlimingyang\\PycharmProjects\\chatbot-version2\\cache\\query.txt'
    )
    cp = CorpusParser(
        filename=
        'C:\\Users\\jrlimingyang\\PycharmProjects\\chatbot-version2\\cache\\corpus.txt'
    )
    qp.parse()
    queries = qp.get_queries()
    cp.parse()
    corpus = cp.get_corpus()
    proc = QueryProcessor(queries, corpus)
    results = proc.run()
    qid = 0
    for result in results:
        sorted_x = sorted(result.iteritems(), key=operator.itemgetter(1))
        sorted_x.reverse()
        index = 0
        for i in sorted_x[:10]:
            tmp = (qid, qid, i[0], index, i[1])
            print '{:>1}\tQ{:>1}\t{:>4}\t{:>2}\t{:>12}\tL-BM25'.format(*tmp)
            index += 1
        qid += 1
Example #4
0
def main():
    #qp = QueryParser(filename='../text/queries.txt')
    qp = QueryParser(filename='../text/query_documents2.txt')
    #cp = CorpusParser(filename='../text/corpus.txt')
    #cp = CorpusParser(filename='../text/candidate_methodbody_documents_only.txt')
    cp = CorpusParser(
        filename='../text/candidate_methodbody_apitext_documents_only.txt')
    qp.parse()
    queries = qp.get_queries()
    cp.parse()
    corpus = cp.get_corpus()
    proc = QueryProcessor(queries, corpus)
    results = proc.run()
    qid = 0
    print len(results)
    for result in results:
        #print result
        #sorted_x = sorted(result.iteritems(), key=operator.itemgetter(1))
        sorted_x = sorted(result.items(),
                          key=operator.itemgetter(1),
                          reverse=True)
        #sorted_x.reverse()
        index = 0
        print sorted_x
        print len(sorted_x)
        for i in sorted_x[:100]:
            tmp = (qid, i[0], index, i[1])
            print '{:>1}\tQ0\t{:>4}\t{:>2}\t{:>12}\tNH-BM25'.format(*tmp)
            index += 1
        qid += 1
def main():
    qp = QueryParser(filename='../text/quer.txt')
    qp.parse()
    cp = CorpusParser(filename='../text/corps.txt')
    queries = qp.get_queries()
    cp.parse()
    corpus = cp.get_corpus()
    proc = QueryProcessor(queries, corpus)
    results, originalResults = proc.run()
    qid = 0
    total = 10
    correct = 0
    for result in results:
        sorted_x = sorted(result.iteritems(), key=operator.itemgetter(1))
        sorted_x.reverse()
        index = 0
        #print sorted_x[0]
        try:
            actualOutput = sorted_x[0][0]
            expected = originalResults[qid]
            actualOutput = actualOutput.split("-")[0].strip()
            expected = expected.split("-")[0].strip()
            #print actualOutput,expected
            if actualOutput == expected:
                correct += 1
            total += 1
        except Exception as e:
            pass

        qid += 1

    print("Accuracy"),
    print((correct / float(total)) * 100)
Example #6
0
def main():
    qp = QueryParser(filename=r'..\text\query.txt')
    cp = CorpusParser(filename=r'..\text\corpus.txt')
    qp.parse()
    queries = qp.get_queries()
    cp.parse()
    corpus = cp.get_corpus()
    proc = QueryProcessor(queries, corpus)

    k1 = 1.2
    k2 = 100
    b = 0.5
    result_path = r'.\Result.txt'
    out = open(result_path, 'w')

    results = proc.run(k1, k2, b)
    qid = 0
    for result in results:
        sorted_x = sorted(result.iteritems(), key=operator.itemgetter(1))
        sorted_x.reverse()
        index = 0
        #这里可指定输出topK个
        for i in sorted_x[:]:
            out.write(
                str(qid) + '\t' + str(index) + '\t' + i[0] + '\t' + str(i[1]) +
                '\n')
            out.flush()
            index += 1
        qid += 1
        print qid
    out.close()
Example #7
0
def main():
    pickle_in = open('../pickles/dev_question_candidates.pkl', 'rb')
    question_candidates = pickle.load(pickle_in)

    # qp = QueryParser(filename='../text/queries.txt')
    # cp = CorpusParser(filename='../text/corpus.txt')
    qp = QueryParser(filename='../text/queries-fiqa.txt')
    cp = CorpusParser(filename='../text/corpus-fiqa.txt')
    qp.parse(isCustomFormat=True)
    queries = qp.get_queries()
    cp.parse()
    corpus = cp.get_corpus()
    proc = QueryProcessor(queries, corpus, dev_candidates=question_candidates)
    results = proc.run(isCustomFormat=True)
    # qid = 0

    result_dict = dict()
    for result, qid in results:
        sorted_x = sorted(result.items(),
                          key=operator.itemgetter(1),
                          reverse=True)
        # sorted_x.reverse()
        index = 0
        result_dict[qid] = []
        for i in sorted_x[:100]:
            tmp = (qid, i[0], index, i[1])
            print('{:>1}\tQ0\t{:>6}\t{:>2}\t{:>12}\tNH-BM25'.format(*tmp))
            index += 1
            result_dict[qid].append(i[0])
        # qid += 1
    print(result_dict)
    pickle_out = open('dev_candidate_after_bm25', 'wb')
    pickle.dump(result_dict, pickle_out)
Example #8
0
def main(queryPath, corpusPath, resultPath, k1Value,k2Value, bValue):
	qp = QueryParser(queryPath)
	cp = CorpusParser(corpusPath)
	qp.parse()
	queries = qp.get_queries()
	cp.parse()
	corpus = cp.get_corpus()
	proc = QueryProcessor(queries, corpus)

	k1=k1Value
	k2=k2Value
	b=bValue
	result_path=resultPath
	out=open(result_path,'w')

	results = proc.run(k1, k2, b)
	qid = 0
	for result in results:
		sorted_x = sorted(result.iteritems(), key=operator.itemgetter(1))
		sorted_x.reverse()
		index = 0
		#这里可指定输出topK个
		for i in sorted_x[:]:
			out.write(str(qid)+'\t'+str(index)+'\t'+i[0]+'\t'+str(i[1])+'\n')
			out.flush()
			index += 1
		qid += 1
		print qid
	out.close()
def bm25_sort(queries, corpus):
    proc = QueryProcessor(queries, corpus)
    results = proc.run()
    for result in results:
        sorted_x = sorted(result.iteritems(), key=operator.itemgetter(1))
        sorted_x.reverse()

        return dict(sorted_x)
Example #10
0
def main(passage_path, query_path, output_path, stop_words_path, w2v_path):
    
    #停用词路径,词向量模型路径
    stop_words_path = stop_words_path
    w2v_path = w2v_path
    query_path = query_path
    passage_path = passage_path
    #读取查询文件,构建查询表
    qp = QueryParser(query_path, w2v_path, stop_words_path)
    qp.parse()
    queries = qp.get_queries() #获取 qid-q
    print("Read queries end.")
    #读取passage文件
    cp = CorpusParser(passage_path, stop_words_path)
    cp.parse()
    corpus = cp.get_corpus() #获取 qid-{pid-p}
    print("Build passage pools end.")
    
    #每个查询根据各自候选池 进行PM25打分
    results = {}
    i = 1
    for qid in queries:
        print("No. %s query has been scored..." % i)
        proc = QueryProcessor(queries[qid], corpus[qid])
        results[qid] = proc.run_query() #得到一个pid-score的字典
        i += 1

    #排序后写出
    res_string = ""
    for qid in results:
        #对于每组结果按照分数排序
        sorted_res = sorted(results[qid].items(), key = lambda kv:(kv[1], kv[0]))
        sorted_res.reverse()
        rank = 1
        last_score = 0 #最后一个得分
        for i in sorted_res:  #pid - score
            res_string += std_out(qid, i[0], rank, i[1])
            rank += 1
        #遍历所有的pid,如果没有被写出则直接跟在最后
        if len(sorted_res) > 0:  #防止文章池太小(eg:5)没有找到一篇匹配文章的情况
            last_score = sorted_res[len(sorted_res) - 1][1]
            pids = np.array(sorted_res)[:, 0]
            for pid in corpus[qid]:
                if pid not in pids:
                    last_score -= 0.1  #每个减0.1
                    res_string += std_out(qid, pid, rank, last_score - 0.1)
                    rank += 1
        else:
            for pid in corpus[qid]:
                last_score -= 0.1  #每个减0.1
                res_string += std_out(qid, pid, rank, last_score - 0.1)
                rank += 1

    with open(os.path.join(output_path), "w") as f:
        f.write(res_string)
        f.close()
Example #11
0
def start_bm25():
    # print "i was HERE!!!"
    thresh = open('/home/deep/TwitterTrend/text/Threshold.txt')
    th = []
    for each in thresh:
        th.append(each.strip())

    prof = open('/home/deep/TwitterTrend/text/ProfileName.txt')
    pro = []
    for each in prof:
        pro.append(each.strip())

    qp = QueryParser(filename='text/query.txt')
    cp = CorpusParser(filename='corpus111.txt')
    qp.parse()
    queries = qp.get_queries()
    #print "Q is ",queries
    cp.parse()
    corpus = cp.get_corpus()
    #print "c is ", corpus
    proc = QueryProcessor(queries, corpus)
    results = proc.run()
    print results
    qid = 1
    bm25_output_list = []
    for result in results:
        threshold = th[qid - 1]
        profile_name = pro[qid - 1]
        print threshold
        print profile_name
        sorted_x = sorted(result.iteritems(), key=operator.itemgetter(1))
        sorted_x.reverse()
        index = 0
        for i in sorted_x[:100]:
            tweet_text = cp.get_text(i[0])
            tmp = (qid, i[0], index, i[1])
            # print str(tmp)
            #print "yaay"
            text = '{:>1}, {:>4}, {:>2}, {:>12}'.format(
                *tmp) + ", " + tweet_text
            flag = threshold_check(i[1], threshold)
            if flag:
                push.push(profile_name, i[1], tweet_text)
                bm25_output_list.append(text)
            index += 1
        qid += 1

    with open('ss_b' 'm25_output.txt', 'a') as f:
        for item in bm25_output_list:
            f.write(item + ' \n')
    rparser.seperate_store_result()
Example #12
0
def main():
    qp = QueryParser(filename='../text/queries.txt')
    cp = CorpusParser(filename='../text/corpus.txt')
    qp.parse()
    queries = qp.get_queries()
    cp.parse()
    corpus = cp.get_corpus()
    proc = QueryProcessor(queries, corpus)
    results = proc.run()
    qid = 0
    for result in results:
        sorted_x = sorted(result.iteritems(), key=operator.itemgetter(1))
        sorted_x.reverse()
        index = 0
        for i in sorted_x[:100]:
            tmp = (qid, i[0], index, i[1])
            print '{:>1}\tQ0\t{:>4}\t{:>2}\t{:>12}\tNH-BM25'.format(*tmp)
            index += 1
        qid += 1
Example #13
0
def main():
    qp = QueryParser(filename='./data/queries.txt')
    cp = CorpusParser(filename='./data/corpus.txt')
    qp.parse()
    queries = qp.get_queries()
    cp.parse()
    corpus = cp.get_corpus()
    proc = QueryProcessor(queries, corpus)
    results = proc.run()
    qid = 0
    for result in results:
        sorted_x = sorted(result.iteritems(), key=operator.itemgetter(1))
        sorted_x.reverse()
        index = 0
        for i in sorted_x[:10]:
            tmp = (qid, i[0], index, i[1])
            print '{:>1}\tQ0\t{:>4}\t{:>2}\t{:>12}\tL-BM25'.format(*tmp)
            index += 1
        qid  += 1
Example #14
0
def main():
    qp = QueryParser('../text/queries.txt')
    cp = CorpusParser('../text/ptwiki-v2.trec.xml', '../text/stopwords.txt')
    queries = qp.queries
    corpus = cp.corpus
    proc = QueryProcessor(queries, corpus)
    results = proc.run()
    qid = 0
    for result in results:
        sorted_x = sorted(result.items(), key=operator.itemgetter(1))
        sorted_x.reverse()
        index = 0
        print("Query: {}".format(' '.join(queries[qid])))
        for i in sorted_x[:10]:
            doc_num = i[0]
            doc_bm25_score = i[1]
            tmp = (index, doc_num, doc_bm25_score)
            print('{:>4}\t{:>2}\t{:>12}'.format(*tmp))
            index += 1
        qid += 1
def main():
    qp = QueryParser('../text/queries.txt')
    cp = CorpusParser('../text/ptwiki-v2.trec.xml', '../text/stopwords.txt')
    queries = qp.queries
    corpus = cp.corpus
    proc = QueryProcessor(queries, corpus)
    results = proc.run()
    qid = 0
    for result in results:
        sorted_x = sorted(result.items(), key=operator.itemgetter(1))
        sorted_x.reverse()
        index = 0
        print("Query: {}".format(' '.join(queries[qid])))
        for i in sorted_x[:10]:
            doc_num = i[0]
            doc_bm25_score = i[1]
            tmp = (index, doc_num, doc_bm25_score)
            print ('{:>4}\t{:>2}\t{:>12}'.format(*tmp))
            index += 1
        qid += 1
Example #16
0
def main():

    course_index_chinese_dict = chinese_course_dict('../text/course.csv')
    # print(course_index_chinese_dict)

    qp = QueryParser(filename='../text/job_phrase.txt')
    # qp = QueryParser(filename='../text/mini_job_phrase.txt')
    qp.parse()
    queries = qp.get_queries()
    print(len(queries))
    # print(queries)
    # print(len(queries))

    cp = CorpusParser(filename='../text/course_phrase.txt')
    cp.parse()
    corpus = cp.get_corpus()
    # print(corpus)
    proc = QueryProcessor(queries, corpus)
    # print(proc.index.index)
    # print(proc.dlt.table.keys())

    results = proc.run()
    # print(results[0][725])
    print(len(results))

    # #top 60=1
    # with open('bm25_weaksupervision.csv', 'w', encoding='utf-8') as writer:
    #     for result in results:
    #         sorted_x = sorted(result.items(), key=operator.itemgetter(1))
    #         sorted_x.reverse()
    #         for i in sorted_x[:60]:
    #             writer.write(str(i[0])+',')
    #         writer.write('\n')

    # evaluate
    hr_20, ndcg_20, hr_10, ndcg_10, hr_5, ndcg_5, f_map, f_mrr = evaluate_ranking_output(
        test_filename='../text/job_course_99neg_1pos.txt',
        ranking_results=results)
    print(
        'hr5 = %.4f, ndcg5 = %.4f, hr20 = %.4f, ndcg20 = %.4f, hr10 = %.4f, ndcg10 = %.4f, map = %.4f, mrr = %.4f'
        % (hr_5, ndcg_5, hr_20, ndcg_20, hr_10, ndcg_10, f_map, f_mrr))
def rankpapers():
    qp = QueryParser(filename='../data/queriesProtonbeam.txt')
    cp = CorpusParser(filename='../data/CorpusProtonbeam.txt')
    qp.parse()
    queries = qp.get_queries()

    cp.parse()
    corpus = cp.get_corpus()

    proc = QueryProcessor(queries, corpus)
    results = proc.run()
    qid = 0
    for result in results:
        sorted_x = sorted(result.iteritems(), key=operator.itemgetter(1))
        sorted_x.reverse()

        index = 0
        # maxScore=sorted_x[0][1]
        for i in sorted_x:
            tmp = (qid, i[0], index, i[1])
            Orpapers[i[0]] = i[1]
            print '{:>1}\tQ0\t{:>4}\t{:>2}\t{:>12}\tNH-BM25'.format(*tmp)
            index += 1
        qid += 1
def eval(index_file, query_text, qrels, n):
    qrys = cranqry.loadCranQry(query_text)
    queries = {}
    for q in qrys:
        queries[q] = qrys[q].text
    query_ids = list(queries.keys())
    query_ids.sort()
    query_ids_ints = []
    for k in range(0, len(query_ids)):  # generating n random queries
        query_ids_ints.append(int(query_ids[k]))
    set1 = set()
    while len(set1) != n:
        set1.add(random.choice(query_ids_ints))
    selected_queries = list(set1)
    docs = set()
    qrels = {}

    f = open("qrels.text", "r")  # parsing relevant queries(qrels.text)
    l = f.readline()
    while l:
        j = l.split(" ")
        if query_ids_ints[int(j[0]) - 1] in qrels.keys():
            qrels[query_ids_ints[int(j[0]) - 1]].append(int(j[1]))
        else:
            qrels[query_ids_ints[int(j[0]) - 1]] = [int(j[1])]
        l = f.readline()
    cranqryobj = cranqry.loadCranQry(query_text)
    dict_query = {}
    for q in cranqryobj:
        dict_query[int(q)] = cranqryobj[
            q].text  # matching queries in query.text and qrels.text
    indexObject = index.InvertedIndex()
    items = indexObject.load(index_file)
    vector_ndcg_score = {}
    vector_score_dict = {}
    for q in selected_queries:
        print(q)
        query_raw = dict_query[q]
        QPobj = QueryProcessor(query_raw, items, index_file)
        QPobj.preprocessing()
        result_list = QPobj.vectorQuery(
            10)  # fetching first 10 documents for a query using vector model
        boolean_result_list = QPobj.booleanQuery()
        print("Boolean query result : ", boolean_result_list
              )  # fetching documents for a query using booleanQuery
        ndcg_boolean = 0
        truth_list = qrels[q]
        boolean_output_list = []
        rank_doc_list = list(map(lambda x: int(x[0]), result_list))
        print("Relavant documents for this query : ",
              truth_list)  # relavant documents for the query
        print("Vector model result : ",
              rank_doc_list)  # documents result list for vector model
        vector_score_list = []
        for id in boolean_result_list:  # calculating the predicted scores for boolean model
            if int(id) in truth_list:
                boolean_output_list.append(1)
            else:
                boolean_output_list.append(0)
        boolean_score_list = []
        if len(boolean_score_list) < 10:
            boolean_score_list = boolean_output_list
            while len(boolean_score_list) != 10:
                boolean_score_list.append(0)
        elif len(boolean_score_list) > 10:
            for i in range(0, 10):
                boolean_score_list[i] = boolean_output_list[i]
        for id in rank_doc_list:  # calculating the predicted scores for vector model

            if id in truth_list:
                vector_score_list.append(1)
            else:
                vector_score_list.append(0)
        vector_score_dict[q] = vector_score_list
        truth_score_list = []
        for i in range(
                0, len(vector_score_list)
        ):  # calculating the ground_truth scores for vector model
            truth_score_list.append(vector_score_list[i])
        truth_score_list.sort(reverse=True)

        boolean_truth_score_list = []
        for i in range(
                0, len(boolean_score_list)
        ):  # calculating the ground_truth scores for boolean model
            boolean_truth_score_list.append(boolean_score_list[i])
        boolean_truth_score_list.sort(reverse=True)
        print("Vector model ground_truth list is:\n", truth_score_list)
        print("Vector ranking score list is:\n", vector_score_list)
        print("Boolean model ground_truth list is:\n",
              boolean_truth_score_list)
        print("Boolean model score list is:\n", boolean_score_list)
        vector_ndcg_score[q] = [
            ndcg_score(np.array(boolean_truth_score_list),
                       np.array(boolean_score_list)),
            ndcg_score(np.array(truth_score_list), np.array(vector_score_list))
        ]
    vector_list = [
    ]  # compute ndcg score for boolean and vector models for all the randomly generated queries
    boolean_list = []
    for qu in vector_ndcg_score:
        vector_list.append(vector_ndcg_score[qu][1])
        boolean_list.append(vector_ndcg_score[qu][0])

    print("ndcg score of boolean and vector models for all the queries:\n",
          vector_ndcg_score)
    print("ndcg scores list for boolean model for all the queries:\n",
          boolean_list)
    print("ndcg scores list for vector model for all the queries:\n",
          vector_list)
    p_value_wilcoxon = stats.wilcoxon(
        np.array(boolean_list), np.array(vector_list)
    )  # calculating p value using wilcoxon test and ttest  for boolean and vector models  p_value_ttest=stats.ttest_ind(np.array(boolean_list),np.array(vector_list), equal_var = False)
    print("wilcoxon test p value is:", p_value_wilcoxon[1])
    print("ttest p value is :", p_value_ttest[1])
Example #19
0
import cran
import query
from cranqry import loadCranQry
from index import InvertedIndex, test
from query import QueryProcessor

print("***************Test Cases Running for Index File****************")
invertedobj = InvertedIndex()
test(invertedobj)

print("***************Test Cases Running for Query File****************")
# load documents
inputdocument = cran.CranFile("cran.all")
# load the index file saved at from part 1
index = InvertedIndex().load("index_file")
# load query processed files
queries = loadCranQry("query.text")

qp = QueryProcessor(queries, index, inputdocument, 29)
query.test(qp)

qp = QueryProcessor(queries, index, inputdocument, 29)
qp.vectorQuery(3)
Example #20
0
def main():
    # qp = QueryParser(filename='../text/queries.txt')
    # cp = CorpusParser(filename='../text/corpus.txt')
    # cp = CorpusParser(filename='../text/comments.txt')
    # kw = KeywordParser(filename='default_db_name)
    # ar = ArticleParser(filename='../text/articles.txt')
    # kwt = KeywordTypeParser(filename='../text/hatetype.txt')

    run_results_file = '../results/run_results.txt'  # this is the file used to write master activity
    #qp = QueryParser(filename='../text/queries.txt')
    qp = QueryParser(db_name=default_db_name)
    cp = CorpusParser(db_name=default_db_name)
    kw = KeywordParser(db_name=default_db_name)
    ar = ArticleParser(db_name=default_db_name)
    kwt = KeywordTypeParser(db_name=default_db_name)


    qp.parse()
    queries = qp.get_queries()

    cp.parse()
    corpus = cp.get_corpus()

    kw.parse()
    keywords = kw.get_keywords()
    #print('keywords retrieved successfull')
    #print('printing keywords')
    #for key, value in keywords.items():
        #print(key, value)

    kwt.parse()
    keyword_types = kwt.get_keywords()

    ar.parse()
    articles = ar.get_articles()
    run_date = datetime.datetime.now()


    proc = QueryProcessor(queries, corpus, keywords, keyword_types, run_date, run_results_file, articles, default_db_name)
    results = proc.run()
    qid = 0
    data = {}

    for result in results:
        sorted_x = sorted(result.items(), key=operator.itemgetter(1))
        sorted_x.reverse()
        index = 0
        j = 0
        for i in sorted_x[:100]:
            tmp = (qid, i[0], index, i[1])
            # todo: add lookup to the original article and add to output
            # print('{:>1}\tQ0\t{:>4}\t{:>2}\t{:>12}\tNH-BM25'.format(*tmp))
            j += 1
            score = i[1]
            docid = i[0]
            ##   title = articles[int(i[0])]['title'].rstrip()
         ##   pub_url = articles[int(i[0])]['pub_url'].rstrip()
         ##   pub_date = articles[int(i[0])]['pub_date'].rstrip()
         ##   source = articles[int(i[0])]['source'].rstrip()
         ##   data.update({'docId': i[0], 'rank_score': j, 'Score': score, 'source': source, 'title': title,
         ##   						'pub_date': pub_date})

         ##   out_string = docid + ', ' + str(j) + ', '+str(round(score, 4))+', "' + title + '", "' + source + '", "'+pub_date + '", "' + pub_url
           # print(out_string)
           # with open('../results/rankings.csv', 'a') as f:
           #     f.write(out_string)
            index += 1
        qid += 1
Example #21
0
def eval():

    # Algorithm:
    # Pick N random samples from query.txt
    # Get top 10 results from bool query for each rnd query
    # Get top 10 results from vector query for each rnd query
    # Compute NDCG btn bool query results and qrels.txt
    # Compute NDCG btn vector query results and qrels.txt
    # Get p-value btn bool and vector

    # Get the query collection
    qc = loadCranQry(query_path)
    poss_queries = list(qc)

    # Load up the inverted index
    ii = InvertedIndex()
    ii.load(index_file)

    # Load up the document collection
    cf = CranFile("cran.all")

    # Get ground-truth results from qrels.txt
    with open(qrels_path) as f:
        qrels = f.readlines()

    # Index qrels into a dict
    qrel_dict = {}
    for qrel in qrels:
        qrel_split = qrel.split()
        if int(qrel_split[0]) in qrel_dict:
            qrel_dict[int(qrel_split[0])].append(int(qrel_split[1]))
        else:
            qrel_dict[int(qrel_split[0])] = [int(qrel_split[1])]

    # Run over N random queries, collecting NDCGs
    bool_ndcgs = []
    vector_ndcgs = []
    for _ in range(n):
        # Get random query ID
        query_id = choice(poss_queries)

        # Get the query
        if 0 < int(query_id) < 10:
            query_id = '00' + str(int(query_id))
        elif 9 < int(query_id) < 100:
            query_id = '0' + str(int(query_id))
        try:
            query = qc[query_id].text
        except KeyError:
            print("Invalid query id", query_id)
            return

        # Initialize the query processor
        qp = QueryProcessor(query, ii, cf)

        # Run bool query
        bool_result = qp.booleanQuery()[:10]

        # Run vector query
        vector_result = qp.vectorQuery(10)

        # Pull top 10 ground-truth results from qrels dict
        gt_results = qrel_dict[poss_queries.index(query_id) + 1][:10]

        # Compute NDCG for bool query
        # NOTE: There is no weighting on the bool query, so give all an even 1
        truth_vector = list(map(lambda x: x in gt_results, bool_result))
        bool_ndcg = ndcg_score(truth_vector, [1] * len(truth_vector),
                               k=len(truth_vector))

        # Compute NDCG for vector query
        vector_docs = []
        vector_scores = []
        for v in vector_result:
            vector_docs.append(v[0])
            vector_scores.append(v[1])
        truth_vector = list(map(lambda x: x in gt_results, vector_docs))
        vector_ndcg = ndcg_score(truth_vector,
                                 vector_scores,
                                 k=len(truth_vector))

        # Accumulate NDCGs
        bool_ndcgs.append(bool_ndcg)
        vector_ndcgs.append(vector_ndcg)

    # Average out score lists
    bool_avg = 0
    for bool in bool_ndcgs:
        bool_avg += bool
    bool_avg /= len(bool_ndcgs)

    vector_avg = 0
    for vector in vector_ndcgs:
        vector_avg += vector
    vector_avg /= len(vector_ndcgs)

    # Present averages and p-values
    print("Boolean NDCG average:", bool_avg)
    print("Vector NDCG average:", vector_avg)
    if n > 19:
        print("Wilcoxon p-value:", wilcoxon(bool_ndcgs, vector_ndcgs).pvalue)
    else:
        print("Wilcoxon p-value: Sample size too small to be significant")
    print("T-Test p-value:", ttest_ind(bool_ndcgs, vector_ndcgs).pvalue)
def eval(testOn):
    k = 10  # k the number of top k pairs of (docID, similarity) to get from vectorQuery
    dictQ_ID = []
    indexFile = sys.argv[1]  #v "src/Data/tempFile"
    queryText = sys.argv[2]
    qrelsText = sys.argv[3]
    dictOfQuery = {}
    dictQrelsText = {}
    docCollection = CranFile('./CranfieldDataset/cran.all')
    NDCGScoreBool = []
    numberOfQueries = int(sys.argv[4])
    NDCGScoreVector = []
    #indexFile           = "src/Data/tempFile"
    #queryText           = 'src/CranfieldDataset/query.text'
    #qrelsText           = 'src/CranfieldDataset/qrels.text'
    #numberOfQueries     = 50
    numberOfTimeToLoop = 5

    #Loads Files
    listOfQueryRelsMaping = readFile(qrelsText)
    queryFile = loadCranQry(queryText)

    #Data Need
    for i in range(numberOfTimeToLoop):

        #Get random Queiry
        dictOfQuery = getRandomQuery(queryFile, numberOfQueries)
        if testOn:
            assert len(dictOfQuery
                       ) == numberOfQueries, "Error are getting random query"

        # Return all query
        # dictOfQuery = getAllDataItems(queryFile)
        # if testOn:
        #     assert len(dictOfQuery) == 225, "Error are getting random query"

        #get list of Query result from qrel.txt
        dictQrelsText = getResultsFrom_QrelsFile(listOfQueryRelsMaping,
                                                 dictOfQuery)
        if testOn:
            assert len(dictQrelsText
                       ) == numberOfQueries, "Error number Of Queries to large"

        start = timer()
        queryProcessor = QueryProcessor(
            "", indexFile,
            docCollection.docs)  # This is an extremely expensive process\
        end = timer()

        if testOn:
            print("Time for creating QueryProcessor:", end - start)
        countDoc = 0
        start = timer()

        dictQ_ID = []
        for qid, queryText in dictOfQuery.items():
            countDoc += 1

            dictQ_ID.append(qid)

            if testOn:
                print("QID:", qid)
            start = timer()
            queryProcessor.loadQuery(queryText)
            end = timer()
            if testOn:
                print("Time for Load:", end - start)
                print("qrels: ", dictQrelsText[qid])

            start = timer()
            docIDs = queryProcessor.booleanQuery(
            )  # data would need to be like this [12, 14, 78, 141, 486, 746, 172, 573, 1003]
            #docIDs_1 = queryProcessor.booleanQuery_1()
            end = timer()
            if testOn:
                print("Time for booleanQuery:", end - start)

            start = timer()
            listOfDocIDAndSimilarity = queryProcessor.vectorQuery(
                k
            )  # data need to look like k=3 [[625,0.8737006126353902],[401,0.8697643788341478],[943,0.8424991316663082]]
            #vectorQueryDict[qid] = dictOfDocIDAndSimilarity
            end = timer()
            if testOn:
                print("Time for vectorQuery:", end - start)
                print("booleanQuery:", docIDs)

            #For Boolean part
            start = timer()
            yTrue = []
            yScore = []
            for docID in docIDs:
                yScore.append(1)
                if docID in dictQrelsText[qid]:
                    yTrue.append(1)
                else:
                    yTrue.append(0)
            yTrue.sort(reverse=True)
            score = metrics.ndcg_score(yTrue[:k], yScore[:k], k, "exponential")
            if math.isnan(score):
                NDCGScoreBool.append(0)
            else:
                NDCGScoreBool.append(score)
            end = timer()
            if testOn:
                print("Time for  Boolean ndcg:", end - start)

            #For Vector part
            start = timer()
            yTrue = []
            yScore = []
            if testOn:
                print("vectorQuery:", listOfDocIDAndSimilarity)
            for docID_Score in listOfDocIDAndSimilarity:
                yScore.append(float(docID_Score[1]))
                if docID_Score[0] in dictQrelsText[qid]:
                    yTrue.append(1)
                else:
                    yTrue.append(0)
            yTrue.sort(reverse=True)
            score = metrics.ndcg_score(yTrue[:k], yScore[:k], k, "exponential")
            if math.isnan(score):
                NDCGScoreVector.append(0)
            else:
                NDCGScoreVector.append(score)
            end = timer()
            if testOn:
                print("Time for  Vector ndcg:", end - start)
        print("\nRunning Querys iteration:(", str(i + 1), ")\n", dictQ_ID)

        if testOn:
            for QID, boolScore, vectorScore in zip(dictQ_ID, NDCGScoreBool,
                                                   NDCGScoreVector):
                print("QID", QID, "Boolean Model:", boolScore, "Vector Model",
                      vectorScore)

    print("\nThe Length Of Both NDCG Score is: ", len(NDCGScoreBool), "==",
          len(NDCGScoreVector))

    print('\nThe Avg NDCG Score')
    vectorAvg = avg(NDCGScoreVector)
    BoolAvg = avg(NDCGScoreBool)
    print("Avg NDCG Score for Bool:", BoolAvg, "\nAvg NDCG Score for Vector:",
          vectorAvg)
    end = timer()
    if testOn:
        print("\n\nTime for running ", countDoc, " queries:", end - start)

    print('\nThe P-Value')
    p_va_ttest = stats.ttest_ind(NDCGScoreBool, NDCGScoreVector)
    p_va_wilcoxon = stats.wilcoxon(NDCGScoreBool, NDCGScoreVector)
    print("T-Test P-value: ", p_va_ttest)
    print("Wilcoxon P-value: ", p_va_wilcoxon)
    print('Done')
Example #23
0
def main():
    # qp = QueryParser(filename='../text/queries.txt')
    # cp = CorpusParser(filename='../text/corpus.txt')
    # cp = CorpusParser(filename='../text/comments.txt')
    # kw = KeywordParser(filename='../text/'+sys.argv[1])
    # ar = ArticleParser(filename='../text/articles.txt')
    # kwt = KeywordTypeParser(filename='../text/hatetype.txt')

    run_results_file = '../results/run_results.txt'  # this is the file used to write master activity
    qp = QueryParser(filename='../text/queries.txt')
    cp = CorpusParser(filename='../text/comments.txt')
    kw = KeywordParser(filename='../text/weights-chen.txt')
    ar = ArticleParser(filename='../text/articles.txt')
    kwt = KeywordTypeParser(filename='../text/hatetype.txt')

    qp.parse()
    queries = qp.get_queries()

    cp.parse()
    corpus = cp.get_corpus()

    kw.parse()
    keywords = kw.get_keywords()

    kwt.parse()
    keyword_types = kwt.get_keywords()

    ar.parse()
    articles = ar.get_articles()
    run_date = datetime.datetime.now()

    proc = QueryProcessor(queries, corpus, keywords, keyword_types, run_date,
                          run_results_file, articles)
    results = proc.run()
    qid = 0
    data = {}

    for result in results:
        sorted_x = sorted(result.items(), key=operator.itemgetter(1))
        sorted_x.reverse()
        index = 0
        j = 0
        for i in sorted_x[:100]:
            tmp = (qid, i[0], index, i[1])
            # todo: add lookup to the original article and add to output
            # print('{:>1}\tQ0\t{:>4}\t{:>2}\t{:>12}\tNH-BM25'.format(*tmp))
            j += 1
            score = i[1]
            docid = i[0]
            title = articles[int(i[0])]['title'].rstrip()
            pub_url = articles[int(i[0])]['pub_url'].rstrip()
            pub_date = articles[int(i[0])]['pub_date'].rstrip()
            source = articles[int(i[0])]['source'].rstrip()
            data.update({
                'docId': i[0],
                'rank_score': j,
                'Score': score,
                'source': source,
                'title': title,
                'pub_date': pub_date
            })

            out_string = docid + ', ' + str(j) + ', ' + str(
                round(score, 4)
            ) + ', "' + title + '", "' + source + '", "' + pub_date + '", "' + pub_url
            print(out_string)
            # with open('../results/rankings.csv', 'a') as f:
            #     f.write(out_string)
            index += 1
        qid += 1
        print(
            '\n**The application has finished: You may view the results and supporting files in '
            'the ../results directory for this run.\nEach query in the /text/query.txt file'
            'will generate one directory with the format YYYYMMDDHHMMSSQ# with # being the query #.\n'
            'Inside the director you will find the following files:\n\n'
            'xxxxx.category - This file contains a record for each document showing the most prevelant category, the number\n'
            '\t\tof occurances, if the document contained terms considered threatening and all additional categories and '
            'their counts\nxxxxx.details - This file contains all documents, each term found and count and the weight applied'
            'to the specific term found.  This is supporting for the analysis on how a document was ranked\n'
            'xxxxx.query - This file contains the terms used for this query\nxxxxx.rank - this contains a list of all'
            'documents that were ranked including their score, the source, title, date\nxxxxx.weights - this file'
            'contains all of the terms found and the weights in effect at the time of the run.'
        )