Beispiel #1
0
    def __init__(self, query, index, collection):
        ''' index is the inverted index; collection is the document collection'''
        self.raw_query = query
        self.index = index
        self.docs_fname = collection

        self.cf = cran.CranFile('../CranfieldDataset/cran.all')
        self.nDocs = len(self.cf.docs)
def indexingCranfield(cran_all_file, index_file):
    # ToDo: indexing the Cranfield dataset and save the index to a file
    # command line usage: "python index.py cran.all index_file"
    # the index is saved to index_file
    cf = cran.CranFile(cran_all_file)
    indexobj = InvertedIndex()
    for doc in cf.docs:
        indexobj.indexDoc(doc)
    print(indexobj.items)
    print("Done")
    indexobj.sort()
    indexobj.save(index_file)
def indexingCranfield():
    #ToDo: indexing the Cranfield dataset and save the index to a file
    # command line usage: "python index.py cran.all index_file"
    # the index is saved to index_file
    cf = cran.CranFile('cran.all')
    for doc in cf.docs:
        #print("The information is {} \n", doc.body)
        #print("Document ID:{} ".format(doc.docID))
        print doc.docID, doc.body

        #if True:
        #print doc.body
        #print '\n'
    print 'Done'
Beispiel #4
0
def indexingCranfield(data_file, indexfile):
    #ToDo: indexing the Cranfield dataset and save the index to a file
    # command line usage: "python index.py cran.all index_file"
    # the index is saved to index_file
    #first load the document file
    inputdocument = cran.CranFile(data_file)

    #create object for invetedIndex class
    #iterate over the document File and create the index file
    #calculate idf
    invertedobj = InvertedIndex()
    for docs in inputdocument.docs:
        invertedobj.indexDoc(docs)
    for x in invertedobj.items:
        invertedobj.idf(x)
    invertedobj.save(indexfile)
def query(index_file, algorithm, query_file, query_id):
    ''' the main query processing program, using QueryProcessor'''

    # ToDo: the commandline usage: "echo query_string | python query.py index_file processing_algorithm"
    # processing_algorithm: 0 for booleanQuery and 1 for vectorQuery
    # for booleanQuery, the program will print the total number of documents and the list of docuement IDs
    # for vectorQuery, the program will output the top 3 most similar documents
    query_file = cranqry.loadCranQry(query_file)  # loading file
    index_items = InvertedIndex()
    index_items = index_items.load(index_file)
    cran_file = cran.CranFile('cran.all')
    query_verify = QueryProcessor(query_file, index_items, cran_file.docs)
    query_verify.preprocessing()
    results = None
    if algorithm == '0':  # if algorithm is 0 it represents boolean model
        results = query_verify.booleanQuery(query_id)
    elif algorithm == '1':  # if algorithm is 1 it is vector model
        results = query_verify.vectorQuery(3, query_id)
    print(results)
Beispiel #6
0
def indexingCranfield():
    #ToDo: indexing the Cranfield dataset and save the index to a file
    # command line usage: "python index.py cran.all index_file"
    # the index is saved to index_file

    #input parameters from command line
    cran_file = sys.argv[1]
    index_file = sys.argv[2]

    cf = cran.CranFile(cran_file)  #instantiation of CranFile class

    II = InvertedIndex()  #InvertedIndex class instantiated
    dump = list()  #to store list of each doc to be indexed
    for doc in cf.docs:
        dump.append(II.indexDoc(doc))

    II.save(
        index_file
    )  #saves to an output file----call to 'save' method of 'InvertedIndex' class

    print 'Done'
Beispiel #7
0
def indexingCranfield():
    #ToDo: indexing the Cranfield dataset and save the index to a file
    # command line usage: "python index.py cran.all index_file"
    # the index is saved to index_file

    #read arguements from command line
    file = sys.argv[1]
    filename = sys.argv[2]

    filename += ".p"

    #create obejct
    i = InvertedIndex()

    cf = cran.CranFile(file)

    print(datetime.datetime.now())
    for docs in cf.docs:
        i.indexDoc(docs)  #call indexDoc to create index for each doc

    #save the index to disk
    i.save(filename)

    print(datetime.datetime.now())
Beispiel #8
0
def query(index_file, model_type, query_file, query_id):
    ''' the main query processing program, using QueryProcessor'''

    # ToDo: the commandline usage: "echo query_string | python query.py index_file processing_algorithm"
    # processing_algorithm: 0 for booleanQuery and 1 for vectorQuery
    # for booleanQuery, the program will print the total number of documents and the list of docuement IDs
    # for vectorQuery, the program will output the top 3 most similar documents
    #load documents
    inputdocument = cran.CranFile("cran.all")
    #load the index file saved at from part 1
    index = InvertedIndex().load(index_file)
    #load query processed files
    queries = loadCranQry(query_file)

    qp = QueryProcessor(queries, index, inputdocument, query_id)

    if model_type == 0:
        Booleanres = qp.booleanQuery()
        print(Booleanres)
    if model_type == 1:
        vectorres = qp.vectorQuery(3)
        print(vectorres)
    if model_type == 2:
        qp.BatchEvaluation()
Beispiel #9
0
import cran
import query
from cranqry import loadCranQry
from index import InvertedIndex, test
from query import QueryProcessor

print("***************Test Cases Running for Index File****************")
invertedobj = InvertedIndex()
test(invertedobj)

print("***************Test Cases Running for Query File****************")
# load documents
inputdocument = cran.CranFile("cran.all")
# load the index file saved at from part 1
index = InvertedIndex().load("index_file")
# load query processed files
queries = loadCranQry("query.text")

qp = QueryProcessor(queries, index, inputdocument, 29)
query.test(qp)

qp = QueryProcessor(queries, index, inputdocument, 29)
qp.vectorQuery(3)
# -*- coding: utf-8 -*-
"""
Created on Sat Mar  2 14:18:51 2019

@author: alekh
"""

from nltk.tokenize import word_tokenize
import cran
import util


def tokenise(documents):
    tokens = word_tokenize(doc)
    alpha_tokens = [token for token in tokens if token.isalpha()]
    return alpha_tokens


if __name__ == "__main__":
    cf = cran.CranFile("cran.all")
    documents = []
    tokens = []
    for doc in cf.docs:
        documents.append(doc.body)

    #print(documents)
    for doc in documents:
        tokens.append(util.tokenize(doc))
    print(tokens)
Beispiel #11
0
def totalDocuments():
    #total number of documents
    cf = cran.CranFile(sys.argv[1])
    return len(cf.docs)
Beispiel #12
0
def eval(index_file, query_file, qrels_File, number_of_queries):
    #read queryfile,indexfile
    # ToDo
    queries = loadCranQry(query_file)
    queries_id_list = [str(int(x)) for x in queries.keys()]
    #print(queries_id_list)
    #read querls.txt
    qrels_dict = process_querls_file(qrels_File, queries_id_list)
    inputdocument = cran.CranFile("cran.all")
    # load the index file saved at from part 1
    index = InvertedIndex().load(index_file)
    qp = QueryProcessor(queries, index, inputdocument, number_of_queries)
    queries_id_list_int = [int(x) for x in qrels_dict.keys()]
    queries_id_ls = [int(x) for x in queries.keys()]
    #IdeaVectorsforQuery_ids={}
    sumbooleanNADC = []
    sumvectorNADC = []
    with open('Evaluation_search.csv', 'w') as f:
        f.write("%s,%s,%s,%s\n" % ("Iteration", "AverageNDCG-booleanModel",
                                   "AverageNDCG-vectorModel", "P-value"))
        for i in range(0, 5):
            vectorNADC = []
            booleanNADC = []
            intersection_queries = list(
                set(queries_id_list_int) & set(queries_id_ls))
            random_query_id_list = random.sample(queries_id_list_int,
                                                 number_of_queries)
            #random_query_id_list=[153, 18]
            #print(random_query_id_list)
            for q_id in random_query_id_list:
                print("Processing for Query ID ::", q_id)
                qp.querynumber = q_id
                #boolean_res=qp.booleanQuery()
                vector_top3 = qp.vectorQuery(5)
                #vector_top3=[('12',0.34),('746',0.33),('875',0.24)]
                #print(boolean_res)
                print("Output for Vector Model Result::", vector_top3)
                if (vector_top3.__len__() < 1):
                    vectorNADC.append(0)
                else:
                    vector_label = [x[0] for x in vector_top3]
                    score = [x[1] for x in vector_top3]
                    print("DocumentIDs of Vector Model Result:: ",
                          vector_label)
                    print("Scores of Vector Model Result::", score)
                    true_label = vector_label.copy()
                    query_id = str(q_id)
                    for x in vector_label:
                        #str_x="{0:0=3d}".format(x)
                        ind = vector_label.index(x)
                        if (x in qrels_dict.get(query_id)):
                            true_label[ind] = 1
                        else:
                            true_label[ind] = 0
                    if true_label.__len__() < 5:
                        len_val = 10 - (true_label.__len__())
                        true_label.extend([0] * len_val)
                    print("Actual Vector:: ", true_label)
                    print("Predicted Vector:: ", score)
                    if sum(true_label) == 0:
                        vectorNADC.append(0)
                    else:
                        ndcg = metrics.ndcg_score(true_label, score, 5)
                        print("Calculated ndcg for Vector::", ndcg)
                        vectorNADC.append(ndcg)
                boolean_res = qp.booleanQuery()
                print("output of boolean_res:: ", boolean_res)
                if boolean_res.__len__() < 1:
                    booleanNADC.append(0)
                else:
                    score = [1] * len(boolean_res)
                    if (score.__len__() < 5):
                        leng = 5 - (score.__len__())
                        score.extend([0] * leng)
                    true_label = boolean_res.copy()
                    query_id = str(q_id)
                    for x in boolean_res:
                        ind = boolean_res.index(x)
                        if (x in qrels_dict.get(query_id)):
                            true_label[ind] = 1
                        else:
                            true_label[ind] = 0
                    if true_label.__len__() < 5:
                        len_val = 10 - (true_label.__len__())
                        true_label.extend([0] * len_val)
                    print("Actual boolean:: ", true_label)
                    print("Predicted boolean:: ", score)
                    if sum(true_label) == 0:
                        booleanNADC.append(0)
                    else:
                        ndcg = metrics.ndcg_score(true_label, score, 5)
                        print("Calculated ndcg for Boolean::", ndcg)
                        booleanNADC.append(ndcg)
            print("Calculated NADC sum for all queries", vectorNADC)
            avergae_vectorNADC = float(sum(vectorNADC) / number_of_queries)
            print("Calculated NADC sum for all queries", booleanNADC)
            avergae_booleanNADC = float(sum(booleanNADC) / number_of_queries)
            print("Avergae NADC Vector::", avergae_vectorNADC)
            print("Avergae NADC boolean::", avergae_booleanNADC)
            p_value = scipy.stats.wilcoxon(vectorNADC,
                                           booleanNADC,
                                           zero_method='wilcox',
                                           correction=False)
            print(i, str(avergae_booleanNADC), str(avergae_vectorNADC),
                  str(p_value[1]))
            p = "%.20f" % float(str(p_value[1]))
            print('P value for all the queries processed is:', p)
            f.write("%s,%s,%s,%s\n" % (i + 1, str(avergae_booleanNADC),
                                       str(avergae_vectorNADC), str(p)))
    print('Done')
Beispiel #13
0
def VectorCompare():
     queries = loadCranQry("query.text")
     queries_id_list=[str(int(x)) for x in queries.keys()]
     inputdocument = cran.CranFile("cran.all")
     # load the index file saved at from part 1
     index = InvertedIndex().load("index_file")
     qp = QueryProcessor(queries, index, inputdocument, 10)
     queries_id_list=[str(int(x)) for x in queries.keys()]
     #print(queries_id_list)
     #read querls.txt
     qrels_dict=process_querls_file("qrels.text",queries_id_list)
     #IdeaVectorsforQuery_ids={}
     sumbooleanNADC=[]
     sumvectorNADC=[]
     vectorNADC1 = []
     booleanNADC2 = []
     # random_query_id_list=[153, 18]
     # print(random_query_id_list)
     query_id = [4 , 29, 53, 58, 100]
     vectorNADC1=[]
     vectorNADC2=[]
     for q_id in query_id:
         qp.querynumber = q_id
         # boolean_res=qp.booleanQuery()
         vector_top3 = qp.vectorQuery(5)
         vector2_top3=qp.vectorQuery(5,True)
         # vector_top3=[('12',0.34),('746',0.33),('875',0.24)]
         # print(boolean_res)
         print("Output for Vector Model Result::", vector_top3)
         if (vector_top3.__len__() < 1):
             vectorNADC1.append(0)
         else:
             vector_label = [x[0] for x in vector_top3]
             score = [x[1] for x in vector_top3]
             print("DocumentIDs of Vector Model Result:: ", vector_label)
             print("Scores of Vector Model Result::", score)
             true_label = vector_label.copy()
             query_id = str(q_id)
             for x in vector_label:
                 # str_x="{0:0=3d}".format(x)
                 ind = vector_label.index(x)
                 if (x in qrels_dict.get(query_id)):
                     true_label[ind] = 1
                 else:
                     true_label[ind] = 0
             if true_label.__len__() < 5:
                 len_val = 10 - (true_label.__len__())
                 true_label.extend([0] * len_val)
             print("Actual Vector:: ", true_label)
             print("Predicted Vector:: ", score)
             if sum(true_label) == 0:
                 vectorNADC1.append(0)
             else:
                 ndcg = metrics.ndcg_score(true_label, score, 5)
                 print("Calculated ndcg for Vector::", ndcg)
                 vectorNADC1.append(ndcg)
         if (vector2_top3.__len__() < 1):
             vectorNADC2.append(0)
         else:
             vector_label = [x[0] for x in vector2_top3]
             score = [x[1] for x in vector2_top3]
             print("DocumentIDs of Vector Model Result:: ", vector_label)
             print("Scores of Vector Model Result::", score)
             true_label = vector_label.copy()
             query_id = str(q_id)
             for x in vector_label:
                 # str_x="{0:0=3d}".format(x)
                 ind = vector_label.index(x)
                 if (x in qrels_dict.get(query_id)):
                     true_label[ind] = 1
                 else:
                     true_label[ind] = 0
             if true_label.__len__() < 5:
                 len_val = 10 - (true_label.__len__())
                 true_label.extend([0] * len_val)
             print("Actual Vector:: ", true_label)
             print("Predicted Vector:: ", score)
             if sum(true_label) == 0:
                 vectorNADC2.append(0)
             else:
                 ndcg = metrics.ndcg_score(true_label, score, 5)
                 print("Calculated ndcg for Vector::", ndcg)
                 vectorNADC2.append(ndcg)
     print("Calculated NADC sum for all queries", vectorNADC1)
     avergae_vectorNADC = float(sum(vectorNADC1) / 5)
     print("Calculated NADC sum for all queries", vectorNADC2)
     avergae_vectorNADC2 = float(sum(vectorNADC2) / 5)
     print("Avergae NADC Vector::", avergae_vectorNADC)
     print("Avergae NADC boolean::", avergae_vectorNADC2)
     print(vectorNADC1)
     print(vectorNADC2)
     p_value = scipy.stats.wilcoxon(vectorNADC1, vectorNADC2, zero_method='wilcox', correction=False)
     p = "%.20f" % float(str(p_value[1]))
     print('P value for all the queries processed is:', p)