def __init__(self, query, index, collection): ''' index is the inverted index; collection is the document collection''' self.raw_query = query self.index = index self.docs_fname = collection self.cf = cran.CranFile('../CranfieldDataset/cran.all') self.nDocs = len(self.cf.docs)
def indexingCranfield(cran_all_file, index_file): # ToDo: indexing the Cranfield dataset and save the index to a file # command line usage: "python index.py cran.all index_file" # the index is saved to index_file cf = cran.CranFile(cran_all_file) indexobj = InvertedIndex() for doc in cf.docs: indexobj.indexDoc(doc) print(indexobj.items) print("Done") indexobj.sort() indexobj.save(index_file)
def indexingCranfield(): #ToDo: indexing the Cranfield dataset and save the index to a file # command line usage: "python index.py cran.all index_file" # the index is saved to index_file cf = cran.CranFile('cran.all') for doc in cf.docs: #print("The information is {} \n", doc.body) #print("Document ID:{} ".format(doc.docID)) print doc.docID, doc.body #if True: #print doc.body #print '\n' print 'Done'
def indexingCranfield(data_file, indexfile): #ToDo: indexing the Cranfield dataset and save the index to a file # command line usage: "python index.py cran.all index_file" # the index is saved to index_file #first load the document file inputdocument = cran.CranFile(data_file) #create object for invetedIndex class #iterate over the document File and create the index file #calculate idf invertedobj = InvertedIndex() for docs in inputdocument.docs: invertedobj.indexDoc(docs) for x in invertedobj.items: invertedobj.idf(x) invertedobj.save(indexfile)
def query(index_file, algorithm, query_file, query_id): ''' the main query processing program, using QueryProcessor''' # ToDo: the commandline usage: "echo query_string | python query.py index_file processing_algorithm" # processing_algorithm: 0 for booleanQuery and 1 for vectorQuery # for booleanQuery, the program will print the total number of documents and the list of docuement IDs # for vectorQuery, the program will output the top 3 most similar documents query_file = cranqry.loadCranQry(query_file) # loading file index_items = InvertedIndex() index_items = index_items.load(index_file) cran_file = cran.CranFile('cran.all') query_verify = QueryProcessor(query_file, index_items, cran_file.docs) query_verify.preprocessing() results = None if algorithm == '0': # if algorithm is 0 it represents boolean model results = query_verify.booleanQuery(query_id) elif algorithm == '1': # if algorithm is 1 it is vector model results = query_verify.vectorQuery(3, query_id) print(results)
def indexingCranfield(): #ToDo: indexing the Cranfield dataset and save the index to a file # command line usage: "python index.py cran.all index_file" # the index is saved to index_file #input parameters from command line cran_file = sys.argv[1] index_file = sys.argv[2] cf = cran.CranFile(cran_file) #instantiation of CranFile class II = InvertedIndex() #InvertedIndex class instantiated dump = list() #to store list of each doc to be indexed for doc in cf.docs: dump.append(II.indexDoc(doc)) II.save( index_file ) #saves to an output file----call to 'save' method of 'InvertedIndex' class print 'Done'
def indexingCranfield(): #ToDo: indexing the Cranfield dataset and save the index to a file # command line usage: "python index.py cran.all index_file" # the index is saved to index_file #read arguements from command line file = sys.argv[1] filename = sys.argv[2] filename += ".p" #create obejct i = InvertedIndex() cf = cran.CranFile(file) print(datetime.datetime.now()) for docs in cf.docs: i.indexDoc(docs) #call indexDoc to create index for each doc #save the index to disk i.save(filename) print(datetime.datetime.now())
def query(index_file, model_type, query_file, query_id): ''' the main query processing program, using QueryProcessor''' # ToDo: the commandline usage: "echo query_string | python query.py index_file processing_algorithm" # processing_algorithm: 0 for booleanQuery and 1 for vectorQuery # for booleanQuery, the program will print the total number of documents and the list of docuement IDs # for vectorQuery, the program will output the top 3 most similar documents #load documents inputdocument = cran.CranFile("cran.all") #load the index file saved at from part 1 index = InvertedIndex().load(index_file) #load query processed files queries = loadCranQry(query_file) qp = QueryProcessor(queries, index, inputdocument, query_id) if model_type == 0: Booleanres = qp.booleanQuery() print(Booleanres) if model_type == 1: vectorres = qp.vectorQuery(3) print(vectorres) if model_type == 2: qp.BatchEvaluation()
import cran import query from cranqry import loadCranQry from index import InvertedIndex, test from query import QueryProcessor print("***************Test Cases Running for Index File****************") invertedobj = InvertedIndex() test(invertedobj) print("***************Test Cases Running for Query File****************") # load documents inputdocument = cran.CranFile("cran.all") # load the index file saved at from part 1 index = InvertedIndex().load("index_file") # load query processed files queries = loadCranQry("query.text") qp = QueryProcessor(queries, index, inputdocument, 29) query.test(qp) qp = QueryProcessor(queries, index, inputdocument, 29) qp.vectorQuery(3)
# -*- coding: utf-8 -*- """ Created on Sat Mar 2 14:18:51 2019 @author: alekh """ from nltk.tokenize import word_tokenize import cran import util def tokenise(documents): tokens = word_tokenize(doc) alpha_tokens = [token for token in tokens if token.isalpha()] return alpha_tokens if __name__ == "__main__": cf = cran.CranFile("cran.all") documents = [] tokens = [] for doc in cf.docs: documents.append(doc.body) #print(documents) for doc in documents: tokens.append(util.tokenize(doc)) print(tokens)
def totalDocuments(): #total number of documents cf = cran.CranFile(sys.argv[1]) return len(cf.docs)
def eval(index_file, query_file, qrels_File, number_of_queries): #read queryfile,indexfile # ToDo queries = loadCranQry(query_file) queries_id_list = [str(int(x)) for x in queries.keys()] #print(queries_id_list) #read querls.txt qrels_dict = process_querls_file(qrels_File, queries_id_list) inputdocument = cran.CranFile("cran.all") # load the index file saved at from part 1 index = InvertedIndex().load(index_file) qp = QueryProcessor(queries, index, inputdocument, number_of_queries) queries_id_list_int = [int(x) for x in qrels_dict.keys()] queries_id_ls = [int(x) for x in queries.keys()] #IdeaVectorsforQuery_ids={} sumbooleanNADC = [] sumvectorNADC = [] with open('Evaluation_search.csv', 'w') as f: f.write("%s,%s,%s,%s\n" % ("Iteration", "AverageNDCG-booleanModel", "AverageNDCG-vectorModel", "P-value")) for i in range(0, 5): vectorNADC = [] booleanNADC = [] intersection_queries = list( set(queries_id_list_int) & set(queries_id_ls)) random_query_id_list = random.sample(queries_id_list_int, number_of_queries) #random_query_id_list=[153, 18] #print(random_query_id_list) for q_id in random_query_id_list: print("Processing for Query ID ::", q_id) qp.querynumber = q_id #boolean_res=qp.booleanQuery() vector_top3 = qp.vectorQuery(5) #vector_top3=[('12',0.34),('746',0.33),('875',0.24)] #print(boolean_res) print("Output for Vector Model Result::", vector_top3) if (vector_top3.__len__() < 1): vectorNADC.append(0) else: vector_label = [x[0] for x in vector_top3] score = [x[1] for x in vector_top3] print("DocumentIDs of Vector Model Result:: ", vector_label) print("Scores of Vector Model Result::", score) true_label = vector_label.copy() query_id = str(q_id) for x in vector_label: #str_x="{0:0=3d}".format(x) ind = vector_label.index(x) if (x in qrels_dict.get(query_id)): true_label[ind] = 1 else: true_label[ind] = 0 if true_label.__len__() < 5: len_val = 10 - (true_label.__len__()) true_label.extend([0] * len_val) print("Actual Vector:: ", true_label) print("Predicted Vector:: ", score) if sum(true_label) == 0: vectorNADC.append(0) else: ndcg = metrics.ndcg_score(true_label, score, 5) print("Calculated ndcg for Vector::", ndcg) vectorNADC.append(ndcg) boolean_res = qp.booleanQuery() print("output of boolean_res:: ", boolean_res) if boolean_res.__len__() < 1: booleanNADC.append(0) else: score = [1] * len(boolean_res) if (score.__len__() < 5): leng = 5 - (score.__len__()) score.extend([0] * leng) true_label = boolean_res.copy() query_id = str(q_id) for x in boolean_res: ind = boolean_res.index(x) if (x in qrels_dict.get(query_id)): true_label[ind] = 1 else: true_label[ind] = 0 if true_label.__len__() < 5: len_val = 10 - (true_label.__len__()) true_label.extend([0] * len_val) print("Actual boolean:: ", true_label) print("Predicted boolean:: ", score) if sum(true_label) == 0: booleanNADC.append(0) else: ndcg = metrics.ndcg_score(true_label, score, 5) print("Calculated ndcg for Boolean::", ndcg) booleanNADC.append(ndcg) print("Calculated NADC sum for all queries", vectorNADC) avergae_vectorNADC = float(sum(vectorNADC) / number_of_queries) print("Calculated NADC sum for all queries", booleanNADC) avergae_booleanNADC = float(sum(booleanNADC) / number_of_queries) print("Avergae NADC Vector::", avergae_vectorNADC) print("Avergae NADC boolean::", avergae_booleanNADC) p_value = scipy.stats.wilcoxon(vectorNADC, booleanNADC, zero_method='wilcox', correction=False) print(i, str(avergae_booleanNADC), str(avergae_vectorNADC), str(p_value[1])) p = "%.20f" % float(str(p_value[1])) print('P value for all the queries processed is:', p) f.write("%s,%s,%s,%s\n" % (i + 1, str(avergae_booleanNADC), str(avergae_vectorNADC), str(p))) print('Done')
def VectorCompare(): queries = loadCranQry("query.text") queries_id_list=[str(int(x)) for x in queries.keys()] inputdocument = cran.CranFile("cran.all") # load the index file saved at from part 1 index = InvertedIndex().load("index_file") qp = QueryProcessor(queries, index, inputdocument, 10) queries_id_list=[str(int(x)) for x in queries.keys()] #print(queries_id_list) #read querls.txt qrels_dict=process_querls_file("qrels.text",queries_id_list) #IdeaVectorsforQuery_ids={} sumbooleanNADC=[] sumvectorNADC=[] vectorNADC1 = [] booleanNADC2 = [] # random_query_id_list=[153, 18] # print(random_query_id_list) query_id = [4 , 29, 53, 58, 100] vectorNADC1=[] vectorNADC2=[] for q_id in query_id: qp.querynumber = q_id # boolean_res=qp.booleanQuery() vector_top3 = qp.vectorQuery(5) vector2_top3=qp.vectorQuery(5,True) # vector_top3=[('12',0.34),('746',0.33),('875',0.24)] # print(boolean_res) print("Output for Vector Model Result::", vector_top3) if (vector_top3.__len__() < 1): vectorNADC1.append(0) else: vector_label = [x[0] for x in vector_top3] score = [x[1] for x in vector_top3] print("DocumentIDs of Vector Model Result:: ", vector_label) print("Scores of Vector Model Result::", score) true_label = vector_label.copy() query_id = str(q_id) for x in vector_label: # str_x="{0:0=3d}".format(x) ind = vector_label.index(x) if (x in qrels_dict.get(query_id)): true_label[ind] = 1 else: true_label[ind] = 0 if true_label.__len__() < 5: len_val = 10 - (true_label.__len__()) true_label.extend([0] * len_val) print("Actual Vector:: ", true_label) print("Predicted Vector:: ", score) if sum(true_label) == 0: vectorNADC1.append(0) else: ndcg = metrics.ndcg_score(true_label, score, 5) print("Calculated ndcg for Vector::", ndcg) vectorNADC1.append(ndcg) if (vector2_top3.__len__() < 1): vectorNADC2.append(0) else: vector_label = [x[0] for x in vector2_top3] score = [x[1] for x in vector2_top3] print("DocumentIDs of Vector Model Result:: ", vector_label) print("Scores of Vector Model Result::", score) true_label = vector_label.copy() query_id = str(q_id) for x in vector_label: # str_x="{0:0=3d}".format(x) ind = vector_label.index(x) if (x in qrels_dict.get(query_id)): true_label[ind] = 1 else: true_label[ind] = 0 if true_label.__len__() < 5: len_val = 10 - (true_label.__len__()) true_label.extend([0] * len_val) print("Actual Vector:: ", true_label) print("Predicted Vector:: ", score) if sum(true_label) == 0: vectorNADC2.append(0) else: ndcg = metrics.ndcg_score(true_label, score, 5) print("Calculated ndcg for Vector::", ndcg) vectorNADC2.append(ndcg) print("Calculated NADC sum for all queries", vectorNADC1) avergae_vectorNADC = float(sum(vectorNADC1) / 5) print("Calculated NADC sum for all queries", vectorNADC2) avergae_vectorNADC2 = float(sum(vectorNADC2) / 5) print("Avergae NADC Vector::", avergae_vectorNADC) print("Avergae NADC boolean::", avergae_vectorNADC2) print(vectorNADC1) print(vectorNADC2) p_value = scipy.stats.wilcoxon(vectorNADC1, vectorNADC2, zero_method='wilcox', correction=False) p = "%.20f" % float(str(p_value[1])) print('P value for all the queries processed is:', p)