def query( indexfilename,processingalgorithm,queryfilename, queryid, numresults=3): ''' the main query processing program, using QueryProcessor''' # ToDo: the commandline usage: "echo query_string | python query.py index_file processing_algorithm" # processing_algorithm: 0 for booleanQuery and 1 for vectorQuery # for booleanQuery, the program will print the total number of documents and the list of docuement IDs # for vectorQuery, the program will output the top 3 most similar documents qrys = loadCranQry(queryfilename) # for q in qrys: # print(q, qrys[q].text) loadiindex = InvertedIndex() loadiindex = loadiindex.load(indexfilename) # print("index loaded") cf = CranFile('cran.all') queryProcessor = QueryProcessor(qrys, loadiindex, cf.docs, numresults) if processingalgorithm == '0' : queryProcessor.preprocessing() queryProcessor.queryId = queryid results = queryProcessor.booleanQuery() if processingalgorithm == '1': queryProcessor.queryId = queryid results = queryProcessor.vectorQuery(queryProcessor.numofresults) return results
def indexingCranfield(): # ToDo: indexing the Cranfield dataset and save the index to a file (Done) # command line usage: "python index.py cran.all index_file" (Done) # the index is saved to index_file (Done) # Import the cran.all collection cf = CranFile(sys.argv[1]) # Instantiate an invertedIndex invertedIndex = InvertedIndex() # Loop through and index each document in the Cran collection for doc in cf.docs: print("Indexing document {}".format(doc.docID)) invertedIndex.indexDoc(doc) print("\nTotal documents indexed: {}".format(invertedIndex.nDocs)) # Sort the invertedIndex invertedIndex.sort() # Save the invertedIndex invertedIndex.save(sys.argv[2]) print('Done')
def query(): ''' the main query processing program, using QueryProcessor''' # ToDo: the commandline usage: "echo query_string | python query.py index_file processing_algorithm" # processing_algorithm: 0 for booleanQuery and 1 for vectorQuer # for booleanQuery, the program will print the total number of documents and the list of docuement IDs # for vectorQuery, the program will output the top 3 most similar documents # Ensure args are valid if len(argv) is not 5: print( "Syntax: python query.py <index-file-path> <processing-algorithm> <query.txt path> <query-id>" ) return # Grab arguments index_file_loc = argv[1] processing_algo = argv[2] query_file_path = argv[3] query_id = argv[4] # Grab index file to restore II ii = InvertedIndex() ii.load(index_file_loc) # Get the document collection cf = CranFile("cran.all") # Get the query collection qc = loadCranQry(query_file_path) # Get the query if 0 < int(query_id) < 10: query_id = '00' + str(int(query_id)) elif 9 < int(query_id) < 100: query_id = '0' + str(int(query_id)) try: query = qc[query_id].text except KeyError: print("Invalid query id", query_id) return # Initialize a query processor qp = QueryProcessor(query, ii, cf) # Do query if int(processing_algo) is 0: result = qp.booleanQuery() if result: print("Results:", ", ".join(str(x) for x in qp.booleanQuery())) else: print("Results: None") elif int(processing_algo) is 1: result = qp.vectorQuery(k=3) print("Results:") for r in result: print("Doc", r[0], "Score", r[1]) else: print("Invalid processing algorithm", processing_algo + ". Use 0 (boolean) or 1 (vector).")
def indexingCranfield(): #ToDo: indexing the Cranfield dataset and save the index to a file # command line usage: "python index.py cran.all index_file" # the index is saved to index_file # Ensure args are valid if len(argv) != 3: print("Syntax: python index.py <cran.all path> <index-save-location>") return # Grab arguments file_to_index = argv[1] save_location = argv[2] # Index file print("Indexing documents from", file_to_index + "...") cf = CranFile(file_to_index) ii = InvertedIndex() for doc in cf.docs: ii.indexDoc(doc) # Sort index before saving ii.sort() # Compute tf-idf vector representations for each doc ii.compute_tfidf() # Save off index ii.save(save_location) print("Index saved to", save_location + "!")
def test(): ''' test your code thoroughly. put the testing cases here''' # Import the cran.all collection cf = CranFile(sys.argv[1]) # Instantiate an invertedIndex invertedIndex_1 = InvertedIndex() # Index the first 2 documents for i in range(2): print("Indexing document {}\n".format(cf.docs[i].docID)) invertedIndex_1.indexDoc(cf.docs[i], "test") # Check number of document indexed print("# of documents indexed: {}".format(invertedIndex_1.nDocs)) # Check number of terms indexed print("# of terms indexed: {}\n".format( len([item for item in invertedIndex_1.items.iterkeys()]))) # Sort the invertedIndex invertedIndex_1.sort() # Check the posting list, term frequency, and IDF print( "== Statistics for the term 'lift' BEFORE saving the index to disk (invertedIndex_1) ==" ) print("Posting list:\t{}".format( invertedIndex_1.find("lift").sorted_postings)) print("Positions:\t{}".format( invertedIndex_1.find("lift").posting[1].positions)) print("TF:\t\t{}".format( invertedIndex_1.find("lift").posting[1].term_freq())) print("IDF:\t\t{}\n".format(round(invertedIndex_1.idf("lift"), 5))) # Save the invertedIndex invertedIndex_1.save(sys.argv[2]) # Instantiate a new invertedIndex invertedIndex_2 = InvertedIndex() # Load the invertedIndex invertedIndex_2.load(sys.argv[2]) # Check the posting list, term frequency, and IDF print( "== Statistics for the term 'lift' AFTER loading the index from disk (invertedIndex_2) ==" ) print("Posting list:\t{}".format( invertedIndex_2.find("lift").sorted_postings)) print("Positions:\t{}".format( invertedIndex_2.find("lift").posting["1"].positions)) print("TF:\t\t{}".format( invertedIndex_2.find("lift").posting["1"].term_freq())) print("IDF:\t\t{}\n".format(round(invertedIndex_2.idf("lift"), 5))) print('Pass')
def indexingCranfield(): #ToDo: indexing the Cranfield dataset and save the index to a file # command line usage: "python index.py cran.all index_file" # the index is saved to index_file input_filename = sys.argv[1] #"cran.all" #sys.argv[1] ouput_filename = sys.argv[2] #"Index.json" as INDEX_file #sys.argv[2] 'creating Cranfile and inverted index objects' cf = CranFile(input_filename) x = InvertedIndex() 'Iterating over crancollection to process and create index for all documents in collection' for i, doc in enumerate(cf.docs): if i < 1: 'call to build index' x.indexDoc(doc) collectionfile.docs.update({doc.docID: doc}) 'Saving index to file' x.save(dictionary, ouput_filename) print("index created")
def indexingCranfield(): #ToDo: indexing the Cranfield dataset and save the index to a file # command line usage: "python index.py cran.all index_file" # the index is saved to index_file filePath = sys.argv[1] fileName = sys.argv[2] #filePath = "src/CranfieldDataset/cran.all" #fileName = "src/Data/tempFile" #filePath = "./CranfieldDataset/cran.all" #fileName = "./Data/tempFile" invertedIndexer = InvertedIndex() data = CranFile(filePath) for doc in data.docs: invertedIndexer.indexDoc(doc) invertedIndexer.storeData(fileName) print("Done")
def indexingCranfield(collectionname, indexfilename): # ToDo: indexing the Cranfield dataset and save the index to a file # command line usage: "python index.py cran.all index_file" # the index is saved to index_file cf = CranFile(collectionname) iindex = InvertedIndex() for doc in cf.docs: iindex.indexDoc(doc) # doing stop word removal here ? with open("stopwords") as f: for line in f: if line.strip() in iindex.items: del iindex.items[line.strip()] # Do something with 'line' for terms in iindex.items: # print(terms) iindex.idf(terms) iindex.save(indexfilename) print("Index builded successfully")
def eval(): # Algorithm: # Pick N random samples from query.txt # Get top 10 results from bool query for each rnd query # Get top 10 results from vector query for each rnd query # Compute NDCG btn bool query results and qrels.txt # Compute NDCG btn vector query results and qrels.txt # Get p-value btn bool and vector # Get the query collection qc = loadCranQry(query_path) poss_queries = list(qc) # Load up the inverted index ii = InvertedIndex() ii.load(index_file) # Load up the document collection cf = CranFile("cran.all") # Get ground-truth results from qrels.txt with open(qrels_path) as f: qrels = f.readlines() # Index qrels into a dict qrel_dict = {} for qrel in qrels: qrel_split = qrel.split() if int(qrel_split[0]) in qrel_dict: qrel_dict[int(qrel_split[0])].append(int(qrel_split[1])) else: qrel_dict[int(qrel_split[0])] = [int(qrel_split[1])] # Run over N random queries, collecting NDCGs bool_ndcgs = [] vector_ndcgs = [] for _ in range(n): # Get random query ID query_id = choice(poss_queries) # Get the query if 0 < int(query_id) < 10: query_id = '00' + str(int(query_id)) elif 9 < int(query_id) < 100: query_id = '0' + str(int(query_id)) try: query = qc[query_id].text except KeyError: print("Invalid query id", query_id) return # Initialize the query processor qp = QueryProcessor(query, ii, cf) # Run bool query bool_result = qp.booleanQuery()[:10] # Run vector query vector_result = qp.vectorQuery(10) # Pull top 10 ground-truth results from qrels dict gt_results = qrel_dict[poss_queries.index(query_id) + 1][:10] # Compute NDCG for bool query # NOTE: There is no weighting on the bool query, so give all an even 1 truth_vector = list(map(lambda x: x in gt_results, bool_result)) bool_ndcg = ndcg_score(truth_vector, [1] * len(truth_vector), k=len(truth_vector)) # Compute NDCG for vector query vector_docs = [] vector_scores = [] for v in vector_result: vector_docs.append(v[0]) vector_scores.append(v[1]) truth_vector = list(map(lambda x: x in gt_results, vector_docs)) vector_ndcg = ndcg_score(truth_vector, vector_scores, k=len(truth_vector)) # Accumulate NDCGs bool_ndcgs.append(bool_ndcg) vector_ndcgs.append(vector_ndcg) # Average out score lists bool_avg = 0 for bool in bool_ndcgs: bool_avg += bool bool_avg /= len(bool_ndcgs) vector_avg = 0 for vector in vector_ndcgs: vector_avg += vector vector_avg /= len(vector_ndcgs) # Present averages and p-values print("Boolean NDCG average:", bool_avg) print("Vector NDCG average:", vector_avg) if n > 19: print("Wilcoxon p-value:", wilcoxon(bool_ndcgs, vector_ndcgs).pvalue) else: print("Wilcoxon p-value: Sample size too small to be significant") print("T-Test p-value:", ttest_ind(bool_ndcgs, vector_ndcgs).pvalue)
def query(): ''' the main query processing program, using QueryProcessor''' # ToDo: the commandline usage: "echo query_string | python query.py index_file processing_algorithm" # processing_algorithm: 0 for booleanQuery and 1 for vectorQuery # for booleanQuery, the program will print the total number of documents and the list of docuement IDs # for vectorQuery, the program will output the top 3 most similar documents #ndexFile = "src/Data/tempFile" #model_selection = "0" #queryText = 'src/CranfieldDataset/query.text' #query_id = "226" docCollection = CranFile('CranfieldDataset/cran.all') indexFile = sys.argv[1] model_selection = sys.argv[2] queryText = sys.argv[3] query_id = sys.argv[4] query_id = str(query_id).zfill(3) # need for number 001 or 050 queryTest = "" queryFile = loadCranQry(queryText) #Data Need if not model_selection == '2': queryTuple = queryFile[query_id] if query_id == queryTuple.qid: queryTest = queryTuple.text queryProcessor = QueryProcessor(queryTest, indexFile, docCollection.docs) if model_selection == "0": docIDs = queryProcessor.booleanQuery() print("Boolean") print("Total number of documents is:", str(len(docIDs)) + "\nTheir DocIDs our:" + str(docIDs)) elif model_selection == "1": print("Vector") print(queryProcessor.vectorQuery(3)) elif model_selection == "2": numberOfTimeToLoop = 5 numberOfQueries = int(query_id) k = 10 bresults = [] vresults = [] #Data Need for _ in range(numberOfTimeToLoop): #get list of Query result from qrel.txt dictOfQuery = getRandomQuery(queryFile, numberOfQueries) queryProcessor = QueryProcessor( "", indexFile, docCollection.docs) # This is an extremely expensive process\ start = timer() for __, queryText in dictOfQuery.items(): queryProcessor.loadQuery(queryText) #docIDs = queryProcessor.booleanQuery() queryProcessor.booleanQuery() end = timer() # print("Run:",i+1, "\nTime for boolean model on Query (",numberOfQueries,") \nTime:", end - start, "\n") bresults.append(end - start) start = timer() for __, queryText in dictOfQuery.items(): #listOfDocIDAndSimilarity = queryProcessor.vectorQuery(k) queryProcessor.vectorQuery(k) end = timer() # print("Run:",i+1, "\nTime for Vector model on Query (",numberOfQueries,") \nTime:", end - start, "\n") vresults.append(end - start) print("Model\t\tRun:" + '\t\t\tRun:'.join(map(str, range(numberOfTimeToLoop + 1)[1:]))) print() print("Boolean Model: \t" + '\t'.join(map(str, bresults))) print() print("Vector Model: \t" + '\t'.join(map(str, vresults))) print()
def eval(testOn): k = 10 # k the number of top k pairs of (docID, similarity) to get from vectorQuery dictQ_ID = [] indexFile = sys.argv[1] #v "src/Data/tempFile" queryText = sys.argv[2] qrelsText = sys.argv[3] dictOfQuery = {} dictQrelsText = {} docCollection = CranFile('./CranfieldDataset/cran.all') NDCGScoreBool = [] numberOfQueries = int(sys.argv[4]) NDCGScoreVector = [] #indexFile = "src/Data/tempFile" #queryText = 'src/CranfieldDataset/query.text' #qrelsText = 'src/CranfieldDataset/qrels.text' #numberOfQueries = 50 numberOfTimeToLoop = 5 #Loads Files listOfQueryRelsMaping = readFile(qrelsText) queryFile = loadCranQry(queryText) #Data Need for i in range(numberOfTimeToLoop): #Get random Queiry dictOfQuery = getRandomQuery(queryFile, numberOfQueries) if testOn: assert len(dictOfQuery ) == numberOfQueries, "Error are getting random query" # Return all query # dictOfQuery = getAllDataItems(queryFile) # if testOn: # assert len(dictOfQuery) == 225, "Error are getting random query" #get list of Query result from qrel.txt dictQrelsText = getResultsFrom_QrelsFile(listOfQueryRelsMaping, dictOfQuery) if testOn: assert len(dictQrelsText ) == numberOfQueries, "Error number Of Queries to large" start = timer() queryProcessor = QueryProcessor( "", indexFile, docCollection.docs) # This is an extremely expensive process\ end = timer() if testOn: print("Time for creating QueryProcessor:", end - start) countDoc = 0 start = timer() dictQ_ID = [] for qid, queryText in dictOfQuery.items(): countDoc += 1 dictQ_ID.append(qid) if testOn: print("QID:", qid) start = timer() queryProcessor.loadQuery(queryText) end = timer() if testOn: print("Time for Load:", end - start) print("qrels: ", dictQrelsText[qid]) start = timer() docIDs = queryProcessor.booleanQuery( ) # data would need to be like this [12, 14, 78, 141, 486, 746, 172, 573, 1003] #docIDs_1 = queryProcessor.booleanQuery_1() end = timer() if testOn: print("Time for booleanQuery:", end - start) start = timer() listOfDocIDAndSimilarity = queryProcessor.vectorQuery( k ) # data need to look like k=3 [[625,0.8737006126353902],[401,0.8697643788341478],[943,0.8424991316663082]] #vectorQueryDict[qid] = dictOfDocIDAndSimilarity end = timer() if testOn: print("Time for vectorQuery:", end - start) print("booleanQuery:", docIDs) #For Boolean part start = timer() yTrue = [] yScore = [] for docID in docIDs: yScore.append(1) if docID in dictQrelsText[qid]: yTrue.append(1) else: yTrue.append(0) yTrue.sort(reverse=True) score = metrics.ndcg_score(yTrue[:k], yScore[:k], k, "exponential") if math.isnan(score): NDCGScoreBool.append(0) else: NDCGScoreBool.append(score) end = timer() if testOn: print("Time for Boolean ndcg:", end - start) #For Vector part start = timer() yTrue = [] yScore = [] if testOn: print("vectorQuery:", listOfDocIDAndSimilarity) for docID_Score in listOfDocIDAndSimilarity: yScore.append(float(docID_Score[1])) if docID_Score[0] in dictQrelsText[qid]: yTrue.append(1) else: yTrue.append(0) yTrue.sort(reverse=True) score = metrics.ndcg_score(yTrue[:k], yScore[:k], k, "exponential") if math.isnan(score): NDCGScoreVector.append(0) else: NDCGScoreVector.append(score) end = timer() if testOn: print("Time for Vector ndcg:", end - start) print("\nRunning Querys iteration:(", str(i + 1), ")\n", dictQ_ID) if testOn: for QID, boolScore, vectorScore in zip(dictQ_ID, NDCGScoreBool, NDCGScoreVector): print("QID", QID, "Boolean Model:", boolScore, "Vector Model", vectorScore) print("\nThe Length Of Both NDCG Score is: ", len(NDCGScoreBool), "==", len(NDCGScoreVector)) print('\nThe Avg NDCG Score') vectorAvg = avg(NDCGScoreVector) BoolAvg = avg(NDCGScoreBool) print("Avg NDCG Score for Bool:", BoolAvg, "\nAvg NDCG Score for Vector:", vectorAvg) end = timer() if testOn: print("\n\nTime for running ", countDoc, " queries:", end - start) print('\nThe P-Value') p_va_ttest = stats.ttest_ind(NDCGScoreBool, NDCGScoreVector) p_va_wilcoxon = stats.wilcoxon(NDCGScoreBool, NDCGScoreVector) print("T-Test P-value: ", p_va_ttest) print("Wilcoxon P-value: ", p_va_wilcoxon) print('Done')
print("Total number of retrieved document for search is", len(Bresult)) print(Bresult) BoolenQueryResultDic.append({qid: Bresult}) else: print("Vector Query TF-IDF calculation in progress") Topk, k = qprocessorobj.vectorQuery(3) #print("vector",qid,qrys[qid].text) print("Top", k, "(DocID Similarity)", Topk[:k]) ''' ************this below code is reused in batch_eval also*******************''' input_filename = "cran.all" ouput_filename = sys.argv[1] #"index_file" #sys.argv[2] Queryfile = "query.text" #sys.argv[3]#"query.text" '''creating object for cranefile and collection file and inverted index class,postings class''' cf = CranFile(input_filename) collectionfile = Collection() indexobj = InvertedIndex() 'iterating over cran file for document id' for i, doc in enumerate(cf.docs): collectionfile.docs.update({doc.docID: doc}) postingobj = Posting(doc.docID) '''reading index file which is stored while creating index''' with open(ouput_filename, "r") as invertedindex: InvertedIndex.items = json.load(invertedindex) 'formatting the query id in qrel.text and finding common query id in qrery.text' qidlist = {} qrys = loadCranQry(Queryfile) for position, q in enumerate(qrys): qidlist[q] = position + 1 'Below Variables are used for batch_eval.py file'
def test(index_loc, cran_loc, qrels_loc): ''' test your code thoroughly. put the testing cases here''' ##### SETUP ITEMS ##### # Grab index file to restore II ii = InvertedIndex() ii.load(index_loc) # Get the document collection cf = CranFile(cran_loc) # Get ground-truth results from qrels.txt with open(qrels_loc) as f: qrels = f.readlines() # Index qrels into a dict qrel_dict = {} for qrel in qrels: qrel_split = qrel.split() if int(qrel_split[0]) in qrel_dict: qrel_dict[int(qrel_split[0])].append(int(qrel_split[1])) else: qrel_dict[int(qrel_split[0])] = [int(qrel_split[1])] ##### INITIAL TEST ITEMS ##### print("TESTS BASED ON SUGGESTED TESTING POINTS") # Ensure tf is correct # Find a random word and check TF value against what is manually done posting_list = ii.find("experiment").posting tf_vector = [] for posting in posting_list: tf_vector.append(len(posting_list[posting].positions) \ == posting_list[posting].term_freq()) print("TF is computed correctly:", all(tf_vector)) # Ensure idf is correct print("IDF is computed correctly:", log10(ii.nDocs / len(posting_list)) \ == ii.idf("experiment")) # As both tf and idf are correct, and tf-idf is a product of the two, # it is reasonable to assume tf-idf is computed correctly ##### BOOL QUERY TESTS ##### # Here, I use very specific boolean queries to ensure that a # limited number of documents are returned print("\nBOOL QUERY TESTS") # Ensure that the exact title of doc 8 matches for doc 8 doc8 = "measurements of the effect of two-dimensional and three-dimensional roughness elements on boundary layer transition" qp1 = QueryProcessor(doc8, ii, cf) print("Bool query matches on exact title:", qp1.booleanQuery() == [8]) # Ensure that bool query matches very specific AND query qp2 = QueryProcessor("hugoniot and infinitesimally", ii, cf) print( "Bool query matches on specific AND query ('hugoniot and infinitesimally'):", qp2.booleanQuery() == [329]) # Test that an OR query is handled properly # Both gravel and stagnation have completely distinct postings lists. # OR should merge them. gravel_postings = ii.find("gravel").sorted_postings[:] stag_postings = ii.find("stagnat").sorted_postings[:] gravel_postings.extend(stag_postings) qp3 = QueryProcessor("gravel or stagnation", ii, cf) print("Bool query successfully handles OR ('gravel or stagnation'):", qp3.booleanQuery() == sorted(gravel_postings)) # Test that NOT is handled properly # The posting list for "diameter" is a subset of "slipstream" postings # (oddly enough). To test this works, do "slipstream and not diameter" # and we chould get slipstream's postings minus those of diameter. slip_postings = ii.find("slipstream").sorted_postings[:] diam_postings = ii.find("diamet").sorted_postings[:] slip_not_diam = [t for t in slip_postings if t not in diam_postings] print("Bool query successfully handles NOT ('slipstream and not diameter'):", QueryProcessor("slipstream and not diameter", ii, cf).booleanQuery() \ == slip_not_diam) # Ensure AND/OR order doesn't matter print("Bool query can handle query regardless of AND order ('a and b' = 'b and a'):", QueryProcessor("slipstream and diameter", ii, cf).booleanQuery() \ == QueryProcessor("diameter and slipstream", ii, cf).booleanQuery()) print("Bool query can handle query regardless of OR order ('a or b' = 'b or a'):", QueryProcessor("slipstream or diameter", ii, cf).booleanQuery() \ == QueryProcessor("diameter or slipstream", ii, cf).booleanQuery()) # Ensure that the presence of parens does not change query results print("Bool query can handle query regardless of parens ('slipstream and diameter'):", QueryProcessor("slipstream and diameter", ii, cf).booleanQuery() \ == QueryProcessor("(slipstream and diameter)", ii, cf).booleanQuery()) # Ensure parentheses do not change order of processing for AND-AND and OR-OR queries print("Bool query AND is accociative ('(a and b) and c' = 'a and (b and c)'):", QueryProcessor("(slipstream and diameter) and thrust", ii, cf).booleanQuery() \ == QueryProcessor("slipstream and (diameter and thrust)", ii, cf).booleanQuery()) print("Bool query OR is accociative ('(a or b) or c' = 'a or (b or c)'):", QueryProcessor("(slipstream or diameter) or thrust", ii, cf).booleanQuery() \ == QueryProcessor("slipstream or (diameter or thrust)", ii, cf).booleanQuery()) # Ensure parentheses properly group items # Tested by doing the query "manually" by adding/orring the correct terms part_one = QueryProcessor("conduction and cylinder and gas", ii, cf).booleanQuery() part_two = QueryProcessor("radiation and gas", ii, cf).booleanQuery() part_one.extend(part_two) expected_result = QueryProcessor("hugoniot", ii, cf).booleanQuery() expected_result.extend(part_one) print("Bool query parens successfully group conflicting operators:", QueryProcessor("(conduction and cylinder and gas) or (radiation and gas) or hugoniot", ii, cf).booleanQuery() \ == sorted(list(set(expected_result)))) ##### VECTOR QUERY TESTS ##### # For this, just ensure that most of the results are in the expected list print("\nVECTOR QUERY TESTS") # Ensure vector query can match on exact title print("Vector query matches on exact title:", qp1.vectorQuery(1)[0][0] == 8) # Try a few example queries from query.text # As long as one-fifth of t-10 are in gt_result, call it a pass # Note that queries with larger answer sets were chosen to # ensure there were enough to get to one-fifth of ten qc = loadCranQry("query.text") poss_queries = list(qc) # Query 001 result = QueryProcessor(qc["001"].text, ii, cf).vectorQuery(10) gt_result = qrel_dict[poss_queries.index("001") + 1] correct_vector = list(map(lambda x: x in gt_result, [x[0] for x in result])) print("Vector query is at least one-fifth correct for query 001:", sum(correct_vector) > 2) # Query 128 result = QueryProcessor(qc["128"].text, ii, cf).vectorQuery(10) gt_result = qrel_dict[poss_queries.index("128") + 1] correct_vector = list(map(lambda x: x in gt_result, [x[0] for x in result])) print("Vector query is at least one-fifth correct for query 128:", sum(correct_vector) > 2) # Query 226 result = QueryProcessor(qc["226"].text, ii, cf).vectorQuery(10) gt_result = qrel_dict[poss_queries.index("226") + 1] correct_vector = list(map(lambda x: x in gt_result, [x[0] for x in result])) print("Vector query is at least one-fifth correct for query 226:", sum(correct_vector) > 2) # Query 196 result = QueryProcessor(qc["196"].text, ii, cf).vectorQuery(10) gt_result = qrel_dict[poss_queries.index("196") + 1] correct_vector = list(map(lambda x: x in gt_result, [x[0] for x in result])) print("Vector query is at least one-fifth correct for query 196:", sum(correct_vector) > 2) # Query 291 result = QueryProcessor(qc["291"].text, ii, cf).vectorQuery(10) gt_result = qrel_dict[poss_queries.index("291") + 1] correct_vector = list(map(lambda x: x in gt_result, [x[0] for x in result])) print("Vector query is at least one-fifth correct for query 291:", sum(correct_vector) > 2)
def test(): ''' test your code thoroughly. put the testing cases here''' dictTest_experiment = { '1': 3, '11': 1, '12': 1, '16': 1, '17': 1, '19': 1, '25': 1, '29': 1, '30': 2, '35': 1, '37': 1, '41': 1, '42': 1, '43': 1, '47': 1, '52': 2, '53': 1, '58': 1, '69': 1, '70': 1, '74': 2, '78': 2, '84': 3, '99': 2, '101': 1, '103': 1, '112': 1, '115': 1, '121': 1, '123': 3, '131': 1, '137': 1, '140': 1, '142': 1, '154': 1, '156': 1, '167': 1, '168': 1, '170': 1, '171': 2, '173': 2, '176': 1, '179': 2, '183': 1, '184': 1, '186': 3, '187': 1, '188': 1, '189': 2, '191': 1, '195': 3, '197': 2, '202': 1, '203': 1, '206': 2, '207': 2, '212': 1, '216': 1, '220': 1, '222': 1, '225': 2, '227': 1, '230': 1, '234': 4, '245': 1, '251': 1, '256': 3, '257': 1, '262': 1, '271': 3, '273': 1, '277': 1, '282': 1, '283': 1, '286': 1, '287': 1, '289': 1, '294': 1, '295': 1, '304': 1, '307': 1, '329': 2, '330': 2, '334': 2, '338': 1, '339': 2, '344': 3, '345': 1, '346': 3, '347': 1, '354': 1, '360': 1, '369': 1, '370': 1, '372': 3, '377': 1, '397': 1, '409': 1, '411': 2, '413': 2, '418': 1, '420': 2, '421': 1, '423': 2, '427': 1, '433': 1, '435': 1, '439': 1, '441': 2, '442': 3, '443': 1, '453': 1, '455': 2, '462': 1, '464': 1, '467': 1, '484': 3, '494': 2, '496': 1, '497': 2, '498': 1, '501': 1, '503': 1, '504': 1, '505': 1, '511': 1, '517': 1, '518': 2, '519': 1, '520': 2, '522': 3, '536': 1, '540': 1, '544': 3, '549': 2, '552': 2, '553': 1, '558': 2, '563': 1, '567': 1, '569': 2, '572': 4, '588': 1, '595': 1, '600': 1, '606': 1, '610': 1, '632': 1, '634': 1, '635': 1, '636': 1, '644': 1, '645': 1, '649': 1, '658': 1, '662': 2, '663': 2, '666': 2, '670': 1, '675': 1, '678': 1, '679': 1, '685': 3, '688': 4, '689': 2, '694': 1, '704': 2, '712': 1, '713': 1, '717': 1, '720': 1, '725': 1, '728': 1, '729': 1, '739': 1, '740': 1, '743': 1, '753': 1, '760': 3, '764': 1, '766': 4, '767': 3, '772': 3, '781': 2, '790': 1, '801': 3, '802': 1, '806': 1, '816': 2, '820': 1, '823': 2, '825': 1, '827': 2, '829': 1, '830': 1, '836': 5, '844': 1, '845': 2, '846': 1, '847': 2, '856': 4, '857': 3, '858': 3, '863': 2, '866': 2, '867': 1, '869': 1, '878': 2, '881': 1, '887': 1, '891': 2, '907': 1, '911': 2, '912': 2, '923': 1, '924': 1, '927': 3, '928': 3, '932': 2, '935': 2, '946': 2, '950': 4, '951': 1, '954': 2, '955': 1, '959': 1, '961': 1, '964': 1, '965': 1, '973': 1, '974': 1, '984': 2, '986': 4, '996': 1, '997': 4, '999': 1, '1006': 1, '1008': 1, '1016': 1, '1019': 3, '1028': 1, '1039': 3, '1040': 1, '1045': 1, '1046': 1, '1049': 2, '1051': 1, '1062': 3, '1066': 4, '1069': 1, '1070': 1, '1074': 3, '1075': 3, '1076': 1, '1078': 1, '1080': 1, '1081': 1, '1082': 1, '1083': 1, '1092': 1, '1097': 3, '1098': 2, '1110': 1, '1112': 1, '1118': 2, '1122': 2, '1125': 1, '1127': 1, '1145': 1, '1146': 1, '1151': 1, '1153': 1, '1155': 2, '1156': 3, '1158': 1, '1159': 2, '1160': 2, '1161': 2, '1167': 2, '1171': 1, '1172': 1, '1177': 1, '1185': 1, '1186': 1, '1187': 1, '1192': 1, '1195': 1, '1196': 2, '1198': 1, '1199': 1, '1204': 3, '1205': 1, '1209': 3, '1212': 1, '1213': 2, '1214': 3, '1216': 1, '1218': 2, '1220': 1, '1222': 1, '1225': 3, '1227': 1, '1228': 1, '1230': 1, '1231': 1, '1234': 1, '1237': 1, '1261': 1, '1262': 1, '1263': 2, '1264': 2, '1268': 1, '1269': 2, '1277': 2, '1290': 1, '1298': 1, '1302': 2, '1310': 1, '1314': 2, '1317': 1, '1319': 1, '1324': 1, '1337': 3, '1338': 2, '1339': 1, '1341': 1, '1352': 2, '1363': 2, '1364': 2, '1369': 1, '1372': 1, '1374': 1, '1378': 1, '1384': 1, '1390': 1, '1392': 1, '1396': 1, '1397': 1 } dictTest_bifurc = {'957': 1, '1232': 1} filePath = "src/CranfieldDataset/cran.all" fileName = "src/Data/Test.json" fileNameO = "src/Data/TestPickle" #filePath = "./CranfieldDataset/cran.all" #fileName = "./Data/tempFile.json" invertedIndexer = InvertedIndex() data = CranFile(filePath) for doc in data.docs: invertedIndexer.indexDoc(doc) #TF-IDF TEST TEMP = invertedIndexer.idf("experiment") Temp1 = invertedIndexer.idf("opportun") t = str(TEMP) t2 = str(Temp1) assert t == "0.6172", " Wrong idf." assert t2 == "2.8451", " Wrong idf." assert len(invertedIndexer.find("experiment").get_posting_list() ) == 338, "Worng Lenght for experiment term find does not work." assert invertedIndexer.get_total_number_Doc( ) == 1400, "Worng total nubmer of Doc in Corpus" for docID, post in invertedIndexer.find( "experiment").get_posting_list().items(): assert docID in dictTest_experiment and post.term_freq( ) == dictTest_experiment[docID], "For Term experiment wrong value" dictTest_bifurc = {'957': 1, '1232': 1} for docID, post in invertedIndexer.find( "bifurc").get_posting_list().items(): assert docID in dictTest_bifurc and post.term_freq( ) == dictTest_bifurc[docID], "For Term experiment wrong value" invertedIndexer.save(fileName) assert path.exists(fileName), "error in saving json data." invertedIndexer.storeData(fileNameO) assert path.exists(fileNameO), "error in saving json data." Temp = invertedIndexer.loadData(fileNameO) idfScore = Temp.idf("experiment") assert str(idfScore) == "0.6172", " Error in Load the picle file." print("test Passed")
def vectorQuery(self, k): ''' vector query processing, using the cosine similarity. ''' #ToDo: return top k pairs of (docID, similarity), ranked by their cosine similarity with the query in the descending order # You can use term frequency or TFIDF to construct the vectors #constructing document vector for document 1 vectorResult = [] cf = CranFile('cran.all') documentVector = {} queryVector = {} ps = PorterStemmer() finalResult = {} for q in self.raw_query: if q == self.queryId: query_tokens = [] stemmed_query_tokens = [] # print(q, self.raw_query[q].text) # query_tokens = re.split(" ", self.raw_query[q].text.replace('\n', ' ')) query_tokens = word_tokenize(self.raw_query[q].text) query_tokens = [element.lower() for element in query_tokens]; tempcounter = 0 while tempcounter < len(query_tokens): query_tokens[tempcounter] = correction(query_tokens[tempcounter]); tempcounter = tempcounter + 1 ps = PorterStemmer() temp = 0 querytokentemp = 0 while temp < len(query_tokens): query_tokens[temp] = ps.stem(query_tokens[temp]) querytokentemp = querytokentemp + 1 with open("stopwords") as f: for line in f: if line.strip() == query_tokens[temp]: query_tokens.remove(line.strip()) temp = temp - 1 temp = temp + 1 #block to calculate query vector start temp2 = 0 while temp2 < len(query_tokens): if query_tokens[temp2] in self.index.items: wordfreq = [query_tokens.count(query_tokens[temp2])] # print(wordfreq) queryVector[query_tokens[temp2]] = (self.index.items[query_tokens[temp2]].get('idf') )* (1 + math.log( wordfreq[0] , 10)) temp2 = temp2 + 1 else: queryVector[query_tokens[temp2]] = 0; temp2 = temp2 + 1 #block to calculate query vector end docidScorepair = {} for doc in cf.docs: # print(doc.docID, doc.title, doc.body) # print("generating document vector here") titletoken = word_tokenize(doc.title) bodytoken = word_tokenize(doc.body) tokens = titletoken + bodytoken tokens = [element.lower() for element in tokens]; temp3 = 0 while temp3 < len(tokens): with open("stopwords") as f: for line in f: if line.strip() == tokens[temp3]: tokens.remove(line.strip()) temp3 = temp3 - 1 temp3 = temp3 + 1 temp = 0 while temp < len(tokens): tokens[temp] = ps.stem(tokens[temp]) temp = temp + 1 temp2 = 0 while temp2 < len(tokens): if tokens[temp2] in self.index.items: documentVector[tokens[temp2]] = (1 + math.log(self.index.items[tokens[temp2]].get('posting').get(doc.docID).get('termfreq'),10)) * (self.index.items[tokens[temp2]].get('idf')) temp2 = temp2 + 1 else: documentVector[tokens[temp2]] = 0; temp2 = temp2 + 1 # print('document vector complete') #print(documentVector) # without normalization #normalize query vector and document vector start normalizequeryvectorcounter = 0 queryVectornormalized = [] # sumofsquaresquery = 0 # for z in queryVector: # sumofsquaresquery = sumofsquaresquery + np.multiply(queryVector[z] , queryVector[z]) # # sumofsquaresquery = 1 / math.sqrt(sumofsquaresquery) # for r in queryVector: # queryVector[r] = queryVector[r] * sumofsquaresquery sumofsquaresdocument = 0 for l in documentVector: sumofsquaresdocument = sumofsquaresdocument + np.multiply(documentVector[l], documentVector[l]) try: sumofsquaresdocument = 1 / math.sqrt(sumofsquaresdocument) except: sumofsquaresdocument = 0 for h in documentVector: documentVector[h] = documentVector[h] * sumofsquaresdocument #noramlize ends cosineVector = queryVector.copy() for u in queryVector: if u in documentVector: cosineVector[u] = np.multiply(documentVector[u], queryVector[u]) else: #below line is wrong # cosineVector[k] = queryVector[k] cosineVector[u] = 0 # print ("query vector -->") # print(queryVector) # print ("document vector -->") # print( documentVector) # print ("cosine vector -->") # print(cosineVector) # print ("****************************") # document score docidScorepair[doc.docID] = sum(cosineVector.values()) #end of document score self.intermediateResultVectorQuery[q] = docidScorepair cosineVector = {} #end without normalization documentVector = {} queryVector = {} # print(query_tokens) counterObject = Counter(self.intermediateResultVectorQuery[q]) high = counterObject.most_common(k) # print('*** query id ***'+q + "***** query text *****" +self.raw_query[q].text) if k == 3: print(high) vectorResult = [i[0] for i in counterObject.most_common(k)] # print(vectorResult) return vectorResult
# command line usage: "python index.py cran.all index_file" # the index is saved to index_file coll={} collect= Collection() #creating object invertindex=InvertedIndex() #adding all documents to collection class for docu in cf.docs: coll={docu.docID:[docu.title,docu.author,docu.body]} collect.docs.update(coll) #invertindex.docs.update(coll) for docu in cf.docs: invertindex.indexDoc(docu) #save to json file invertindex.save(indexfile) # load from json file invertindex.load(indexfile) if __name__ == '__main__': #input cran file crfile =sys.argv[1] #output index file indexfile=sys.argv[2] cf = CranFile (crfile) indexingCranfield() test()
def test(): ''' test your code thoroughly. put the testing cases here''' ####### TEST CASES FOR INVERTED INDEX CLASS ####### # Get all documents from cran.all--let Cranfile object handle this cf = CranFile("cran.all") # Build an inverted index object ii = InvertedIndex() # Index one document ii.indexDoc(cf.docs[0]) # The first temr should be "experiment" (verified by printing contents of II) # We want to ensure that find() finds it index_item = ii.find("experiment") print("Result of find:", index_item.term, index_item.posting) # Next, sort to ensure that it works # TODO: figure out what this should doc ii.sort() print("Sorted!") # Get the IDF of the term "experiment" # Following the formula from our slides, this should be 0 print("IDF:", ii.idf("experiment")) # Add back in the rest of Cranfield dataset for doc in cf.docs[1:]: ii.indexDoc(doc) # Re-do find now that we have more things in the index index_item = ii.find("experiment") print("Result of find:", index_item.term, index_item.posting) # Ensure sort works on larger index # Next, sort to ensure that it works # TODO: figure out what this should doc ii.sort() print("Sorted!") # Calculate IDF with larger index # Get the IDF of the term "experiment" # Following the formula from our slides, this should be 0 print("IDF:", ii.idf("experiment")) # Get the tfidf dict ii.compute_tfidf() # Save off our index ii.save("index.pkl") # Read back in the index, ensure they are the same ii_from_file = InvertedIndex() ii_from_file.load("index.pkl") # Cannot determine if the actual items are equal objects, # so just ensure the stats are the same # print("Load matches saved items:", ii.items == ii_from_file.items) print("Load matches saved number of docs:", ii.nDocs == ii_from_file.nDocs) print("Load matches saved IDF for 'experiment':", ii.idf("experiment") == ii_from_file.idf("experiment")) print("Load matches saved find term for 'experiment':", ii.find("experiment").term == ii_from_file.find("experiment").term) print( "Load matches saved find posting for 'experiment':", str(ii.find("experiment").posting) == str( ii_from_file.find("experiment").posting)) ####### TEST CASES FOR POSTING CLASS ####### # Create test posting p = Posting(docID=1) # Test adding a position p.append(3) print("Position appended to posting:", p.positions == [3]) # Add position out of order, ensure sort works p.append(1) print("Append is initially out-of-order:", p.positions == [3, 1]) p.sort() print("Sort correctly sorts postings:", p.positions == [1, 3]) # Ensure we can merge in new postings to_merge = [4, 5, 6] p.merge(to_merge) print("Merge correctly merges:", p.positions == [1, 3, 4, 5, 6]) # Ensure term frequency is correctly print("Term frequency correctly counts postings:", p.term_freq() == 5) ####### TEST CASES FOR INDEX ITEM CLASS ####### # Create index item iitem = IndexItem("abc") # Add value to index item iitem.add(0, 40) print("Document added to item:", 0 in iitem.posting) print("Posting created for document in item:", type(iitem.posting[0]) == type(Posting(5))) ####### ADDITIONAL TEST CASES ####### print("\nTHE FOLLOWING ARE BASED ON THE GIVEN TEST QUESTIONS") # Act on the assumption all words are stemmed # This should be done in the tokenize part of util # The idea was to re-stem all words and ensure they equal the words # in the index, but some double-stemmings differ anyway. # Ensure stopwords were removed from nltk.stem.porter import PorterStemmer with open("stopwords") as f: stopwords = f.readlines() s = PorterStemmer() stopword_vector = list( map(lambda x: s.stem(x.strip()) in ii.items.items(), stopwords)) print("All stopwords removed from index:", not any(stopword_vector)) # Print number of terms in dict--Dr. Chen can ensure this is right print("Number of terms in dictionary:", len(ii.items)) # Print average size of postings--Dr. Chen can ensure this makes sense sum = 0 posting_count = 0 for item in ii.items.values(): for posting in item.posting.values(): sum += len(posting.positions) posting_count += 1 print("Average posting length:", sum / posting_count)
def eval(indexfilename, queryfilename, queryrefilename, numberofrandomqueries): # ToDo actual = [] # if numberofrandomqueries > 225: raise Exception('please enter query count less than or equal to 225') qrys = loadCranQry("query.text") validqueries = [] querycounter = 0 for q in qrys: validqueries.append(int(q)) loadiindex = InvertedIndex() loadiindex = loadiindex.load("index_file.pickle") # print("index loaded") cf = CranFile('cran.all') #QueryProcessor.numberofresult =10 #qp = QueryProcessor(qrys,loadiindex,cf.docs,10) queryRelevence = dict() for line in open(queryrefilename): fields = line.split(" ") fields[0] = '%0*d' % (3, int(fields[0])) if fields[0] in queryRelevence: # and let's extract the data: queryRelevence[fields[0]].append(fields[1]) else: # create a new array in this slot queryRelevence[fields[0]] = [fields[1]] replacecounter = 0 queryRelevenceUpdated = {} for k in queryRelevence: queryRelevenceUpdated['%0*d' % (3, int( validqueries[replacecounter]))] = queryRelevence.get(k) replacecounter = replacecounter + 1 # relevent = list(queryRelevence.keys()) # relevent = list(map(int, relevent)) #samplespace = np.intersect1d(relevent, validqueries) list_of_random_items = random.sample(validqueries, numberofrandomqueries) tempcounter2 = 0 booleanndcg = [] vectorndcg = [] while tempcounter2 < numberofrandomqueries: list_of_random_items[tempcounter2] = '%0*d' % ( 3, int(list_of_random_items[tempcounter2])) print('query for which ndcg is calculated ' + str(list_of_random_items[tempcounter2])) y = str(list_of_random_items[tempcounter2]) vectorresult = query(indexfilename, '1', queryfilename, str(list_of_random_items[tempcounter2]), 10) # vectorresult = ['573', '51', '944', '878', '12', '486', '875', '879', '746', '665'] # print(vectorresult) tempcounter = 0 for z in vectorresult: if z in queryRelevenceUpdated[str( list_of_random_items[tempcounter2])]: vectorresult[tempcounter] = 1 else: vectorresult[tempcounter] = 0 tempcounter = tempcounter + 1 #print(vectorresult) idealvectorresult = vectorresult.copy() idealvectorresult.sort(reverse=True) #print(idealvectorresult) if sum(idealvectorresult) == 0: ndcgscore = 0 else: ndcgscore = ndcg_score(idealvectorresult, vectorresult) # print(ndcgscore) vectorndcg.append(ndcgscore) tempcounter3 = 0 booleanqueryresult = query(indexfilename, '0', queryfilename, str(list_of_random_items[tempcounter2]), 10) #booleanqueryresult = ['462','462','462','462','462','462','462','462','462'] booleanquery = booleanqueryresult.copy() for g in booleanquery: if g in queryRelevenceUpdated[str( list_of_random_items[tempcounter2])]: booleanquery[tempcounter3] = 1 else: booleanquery[tempcounter3] = 0 tempcounter3 = tempcounter3 + 1 #print(booleanquery) tempcounter4 = len(booleanquery) while tempcounter4 < 10: booleanquery.append(0) tempcounter4 = tempcounter4 + 1 idealbooleanresult = [] for i in range(0, 10): if i < len(queryRelevenceUpdated[str( list_of_random_items[tempcounter2])]): idealbooleanresult.append(1) else: idealbooleanresult.append(0) idealbooleanresult.sort(reverse=True) if sum(booleanquery) == 0: ndcgscoreboolean = 0 else: ndcgscoreboolean = ndcg_score(booleanquery, idealbooleanresult) booleanndcg.append(ndcgscoreboolean) tempcounter2 = tempcounter2 + 1 print('P value for all the queries processed is:') print( scipy.stats.wilcoxon(vectorndcg, booleanndcg, zero_method='wilcox', correction=False)) print('Done')