def query(): ''' the main query processing program, using QueryProcessor''' # ToDo: the commandline usage: "echo query_string | python query.py index_file processing_algorithm" # processing_algorithm: 0 for booleanQuery and 1 for vectorQuer # for booleanQuery, the program will print the total number of documents and the list of docuement IDs # for vectorQuery, the program will output the top 3 most similar documents # Ensure args are valid if len(argv) is not 5: print( "Syntax: python query.py <index-file-path> <processing-algorithm> <query.txt path> <query-id>" ) return # Grab arguments index_file_loc = argv[1] processing_algo = argv[2] query_file_path = argv[3] query_id = argv[4] # Grab index file to restore II ii = InvertedIndex() ii.load(index_file_loc) # Get the document collection cf = CranFile("cran.all") # Get the query collection qc = loadCranQry(query_file_path) # Get the query if 0 < int(query_id) < 10: query_id = '00' + str(int(query_id)) elif 9 < int(query_id) < 100: query_id = '0' + str(int(query_id)) try: query = qc[query_id].text except KeyError: print("Invalid query id", query_id) return # Initialize a query processor qp = QueryProcessor(query, ii, cf) # Do query if int(processing_algo) is 0: result = qp.booleanQuery() if result: print("Results:", ", ".join(str(x) for x in qp.booleanQuery())) else: print("Results: None") elif int(processing_algo) is 1: result = qp.vectorQuery(k=3) print("Results:") for r in result: print("Doc", r[0], "Score", r[1]) else: print("Invalid processing algorithm", processing_algo + ". Use 0 (boolean) or 1 (vector).")
def test_InvertedIndex(): data = ("a,b,c,d,e,f\n" "g,h,i,j,k,l\n" "z,x\n" "z,x\n" "z,x,y\n" "z,x,y,i\n") index = InvertedIndex() index.load(data) assert (index.support({Item("a")}) == 1 / 6) assert (index.support({Item("b")}) == 1 / 6) assert (index.support({Item("c")}) == 1 / 6) assert (index.support({Item("d")}) == 1 / 6) assert (index.support({Item("e")}) == 1 / 6) assert (index.support({Item("f")}) == 1 / 6) assert (index.support({Item("h")}) == 1 / 6) assert (index.support({Item("i")}) == 2 / 6) assert (index.support({Item("j")}) == 1 / 6) assert (index.support({Item("k")}) == 1 / 6) assert (index.support({Item("l")}) == 1 / 6) assert (index.support({Item("z")}) == 4 / 6) assert (index.support({Item("x")}) == 4 / 6) assert (index.support({Item("y")}) == 2 / 6) sup_zx = index.support({Item("z"), Item("x")}) assert (sup_zx == 4 / 6) sup_zxy = index.support({Item("z"), Item("x"), Item("y")}) assert (sup_zxy == 2 / 6) sup_zxyi = index.support({Item("z"), Item("x"), Item("y"), Item("i")}) assert (sup_zxyi == 1 / 6)
def test_apriori(): data = ("a,b,c,d,e,f\n" "g,h,i,j,k,l\n" "z,x\n" "z,x\n" "z,x,y\n" "z,x,y,i\n") expectedItemSets = { ItemSet("i"): 2 / 6, ItemSet("z"): 4 / 6, ItemSet("x"): 4 / 6, ItemSet("y"): 2 / 6, ItemSet("xz"): 4 / 6, ItemSet("yz"): 2 / 6, ItemSet("xy"): 2 / 6, ItemSet("xyz"): 2 / 6 } index = InvertedIndex() index.load(data) itemsets = apriori(index, 2 / 6) assert (set(expectedItemSets.keys()) == set(itemsets)) for itemset in itemsets: assert (expectedItemSets[itemset] == index.support(itemset)) print("Itemsets={}".format([i for i in itemsets if len(i) > 1])) # (antecedent, consequent, confidence, lift, support) expectedRules = { (frozenset({Item("x"), Item("y")}), frozenset({Item("z")}), 1, 1.5, 1 / 3), (frozenset({Item("x")}), frozenset({Item("y")}), 0.5, 1.5, 1 / 3), (frozenset({Item("x")}), frozenset({Item("z"), Item("y")}), 0.5, 1.5, 1 / 3), (frozenset({Item("x")}), frozenset({Item("z")}), 1, 1.5, 2 / 3), (frozenset({Item("y")}), frozenset({Item("x")}), 1, 1.5, 1 / 3), (frozenset({Item("y")}), frozenset({Item("z"), Item("x")}), 1, 1.5, 1 / 3), (frozenset({Item("y")}), frozenset({Item("z")}), 1, 1.5, 1 / 3), (frozenset({Item("z"), Item("x")}), frozenset({Item("y")}), 0.5, 1.5, 1 / 3), (frozenset({Item("z"), Item("y")}), frozenset({Item("x")}), 1, 1.5, 1 / 3), (frozenset({Item("z")}), frozenset({Item("x"), Item("y")}), 0.5, 1.5, 1 / 3), (frozenset({Item("z")}), frozenset({Item("x")}), 1, 1.5, 2 / 3), (frozenset({Item("z")}), frozenset({Item("y")}), 0.5, 1.5, 1 / 3), } rules = set(generate_rules(itemsets, 0, 0, index)) for (antecedent, consequent, confidence, lift, support) in rules: print("{}, {} conf={:.4f}, {:.4f}, {:.4f}".format( antecedent, consequent, confidence, lift, support)) assert (rules == expectedRules)
def booleanQuery(self): """ boolean query processing; note that a query like "A B C" is transformed to "A AND B AND C" for retrieving posting lists and merge them""" ivObj = InvertedIndex() ivObj.load(self.filename) index_item = ivObj.items[self.tokens[0]] # Get the doc ids from the sorted postings in the same order. docs = index_item.get_sorted_doc_ids() for token in self.tokens: index_item = ivObj.items[token] # Find intersection between the current docs and the index_item for the current token. docs = index_item.intersection(docs) return docs
def query( indexfilename,processingalgorithm,queryfilename, queryid, numresults=3): ''' the main query processing program, using QueryProcessor''' # ToDo: the commandline usage: "echo query_string | python query.py index_file processing_algorithm" # processing_algorithm: 0 for booleanQuery and 1 for vectorQuery # for booleanQuery, the program will print the total number of documents and the list of docuement IDs # for vectorQuery, the program will output the top 3 most similar documents qrys = loadCranQry(queryfilename) # for q in qrys: # print(q, qrys[q].text) loadiindex = InvertedIndex() loadiindex = loadiindex.load(indexfilename) # print("index loaded") cf = CranFile('cran.all') queryProcessor = QueryProcessor(qrys, loadiindex, cf.docs, numresults) if processingalgorithm == '0' : queryProcessor.preprocessing() queryProcessor.queryId = queryid results = queryProcessor.booleanQuery() if processingalgorithm == '1': queryProcessor.queryId = queryid results = queryProcessor.vectorQuery(queryProcessor.numofresults) return results
def vectorQuery(self, k): """ vector query processing, using the cosine similarity. """ # ToDo: return top k pairs of (docID, similarity), ranked by their cosine similarity with the query in the descending order # You can use term frequency or TFIDF to construct the vectors result = {} ivObj = InvertedIndex() ivObj.load(self.filename) # loading the InvertedIndex doc_set = set() term_idf_list = [] for term in self.tokens: # for every term in the query finding the document IDs where the term is present if term in self.index: doc_set = doc_set.union(set(self.index[term].posting.keys())) term_idf_list.append( ivObj.idf(term) * 1.0 / len(self.tokens)) # calculating tf-idf weights for query doc_list = list(doc_set) for docID in doc_list: # Calculating tf-idf weights for the above documents for term in self.tokens: if term in self.index: if docID in result.keys(): result[docID].append(ivObj.tfidf(term, docID)) else: result[docID] = [ivObj.tfidf(term, docID)] else: if docID in result.keys(): result[docID].append(0.0) else: result[docID] = [0.0] score_dict = {} term_idf_list_np = np.array(self.unitVector( term_idf_list)) # calculating unit vector for each document for docID in doc_list: unit_result = self.unitVector(result[docID]) unit_np = np.array(unit_result) score_dict[docID] = np.dot( term_idf_list_np, unit_np) # dot product for query and each document score_list = score_dict.items() final = sorted(score_list, key=itemgetter(1), reverse=True) similarity = [] for i in range(0, k): similarity.append(final[i]) return similarity # list of (docID,cosine similarity) in order of ranking
def setup_ranker(): global ranker text_processor = TextProcessor() docs = [] index = InvertedIndex.load(INDEX_FOLDER, "inverted_index") articles = select(article.id for article in Article) for article_id in articles: article = Article[article_id] docs.append( AbstractAndArticle(article, _read_file(article.processed_abstract_path))) ranker = TfIdf(index, text_processor, docs, VECTORS_PER_FILE, VECTORS_SAVE_FOLDER)
def build_index(): if not os.path.exists(INDEX_FOLDER): os.mkdir(INDEX_FOLDER) index = InvertedIndex.load(INDEX_FOLDER, InvertedIndex.NAME) if index: logging.debug("Index is successfully loaded") return logging.debug("Building index...") articles = select(article.id for article in Article)[:] index = InvertedIndex() IndexBuilder(processes=1).build(index, articles) logging.debug("Saving index...") index.save(INDEX_FOLDER)
def query(index_file, algorithm, query_file, query_id): ''' the main query processing program, using QueryProcessor''' # ToDo: the commandline usage: "echo query_string | python query.py index_file processing_algorithm" # processing_algorithm: 0 for booleanQuery and 1 for vectorQuery # for booleanQuery, the program will print the total number of documents and the list of docuement IDs # for vectorQuery, the program will output the top 3 most similar documents query_file = cranqry.loadCranQry(query_file) # loading file index_items = InvertedIndex() index_items = index_items.load(index_file) cran_file = cran.CranFile('cran.all') query_verify = QueryProcessor(query_file, index_items, cran_file.docs) query_verify.preprocessing() results = None if algorithm == '0': # if algorithm is 0 it represents boolean model results = query_verify.booleanQuery(query_id) elif algorithm == '1': # if algorithm is 1 it is vector model results = query_verify.vectorQuery(3, query_id) print(results)
def run_rank(): text_processor = TextProcessor() docs = [] index = InvertedIndex.load(INDEX_FOLDER, "inverted_index") articles = select(article.id for article in Article) for article_id in articles: article = Article[article_id] docs.append(AbstractAndArticle(article, _read_file(article.processed_abstract_path))) ranker = TfIdf(index, text_processor, docs, vectors_per_file=VECTORS_PER_FILE, vectors_save_folder=VECTORS_SAVE_FOLDER) while True: query = input("Enter query: ") top_ids = ranker.rank(query, 5) for article_id in top_ids: article = Article[article_id] print(article.title, article.document.url)
def eval(): # Algorithm: # Pick N random samples from query.txt # Get top 10 results from bool query for each rnd query # Get top 10 results from vector query for each rnd query # Compute NDCG btn bool query results and qrels.txt # Compute NDCG btn vector query results and qrels.txt # Get p-value btn bool and vector # Get the query collection qc = loadCranQry(query_path) poss_queries = list(qc) # Load up the inverted index ii = InvertedIndex() ii.load(index_file) # Load up the document collection cf = CranFile("cran.all") # Get ground-truth results from qrels.txt with open(qrels_path) as f: qrels = f.readlines() # Index qrels into a dict qrel_dict = {} for qrel in qrels: qrel_split = qrel.split() if int(qrel_split[0]) in qrel_dict: qrel_dict[int(qrel_split[0])].append(int(qrel_split[1])) else: qrel_dict[int(qrel_split[0])] = [int(qrel_split[1])] # Run over N random queries, collecting NDCGs bool_ndcgs = [] vector_ndcgs = [] for _ in range(n): # Get random query ID query_id = choice(poss_queries) # Get the query if 0 < int(query_id) < 10: query_id = '00' + str(int(query_id)) elif 9 < int(query_id) < 100: query_id = '0' + str(int(query_id)) try: query = qc[query_id].text except KeyError: print("Invalid query id", query_id) return # Initialize the query processor qp = QueryProcessor(query, ii, cf) # Run bool query bool_result = qp.booleanQuery()[:10] # Run vector query vector_result = qp.vectorQuery(10) # Pull top 10 ground-truth results from qrels dict gt_results = qrel_dict[poss_queries.index(query_id) + 1][:10] # Compute NDCG for bool query # NOTE: There is no weighting on the bool query, so give all an even 1 truth_vector = list(map(lambda x: x in gt_results, bool_result)) bool_ndcg = ndcg_score(truth_vector, [1] * len(truth_vector), k=len(truth_vector)) # Compute NDCG for vector query vector_docs = [] vector_scores = [] for v in vector_result: vector_docs.append(v[0]) vector_scores.append(v[1]) truth_vector = list(map(lambda x: x in gt_results, vector_docs)) vector_ndcg = ndcg_score(truth_vector, vector_scores, k=len(truth_vector)) # Accumulate NDCGs bool_ndcgs.append(bool_ndcg) vector_ndcgs.append(vector_ndcg) # Average out score lists bool_avg = 0 for bool in bool_ndcgs: bool_avg += bool bool_avg /= len(bool_ndcgs) vector_avg = 0 for vector in vector_ndcgs: vector_avg += vector vector_avg /= len(vector_ndcgs) # Present averages and p-values print("Boolean NDCG average:", bool_avg) print("Vector NDCG average:", vector_avg) if n > 19: print("Wilcoxon p-value:", wilcoxon(bool_ndcgs, vector_ndcgs).pvalue) else: print("Wilcoxon p-value: Sample size too small to be significant") print("T-Test p-value:", ttest_ind(bool_ndcgs, vector_ndcgs).pvalue)
def main(): ######### # SETUP # ######### # Get input args newsgroups_root_dir = argv[1] feat_def_path = argv[2] class_def_path = argv[3] training_data_path = argv[4] # Generate index #index_newsgroups(newsgroups_root_dir, "idx_save.pkl") ii = InvertedIndex() ii.load("idx_save.pkl") # Write out feature/term pairs to feat_def_path feature_id = 0 with open(feat_def_path, 'w') as outf: for item in ii.items: outf.write(str(feature_id) + " " + str(item) + "\n") feature_id += 1 # Read back in the feature/term pairs for later with open(feat_def_path, 'r') as inf: ft_pairs = inf.readlines() # Put the ft_pairs into a dictionary for quick lookup ft_dict = {} for pair in ft_pairs: ft_dict[pair.split()[1].strip()] = pair.split()[0] # Map the different newsgroups to a given class # This is fairly manual... with open(class_def_path, 'w') as outf: for dir in listdir(newsgroups_root_dir): outf.write(class_def_helper(dir) + " " + dir + "\n") ############################ # TRAINING DATA GENERATION # ############################ # Create the training data # For each document: # Find its containing folder, and extract class from class def # For each term in document # Compute tfidf, tf or idf current_file_id = 1 with open(training_data_path + ".TFIDF", 'w') as outf: # Compute tf-idf # Go through each document in newsgroups dir for root, _, files in walk(newsgroups_root_dir): # Find and write out the class label local_dir = root.split(sep)[-1] # For each file... for file in files: outf.write(class_def_helper(local_dir) + " ") print(root, file) # Get the words from the doc stemmed_token_list = preprocess_doc(root + sep + file) # Put all the info into a set (for uniqueness) data_set = set() # Now that we've re-done all that, find idfs for word in stemmed_token_list: # Skip blank stopwords if word == "": continue # Get the term ID #outf.write(ft_dict[word] + ":") # Calculate and write out TF-IDF # Note current_file_id is our doc_id tf = ii.find(word).posting[current_file_id].term_freq() idf = ii.idf(word) #outf.write(str(log10(1 + tf) * idf) + " ") data_set.add(ft_dict[word] + ":" + str(log10(1 + tf) * idf)) # Write newline to signify end of file #outf.write("\n") outf.write(" ".join( sorted(data_set, key=lambda x: int(x.split(':')[0]))) + "\n") outf.flush() # Increment our current doc current_file_id += 1 current_file_id = 1 with open(training_data_path + ".TF", 'w') as outf: # Compute tf # Go through each document in newsgroups dir for root, _, files in walk(newsgroups_root_dir): # Find and write out the class label local_dir = root.split(sep)[-1] # For each file... for file in files: outf.write(class_def_helper(local_dir) + " ") print(root, file) # Get the words from the doc stemmed_token_list = preprocess_doc(root + sep + file) # Put all the info into a set (for uniqueness) data_set = set() # Now that we've re-done all that, find idfs for word in stemmed_token_list: # Skip blank stopwords if word == "": continue # Get the term ID #outf.write(ft_dict[word] + ":") # Write the TF # Note current_file_id is our doc_id # outf.write(str(ii.find(word).posting[ # current_file_id].term_freq()) + " ") data_set.add(ft_dict[word] + ":" + str( ii.find(word).posting[current_file_id].term_freq())) # Write newline to signify end of file # outf.write("\n") outf.write(" ".join( sorted(data_set, key=lambda x: int(x.split(':')[0]))) + "\n") # outf.flush() # Increment our current doc current_file_id += 1 current_file_id = 1 with open(training_data_path + ".IDF", 'w') as outf: # Compute idf # Go through each document in newsgroups dir for root, _, files in walk(newsgroups_root_dir): # Find and write out the class label local_dir = root.split(sep)[-1] # For each file... for file in files: outf.write(class_def_helper(local_dir) + " ") print(root, file) # Get the words from the doc stemmed_token_list = preprocess_doc(root + sep + file) # Put all the info into a set (for uniqueness) data_set = set() # Now that we've re-done all that, find idfs for word in stemmed_token_list: # Skip blank stopwords if word == "": continue # Get the term ID #outf.write(ft_dict[word] + ":" + str(ii.idf(word)) # + " ") data_set.add(ft_dict[word] + ":" + str(ii.idf(word))) # Write newline to signify end of file outf.write(" ".join( sorted(data_set, key=lambda x: int(x.split(':')[0]))) + "\n")
def test_apriori(): data = ("a,b,c,d,e,f\n" "g,h,i,j,k,l\n" "z,x\n" "z,x\n" "z,x,y\n" "z,x,y,i\n") expectedItemSets = {ItemSet("i"): 2 / 6, ItemSet("z"): 4 / 6, ItemSet("x"): 4 / 6, ItemSet("y"): 2 / 6, ItemSet("xz"): 4 / 6, ItemSet("yz"): 2 / 6, ItemSet("xy"): 2 / 6, ItemSet("xyz"): 2 / 6} index = InvertedIndex() index.load(data) itemsets = apriori(index, 2 / 6) assert(len(itemsets) == len(expectedItemSets)) for itemset in itemsets: assert(frozenset(itemset) in expectedItemSets) for itemset in itemsets: assert(expectedItemSets[frozenset(itemset)] == index.support(itemset)) print("Itemsets={}".format([i for i in itemsets if len(i) > 1])) def itemize(a): return list(map(item_id, a)) # (antecedent, consequent, confidence, lift, support) rx = [ (['y'], ['x'], 1.0, 1.5, 0.3333333333333333), (['x'], ['y'], 0.5, 1.5, 0.3333333333333333), (['y'], ['z'], 1.0, 1.5, 0.3333333333333333), (['z'], ['y'], 0.5, 1.5, 0.3333333333333333), (['x'], ['z'], 1.0, 1.5, 0.6666666666666666), (['z'], ['x'], 1.0, 1.5, 0.6666666666666666), (['x', 'y'], ['z'], 1.0, 1.5, 0.3333333333333333), (['z', 'y'], ['x'], 1.0, 1.5, 0.3333333333333333), (['z', 'x'], ['y'], 0.5, 1.5, 0.3333333333333333), (['y'], ['z', 'x'], 1.0, 1.5, 0.3333333333333333), (['x'], ['z', 'y'], 0.5, 1.5, 0.3333333333333333), (['z'], ['x', 'y'], 0.5, 1.5, 0.3333333333333333) ] expectedRules = list(map(lambda a: (itemize(a[0]), itemize(a[1]), a[2], a[3], a[4]), rx)) itemset_counts = dict(map(lambda i: (tuple(i), index.count(i)), itemsets)) rules = generate_rules( itemsets, itemset_counts, index.num_transactions, 0, 0) def deitemize(a): return list(map(item_str, a)) p = list(map(lambda a: (deitemize(a[0]), deitemize(a[1]), a[2], a[3], a[4]), rules)) print("rules") print(p) for (antecedent, consequent, confidence, lift, support) in rules: print("{}, {} conf={:.4f}, {:.4f}, {:.4f}". format(antecedent, consequent, confidence, lift, support)) assert(len(rules) == len(expectedRules)) for i in range(len(rules)): assert(expectedRules[i] in rules)
def eval(indexfilename, queryfilename, queryrefilename, numberofrandomqueries): # ToDo actual = [] # if numberofrandomqueries > 225: raise Exception('please enter query count less than or equal to 225') qrys = loadCranQry("query.text") validqueries = [] querycounter = 0 for q in qrys: validqueries.append(int(q)) loadiindex = InvertedIndex() loadiindex = loadiindex.load("index_file.pickle") # print("index loaded") cf = CranFile('cran.all') #QueryProcessor.numberofresult =10 #qp = QueryProcessor(qrys,loadiindex,cf.docs,10) queryRelevence = dict() for line in open(queryrefilename): fields = line.split(" ") fields[0] = '%0*d' % (3, int(fields[0])) if fields[0] in queryRelevence: # and let's extract the data: queryRelevence[fields[0]].append(fields[1]) else: # create a new array in this slot queryRelevence[fields[0]] = [fields[1]] replacecounter = 0 queryRelevenceUpdated = {} for k in queryRelevence: queryRelevenceUpdated['%0*d' % (3, int( validqueries[replacecounter]))] = queryRelevence.get(k) replacecounter = replacecounter + 1 # relevent = list(queryRelevence.keys()) # relevent = list(map(int, relevent)) #samplespace = np.intersect1d(relevent, validqueries) list_of_random_items = random.sample(validqueries, numberofrandomqueries) tempcounter2 = 0 booleanndcg = [] vectorndcg = [] while tempcounter2 < numberofrandomqueries: list_of_random_items[tempcounter2] = '%0*d' % ( 3, int(list_of_random_items[tempcounter2])) print('query for which ndcg is calculated ' + str(list_of_random_items[tempcounter2])) y = str(list_of_random_items[tempcounter2]) vectorresult = query(indexfilename, '1', queryfilename, str(list_of_random_items[tempcounter2]), 10) # vectorresult = ['573', '51', '944', '878', '12', '486', '875', '879', '746', '665'] # print(vectorresult) tempcounter = 0 for z in vectorresult: if z in queryRelevenceUpdated[str( list_of_random_items[tempcounter2])]: vectorresult[tempcounter] = 1 else: vectorresult[tempcounter] = 0 tempcounter = tempcounter + 1 #print(vectorresult) idealvectorresult = vectorresult.copy() idealvectorresult.sort(reverse=True) #print(idealvectorresult) if sum(idealvectorresult) == 0: ndcgscore = 0 else: ndcgscore = ndcg_score(idealvectorresult, vectorresult) # print(ndcgscore) vectorndcg.append(ndcgscore) tempcounter3 = 0 booleanqueryresult = query(indexfilename, '0', queryfilename, str(list_of_random_items[tempcounter2]), 10) #booleanqueryresult = ['462','462','462','462','462','462','462','462','462'] booleanquery = booleanqueryresult.copy() for g in booleanquery: if g in queryRelevenceUpdated[str( list_of_random_items[tempcounter2])]: booleanquery[tempcounter3] = 1 else: booleanquery[tempcounter3] = 0 tempcounter3 = tempcounter3 + 1 #print(booleanquery) tempcounter4 = len(booleanquery) while tempcounter4 < 10: booleanquery.append(0) tempcounter4 = tempcounter4 + 1 idealbooleanresult = [] for i in range(0, 10): if i < len(queryRelevenceUpdated[str( list_of_random_items[tempcounter2])]): idealbooleanresult.append(1) else: idealbooleanresult.append(0) idealbooleanresult.sort(reverse=True) if sum(booleanquery) == 0: ndcgscoreboolean = 0 else: ndcgscoreboolean = ndcg_score(booleanquery, idealbooleanresult) booleanndcg.append(ndcgscoreboolean) tempcounter2 = tempcounter2 + 1 print('P value for all the queries processed is:') print( scipy.stats.wilcoxon(vectorndcg, booleanndcg, zero_method='wilcox', correction=False)) print('Done')
def test(index_loc, cran_loc, qrels_loc): ''' test your code thoroughly. put the testing cases here''' ##### SETUP ITEMS ##### # Grab index file to restore II ii = InvertedIndex() ii.load(index_loc) # Get the document collection cf = CranFile(cran_loc) # Get ground-truth results from qrels.txt with open(qrels_loc) as f: qrels = f.readlines() # Index qrels into a dict qrel_dict = {} for qrel in qrels: qrel_split = qrel.split() if int(qrel_split[0]) in qrel_dict: qrel_dict[int(qrel_split[0])].append(int(qrel_split[1])) else: qrel_dict[int(qrel_split[0])] = [int(qrel_split[1])] ##### INITIAL TEST ITEMS ##### print("TESTS BASED ON SUGGESTED TESTING POINTS") # Ensure tf is correct # Find a random word and check TF value against what is manually done posting_list = ii.find("experiment").posting tf_vector = [] for posting in posting_list: tf_vector.append(len(posting_list[posting].positions) \ == posting_list[posting].term_freq()) print("TF is computed correctly:", all(tf_vector)) # Ensure idf is correct print("IDF is computed correctly:", log10(ii.nDocs / len(posting_list)) \ == ii.idf("experiment")) # As both tf and idf are correct, and tf-idf is a product of the two, # it is reasonable to assume tf-idf is computed correctly ##### BOOL QUERY TESTS ##### # Here, I use very specific boolean queries to ensure that a # limited number of documents are returned print("\nBOOL QUERY TESTS") # Ensure that the exact title of doc 8 matches for doc 8 doc8 = "measurements of the effect of two-dimensional and three-dimensional roughness elements on boundary layer transition" qp1 = QueryProcessor(doc8, ii, cf) print("Bool query matches on exact title:", qp1.booleanQuery() == [8]) # Ensure that bool query matches very specific AND query qp2 = QueryProcessor("hugoniot and infinitesimally", ii, cf) print( "Bool query matches on specific AND query ('hugoniot and infinitesimally'):", qp2.booleanQuery() == [329]) # Test that an OR query is handled properly # Both gravel and stagnation have completely distinct postings lists. # OR should merge them. gravel_postings = ii.find("gravel").sorted_postings[:] stag_postings = ii.find("stagnat").sorted_postings[:] gravel_postings.extend(stag_postings) qp3 = QueryProcessor("gravel or stagnation", ii, cf) print("Bool query successfully handles OR ('gravel or stagnation'):", qp3.booleanQuery() == sorted(gravel_postings)) # Test that NOT is handled properly # The posting list for "diameter" is a subset of "slipstream" postings # (oddly enough). To test this works, do "slipstream and not diameter" # and we chould get slipstream's postings minus those of diameter. slip_postings = ii.find("slipstream").sorted_postings[:] diam_postings = ii.find("diamet").sorted_postings[:] slip_not_diam = [t for t in slip_postings if t not in diam_postings] print("Bool query successfully handles NOT ('slipstream and not diameter'):", QueryProcessor("slipstream and not diameter", ii, cf).booleanQuery() \ == slip_not_diam) # Ensure AND/OR order doesn't matter print("Bool query can handle query regardless of AND order ('a and b' = 'b and a'):", QueryProcessor("slipstream and diameter", ii, cf).booleanQuery() \ == QueryProcessor("diameter and slipstream", ii, cf).booleanQuery()) print("Bool query can handle query regardless of OR order ('a or b' = 'b or a'):", QueryProcessor("slipstream or diameter", ii, cf).booleanQuery() \ == QueryProcessor("diameter or slipstream", ii, cf).booleanQuery()) # Ensure that the presence of parens does not change query results print("Bool query can handle query regardless of parens ('slipstream and diameter'):", QueryProcessor("slipstream and diameter", ii, cf).booleanQuery() \ == QueryProcessor("(slipstream and diameter)", ii, cf).booleanQuery()) # Ensure parentheses do not change order of processing for AND-AND and OR-OR queries print("Bool query AND is accociative ('(a and b) and c' = 'a and (b and c)'):", QueryProcessor("(slipstream and diameter) and thrust", ii, cf).booleanQuery() \ == QueryProcessor("slipstream and (diameter and thrust)", ii, cf).booleanQuery()) print("Bool query OR is accociative ('(a or b) or c' = 'a or (b or c)'):", QueryProcessor("(slipstream or diameter) or thrust", ii, cf).booleanQuery() \ == QueryProcessor("slipstream or (diameter or thrust)", ii, cf).booleanQuery()) # Ensure parentheses properly group items # Tested by doing the query "manually" by adding/orring the correct terms part_one = QueryProcessor("conduction and cylinder and gas", ii, cf).booleanQuery() part_two = QueryProcessor("radiation and gas", ii, cf).booleanQuery() part_one.extend(part_two) expected_result = QueryProcessor("hugoniot", ii, cf).booleanQuery() expected_result.extend(part_one) print("Bool query parens successfully group conflicting operators:", QueryProcessor("(conduction and cylinder and gas) or (radiation and gas) or hugoniot", ii, cf).booleanQuery() \ == sorted(list(set(expected_result)))) ##### VECTOR QUERY TESTS ##### # For this, just ensure that most of the results are in the expected list print("\nVECTOR QUERY TESTS") # Ensure vector query can match on exact title print("Vector query matches on exact title:", qp1.vectorQuery(1)[0][0] == 8) # Try a few example queries from query.text # As long as one-fifth of t-10 are in gt_result, call it a pass # Note that queries with larger answer sets were chosen to # ensure there were enough to get to one-fifth of ten qc = loadCranQry("query.text") poss_queries = list(qc) # Query 001 result = QueryProcessor(qc["001"].text, ii, cf).vectorQuery(10) gt_result = qrel_dict[poss_queries.index("001") + 1] correct_vector = list(map(lambda x: x in gt_result, [x[0] for x in result])) print("Vector query is at least one-fifth correct for query 001:", sum(correct_vector) > 2) # Query 128 result = QueryProcessor(qc["128"].text, ii, cf).vectorQuery(10) gt_result = qrel_dict[poss_queries.index("128") + 1] correct_vector = list(map(lambda x: x in gt_result, [x[0] for x in result])) print("Vector query is at least one-fifth correct for query 128:", sum(correct_vector) > 2) # Query 226 result = QueryProcessor(qc["226"].text, ii, cf).vectorQuery(10) gt_result = qrel_dict[poss_queries.index("226") + 1] correct_vector = list(map(lambda x: x in gt_result, [x[0] for x in result])) print("Vector query is at least one-fifth correct for query 226:", sum(correct_vector) > 2) # Query 196 result = QueryProcessor(qc["196"].text, ii, cf).vectorQuery(10) gt_result = qrel_dict[poss_queries.index("196") + 1] correct_vector = list(map(lambda x: x in gt_result, [x[0] for x in result])) print("Vector query is at least one-fifth correct for query 196:", sum(correct_vector) > 2) # Query 291 result = QueryProcessor(qc["291"].text, ii, cf).vectorQuery(10) gt_result = qrel_dict[poss_queries.index("291") + 1] correct_vector = list(map(lambda x: x in gt_result, [x[0] for x in result])) print("Vector query is at least one-fifth correct for query 291:", sum(correct_vector) > 2)