def query(): ''' the main query processing program, using QueryProcessor''' # ToDo: the commandline usage: "echo query_string | python query.py index_file processing_algorithm" # processing_algorithm: 0 for booleanQuery and 1 for vectorQuer # for booleanQuery, the program will print the total number of documents and the list of docuement IDs # for vectorQuery, the program will output the top 3 most similar documents # Ensure args are valid if len(argv) is not 5: print( "Syntax: python query.py <index-file-path> <processing-algorithm> <query.txt path> <query-id>" ) return # Grab arguments index_file_loc = argv[1] processing_algo = argv[2] query_file_path = argv[3] query_id = argv[4] # Grab index file to restore II ii = InvertedIndex() ii.load(index_file_loc) # Get the document collection cf = CranFile("cran.all") # Get the query collection qc = loadCranQry(query_file_path) # Get the query if 0 < int(query_id) < 10: query_id = '00' + str(int(query_id)) elif 9 < int(query_id) < 100: query_id = '0' + str(int(query_id)) try: query = qc[query_id].text except KeyError: print("Invalid query id", query_id) return # Initialize a query processor qp = QueryProcessor(query, ii, cf) # Do query if int(processing_algo) is 0: result = qp.booleanQuery() if result: print("Results:", ", ".join(str(x) for x in qp.booleanQuery())) else: print("Results: None") elif int(processing_algo) is 1: result = qp.vectorQuery(k=3) print("Results:") for r in result: print("Doc", r[0], "Score", r[1]) else: print("Invalid processing algorithm", processing_algo + ". Use 0 (boolean) or 1 (vector).")
def query( indexfilename,processingalgorithm,queryfilename, queryid, numresults=3): ''' the main query processing program, using QueryProcessor''' # ToDo: the commandline usage: "echo query_string | python query.py index_file processing_algorithm" # processing_algorithm: 0 for booleanQuery and 1 for vectorQuery # for booleanQuery, the program will print the total number of documents and the list of docuement IDs # for vectorQuery, the program will output the top 3 most similar documents qrys = loadCranQry(queryfilename) # for q in qrys: # print(q, qrys[q].text) loadiindex = InvertedIndex() loadiindex = loadiindex.load(indexfilename) # print("index loaded") cf = CranFile('cran.all') queryProcessor = QueryProcessor(qrys, loadiindex, cf.docs, numresults) if processingalgorithm == '0' : queryProcessor.preprocessing() queryProcessor.queryId = queryid results = queryProcessor.booleanQuery() if processingalgorithm == '1': queryProcessor.queryId = queryid results = queryProcessor.vectorQuery(queryProcessor.numofresults) return results
def crawl(self): """ Performs a crawl process on each pending page. Args: None Returns: None """ index = InvertedIndex() for url in self.urls: if url not in self.crawled_urls: # build page response = requests.get(f"{self.url}{url}") page_object = BeautifulSoup(response.content, "html.parser") # begin crawling print(f"Crawling: {url}") self.links(page_object) page = f"{self.url}{url}" words = self.words(page_object) index.create_index(page, words) # mark page as visited self.crawled_urls.append(url) self.urls.remove(url) time.sleep(5)
def test_apriori(): data = ("a,b,c,d,e,f\n" "g,h,i,j,k,l\n" "z,x\n" "z,x\n" "z,x,y\n" "z,x,y,i\n") expectedItemSets = { ItemSet("i"): 2 / 6, ItemSet("z"): 4 / 6, ItemSet("x"): 4 / 6, ItemSet("y"): 2 / 6, ItemSet("xz"): 4 / 6, ItemSet("yz"): 2 / 6, ItemSet("xy"): 2 / 6, ItemSet("xyz"): 2 / 6 } index = InvertedIndex() index.load(data) itemsets = apriori(index, 2 / 6) assert (set(expectedItemSets.keys()) == set(itemsets)) for itemset in itemsets: assert (expectedItemSets[itemset] == index.support(itemset)) print("Itemsets={}".format([i for i in itemsets if len(i) > 1])) # (antecedent, consequent, confidence, lift, support) expectedRules = { (frozenset({Item("x"), Item("y")}), frozenset({Item("z")}), 1, 1.5, 1 / 3), (frozenset({Item("x")}), frozenset({Item("y")}), 0.5, 1.5, 1 / 3), (frozenset({Item("x")}), frozenset({Item("z"), Item("y")}), 0.5, 1.5, 1 / 3), (frozenset({Item("x")}), frozenset({Item("z")}), 1, 1.5, 2 / 3), (frozenset({Item("y")}), frozenset({Item("x")}), 1, 1.5, 1 / 3), (frozenset({Item("y")}), frozenset({Item("z"), Item("x")}), 1, 1.5, 1 / 3), (frozenset({Item("y")}), frozenset({Item("z")}), 1, 1.5, 1 / 3), (frozenset({Item("z"), Item("x")}), frozenset({Item("y")}), 0.5, 1.5, 1 / 3), (frozenset({Item("z"), Item("y")}), frozenset({Item("x")}), 1, 1.5, 1 / 3), (frozenset({Item("z")}), frozenset({Item("x"), Item("y")}), 0.5, 1.5, 1 / 3), (frozenset({Item("z")}), frozenset({Item("x")}), 1, 1.5, 2 / 3), (frozenset({Item("z")}), frozenset({Item("y")}), 0.5, 1.5, 1 / 3), } rules = set(generate_rules(itemsets, 0, 0, index)) for (antecedent, consequent, confidence, lift, support) in rules: print("{}, {} conf={:.4f}, {:.4f}, {:.4f}".format( antecedent, consequent, confidence, lift, support)) assert (rules == expectedRules)
def __init__(self, query, index_file, collection): ''' index is the inverted index; collection is the document collection''' self.raw_query = query self.index = InvertedIndex() self.index = self.index.loadData(index_file) self.docs = collection self.tokenizer = Tokenizer( known_words=set(self.index.get_items_inverted().keys())) if self.raw_query: self.processed_query = self.preprocessing(self.raw_query)
def booleanQuery(self): ''' boolean query processing; note that a query like "A B C" is transformed to "A AND B AND C" for retrieving posting lists and merge them''' #ToDo: return a list of docIDs PostingDict = { } #store key value pair of query term and postings by processing index file boolen = [] #stores list of docid for each queryterm key booleanResult = set() tempDic = {} QueryDic = {} for qterm in Queryterm: plist = InvertedIndex.getPostingsList(qterm) '''since every term in inverted index is unique below code adds the qterm:postings list to Postings Dictionary''' PostingDict.update({qterm: plist}) for qterms in PostingDict.keys(): tempDic[qterms] = len(PostingDict[qterms]) for qterms, cf in tempDic.items(): if cf > 0: if cf < 300: QueryDic[qterms] = cf '''checking for length of query term is it contains only single word it directly posts the result read from inverted index file''' if len(QueryDic) == 1: for key in QueryDic.keys(): booleanResult = PostingDict[key] if not booleanResult: print("Given query has no matched Document", ''.join(Query)) else: print("Result of the search query ", booleanResult) else: keylist = list(QueryDic.keys()) '''iterating over query terms as keys and merging postings list over intersection to find list of postings that contains all query terms''' for key in QueryDic.keys(): 'adding postings list of each queryterm' boolen.append(sorted(PostingDict[key], key=int)) '''checking the intersection result boolean result set ''' booleanResult = set.intersection(*map(set, boolen)) 'If first boolean result is null then we process pairwise intersection of query terms' if booleanResult == set(): for i in range(len(QueryDic) - 1): if not i == len(QueryDic) - 1: p1 = PostingDict[keylist[i]] p2 = PostingDict[keylist[i + 1]] temp = InvertedIndex.mergeList(p1, p2) '''checking for empty result post merge if result is not empty set adding the intersection result boolean result set ''' if not temp == set(): booleanResult.update(temp) return sorted(booleanResult, key=int)
def booleanQuery(self): """ boolean query processing; note that a query like "A B C" is transformed to "A AND B AND C" for retrieving posting lists and merge them""" ivObj = InvertedIndex() ivObj.load(self.filename) index_item = ivObj.items[self.tokens[0]] # Get the doc ids from the sorted postings in the same order. docs = index_item.get_sorted_doc_ids() for token in self.tokens: index_item = ivObj.items[token] # Find intersection between the current docs and the index_item for the current token. docs = index_item.intersection(docs) return docs
def test_stress(): datasets = [ ("datasets/UCI-zoo.csv", 0.3), ("datasets/mushroom.csv", 0.4), # ("datasets/BMS-POS.csv", 0.05), # ("datasets/kosarak.csv", 0.05), ] for (csvFilePath, min_support) in datasets: # Run Apriori and FP-Growth and assert both have the same results. print("Running Apriori for {}".format(csvFilePath)) start = time.time() index = InvertedIndex() index.load_csv(csvFilePath) apriori_itemsets = apriori(index, min_support) apriori_duration = time.time() - start print( "Apriori complete. Generated {} itemsets in {:.2f} seconds".format( len(apriori_itemsets), apriori_duration)) print("Running FPTree for {}".format(csvFilePath)) start = time.time() with open(csvFilePath, newline='') as csvfile: test_transactions = list(csv.reader(csvfile)) fptree_itemsets = mine_fp_tree(test_transactions, min_support) fptree_duration = time.time() - start print( "fp_growth complete. Generated {} itemsets in {:.2f} seconds".format( len(fptree_itemsets), fptree_duration)) if set(fptree_itemsets) == set(apriori_itemsets): print("SUCCESS({}): Apriori and fptree results match".format(csvFilePath)) else: print("FAIL({}): Apriori and fptree results differ!".format(csvFilePath)) assert(set(fptree_itemsets) == set(apriori_itemsets)) if apriori_duration > fptree_duration: print( "FPTree was faster by {:.2f} seconds".format( apriori_duration - fptree_duration)) else: print( "Apriori was faster by {:.2f} seconds".format( fptree_duration - apriori_duration)) print("")
def build_index(): if not os.path.exists(INDEX_FOLDER): os.mkdir(INDEX_FOLDER) index = InvertedIndex.load(INDEX_FOLDER, InvertedIndex.NAME) if index: logging.debug("Index is successfully loaded") return logging.debug("Building index...") articles = select(article.id for article in Article)[:] index = InvertedIndex() IndexBuilder(processes=1).build(index, articles) logging.debug("Saving index...") index.save(INDEX_FOLDER)
def query(index_file, algorithm, query_file, query_id): ''' the main query processing program, using QueryProcessor''' # ToDo: the commandline usage: "echo query_string | python query.py index_file processing_algorithm" # processing_algorithm: 0 for booleanQuery and 1 for vectorQuery # for booleanQuery, the program will print the total number of documents and the list of docuement IDs # for vectorQuery, the program will output the top 3 most similar documents query_file = cranqry.loadCranQry(query_file) # loading file index_items = InvertedIndex() index_items = index_items.load(index_file) cran_file = cran.CranFile('cran.all') query_verify = QueryProcessor(query_file, index_items, cran_file.docs) query_verify.preprocessing() results = None if algorithm == '0': # if algorithm is 0 it represents boolean model results = query_verify.booleanQuery(query_id) elif algorithm == '1': # if algorithm is 1 it is vector model results = query_verify.vectorQuery(3, query_id) print(results)
def setup_ranker(): global ranker text_processor = TextProcessor() docs = [] index = InvertedIndex.load(INDEX_FOLDER, "inverted_index") articles = select(article.id for article in Article) for article_id in articles: article = Article[article_id] docs.append( AbstractAndArticle(article, _read_file(article.processed_abstract_path))) ranker = TfIdf(index, text_processor, docs, VECTORS_PER_FILE, VECTORS_SAVE_FOLDER)
def vectorQuery(self, k): """ vector query processing, using the cosine similarity. """ # ToDo: return top k pairs of (docID, similarity), ranked by their cosine similarity with the query in the descending order # You can use term frequency or TFIDF to construct the vectors result = {} ivObj = InvertedIndex() ivObj.load(self.filename) # loading the InvertedIndex doc_set = set() term_idf_list = [] for term in self.tokens: # for every term in the query finding the document IDs where the term is present if term in self.index: doc_set = doc_set.union(set(self.index[term].posting.keys())) term_idf_list.append( ivObj.idf(term) * 1.0 / len(self.tokens)) # calculating tf-idf weights for query doc_list = list(doc_set) for docID in doc_list: # Calculating tf-idf weights for the above documents for term in self.tokens: if term in self.index: if docID in result.keys(): result[docID].append(ivObj.tfidf(term, docID)) else: result[docID] = [ivObj.tfidf(term, docID)] else: if docID in result.keys(): result[docID].append(0.0) else: result[docID] = [0.0] score_dict = {} term_idf_list_np = np.array(self.unitVector( term_idf_list)) # calculating unit vector for each document for docID in doc_list: unit_result = self.unitVector(result[docID]) unit_np = np.array(unit_result) score_dict[docID] = np.dot( term_idf_list_np, unit_np) # dot product for query and each document score_list = score_dict.items() final = sorted(score_list, key=itemgetter(1), reverse=True) similarity = [] for i in range(0, k): similarity.append(final[i]) return similarity # list of (docID,cosine similarity) in order of ranking
def run_rank(): text_processor = TextProcessor() docs = [] index = InvertedIndex.load(INDEX_FOLDER, "inverted_index") articles = select(article.id for article in Article) for article_id in articles: article = Article[article_id] docs.append(AbstractAndArticle(article, _read_file(article.processed_abstract_path))) ranker = TfIdf(index, text_processor, docs, vectors_per_file=VECTORS_PER_FILE, vectors_save_folder=VECTORS_SAVE_FOLDER) while True: query = input("Enter query: ") top_ids = ranker.rank(query, 5) for article_id in top_ids: article = Article[article_id] print(article.title, article.document.url)
def query(index_file, model_type, query_file, query_id): ''' the main query processing program, using QueryProcessor''' # ToDo: the commandline usage: "echo query_string | python query.py index_file processing_algorithm" # processing_algorithm: 0 for booleanQuery and 1 for vectorQuery # for booleanQuery, the program will print the total number of documents and the list of docuement IDs # for vectorQuery, the program will output the top 3 most similar documents #load documents inputdocument = cran.CranFile("cran.all") #load the index file saved at from part 1 index = InvertedIndex().load(index_file) #load query processed files queries = loadCranQry(query_file) qp = QueryProcessor(queries, index, inputdocument, query_id) if model_type == 0: Booleanres = qp.booleanQuery() print(Booleanres) if model_type == 1: vectorres = qp.vectorQuery(3) print(vectorres) if model_type == 2: qp.BatchEvaluation()
def main(): ######### # SETUP # ######### # Get input args newsgroups_root_dir = argv[1] feat_def_path = argv[2] class_def_path = argv[3] training_data_path = argv[4] # Generate index #index_newsgroups(newsgroups_root_dir, "idx_save.pkl") ii = InvertedIndex() ii.load("idx_save.pkl") # Write out feature/term pairs to feat_def_path feature_id = 0 with open(feat_def_path, 'w') as outf: for item in ii.items: outf.write(str(feature_id) + " " + str(item) + "\n") feature_id += 1 # Read back in the feature/term pairs for later with open(feat_def_path, 'r') as inf: ft_pairs = inf.readlines() # Put the ft_pairs into a dictionary for quick lookup ft_dict = {} for pair in ft_pairs: ft_dict[pair.split()[1].strip()] = pair.split()[0] # Map the different newsgroups to a given class # This is fairly manual... with open(class_def_path, 'w') as outf: for dir in listdir(newsgroups_root_dir): outf.write(class_def_helper(dir) + " " + dir + "\n") ############################ # TRAINING DATA GENERATION # ############################ # Create the training data # For each document: # Find its containing folder, and extract class from class def # For each term in document # Compute tfidf, tf or idf current_file_id = 1 with open(training_data_path + ".TFIDF", 'w') as outf: # Compute tf-idf # Go through each document in newsgroups dir for root, _, files in walk(newsgroups_root_dir): # Find and write out the class label local_dir = root.split(sep)[-1] # For each file... for file in files: outf.write(class_def_helper(local_dir) + " ") print(root, file) # Get the words from the doc stemmed_token_list = preprocess_doc(root + sep + file) # Put all the info into a set (for uniqueness) data_set = set() # Now that we've re-done all that, find idfs for word in stemmed_token_list: # Skip blank stopwords if word == "": continue # Get the term ID #outf.write(ft_dict[word] + ":") # Calculate and write out TF-IDF # Note current_file_id is our doc_id tf = ii.find(word).posting[current_file_id].term_freq() idf = ii.idf(word) #outf.write(str(log10(1 + tf) * idf) + " ") data_set.add(ft_dict[word] + ":" + str(log10(1 + tf) * idf)) # Write newline to signify end of file #outf.write("\n") outf.write(" ".join( sorted(data_set, key=lambda x: int(x.split(':')[0]))) + "\n") outf.flush() # Increment our current doc current_file_id += 1 current_file_id = 1 with open(training_data_path + ".TF", 'w') as outf: # Compute tf # Go through each document in newsgroups dir for root, _, files in walk(newsgroups_root_dir): # Find and write out the class label local_dir = root.split(sep)[-1] # For each file... for file in files: outf.write(class_def_helper(local_dir) + " ") print(root, file) # Get the words from the doc stemmed_token_list = preprocess_doc(root + sep + file) # Put all the info into a set (for uniqueness) data_set = set() # Now that we've re-done all that, find idfs for word in stemmed_token_list: # Skip blank stopwords if word == "": continue # Get the term ID #outf.write(ft_dict[word] + ":") # Write the TF # Note current_file_id is our doc_id # outf.write(str(ii.find(word).posting[ # current_file_id].term_freq()) + " ") data_set.add(ft_dict[word] + ":" + str( ii.find(word).posting[current_file_id].term_freq())) # Write newline to signify end of file # outf.write("\n") outf.write(" ".join( sorted(data_set, key=lambda x: int(x.split(':')[0]))) + "\n") # outf.flush() # Increment our current doc current_file_id += 1 current_file_id = 1 with open(training_data_path + ".IDF", 'w') as outf: # Compute idf # Go through each document in newsgroups dir for root, _, files in walk(newsgroups_root_dir): # Find and write out the class label local_dir = root.split(sep)[-1] # For each file... for file in files: outf.write(class_def_helper(local_dir) + " ") print(root, file) # Get the words from the doc stemmed_token_list = preprocess_doc(root + sep + file) # Put all the info into a set (for uniqueness) data_set = set() # Now that we've re-done all that, find idfs for word in stemmed_token_list: # Skip blank stopwords if word == "": continue # Get the term ID #outf.write(ft_dict[word] + ":" + str(ii.idf(word)) # + " ") data_set.add(ft_dict[word] + ":" + str(ii.idf(word))) # Write newline to signify end of file outf.write(" ".join( sorted(data_set, key=lambda x: int(x.split(':')[0]))) + "\n")
class QueryProcessor: ## # # @param self # @param query # @param index # @param collection # @return None # @brief The constructor. # This process is extremely expensive because it loads the entire pickle object into memory. # If we are only executing this for one query it is fine but if we are doing it # for the evaluation used the load query instead # @exception None documented yet ## def __init__(self, query, index_file, collection): ''' index is the inverted index; collection is the document collection''' self.raw_query = query self.index = InvertedIndex() self.index = self.index.loadData(index_file) self.docs = collection self.tokenizer = Tokenizer( known_words=set(self.index.get_items_inverted().keys())) if self.raw_query: self.processed_query = self.preprocessing(self.raw_query) ## # @brief This method is used to load the next query for evaluation # @param self # @param query # @return None # @exception None ## def loadQuery(self, query): self.raw_query = query self.processed_query = self.preprocessing(self.raw_query) ## # @brief This method is used to load the next query for evaluation # @param self # @param raw_query # @return None # @exception None ## def preprocessing(self, raw_query): ''' apply the same preprocessing steps used by indexing, also use the provided spelling corrector. Note that spelling corrector should be applied before stopword removal and stemming (why?)''' return self.tokenizer.transpose_document_tokenized_stemmed_spelling( raw_query) ## # @brief This method does the boolean query processing # @param self # @return results:list[docID] # @bug Fixed # @exception None ## def booleanQuery(self): ''' boolean query processing; note that a query like "A B C" is transformed to "A AND B AND C" for retrieving posting lists and merge them''' ''' This method would likely be faster due to the use of hashes, but I wanted to do what was shown in the slides from functools import reduce docs = [set(self.index[w]) for w in self.processed_query] docs.sort(key=len) # notice it is still smart to order by size return reduce(set.intersection,docs) ''' if len(self.processed_query) == 0: return [] ## checks that all of our query words are in the index, if not return [] ## for w in self.processed_query: if not w in self.index.get_items_inverted(): return [] ## checks if we only have 1 term in the query and returns its posting list if we do ## if len(self.processed_query) == 1: return list(self.index.get_items_inverted()[ self.processed_query[0]].get_posting_list().keys()) #### document_ids is a list of lists containing only document ids #### document_ids = [ list(self.index.get_items_inverted()[w].get_posting_list().keys()) for w in self.processed_query ] # by sorting so that we start with the shortest list of documents we get a potential speed up document_ids.sort(key=len) results = document_ids[0] ## iterates through each query word and does the intersection of docids from its posting list with all those before it ## ## could be done faster if index was implemented as set or some other hash data structure for p in document_ids[1:]: intermediate = [] i, j = 0, 0 while i < len(results) and j < len(p): if int(results[i]) < int(p[j]): i += 1 elif int(results[i]) > int(p[j]): j += 1 else: intermediate.append(p[j]) j += 1 i += 1 results = intermediate ## checks if we have already found terms totally disjoint from one another if len(results) == 0: return results return results ## # @brief This method compute cosine similarity for two vectors # @param self # @param vec1 # @param vec2 # @return score cosine: int # @exception None ## def cosine_similarity(self, vec1, vec2): # "compute cosine similarity: (vec1*vec2)/(||vec1||*||vec2||)" AA, AB, BB = 0, 0, 0 for i in range(len(vec1)): x = vec1[i] y = vec2[i] AA += x * x BB += y * y AB += x * y return round(AB / math.sqrt(AA * BB), 4) ## # @brief This method compute vector model # @param self # @param k # @return cosines: dict{docID: score} # @bug Fixed # @exception ValueError ## def vectorQuery(self, k): ''' vector query processing, using the cosine similarity. ''' #ToDo: return top k pairs of (docID, similarity), ranked by their cosine similarity with the query in the descending order # You can use term frequency or TFIDF to construct the vectors if len(self.processed_query) == 0: all_docids = set() for _, v in self.index.get_items_inverted().items(): all_docids.update(v.get_posting_list().keys()) return [(str(id), 0) for id in sorted(list(map(int, all_docids)))[:k]] query_words = list(set(self.processed_query)) idfs = [self.index.idf(w) for w in query_words] # undefined behavior from document on what to do if k is larger than the corpus try: if k > self.index.get_total_number_Doc(): raise ValueError('k is greater than number of documents') except ValueError as err: print(err.args) return # below we define behavior if none of the words in the query are in any documents # this behavior was not defined in instructions so no documents seems most appropriate # if you used google and got 0 cosine it would return 0 documents even if you wanted the 50 most relevant if set(idfs) == {0}: all_docids = set() for _, v in self.index.get_items_inverted().items(): all_docids.update(v.get_posting_list().keys()) return [(str(id), 0) for id in sorted(list(map(int, all_docids)))[:k]] # removes any words that have 0 idf as that means they didn't appear in the corpus, means save memory # probably not necessary to turn it into lists, and may actually be more appropriate to leave as tuples idfs, query_words = map( list, zip(*[i for i in list(zip(idfs, query_words)) if not i[0] == 0])) #Calculates tfs of relevant words query_term_counter = Counter(self.processed_query) query_tf_vector = [ round(math.log10(query_term_counter[w] + 1), 4) for w in query_words ] #Other way of doing tf #query_tf_vector = [round(1 + math.log10(query_term_counter[w]),4) if query_term_counter[w] > 0 else 0 for w in query_words] ### NCC change if a term in a quiry does not appear in our inverted index Forget/Discount term #### postings should be a list of lists which contains word postings postings = [ self.index.get_items_inverted()[w].get_posting_list() for w in query_words if w in self.index.get_items_inverted() ] document_ids = set().union(*postings) document_tfs = {d: [0] * len(query_words) for d in document_ids} for inx, term in enumerate(postings): for document_id, posting in term.items(): #log normalization document_tfs[document_id][inx] = math.log10( posting.term_freq() + 1) #Other # tf = posting.term_freq() # if tf > 0 : # tf = 1 + math.log10(tf) # else: # tf = 0 # document_tfs[document_id][inx] = tf query_tfidf = np.multiply(query_tf_vector, idfs) cosines = Counter({ d: self.cosine_similarity(query_tfidf, np.multiply(d_tf, idfs)) for d, d_tf in document_tfs.items() }) # this has to be a list as dict are not sorted... # need a consistent ordering of documents when multiple documents have the same score we first sort on score then docid, very slow # if we know k or know the number of documents we could use numpy to preallocate memory which means we would not have to use append and could just use copy temp_k = k scores = sorted(list(set(cosines.values())), reverse=True) ret = [] for s in scores: docs_with_score_s = sorted( [int(d) for d, v in cosines.items() if v == s]) if len(docs_with_score_s) >= temp_k: docs_with_score_s = docs_with_score_s[:temp_k] ret.extend([(str(d), s) for d in docs_with_score_s]) temp_k = 0 break else: temp_k = temp_k - len(docs_with_score_s) ret.extend([(str(d), s) for d in docs_with_score_s]) if not temp_k == 0: all_docids = set() for _, v in self.index.get_items_inverted().items(): all_docids.update(v.get_posting_list().keys()) ret.extend([(str(j), 0) for j in sorted( list(map(int, all_docids.difference({i[0] for i in ret}))))[:temp_k] ]) return ret
def eval(index_file, query_file, qrels_File, number_of_queries): #read queryfile,indexfile # ToDo queries = loadCranQry(query_file) queries_id_list = [str(int(x)) for x in queries.keys()] #print(queries_id_list) #read querls.txt qrels_dict = process_querls_file(qrels_File, queries_id_list) inputdocument = cran.CranFile("cran.all") # load the index file saved at from part 1 index = InvertedIndex().load(index_file) qp = QueryProcessor(queries, index, inputdocument, number_of_queries) queries_id_list_int = [int(x) for x in qrels_dict.keys()] queries_id_ls = [int(x) for x in queries.keys()] #IdeaVectorsforQuery_ids={} sumbooleanNADC = [] sumvectorNADC = [] with open('Evaluation_search.csv', 'w') as f: f.write("%s,%s,%s,%s\n" % ("Iteration", "AverageNDCG-booleanModel", "AverageNDCG-vectorModel", "P-value")) for i in range(0, 5): vectorNADC = [] booleanNADC = [] intersection_queries = list( set(queries_id_list_int) & set(queries_id_ls)) random_query_id_list = random.sample(queries_id_list_int, number_of_queries) #random_query_id_list=[153, 18] #print(random_query_id_list) for q_id in random_query_id_list: print("Processing for Query ID ::", q_id) qp.querynumber = q_id #boolean_res=qp.booleanQuery() vector_top3 = qp.vectorQuery(5) #vector_top3=[('12',0.34),('746',0.33),('875',0.24)] #print(boolean_res) print("Output for Vector Model Result::", vector_top3) if (vector_top3.__len__() < 1): vectorNADC.append(0) else: vector_label = [x[0] for x in vector_top3] score = [x[1] for x in vector_top3] print("DocumentIDs of Vector Model Result:: ", vector_label) print("Scores of Vector Model Result::", score) true_label = vector_label.copy() query_id = str(q_id) for x in vector_label: #str_x="{0:0=3d}".format(x) ind = vector_label.index(x) if (x in qrels_dict.get(query_id)): true_label[ind] = 1 else: true_label[ind] = 0 if true_label.__len__() < 5: len_val = 10 - (true_label.__len__()) true_label.extend([0] * len_val) print("Actual Vector:: ", true_label) print("Predicted Vector:: ", score) if sum(true_label) == 0: vectorNADC.append(0) else: ndcg = metrics.ndcg_score(true_label, score, 5) print("Calculated ndcg for Vector::", ndcg) vectorNADC.append(ndcg) boolean_res = qp.booleanQuery() print("output of boolean_res:: ", boolean_res) if boolean_res.__len__() < 1: booleanNADC.append(0) else: score = [1] * len(boolean_res) if (score.__len__() < 5): leng = 5 - (score.__len__()) score.extend([0] * leng) true_label = boolean_res.copy() query_id = str(q_id) for x in boolean_res: ind = boolean_res.index(x) if (x in qrels_dict.get(query_id)): true_label[ind] = 1 else: true_label[ind] = 0 if true_label.__len__() < 5: len_val = 10 - (true_label.__len__()) true_label.extend([0] * len_val) print("Actual boolean:: ", true_label) print("Predicted boolean:: ", score) if sum(true_label) == 0: booleanNADC.append(0) else: ndcg = metrics.ndcg_score(true_label, score, 5) print("Calculated ndcg for Boolean::", ndcg) booleanNADC.append(ndcg) print("Calculated NADC sum for all queries", vectorNADC) avergae_vectorNADC = float(sum(vectorNADC) / number_of_queries) print("Calculated NADC sum for all queries", booleanNADC) avergae_booleanNADC = float(sum(booleanNADC) / number_of_queries) print("Avergae NADC Vector::", avergae_vectorNADC) print("Avergae NADC boolean::", avergae_booleanNADC) p_value = scipy.stats.wilcoxon(vectorNADC, booleanNADC, zero_method='wilcox', correction=False) print(i, str(avergae_booleanNADC), str(avergae_vectorNADC), str(p_value[1])) p = "%.20f" % float(str(p_value[1])) print('P value for all the queries processed is:', p) f.write("%s,%s,%s,%s\n" % (i + 1, str(avergae_booleanNADC), str(avergae_vectorNADC), str(p))) print('Done')
BoolenQueryResultDic.append({qid: Bresult}) else: print("Vector Query TF-IDF calculation in progress") Topk, k = qprocessorobj.vectorQuery(3) #print("vector",qid,qrys[qid].text) print("Top", k, "(DocID Similarity)", Topk[:k]) ''' ************this below code is reused in batch_eval also*******************''' input_filename = "cran.all" ouput_filename = sys.argv[1] #"index_file" #sys.argv[2] Queryfile = "query.text" #sys.argv[3]#"query.text" '''creating object for cranefile and collection file and inverted index class,postings class''' cf = CranFile(input_filename) collectionfile = Collection() indexobj = InvertedIndex() 'iterating over cran file for document id' for i, doc in enumerate(cf.docs): collectionfile.docs.update({doc.docID: doc}) postingobj = Posting(doc.docID) '''reading index file which is stored while creating index''' with open(ouput_filename, "r") as invertedindex: InvertedIndex.items = json.load(invertedindex) 'formatting the query id in qrel.text and finding common query id in qrery.text' qidlist = {} qrys = loadCranQry(Queryfile) for position, q in enumerate(qrys): qidlist[q] = position + 1 'Below Variables are used for batch_eval.py file' BoolenQueryResultDic = [] VectorResult = []
import cran import query from cranqry import loadCranQry from index import InvertedIndex, test from query import QueryProcessor print("***************Test Cases Running for Index File****************") invertedobj = InvertedIndex() test(invertedobj) print("***************Test Cases Running for Query File****************") # load documents inputdocument = cran.CranFile("cran.all") # load the index file saved at from part 1 index = InvertedIndex().load("index_file") # load query processed files queries = loadCranQry("query.text") qp = QueryProcessor(queries, index, inputdocument, 29) query.test(qp) qp = QueryProcessor(queries, index, inputdocument, 29) qp.vectorQuery(3)
def extractfeature(self, directoryOfNewsgroup, featureDefinitionFile, classDefinitionFile, trainingDataFile): iindexObject = InvertedIndex() invertedIndex = iindexObject.indexingCranfield(directoryOfNewsgroup) f = open(featureDefinitionFile, "w") counter = 0 for x in invertedIndex.items.keys(): counter = counter + 1 formattedData = str(counter) + " " + x + "\n" f.write(formattedData) self.termIdLookup[x] = counter f.close() #as per the proejct requirement hardcoding the class files here and outputting classDefinitiontuple = ("1 comp.graphics", "1 comp.os.ms-windows.misc", "1 comp.sys.ibm.pc.hardware", "1 comp.sys.mac.hardware", "1 comp.windows.x", "2 rec.autos", "2 rec.motorcycles", "2 rec.sport.baseball", "2 rec.sport.hockey", "3 sci.crypt", "3 sci.electronics", "3 sci.med", "3 sci.space", "4 misc.forsale", "5 talk.politics.misc", "5 talk.politics.guns", "5 talk.politics.mideast", "6 talk.religion.misc", "6 alt.atheism", "6 soc.religion.christian") classfile = open(classDefinitionFile, "w") for x in classDefinitiontuple: classfile.write(x + "\n") classfile.close() #end of hardcoded class files print('tf start') libsvmtf = {} if os.path.exists("training_data_file.TF"): os.remove("training_data_file.TF") newsgroup = self.getNewsGroupFile(directoryOfNewsgroup) for x in invertedIndex.items.keys(): for postingobject in invertedIndex.items.get(x).posting.keys(): #libsvmtf.setdefault(invertedIndex.items.get(x).posting.get(postingobject).docID, []).append('\t') libsvmtf.setdefault( invertedIndex.items.get(x).posting.get( postingobject).docID, []).append(self.getKeysByValue(self.termIdLookup, x)) libsvmtf.setdefault( invertedIndex.items.get(x).posting.get( postingobject).docID, []).append(':') #libsvmtf.setdefault(invertedIndex.items.get(x).posting.get(postingobject).docID, []).append('\t') libsvmtf.setdefault( invertedIndex.items.get(x).posting.get( postingobject).docID, []).append( round( invertedIndex.items.get(x).posting.get( postingobject).termfreq, 5)) # libsvmtf.setdefault(invertedIndex.items.get(x).posting.get(postingobject).docID, []).append('\t') for x in libsvmtf: tfdata = '' libsvmtffile = open("training_data_file.TF", "a") if x in newsgroup.class1items1: classid = 1 if x in newsgroup.class1items2: classid = 2 if x in newsgroup.class1items3: classid = 3 if x in newsgroup.class1items4: classid = 4 if x in newsgroup.class1items5: classid = 5 if x in newsgroup.class1items6: classid = 6 # print('\t '.join(libsvmtf)) #tfdata = str(x) +" : "+str(''.join(str(libsvmtf[x]).split(",")))[1:-1] + "\n" # for row in reader: # read a row as {column1: value1, column2: value2,...} # for (k, v) in row.items(): # go over each column name and value # columns[k].append(v) # append the value into the appropriate list # saved_column = df.column_name # you can also use df['column_name'] #print (str(tempstr).split("'","")) tfdata = str(classid) + " " + str(''.join( str(libsvmtf[x]).split(",")))[1:-1] + "\n" tfdata = str.replace(tfdata, " ':' ", ":") print(tfdata) libsvmtffile.write(tfdata) libsvmtffile.close() print('tf complete') print('idf start') libsvmidf = {} if os.path.exists("training_data_file.IDF"): os.remove("training_data_file.IDF") newsgroup = self.getNewsGroupFile(directoryOfNewsgroup) for x in invertedIndex.items.keys(): for postingobject in invertedIndex.items.get(x).posting.keys(): # libsvmtf.setdefault(invertedIndex.items.get(x).posting.get(postingobject).docID, []).append('\t') libsvmidf.setdefault( invertedIndex.items.get(x).posting.get( postingobject).docID, []).append(self.getKeysByValue(self.termIdLookup, x)) libsvmidf.setdefault( invertedIndex.items.get(x).posting.get( postingobject).docID, []).append(':') # libsvmtf.setdefault(invertedIndex.items.get(x).posting.get(postingobject).docID, []).append('\t') libsvmidf.setdefault( invertedIndex.items.get(x).posting.get( postingobject).docID, []).append(invertedIndex.items.get(x).idf) # libsvmtf.setdefault(invertedIndex.items.get(x).posting.get(postingobject).docID, []).append('\t') for x in libsvmidf: idfdata = '' libsvmidffile = open("training_data_file.IDF", "a") if x in newsgroup.class1items1: classid = 1 if x in newsgroup.class1items2: classid = 2 if x in newsgroup.class1items3: classid = 3 if x in newsgroup.class1items4: classid = 4 if x in newsgroup.class1items5: classid = 5 if x in newsgroup.class1items6: classid = 6 # print('\t '.join(libsvmtf)) # tfdata = str(x) +" : "+str(''.join(str(libsvmtf[x]).split(",")))[1:-1] + "\n" # for row in reader: # read a row as {column1: value1, column2: value2,...} # for (k, v) in row.items(): # go over each column name and value # columns[k].append(v) # append the value into the appropriate list # saved_column = df.column_name # you can also use df['column_name'] idfdata = str(classid) + " " + str(''.join( str(libsvmidf[x]).split(",")))[1:-1] + "\n" idfdata = str.replace(idfdata, " ':' ", ":") # print(idfdata) libsvmidffile.write(idfdata) libsvmidffile.close() print('idf complete') print('TF-idf start') libsvmtfidf = {} if os.path.exists("training_data_file.TFIDF"): os.remove("training_data_file.TFIDF") newsgroup = self.getNewsGroupFile(directoryOfNewsgroup) for x in invertedIndex.items.keys(): for postingobject in invertedIndex.items.get(x).posting.keys(): # libsvmtf.setdefault(invertedIndex.items.get(x).posting.get(postingobject).docID, []).append('\t') libsvmtfidf.setdefault( invertedIndex.items.get(x).posting.get( postingobject).docID, []).append(self.getKeysByValue(self.termIdLookup, x)) libsvmtfidf.setdefault( invertedIndex.items.get(x).posting.get( postingobject).docID, []).append(':') # libsvmtf.setdefault(invertedIndex.items.get(x).posting.get(postingobject).docID, []).append('\t') libsvmtfidf.setdefault( invertedIndex.items.get(x).posting.get( postingobject).docID, []).append( invertedIndex.items.get(x).posting.get( postingobject).termfreq * invertedIndex.items.get(x).idf) # libsvmtf.setdefault(invertedIndex.items.get(x).posting.get(postingobject).docID, []).append('\t') for x in libsvmtfidf: tfidfdata = '' libsvmtfidffile = open("training_data_file.TFIDF", "a") if x in newsgroup.class1items1: classid = 1 if x in newsgroup.class1items2: classid = 2 if x in newsgroup.class1items3: classid = 3 if x in newsgroup.class1items4: classid = 4 if x in newsgroup.class1items5: classid = 5 if x in newsgroup.class1items6: classid = 6 # print('\t '.join(libsvmtf)) # tfdata = str(x) +" : "+str(''.join(str(libsvmtf[x]).split(",")))[1:-1] + "\n" # for row in reader: # read a row as {column1: value1, column2: value2,...} # for (k, v) in row.items(): # go over each column name and value # columns[k].append(v) # append the value into the appropriate list # saved_column = df.column_name # you can also use df['column_name'] tfidfdata = str(classid) + " " + str(''.join( str(libsvmtfidf[x]).split(",")))[1:-1] + "\n" tfidfdata = str.replace(tfidfdata, " ':' ", ":") # print(tfidfdata) libsvmtfidffile.write(tfidfdata) libsvmtfidffile.close() print('TF-idf complete')
# print(v) # print(doc.class_name) if class_name in v: class_label = k train_dict.update({(docid + class_name): [class_label, {term_id: term_val}]}) # write to file with open(training_file_tfidf, "w") as train_obj: for doc, val in train_dict.items(): x = '' classid = str(train_dict[doc][0]) for i in val[1:]: for k, v in i.items(): x = x + " " + str(k) + ":" + str(v) tfidfdata = classid + "\t" + x + "\n" train_obj.write(tfidfdata) print("training data file.tfidf generated successfully") if __name__ == '__main__': '''class_defn_file("class_sample_file") index_obj = InvertedIndex() iindex = index_obj.indexingCranfield("sample_newsgroup") feature_defn_file(iindex,"feature_sample_file") training_file_idf("feature_sample_file", "class_sample_file", "training_sample_file.tf", "training_sample_file.idf", "training_sample_file.tfidf", iindex)''' class_defn_file("class_definition_file") index_obj = InvertedIndex() iindex = index_obj.indexingCranfield("mini_newsgroups") feature_defn_file(iindex, "feature_definition_file") training_file("feature_definition_file", "class_definition_file", "training_data_file.tf", "training_data_file.idf", "training_data_file.tfidf", iindex)
def main(ref, k): # Build the index index = InvertedIndex(ref, k) index.prepare_disk() index.build()
def test_InvertedIndex(): data = ("a,b,c,d,e,f\n" "g,h,i,j,k,l\n" "z,x\n" "z,x\n" "z,x,y\n" "z,x,y,i\n") index = InvertedIndex() index.load(data) assert (index.support({Item("a")}) == 1 / 6) assert (index.support({Item("b")}) == 1 / 6) assert (index.support({Item("c")}) == 1 / 6) assert (index.support({Item("d")}) == 1 / 6) assert (index.support({Item("e")}) == 1 / 6) assert (index.support({Item("f")}) == 1 / 6) assert (index.support({Item("h")}) == 1 / 6) assert (index.support({Item("i")}) == 2 / 6) assert (index.support({Item("j")}) == 1 / 6) assert (index.support({Item("k")}) == 1 / 6) assert (index.support({Item("l")}) == 1 / 6) assert (index.support({Item("z")}) == 4 / 6) assert (index.support({Item("x")}) == 4 / 6) assert (index.support({Item("y")}) == 2 / 6) sup_zx = index.support({Item("z"), Item("x")}) assert (sup_zx == 4 / 6) sup_zxy = index.support({Item("z"), Item("x"), Item("y")}) assert (sup_zxy == 2 / 6) sup_zxyi = index.support({Item("z"), Item("x"), Item("y"), Item("i")}) assert (sup_zxyi == 1 / 6)
# -*- coding: utf-8 -*- import sys import argparse import os.path from index import InvertedIndex from ui import Fen from PyQt5.QtWidgets import QApplication parser = argparse.ArgumentParser() parser.add_argument("d", type=str, help="Le chemin de la corpus") parser.add_argument("s", type=str, help="Le chemin du fichier contenant les stopwords") args = parser.parse_args() if os.path.isfile(args.d) and os.path.isfile(args.s): print("Indexation de corpus: \n Wait ...!!") index = InvertedIndex(args.d, args.s) monApp = QApplication(sys.argv) fenetre = Fen(index) print("Fin d'indexation: vous pouvez commencer la recherche") print("----------------------------------------------------------") sys.exit(monApp.exec_()) else: print("ERROR: le(s) document(s) que vous avez indiqué n'existe pas")
def eval(): # Algorithm: # Pick N random samples from query.txt # Get top 10 results from bool query for each rnd query # Get top 10 results from vector query for each rnd query # Compute NDCG btn bool query results and qrels.txt # Compute NDCG btn vector query results and qrels.txt # Get p-value btn bool and vector # Get the query collection qc = loadCranQry(query_path) poss_queries = list(qc) # Load up the inverted index ii = InvertedIndex() ii.load(index_file) # Load up the document collection cf = CranFile("cran.all") # Get ground-truth results from qrels.txt with open(qrels_path) as f: qrels = f.readlines() # Index qrels into a dict qrel_dict = {} for qrel in qrels: qrel_split = qrel.split() if int(qrel_split[0]) in qrel_dict: qrel_dict[int(qrel_split[0])].append(int(qrel_split[1])) else: qrel_dict[int(qrel_split[0])] = [int(qrel_split[1])] # Run over N random queries, collecting NDCGs bool_ndcgs = [] vector_ndcgs = [] for _ in range(n): # Get random query ID query_id = choice(poss_queries) # Get the query if 0 < int(query_id) < 10: query_id = '00' + str(int(query_id)) elif 9 < int(query_id) < 100: query_id = '0' + str(int(query_id)) try: query = qc[query_id].text except KeyError: print("Invalid query id", query_id) return # Initialize the query processor qp = QueryProcessor(query, ii, cf) # Run bool query bool_result = qp.booleanQuery()[:10] # Run vector query vector_result = qp.vectorQuery(10) # Pull top 10 ground-truth results from qrels dict gt_results = qrel_dict[poss_queries.index(query_id) + 1][:10] # Compute NDCG for bool query # NOTE: There is no weighting on the bool query, so give all an even 1 truth_vector = list(map(lambda x: x in gt_results, bool_result)) bool_ndcg = ndcg_score(truth_vector, [1] * len(truth_vector), k=len(truth_vector)) # Compute NDCG for vector query vector_docs = [] vector_scores = [] for v in vector_result: vector_docs.append(v[0]) vector_scores.append(v[1]) truth_vector = list(map(lambda x: x in gt_results, vector_docs)) vector_ndcg = ndcg_score(truth_vector, vector_scores, k=len(truth_vector)) # Accumulate NDCGs bool_ndcgs.append(bool_ndcg) vector_ndcgs.append(vector_ndcg) # Average out score lists bool_avg = 0 for bool in bool_ndcgs: bool_avg += bool bool_avg /= len(bool_ndcgs) vector_avg = 0 for vector in vector_ndcgs: vector_avg += vector vector_avg /= len(vector_ndcgs) # Present averages and p-values print("Boolean NDCG average:", bool_avg) print("Vector NDCG average:", vector_avg) if n > 19: print("Wilcoxon p-value:", wilcoxon(bool_ndcgs, vector_ndcgs).pvalue) else: print("Wilcoxon p-value: Sample size too small to be significant") print("T-Test p-value:", ttest_ind(bool_ndcgs, vector_ndcgs).pvalue)
def test(index_loc, cran_loc, qrels_loc): ''' test your code thoroughly. put the testing cases here''' ##### SETUP ITEMS ##### # Grab index file to restore II ii = InvertedIndex() ii.load(index_loc) # Get the document collection cf = CranFile(cran_loc) # Get ground-truth results from qrels.txt with open(qrels_loc) as f: qrels = f.readlines() # Index qrels into a dict qrel_dict = {} for qrel in qrels: qrel_split = qrel.split() if int(qrel_split[0]) in qrel_dict: qrel_dict[int(qrel_split[0])].append(int(qrel_split[1])) else: qrel_dict[int(qrel_split[0])] = [int(qrel_split[1])] ##### INITIAL TEST ITEMS ##### print("TESTS BASED ON SUGGESTED TESTING POINTS") # Ensure tf is correct # Find a random word and check TF value against what is manually done posting_list = ii.find("experiment").posting tf_vector = [] for posting in posting_list: tf_vector.append(len(posting_list[posting].positions) \ == posting_list[posting].term_freq()) print("TF is computed correctly:", all(tf_vector)) # Ensure idf is correct print("IDF is computed correctly:", log10(ii.nDocs / len(posting_list)) \ == ii.idf("experiment")) # As both tf and idf are correct, and tf-idf is a product of the two, # it is reasonable to assume tf-idf is computed correctly ##### BOOL QUERY TESTS ##### # Here, I use very specific boolean queries to ensure that a # limited number of documents are returned print("\nBOOL QUERY TESTS") # Ensure that the exact title of doc 8 matches for doc 8 doc8 = "measurements of the effect of two-dimensional and three-dimensional roughness elements on boundary layer transition" qp1 = QueryProcessor(doc8, ii, cf) print("Bool query matches on exact title:", qp1.booleanQuery() == [8]) # Ensure that bool query matches very specific AND query qp2 = QueryProcessor("hugoniot and infinitesimally", ii, cf) print( "Bool query matches on specific AND query ('hugoniot and infinitesimally'):", qp2.booleanQuery() == [329]) # Test that an OR query is handled properly # Both gravel and stagnation have completely distinct postings lists. # OR should merge them. gravel_postings = ii.find("gravel").sorted_postings[:] stag_postings = ii.find("stagnat").sorted_postings[:] gravel_postings.extend(stag_postings) qp3 = QueryProcessor("gravel or stagnation", ii, cf) print("Bool query successfully handles OR ('gravel or stagnation'):", qp3.booleanQuery() == sorted(gravel_postings)) # Test that NOT is handled properly # The posting list for "diameter" is a subset of "slipstream" postings # (oddly enough). To test this works, do "slipstream and not diameter" # and we chould get slipstream's postings minus those of diameter. slip_postings = ii.find("slipstream").sorted_postings[:] diam_postings = ii.find("diamet").sorted_postings[:] slip_not_diam = [t for t in slip_postings if t not in diam_postings] print("Bool query successfully handles NOT ('slipstream and not diameter'):", QueryProcessor("slipstream and not diameter", ii, cf).booleanQuery() \ == slip_not_diam) # Ensure AND/OR order doesn't matter print("Bool query can handle query regardless of AND order ('a and b' = 'b and a'):", QueryProcessor("slipstream and diameter", ii, cf).booleanQuery() \ == QueryProcessor("diameter and slipstream", ii, cf).booleanQuery()) print("Bool query can handle query regardless of OR order ('a or b' = 'b or a'):", QueryProcessor("slipstream or diameter", ii, cf).booleanQuery() \ == QueryProcessor("diameter or slipstream", ii, cf).booleanQuery()) # Ensure that the presence of parens does not change query results print("Bool query can handle query regardless of parens ('slipstream and diameter'):", QueryProcessor("slipstream and diameter", ii, cf).booleanQuery() \ == QueryProcessor("(slipstream and diameter)", ii, cf).booleanQuery()) # Ensure parentheses do not change order of processing for AND-AND and OR-OR queries print("Bool query AND is accociative ('(a and b) and c' = 'a and (b and c)'):", QueryProcessor("(slipstream and diameter) and thrust", ii, cf).booleanQuery() \ == QueryProcessor("slipstream and (diameter and thrust)", ii, cf).booleanQuery()) print("Bool query OR is accociative ('(a or b) or c' = 'a or (b or c)'):", QueryProcessor("(slipstream or diameter) or thrust", ii, cf).booleanQuery() \ == QueryProcessor("slipstream or (diameter or thrust)", ii, cf).booleanQuery()) # Ensure parentheses properly group items # Tested by doing the query "manually" by adding/orring the correct terms part_one = QueryProcessor("conduction and cylinder and gas", ii, cf).booleanQuery() part_two = QueryProcessor("radiation and gas", ii, cf).booleanQuery() part_one.extend(part_two) expected_result = QueryProcessor("hugoniot", ii, cf).booleanQuery() expected_result.extend(part_one) print("Bool query parens successfully group conflicting operators:", QueryProcessor("(conduction and cylinder and gas) or (radiation and gas) or hugoniot", ii, cf).booleanQuery() \ == sorted(list(set(expected_result)))) ##### VECTOR QUERY TESTS ##### # For this, just ensure that most of the results are in the expected list print("\nVECTOR QUERY TESTS") # Ensure vector query can match on exact title print("Vector query matches on exact title:", qp1.vectorQuery(1)[0][0] == 8) # Try a few example queries from query.text # As long as one-fifth of t-10 are in gt_result, call it a pass # Note that queries with larger answer sets were chosen to # ensure there were enough to get to one-fifth of ten qc = loadCranQry("query.text") poss_queries = list(qc) # Query 001 result = QueryProcessor(qc["001"].text, ii, cf).vectorQuery(10) gt_result = qrel_dict[poss_queries.index("001") + 1] correct_vector = list(map(lambda x: x in gt_result, [x[0] for x in result])) print("Vector query is at least one-fifth correct for query 001:", sum(correct_vector) > 2) # Query 128 result = QueryProcessor(qc["128"].text, ii, cf).vectorQuery(10) gt_result = qrel_dict[poss_queries.index("128") + 1] correct_vector = list(map(lambda x: x in gt_result, [x[0] for x in result])) print("Vector query is at least one-fifth correct for query 128:", sum(correct_vector) > 2) # Query 226 result = QueryProcessor(qc["226"].text, ii, cf).vectorQuery(10) gt_result = qrel_dict[poss_queries.index("226") + 1] correct_vector = list(map(lambda x: x in gt_result, [x[0] for x in result])) print("Vector query is at least one-fifth correct for query 226:", sum(correct_vector) > 2) # Query 196 result = QueryProcessor(qc["196"].text, ii, cf).vectorQuery(10) gt_result = qrel_dict[poss_queries.index("196") + 1] correct_vector = list(map(lambda x: x in gt_result, [x[0] for x in result])) print("Vector query is at least one-fifth correct for query 196:", sum(correct_vector) > 2) # Query 291 result = QueryProcessor(qc["291"].text, ii, cf).vectorQuery(10) gt_result = qrel_dict[poss_queries.index("291") + 1] correct_vector = list(map(lambda x: x in gt_result, [x[0] for x in result])) print("Vector query is at least one-fifth correct for query 291:", sum(correct_vector) > 2)
def eval(indexfilename, queryfilename, queryrefilename, numberofrandomqueries): # ToDo actual = [] # if numberofrandomqueries > 225: raise Exception('please enter query count less than or equal to 225') qrys = loadCranQry("query.text") validqueries = [] querycounter = 0 for q in qrys: validqueries.append(int(q)) loadiindex = InvertedIndex() loadiindex = loadiindex.load("index_file.pickle") # print("index loaded") cf = CranFile('cran.all') #QueryProcessor.numberofresult =10 #qp = QueryProcessor(qrys,loadiindex,cf.docs,10) queryRelevence = dict() for line in open(queryrefilename): fields = line.split(" ") fields[0] = '%0*d' % (3, int(fields[0])) if fields[0] in queryRelevence: # and let's extract the data: queryRelevence[fields[0]].append(fields[1]) else: # create a new array in this slot queryRelevence[fields[0]] = [fields[1]] replacecounter = 0 queryRelevenceUpdated = {} for k in queryRelevence: queryRelevenceUpdated['%0*d' % (3, int( validqueries[replacecounter]))] = queryRelevence.get(k) replacecounter = replacecounter + 1 # relevent = list(queryRelevence.keys()) # relevent = list(map(int, relevent)) #samplespace = np.intersect1d(relevent, validqueries) list_of_random_items = random.sample(validqueries, numberofrandomqueries) tempcounter2 = 0 booleanndcg = [] vectorndcg = [] while tempcounter2 < numberofrandomqueries: list_of_random_items[tempcounter2] = '%0*d' % ( 3, int(list_of_random_items[tempcounter2])) print('query for which ndcg is calculated ' + str(list_of_random_items[tempcounter2])) y = str(list_of_random_items[tempcounter2]) vectorresult = query(indexfilename, '1', queryfilename, str(list_of_random_items[tempcounter2]), 10) # vectorresult = ['573', '51', '944', '878', '12', '486', '875', '879', '746', '665'] # print(vectorresult) tempcounter = 0 for z in vectorresult: if z in queryRelevenceUpdated[str( list_of_random_items[tempcounter2])]: vectorresult[tempcounter] = 1 else: vectorresult[tempcounter] = 0 tempcounter = tempcounter + 1 #print(vectorresult) idealvectorresult = vectorresult.copy() idealvectorresult.sort(reverse=True) #print(idealvectorresult) if sum(idealvectorresult) == 0: ndcgscore = 0 else: ndcgscore = ndcg_score(idealvectorresult, vectorresult) # print(ndcgscore) vectorndcg.append(ndcgscore) tempcounter3 = 0 booleanqueryresult = query(indexfilename, '0', queryfilename, str(list_of_random_items[tempcounter2]), 10) #booleanqueryresult = ['462','462','462','462','462','462','462','462','462'] booleanquery = booleanqueryresult.copy() for g in booleanquery: if g in queryRelevenceUpdated[str( list_of_random_items[tempcounter2])]: booleanquery[tempcounter3] = 1 else: booleanquery[tempcounter3] = 0 tempcounter3 = tempcounter3 + 1 #print(booleanquery) tempcounter4 = len(booleanquery) while tempcounter4 < 10: booleanquery.append(0) tempcounter4 = tempcounter4 + 1 idealbooleanresult = [] for i in range(0, 10): if i < len(queryRelevenceUpdated[str( list_of_random_items[tempcounter2])]): idealbooleanresult.append(1) else: idealbooleanresult.append(0) idealbooleanresult.sort(reverse=True) if sum(booleanquery) == 0: ndcgscoreboolean = 0 else: ndcgscoreboolean = ndcg_score(booleanquery, idealbooleanresult) booleanndcg.append(ndcgscoreboolean) tempcounter2 = tempcounter2 + 1 print('P value for all the queries processed is:') print( scipy.stats.wilcoxon(vectorndcg, booleanndcg, zero_method='wilcox', correction=False)) print('Done')
def vectorQuery(self, k): ''' vector query processing, using the cosine similarity. ''' #ToDo: return top k pairs of (docID, similarity), ranked by their cosine similarity with the query in the descending order # You can use term frequency or TFIDF to construct the vectors 'Finding TF and IDF of Queryterms and saving the result to TF.json and IDF.json file' termfrequency, IDF = postingobj.term_freq(collectionfile, Queryterm) 'Saving TF,IDF of document for given query' indexobj.save(termfrequency, "TF.json") indexobj.save(IDF, "IDF.json") TF_filename = open("TF.json") TF = json.load(TF_filename) IDF_filename = open("IDF.json") IDF = json.load(IDF_filename) QueryDict = {} Qlen = len(Query) Querytf = {} Querytfidf = {} tempdic = {} DocSim = [] '''processing each query term and calculating TF-IDF of query and passing document and query vector to cosine function to calculate cosine similarity''' for term in Queryterm: plist = InvertedIndex.getPostingsList(term) QueryDict.update({term: plist}) if term not in Querytf.keys(): Querytf[term] = 1 else: Querytf[term] = Querytf[term] + 1 for qterms, posting in QueryDict.items(): for pos in posting: for IDFword in IDF: if qterms == IDFword: if qterms not in Querytfidf.keys(): '''calculating tf of query using query token frequency in query to the total query tokens''' tf = Querytf[qterms] '''calculating td-idf of query where idf of word in query is 1+log(N/n) where N total documents and n is number of documents that contain the term ''' Querytfidf[qterms] = {pos: tf * (1 + IDF[IDFword])} else: Querytfidf[qterms].update( {pos: (tf) * (1 + IDF[IDFword])}) TFwordValues = TF[qterms] '''calculating TF*IDF of document and converting it to vector''' for TFdoc, TFvalues in TFwordValues.items(): for IDFword in IDF: if qterms == IDFword and TFdoc == pos: if qterms not in tempdic.keys(): tempdic[qterms] = { TFdoc: (TFvalues) * IDF[IDFword] } else: tempdic[qterms].update( {TFdoc: TFvalues * IDF[IDFword]}) 'converting Query tf -idf dictionary to matrix/vector' Querymatrix = pd.DataFrame(Querytfidf) 'converting document tf-idf dictionary to matrix/vector' DocTFIDFmatrix = pd.DataFrame(data=tempdic) 'processing the matrix/vector to make feasible for cosine function ' for Qpos, Dpos in zip(list(Querymatrix.index), list(DocTFIDFmatrix.index)): if Qpos == Dpos: Q = np.array(Querymatrix.loc[Qpos]) where_are_NaNs = np.isnan(Q) Q[where_are_NaNs] = 0 D = np.array(DocTFIDFmatrix.loc[Dpos]) where_are_NaNs = np.isnan(D) D[where_are_NaNs] = 0 cosine = QueryProcessor.cosine_similaritys(Q, D) DocSim.append((int(Qpos), cosine)) VectorID = sorted(DocSim, key=lambda x: x[1], reverse=True) TopID = sorted(DocSim[:10], key=lambda x: x[1], reverse=True) #print(VectorID) VectorResult.append({qid: VectorID}) return TopID, k
def test_apriori(): data = ("a,b,c,d,e,f\n" "g,h,i,j,k,l\n" "z,x\n" "z,x\n" "z,x,y\n" "z,x,y,i\n") expectedItemSets = {ItemSet("i"): 2 / 6, ItemSet("z"): 4 / 6, ItemSet("x"): 4 / 6, ItemSet("y"): 2 / 6, ItemSet("xz"): 4 / 6, ItemSet("yz"): 2 / 6, ItemSet("xy"): 2 / 6, ItemSet("xyz"): 2 / 6} index = InvertedIndex() index.load(data) itemsets = apriori(index, 2 / 6) assert(len(itemsets) == len(expectedItemSets)) for itemset in itemsets: assert(frozenset(itemset) in expectedItemSets) for itemset in itemsets: assert(expectedItemSets[frozenset(itemset)] == index.support(itemset)) print("Itemsets={}".format([i for i in itemsets if len(i) > 1])) def itemize(a): return list(map(item_id, a)) # (antecedent, consequent, confidence, lift, support) rx = [ (['y'], ['x'], 1.0, 1.5, 0.3333333333333333), (['x'], ['y'], 0.5, 1.5, 0.3333333333333333), (['y'], ['z'], 1.0, 1.5, 0.3333333333333333), (['z'], ['y'], 0.5, 1.5, 0.3333333333333333), (['x'], ['z'], 1.0, 1.5, 0.6666666666666666), (['z'], ['x'], 1.0, 1.5, 0.6666666666666666), (['x', 'y'], ['z'], 1.0, 1.5, 0.3333333333333333), (['z', 'y'], ['x'], 1.0, 1.5, 0.3333333333333333), (['z', 'x'], ['y'], 0.5, 1.5, 0.3333333333333333), (['y'], ['z', 'x'], 1.0, 1.5, 0.3333333333333333), (['x'], ['z', 'y'], 0.5, 1.5, 0.3333333333333333), (['z'], ['x', 'y'], 0.5, 1.5, 0.3333333333333333) ] expectedRules = list(map(lambda a: (itemize(a[0]), itemize(a[1]), a[2], a[3], a[4]), rx)) itemset_counts = dict(map(lambda i: (tuple(i), index.count(i)), itemsets)) rules = generate_rules( itemsets, itemset_counts, index.num_transactions, 0, 0) def deitemize(a): return list(map(item_str, a)) p = list(map(lambda a: (deitemize(a[0]), deitemize(a[1]), a[2], a[3], a[4]), rules)) print("rules") print(p) for (antecedent, consequent, confidence, lift, support) in rules: print("{}, {} conf={:.4f}, {:.4f}, {:.4f}". format(antecedent, consequent, confidence, lift, support)) assert(len(rules) == len(expectedRules)) for i in range(len(rules)): assert(expectedRules[i] in rules)
def VectorCompare(): queries = loadCranQry("query.text") queries_id_list=[str(int(x)) for x in queries.keys()] inputdocument = cran.CranFile("cran.all") # load the index file saved at from part 1 index = InvertedIndex().load("index_file") qp = QueryProcessor(queries, index, inputdocument, 10) queries_id_list=[str(int(x)) for x in queries.keys()] #print(queries_id_list) #read querls.txt qrels_dict=process_querls_file("qrels.text",queries_id_list) #IdeaVectorsforQuery_ids={} sumbooleanNADC=[] sumvectorNADC=[] vectorNADC1 = [] booleanNADC2 = [] # random_query_id_list=[153, 18] # print(random_query_id_list) query_id = [4 , 29, 53, 58, 100] vectorNADC1=[] vectorNADC2=[] for q_id in query_id: qp.querynumber = q_id # boolean_res=qp.booleanQuery() vector_top3 = qp.vectorQuery(5) vector2_top3=qp.vectorQuery(5,True) # vector_top3=[('12',0.34),('746',0.33),('875',0.24)] # print(boolean_res) print("Output for Vector Model Result::", vector_top3) if (vector_top3.__len__() < 1): vectorNADC1.append(0) else: vector_label = [x[0] for x in vector_top3] score = [x[1] for x in vector_top3] print("DocumentIDs of Vector Model Result:: ", vector_label) print("Scores of Vector Model Result::", score) true_label = vector_label.copy() query_id = str(q_id) for x in vector_label: # str_x="{0:0=3d}".format(x) ind = vector_label.index(x) if (x in qrels_dict.get(query_id)): true_label[ind] = 1 else: true_label[ind] = 0 if true_label.__len__() < 5: len_val = 10 - (true_label.__len__()) true_label.extend([0] * len_val) print("Actual Vector:: ", true_label) print("Predicted Vector:: ", score) if sum(true_label) == 0: vectorNADC1.append(0) else: ndcg = metrics.ndcg_score(true_label, score, 5) print("Calculated ndcg for Vector::", ndcg) vectorNADC1.append(ndcg) if (vector2_top3.__len__() < 1): vectorNADC2.append(0) else: vector_label = [x[0] for x in vector2_top3] score = [x[1] for x in vector2_top3] print("DocumentIDs of Vector Model Result:: ", vector_label) print("Scores of Vector Model Result::", score) true_label = vector_label.copy() query_id = str(q_id) for x in vector_label: # str_x="{0:0=3d}".format(x) ind = vector_label.index(x) if (x in qrels_dict.get(query_id)): true_label[ind] = 1 else: true_label[ind] = 0 if true_label.__len__() < 5: len_val = 10 - (true_label.__len__()) true_label.extend([0] * len_val) print("Actual Vector:: ", true_label) print("Predicted Vector:: ", score) if sum(true_label) == 0: vectorNADC2.append(0) else: ndcg = metrics.ndcg_score(true_label, score, 5) print("Calculated ndcg for Vector::", ndcg) vectorNADC2.append(ndcg) print("Calculated NADC sum for all queries", vectorNADC1) avergae_vectorNADC = float(sum(vectorNADC1) / 5) print("Calculated NADC sum for all queries", vectorNADC2) avergae_vectorNADC2 = float(sum(vectorNADC2) / 5) print("Avergae NADC Vector::", avergae_vectorNADC) print("Avergae NADC boolean::", avergae_vectorNADC2) print(vectorNADC1) print(vectorNADC2) p_value = scipy.stats.wilcoxon(vectorNADC1, vectorNADC2, zero_method='wilcox', correction=False) p = "%.20f" % float(str(p_value[1])) print('P value for all the queries processed is:', p)