def findSimilar(self,link,limit): # we call the read text function from the Crawler to read the new link # we use the construcotr with empty variables crawler = Crawler.Crawler('',0,0,0) self.limit = limit file = open("Data/page%d.txt" % self.limit, 'w') try: self.title , text = crawler.getText(link) # we combine the lists of string to a single string text = ''.join(text) for t in text: file.write(t) file.close() except: "Link is not accesible" file.close() sys.exit(0) indexer = Indexer.Indexer() indexer.start() cosineSimilarity = indexer.getCosineSimilarity() linksId = [ i for i in range(self.limit)] linksIdSorted = [x for _,x in sorted(zip(cosineSimilarity,linksId),reverse=True)] return cosineSimilarity , linksIdSorted
def run_index(): # run an entire index build global docs_path global postings_path global is_stemming global indexer global dict_cache_path try: # check validation conditions if (not check_corpus_directory(docs_path.get())) or (not check_postings_directory(postings_path.get())): return result = tkMessageBox.askquestion("Run Index", "Are you sure?\n dont worry if the GUI" " is stuck or not responding - it is working", icon='warning') if result != 'yes': return print ('START TIME - ' + time.strftime("%H:%M:%S")) start_time = datetime.now() # reset the current memory of the project if (globs.main_dictionary is not None) and (bool(globs.main_dictionary)): globs.main_dictionary.clear() if (globs.cache is not None) and (bool(globs.cache)): globs.cache.clear() if (globs.documents_dict is not None) and (bool(globs.documents_dict)): globs.documents_dict.clear() # start indexing globs.stop_words = load_stop_words(docs_path.get()) indexer = Indexer.Indexer(postings_path.get(), is_stemming.get()) read_file = ReadFile.ReadFile(get_corpus_dir(docs_path.get()), indexer, globs.constants, globs.stop_words, is_stemming.get()) read_file.index_folder() globs.num_of_documents = len(read_file.documents_dict) globs.documents_dict = read_file.documents_dict del read_file indexer.unite_temp_postings() globs.main_dictionary = indexer.main_dict indexer.build_document_weight(globs.documents_dict) # in case want to print stats, uncomment this # with open('{}{}'.format('stats', 'stem' if is_stemming.get() else ''),'w') as my_stats_file: # my_stats_file.write('term,tf,df\n'.format()) # for key,val in main_dictionary.iteritems(): # my_stats_file.write('{},{},{}\n'.format(key,val.tf,val.df)) globs.cache = indexer.cache_dict globs.average_doc_size = globs.average_doc_size/globs.num_of_documents dict_cache_path = postings_path print ('END TIME - ' + time.strftime("%H:%M:%S")) end_time = datetime.now() print_stats_at_end_of_indexing(end_time - start_time) except Exception as err: tkMessageBox.showinfo('ERROR', err) traceback.print_exc(file=stdout)
import Tokenizer import Indexer # (modifiedTokens, DocumentId) pairs = [] docFiles = ["Almeida Garrett - Viagens na Minha Terra.txt", "Eça de Queirós - A Cidade e as Serras.txt"] for docId in range(len(docFiles)): tokenizer = Tokenizer.Tokenizer(docFiles[docId]) tokenizer.createTokens() pairs += [(token, docId) for token in tokenizer.getTokens()] indexer = Indexer.Indexer(pairs, len(docFiles)) indexer.indexTerms(None) print("Most frequent terms:") for docId in range(len(docFiles)): print(docFiles[docId] + "\n\t" + str(indexer.getFreqTerms()[docId])) print("\nMost frequent terms with 4 or more letters:") indexer.indexTerms(4) for docId in range(len(docFiles)): print(docFiles[docId] + "\n\t" + str(indexer.getFreqTerms()[docId]))
from Indexer import * from QueryProcessor import QueryProcessor import numpy as np import math import time from nltk.stem import PorterStemmer from string import ascii_lowercase if __name__ == "__main__": index = Indexer() #index.start_index() query = input("Enter query: ") start_time = time.time() #Return the time to start the search qp = QueryProcessor() urlid = qp.search(query.lower()) temp = [] if not urlid: print('no url find with given query') else: with open('doc_id.json', 'r') as url_id: url_dict = json.load(url_id, strict=False) index = 1 for i in urlid: try: if index > 20: break result_str = "#%3d: %s" % (index, url_dict[str(i)]) print(result_str) index += 1
def main(argv): collectionFile = '' tokenizerType = '' queriesFile = '' rankType = '' start = [] end = [] try: opts, args = getopt.getopt(argv, "hf:t:q:r:", ["collectionFile=", "tokenizerType=", "queriesFilePath=", "rankType="]) except getopt.GetoptError: print('main.py -f <collectionFile> -t <tokenizerType: 0 - Simple, 1 - Better> -q <queriesFilePath> ' '-r <rankType: 0 - TF-IDF, 1 - BM25>') sys.exit() if len(opts) != 4: print('main.py -f <collectionFile> -t <tokenizerType: 0 - Simple, 1 - Better> -q <queriesFilePath> ' '-r <rankType: 0 - TF-IDF, 1 - BM25>') sys.exit() for opt, arg in opts: if opt == '-h': print('main.py -f <collectionFile> -t <tokenizerType: 0 - Simple, 1 - Better> -q <queriesFilePath> ' '-r <rankType: 0 - TF-IDF, 1 - BM25>') sys.exit() elif opt in ("-f", "--collectionFile"): if not path.exists(arg): print('Incorrect path to collection file.') sys.exit() collectionFile = arg elif opt in ("-t", "--tokenizerType"): if arg != '0' and arg != '1': print('Incorrect tokenizer type. Simple tokenizer: 0, Better tokenizer: 1.') sys.exit() tokenizerType = arg elif opt in ("-q", "--queriesFilePath"): if not path.exists(arg): print('Incorrect path to queries file.') sys.exit() queriesFile = arg elif opt in ("-r", "--rankType"): if arg != '0' and arg != '1': print('Incorrect rank type. TF-IDF: 0, BM25: 1.') sys.exit() rankType = arg # Indexer (Indexer(collectionFile, tokenizerType)).writeIndexToFile('index') f = open(queriesFile, 'r') queries = f.read().splitlines() f.close() scores = [] if tokenizerType == '0': # simple tokenizer = Tokenizer.SimpleTokenizer('') else: # better tokenizer = Tokenizer.BetterTokenizer('') for query in queries: # Query Operations tokenizer.changeText(query) queryTerms = tokenizer.getTerms() # Searcher documentsInfo, avgDocLen = Searcher.searchDocuments(queryTerms, 'index') # Ranker ranker = Ranker(documentsInfo, avgDocLen) # Start time (latency purpose) start.append(timer()) # If rankType = 0 (tf-idf) if rankType == '0': scores += [ranker.lnc_ltc()] # If rankType = 1 (BM25) else: scores += [ranker.bm25(1.2, 0.75)] # End time (latency purpose) end.append(timer()) # Evaluation Evaluation.getResults('./data/queries.relevance.txt', queries, scores, start, end)
def main(argv): # ----------------------------------------- HANDLING PROGRAM INPUT ------------------------------------------------- collectionFile = '' tokenizerType = '' queriesFile = '' rankType = '' storePos = '' proximity = '' try: opts, args = getopt.getopt(argv, "hf:t:q:r:p:b:", ["collectionFile=", "tokenizerType=", "queriesFilePath=", "rankType=", "storePositions=", "proximityBoost="]) except getopt.GetoptError: print('main.py -f <collectionFile> -t <tokenizerType: 0 - Simple, 1 - Better> -q <queriesFilePath> ' '-r <rankType: 0 - TF-IDF, 1 - BM25> -p <storePositions: 0 - No, 1 - Yes> ' '-b <proximityBoost: 0 - No, 1 - Yes>') sys.exit() if len(opts) != 6: print('main.py -f <collectionFile> -t <tokenizerType: 0 - Simple, 1 - Better> -q <queriesFilePath> ' '-r <rankType: 0 - TF-IDF, 1 - BM25> -p <storePositions: 0 - No, 1 - Yes> ' '-b <proximityBoost: 0 - No, 1 - Yes>') sys.exit() for opt, arg in opts: if opt == '-h': print('main.py -f <collectionFile> -t <tokenizerType: 0 - Simple, 1 - Better> -q <queriesFilePath> ' '-r <rankType: 0 - TF-IDF, 1 - BM25> -p <storePositions: 0 - No, 1 - Yes> ' '-b <proximityBoost: 0 - No, 1 - Yes>') sys.exit() elif opt in ("-f", "--collectionFile"): if not path.exists(arg): print('Incorrect path to collection file.') sys.exit() collectionFile = arg elif opt in ("-t", "--tokenizerType"): if arg != '0' and arg != '1': print('Incorrect tokenizer type. Simple tokenizer: 0, Better tokenizer: 1.') sys.exit() tokenizerType = arg elif opt in ("-q", "--queriesFilePath"): if not path.exists(arg): print('Incorrect path to queries file.') sys.exit() queriesFile = arg elif opt in ("-r", "--rankType"): if arg != '0' and arg != '1': print('Incorrect rank type. TF-IDF: 0, BM25: 1.') sys.exit() rankType = arg elif opt in ("-p", "--storePositions"): if arg != '0' and arg != '1': print('\nIncorrect store positions choice. No: 0, Yes: 1.') sys.exit() storePos = arg elif opt in ("-b", "--proximityBoost"): if arg != '0' and arg != '1': print('\nIncorrect proximity boost choice. No: 0, Yes: 1.') sys.exit() proximity = arg # ----------------------------------------------- INDEXER ---------------------------------------------------------- indexer = Indexer(collectionFile, tokenizerType, True if storePos=='1' else False) start = timeit.default_timer() indexer.index() stop = timeit.default_timer() print('Indexing total time - {} tokenizer: {} min and {} seconds'.format("simple" if tokenizerType == "0" else "better", (stop - start)//60, (stop - start) % 60)) f = open(queriesFile, 'r') queries = f.read().splitlines() f.close() scores = [] if tokenizerType == '0': # simple tokenizer = Tokenizer.SimpleTokenizer('') else: # better tokenizer = Tokenizer.BetterTokenizer('') start_queries = [] end_queries = [] time_searcher = 0 time_ranker = 0 for query in queries: # --------------------------------------- QUERY OPERATIONS ----------------------------------------------------- tokenizer.changeText(query) #queryTerms, queryTermsPositions = tokenizer.getTerms(withPositions=True if storePos == '1' else False) queryTerms = tokenizer.getTerms(withPositions=False) # ------------------------------------------- SEARCHER --------------------------------------------------------- start = timeit.default_timer() documentsInfo, avgDocLen = Searcher.searchDocuments(queryTerms, 'index', True if storePos == '1' else False) stop = timeit.default_timer() time_searcher = time_searcher + stop - start # -------------------------------------------- RANKER ----------------------------------------------------------' start = timeit.default_timer() ranker = Ranker(documentsInfo, avgDocLen) # Start time (latency purpose) start_queries.append(timer()) # If rankType = 0 (tf-idf) if rankType == '0': # If proximity = 1 (Proximity Boost) if proximity == '1': scores += [ranker.proximity_boost(ranker.lnc_ltc(), queryTerms)] else: scores += [ranker.lnc_ltc()] # If rankType = 1 (BM25) else: # If proximity = 1 (Proximity Boost) if proximity == '1': scores += [ranker.proximity_boost(ranker.bm25(1.2, 0.75), queryTerms)] else: scores += [ranker.bm25(1.2, 0.75)] stop = timeit.default_timer() time_ranker = time_ranker + stop - start # End time (latency purpose) end_queries.append(timer()) print('Searching time for all queries: {} min and {} seconds'.format(time_searcher // 60, time_searcher % 60)) print('Ranking time for all queries: {} min and {} seconds'.format(time_ranker // 60, time_ranker % 60)) # Evaluation Evaluation.getResults('./data/queries.relevance.txt', queries, scores, start_queries, end_queries)
def get_context_data(self, **kwargs): context = super(SearchView, self).get_context_data(**kwargs) #{'view': <googlesearch.views.SearchView object at 0x1036cd0d0>} results = [] try: results = [] index = Indexer() query = self.request.GET.get('q', '') start_time = time.time() #Return the time to start the search qp = QueryProcessor() urlid = qp.search(query.lower()) temp = [] if not urlid: print('no url find with given query') else: with open('doc_id.json', 'r') as url_id: url_dict = json.load(url_id, strict=False) index = 1 for i in urlid: try: if index > 20: break result_str = "#%3d: %s" %(index,url_dict[str(i)]) results.append( (result_str, url_dict[str(i)])) index += 1 except: pass total_time = time.time() - start_time #The total time used to complete the search #time_str = "The search took time %f seconds" % (total_time) #print(time_str) #results = SearchResults(results) pages = self.calculate_pages() except: print("Error occured") page = 1 pages = [0, 1, 2] # Defaults context.update({ 'items': [], 'total_results': 0, 'current_page': 0, 'prev_page': 0, 'next_page': 0, 'search_terms': self.request.GET.get('q', ''), 'error': results, 'total_time': 0, }) context.update({ 'items': results, 'total_results': 20, 'current_page': pages[1], 'prev_page': pages[0], 'next_page': pages[2], 'search_terms': self.request.GET.get('q', ''), 'total_time': total_time, }) return context
def __init__(self, directory, load_file=False, stop_word_path=None): self.indexer = Indexer.Indexer(directory, load_file, stop_word_path) self.inverted_index = self.indexer.inverted_idx self.stop_list = self.indexer.stop_words
def corpusReader(): # to choose file to read and open it #filename_input = "OneDrive_1_9-26-2019/2004_TREC_ASCII_MEDLINE_1" filename_input = Interact.openFile() fi = open(filename_input, 'r', encoding="latin-1") #filename_input = "OneDrive_1_9-26-2019/2004_TREC_ASCII_MEDLINE_2" #fi2 = open(filename_input, 'r', encoding="latin-1") #open file to write the results; not needed filename_output = "output.txt" try: fo = open(filename_output, 'w') except: print("File not found!") #read file and send to Identifier reader in separate documents doc = "" var = False idx = Indexer() token = Tokenizer() start = time.time() for line in fi: if(line.strip() == ""): #here ends a document #call IdentifierReader on read lines and find Identitifiers (PMID and TI) docdict = IdentifierReader.identReader(doc) #basic Tokenizer tokenizer_dict = token.tokenizer(docdict) #improved Tokenizer with Porter stemmer tok_dict = ImprovedTokenizer.improvedTokenizer(tokenizer_dict) indexed_dict = idx.indexer(tok_dict) var = False doc = "" continue if(line[4] == '-'): key = line.split("-", 1) #begining of a document if(key[0] == "PMID"): var = True if(var): doc += line #read 2nd file # for line in fi2: # if(line.strip() == ""): # #here ends a document # #call IdentifierReader on read lines and find Identitifiers (PMID and TI) # docdict = IdentifierReader.identReader(doc) # #basic Tokenizer # tokenizer_dict = token.tokenizer(docdict) # #improved Tokenizer with Porter stemmer # tok_dict = ImprovedTokenizer.improvedTokenizer(tokenizer_dict) # #indexing # indexed_dict = idx.indexer(tok_dict) # var = False # doc = "" # continue # if(line[4] == '-'): # key = line.split("-", 1) # if(key[0] == "PMID"): # var = True # #here starts a document # if(var): # doc += line #write results to output.txt file print("Writing in file\n") indexed_dict = idx.sort_indexer(indexed_dict) for i in indexed_dict: tmp = "" fo.write(i) for j in indexed_dict[i]: tmp = tmp + "," + j + ":" + str(indexed_dict[i][j]) fo.write(tmp + "\n") end = time.time() #to answer question 4 #ten first terms (in alphabetic order) that appear in only one document doc_freq_1 = [] high_doc_freq = {1:0, 2:0, 3:0, 4:0, 5:0, 6:0, 7:0, 8:0, 9:0, 10:0} current_min = 0 count1 = 0 count2 = 0 for i in indexed_dict: if((len(indexed_dict[i]) == 1) and count1 <= 9): doc_freq_1.append(i) count1 += 1 if(len(indexed_dict[i]) > current_min): term_to_replace = [k for k, h in high_doc_freq.items() if h is current_min] high_doc_freq.pop(term_to_replace[0]) high_doc_freq[i] = len(indexed_dict[i]) current_min = min(list(high_doc_freq.values())) print("RESULTS") print("Time to run: ", end - start) print("Vocabulary size: ", len(indexed_dict)) print("Doc frequency 1: ", doc_freq_1) print("Highest doc frequency: ", high_doc_freq)
count += 1 def answer_phrase_queries(phrase_string): query = Query.Query() results = query.phrase_query(phrase_string) count = 1 for score, file_name in results: print("Choice number: ", count, " --> File: ", file_name, "Score = ", score) count += 1 def answer_text_queries(query_string): query = Query.Query() results = query.text_query(query_string) count = 1 for score, file_name in results: print("Choice number: ", count, " --> File: ", file_name, "Score = ", score) count += 1 if __name__ == "__main__": print("Aloha!") path_to_text_corpus = "/home/nikhil/Desktop/Text-Search-Engine/text_corpus" indexer = Indexer.Indexer() print("Indexer object created!") #ndexer.build_index(path_to_text_corpus) print("Index building success!!!") #listen_for_queries() root = Tk() root.geometry("400x300") app = gui.Window(root) root.mainloop()
parser = argparse.ArgumentParser(description='This is project4 driver.') parser.add_argument('original', help='Original text file name.') parser.add_argument('preprocessed', help='preprocessed file name for indexing.') parser.add_argument('--map', dest='mapType', help='Map type used for the multimap.') parser.add_argument('--index', dest='indexFile', help='File for the indexed output') args = parser.parse_args() myIndexer = Indexer(args) myIndexer.index() run = True while (run): toSearch = input("Enter a word to search for: ") toSearch = toSearch.strip('\n') myIndexer.search(toSearch) quit = input("Quit? (y/n): ") if (quit == 'y'): run = False if (args.indexFile): myIndexer.writeIndex() else: