def endElement(self, name): if name == "page": self.current_article.processArticle() #### This is where the processing for the article starts........starts Indexer.doIndex(self.current_article, self.outfile) to_store = "" if len(self.current_lines): try: to_store = (''.join(self.current_lines)).strip() except Exception: print "exception occured at id={0} and title={1}\n {2}".format(self.current_article.id,self.current_article.title,Exception) if name == "title": self.current_article.title = to_store if name == "id": if self.parent_element == "page": self.current_article.id = to_store if name == "text": self.current_article.text = to_store self.elements.pop() if self.elements: self.current_element = self.parent_element if len(self.elements) == 1: self.parent_element = "" else: self.parent_element = self.elements[len(self.elements)-1] else: self.current_element = ""
def endElement(self, name): if name == "page": self.current_article.processArticle( ) #### This is where the processing for the article starts........starts Indexer.doIndex(self.current_article, self.outfile) to_store = "" if len(self.current_lines): try: to_store = (''.join(self.current_lines)).strip() except Exception: print "exception occured at id={0} and title={1}\n {2}".format( self.current_article.id, self.current_article.title, Exception) #to_store = self.current_characters.strip() if name == "title": self.current_article.title = to_store if name == "id": if self.parent_element == "page": self.current_article.id = to_store if name == "text": self.current_article.text = to_store self.elements.pop() if self.elements: self.current_element = self.parent_element if len(self.elements) == 1: self.parent_element = "" else: self.parent_element = self.elements[len(self.elements) - 1] else: self.current_element = ""
def main(): print("--- RETRIEVER ---") print("Select") print("1 for BM25") print("2 for tf-idf") print("3 for Query Likelihood Model") user_choice = input("Enter your choice: ") if user_choice not in ["1", "2", "3"]: print("\nInvalid input. Aborting . .") sys.exit() # sets "RETRIEVAL_MODEL" to the user chosen model. set_retrieval_model(user_choice) # Create a directory to save the results. # and overwrite existing run file, if any os.makedirs(RUN_OUTPUTS_DIR,exist_ok=True) global RETRIEVAL_MODEL output_file = os.path.join(RUN_OUTPUTS_DIR,RETRIEVAL_MODEL+"Run.txt") if os.path.exists(output_file): os.remove(output_file) # Generate the unigram index. # By default, not performing stopping. # So send False Indexer.unigram_index(False) # Fetch the index generated. global INVERTED_INDEX INVERTED_INDEX = Indexer.INVERTED_INDEX global DOC_TOKEN_COUNT DOC_TOKEN_COUNT = Indexer.DOC_TOKEN_COUNT # Read all queries. queries = extract_queries_from_file() global QUERY_ID for query in queries: QUERY_ID +=1 # Dictionary of query term frequency query_term_freq = query_term_freq_map(query) # Fetch the inverted indexes corresponding to the terms # in the query. fetched_index = query_matching_index(query_term_freq) # Compute ranking scores of all docs for this query. doc_scores = compute_doc_scores(fetched_index, query_term_freq) # Write results to a textfile. output_to_file(doc_scores,QUERY_ID) print("Completed Retrieval for query : " + query) print("End of Retrieval.")
def Get_UL_Coord(contour,pad=10): """ Get the upper left coordinate of the contour. """ dists = [] for c in contour: dists.append(distbetween(c[0][0],c[0][1],0,0)) return (contour[Indexer.get_index_of_min(dists)[0]][0][0]-pad,contour[Indexer.get_index_of_min(dists)[0]][0][1]-pad)
def data_set_Path(corpus_path, index_path): global __stopwords_path global __corpus_path global __index_path __stopwords_path = corpus_path + "\\stop_words.txt" Parser.set_stop_words_file(__stopwords_path) __corpus_path = corpus_path __index_path = index_path Indexer.set_path_to_postiong_files(__index_path)
def Get_LR_Coord(contour,imgXmax,imgYmax, pad=10): """ Get the lower right coordinate of the contour. """ dists = [] for c in contour: dists.append(distbetween(c[0][0],c[0][1],imgXmax,imgYmax)) return (contour[Indexer.get_index_of_min(dists)[0]][0][0]+pad,contour[Indexer.get_index_of_min(dists)[0]][0][1]+pad)
def Get_UL_Coord(contour, pad=10): """ Get the upper left coordinate of the contour. """ dists = [] for c in contour: dists.append(distbetween(c[0][0], c[0][1], 0, 0)) return (contour[Indexer.get_index_of_min(dists)[0]][0][0] - pad, contour[Indexer.get_index_of_min(dists)[0]][0][1] - pad)
def Get_LR_Coord(contour, imgXmax, imgYmax, pad=10): """ Get the lower right coordinate of the contour. """ dists = [] for c in contour: dists.append(distbetween(c[0][0], c[0][1], imgXmax, imgYmax)) return (contour[Indexer.get_index_of_min(dists)[0]][0][0] + pad, contour[Indexer.get_index_of_min(dists)[0]][0][1] + pad)
def main(): print("--- RETRIEVAL WITH SNIPPETS ---") # Generate the unigram index. # By default, not performing stopping. # So send False Indexer.unigram_index(False) # Fetch the index generated. global INVERTED_INDEX INVERTED_INDEX = Indexer.INVERTED_INDEX global DOC_TOKEN_COUNT DOC_TOKEN_COUNT = Indexer.DOC_TOKEN_COUNT # Read all queries. queries = extract_queries_from_file() global QUERY_ID for query in queries: QUERY_ID +=1 # Dictionary of query term frequency query_term_freq = query_term_freq_map(query) # Fetch the inverted indexes corresponding to the terms # in the query. fetched_index = query_matching_index(query_term_freq) # Compute BM25 score for this query. doc_scores = BM25_score(fetched_index, query_term_freq) # Extract top 100 docs, for which we need # to generate snippets. sorted_scores = [(k, doc_scores[k]) for k in sorted(doc_scores, key=doc_scores.get, reverse = True)] for i in range(min(len(sorted_scores),100)): k,v = sorted_scores[i] doc_list.append(k) ''' top100 = Counter(doc_scores).most_common(100) doc_list = [] for k,v in top100: doc_list.append(k) ''' # Snippet generator prints snippets # with query term highlighting, onto the console. SnippetGenerator.snippet_generator(doc_list,query, INVERTED_INDEX) print("Completed Retrieval for query : " + query) print("End of Retrieval with Snippets.")
def index_retri(): Indexer.parse_corpus() Indexer.build() number_of_terms1 = Indexer.number_of_terms1 index_dict = Indexer.index_dict parsed_queries = Retrieval.read_query_doc() query_id = 1 for query in parsed_queries: Retrieval.tf_idf(index_dict, number_of_terms1, query, query_id) query_id += 1
def reset(param=None): global __corpus_path,__index_path, __stem_suffix if param == "Queries": Parser.reset() __corpus_path = "" else: ReadFile.reset() Parser.reset() Indexer.reset() remove_index_files() __stem_suffix = '' __corpus_path = "" __index_path = ""
def task3a(model, raw_corpus_directory): project_directory = os.getcwd() p = Indexer.Parser() corpus_directory = p.build_corpus(raw_corpus_directory, stopped=True) output_directory = os.path.join(project_directory, "output") I = Indexer.InvertedIndexer(corpus_directory) I.ngram_indexer(1) # builds a unigram indexes for each word r = Retriever.Retriever( corpus_directory, I, project_directory ) # create a Retriever class, which contains different retrieval model os.chdir(raw_corpus_directory) os.chdir(os.pardir) f = open('cacm.query.txt', 'r') soup = BeautifulSoup(f.read(), 'html.parser') f.close() f_stop_words = open('common_words.txt', 'r') stop_words_list = f_stop_words.readlines() stop_words = [i.strip() for i in stop_words_list] f_stop_words.close() file_name = os.path.join(output_directory, 'task3a_' + model + '.txt') f = open(file_name, 'w') # open file for writing results for i in range(64): query_no = (soup.find('docno')).text.encode( 'utf-8') # extract query number and query (soup.find('docno')).decompose() query = (soup.find('doc')).text.encode('utf-8') (soup.find('doc')).decompose() r.process_query(query, stopped=True, stopwords=stop_words) # parse the query # r.clean_content(query) docs_and_scores = r.get_scores_for_docs( model, int(query_no)) # retrieve relevant documents # save results into appropriate file docs = docs_and_scores[0] scores = docs_and_scores[1] for i in range(100): f.write(str(query_no) \ + " Q0 " \ + str(docs[i]) + ' ' \ + str((i+1)) + " " \ + str(scores[i]) + " " \ + model + "\n") f.close()
def main(): print("--- PSEUDO RELEVANCE RETRIEVER ---") # Create a directory to save the results. # and overwrite existing run file, if any os.makedirs(RUN_OUTPUTS_DIR,exist_ok=True) output_file = os.path.join(RUN_OUTPUTS_DIR,"PseudoRelBM25Run.txt") if os.path.exists(output_file): os.remove(output_file) # Generate the unigram index. # By default, not performing stopping. # So send False Indexer.unigram_index(False) # Fetch the index generated. global INVERTED_INDEX INVERTED_INDEX = Indexer.INVERTED_INDEX global DOC_TOKEN_COUNT DOC_TOKEN_COUNT = Indexer.DOC_TOKEN_COUNT # Read all queries. queries = extract_queries_from_file() global QUERY_ID for query in queries: QUERY_ID +=1 # Dictionary of query term frequency query_term_freq = query_term_freq_map(query) # Fetch the inverted indexes corresponding to the terms # in the query. fetched_index = query_matching_index(query_term_freq) # Compute BM25 score for this query term. doc_scores = BM25_score(fetched_index, query_term_freq) # Update doc scores using Pseudo-relevance doc_scores = pseudo_relevance(query,doc_scores) # Write results to a textfile. output_to_file(doc_scores,QUERY_ID) print("Completed Retrieval for query : " + query) print("End of Pseudo Relevance retrieval.")
def findSimilar(self,link,limit): # we call the read text function from the Crawler to read the new link # we use the construcotr with empty variables crawler = Crawler.Crawler('',0,0,0) self.limit = limit file = open("Data/page%d.txt" % self.limit, 'w') try: self.title , text = crawler.getText(link) # we combine the lists of string to a single string text = ''.join(text) for t in text: file.write(t) file.close() except: "Link is not accesible" file.close() sys.exit(0) indexer = Indexer.Indexer() indexer.start() cosineSimilarity = indexer.getCosineSimilarity() linksId = [ i for i in range(self.limit)] linksIdSorted = [x for _,x in sorted(zip(cosineSimilarity,linksId),reverse=True)] return cosineSimilarity , linksIdSorted
def __init__(self): # initialize the Controller object, first we would need to initialize # the configuration system # the configuration is saved in the appdata system self.configuration = Configuration.Configuration() self.crawler = Indexer.FileCrawler() self.sep = os.sep
def task3b(model): output_directory = os.path.join(os.getcwd(), "output") stemmed_corpus = get_stemmed_corpus() f = open('cacm_stem.query.txt', 'r') stemmed_queries = f.readlines() f.close() I = Indexer.InvertedIndexer('') I.stemmed_indexer(stemmed_corpus) r = Retriever.Retriever('', I, os.getcwd()) file_name = os.path.join(output_directory, 'task3b_' + model + '_stemmed.txt') f = open(file_name, 'w') query_no = [12, 13, 19, 23, 24, 25, 50] q_iter = 0 for each_query in stemmed_queries: r.process_query(each_query) docs_and_scores = r.get_scores_for_docs(model, query_no[q_iter]) # save results into appropriate file docs = docs_and_scores[0] scores = docs_and_scores[1] for i in range(100): f.write(str(query_no[q_iter]) \ + " Q0 " \ + str(docs[i]) + ' ' \ + str((i + 1)) + " " \ + str(scores[i]) + " " \ + model + "\n") q_iter += 1 f.close()
def localIndexer(): d = eval('local("tree -Nif %s",True)' % LOCAL_DIR) filelist = d.split('\n') for path in filelist[1:-2]: path = _strEncode(path) index = path.replace(r'%s' % LOCAL_DIR,'') r = eval('local("test -d %s || md5sum %s",True)' % (path,path)) if not r: md5sum = 0 lastModification = 0 isDir = 1 else : md5sum = r.split(' ')[0] lastModification = int(eval('local("stat -c %Y '+ path +'",True)')) isDir = 0 Indexer.addLocal(index,md5sum,lastModification,isDir)
def remoteIndexer(): d = eval('run("tree -Nif %s")' % REMOTE_DIR) filelist = d.split('\r\n') for path in filelist[1:-2]: path = _strEncode(path) index = path.replace(r'%s' % REMOTE_DIR,'') r = eval('run("test -d %s || md5sum %s")' % (path,path)) if not r: md5sum = 0 lastModification = 0 isDir = 1 else : md5sum = r.split(' ')[0] lastModification = int(eval('run("stat -c %Y '+ path +'")')) isDir = 0 Indexer.addRemote(index,md5sum,lastModification,isDir)
def main(): global stop_words global parsed_queries Indexer.read_stop_words() Retrieval.read_query_doc() # generate the unigrams from the lucene results for k=10 or 20 stop_words = Indexer.stop_words parsed_queries = Retrieval.parsed_queries # evaluating the dice coefficient on the search query with n=8 or 6 query_id = 0 for query in parsed_queries: query_id = query_id + 1 result = generateUnigramDocFreq(stop_words, query_id) runSearchQuery(query.lower(), result, 8) Retrieval.write_queries_to_file("expanded-queries", expanded_queries)
def read_query_doc(): global parsed_queries fr = open('test-collection/cacm.query.txt', 'r') queries = fr.read() while queries.find('<DOC>') != -1: query = queries[queries.find('</DOCNO>') + 8:queries.find('</DOC>')] query = query.strip().replace('\n', " ") new_query = Indexer.transformText(query, True, True) parsed_queries.append(new_query) queries = queries[queries.find('</DOC>') + 6:] write_queries_to_file("parsed-queries", parsed_queries)
def GetClothColor(hsv,search_width=45): """ Find the most common HSV values in the image. In a well lit image, this will be the cloth """ hist = cv2.calcHist([hsv], [1], None, [180], [0, 180]) h_max = Indexer.get_index_of_max(hist)[0] hist = cv2.calcHist([hsv], [1], None, [256], [0, 256]) s_max = Indexer.get_index_of_max(hist)[0] hist = cv2.calcHist([hsv], [2], None, [256], [0, 256]) v_max = Indexer.get_index_of_max(hist)[0] # define range of blue color in HSV lower_color = np.array([h_max-search_width,s_max-search_width,v_max-search_width]) upper_color = np.array([h_max+search_width,s_max+search_width,v_max+search_width]) return lower_color, upper_color
def run_index(): # run an entire index build global docs_path global postings_path global is_stemming global indexer global dict_cache_path try: # check validation conditions if (not check_corpus_directory(docs_path.get())) or (not check_postings_directory(postings_path.get())): return result = tkMessageBox.askquestion("Run Index", "Are you sure?\n dont worry if the GUI" " is stuck or not responding - it is working", icon='warning') if result != 'yes': return print ('START TIME - ' + time.strftime("%H:%M:%S")) start_time = datetime.now() # reset the current memory of the project if (globs.main_dictionary is not None) and (bool(globs.main_dictionary)): globs.main_dictionary.clear() if (globs.cache is not None) and (bool(globs.cache)): globs.cache.clear() if (globs.documents_dict is not None) and (bool(globs.documents_dict)): globs.documents_dict.clear() # start indexing globs.stop_words = load_stop_words(docs_path.get()) indexer = Indexer.Indexer(postings_path.get(), is_stemming.get()) read_file = ReadFile.ReadFile(get_corpus_dir(docs_path.get()), indexer, globs.constants, globs.stop_words, is_stemming.get()) read_file.index_folder() globs.num_of_documents = len(read_file.documents_dict) globs.documents_dict = read_file.documents_dict del read_file indexer.unite_temp_postings() globs.main_dictionary = indexer.main_dict indexer.build_document_weight(globs.documents_dict) # in case want to print stats, uncomment this # with open('{}{}'.format('stats', 'stem' if is_stemming.get() else ''),'w') as my_stats_file: # my_stats_file.write('term,tf,df\n'.format()) # for key,val in main_dictionary.iteritems(): # my_stats_file.write('{},{},{}\n'.format(key,val.tf,val.df)) globs.cache = indexer.cache_dict globs.average_doc_size = globs.average_doc_size/globs.num_of_documents dict_cache_path = postings_path print ('END TIME - ' + time.strftime("%H:%M:%S")) end_time = datetime.now() print_stats_at_end_of_indexing(end_time - start_time) except Exception as err: tkMessageBox.showinfo('ERROR', err) traceback.print_exc(file=stdout)
def MaskTableBed(contours): """ Mask out the table bed, assuming that it will be the biggest contour. """ #The largest area should be the table bed areas = [] for c in contours: areas.append(cv2.contourArea(c)) #return the contour that delineates the table bed largest_contour = Indexer.get_index_of_max(areas) return contours[largest_contour[0]]
def change_index(request): ans = URL.objects.all() cur_page = [] for i in ans: cur_page.append((str(i.link), str(i.description))) if request.GET.get('request') == 'page': num_page = int(request.GET.get('num')) return HttpResponse(json.dumps({'urls': cur_page[5 * (num_page - 1): num_page * 5]}), content_type="application/json") elif request.GET.get('request') == 'save': print "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA" name = base64.b16encode(request.GET.get('url')) f = open('/home/katrin/databasetemp/'+name, "w") f.write(request.GET.get('txt')) try: d.start('/home/katrin/databasetemp/') except Exception, e: print e URL.objects.filter(link=request.GET.get('url')).delete() new_link = URL(link=request.GET.get('url'), description=request.GET.get('txt')) new_link.save() return HttpResponse(json.dumps('ok'), content_type="application/json")
def GetClothColor(hsv, search_width=45): """ Find the most common HSV values in the image. In a well lit image, this will be the cloth """ hist = cv2.calcHist([hsv], [1], None, [180], [0, 180]) h_max = Indexer.get_index_of_max(hist)[0] hist = cv2.calcHist([hsv], [1], None, [256], [0, 256]) s_max = Indexer.get_index_of_max(hist)[0] hist = cv2.calcHist([hsv], [2], None, [256], [0, 256]) v_max = Indexer.get_index_of_max(hist)[0] # define range of blue color in HSV lower_color = np.array( [h_max - search_width, s_max - search_width, v_max - search_width]) upper_color = np.array( [h_max + search_width, s_max + search_width, v_max + search_width]) return lower_color, upper_color
def checkInIndexFileWordMap(term): pos = bisect.bisect(sortedIndexFileWordMapKeys,term) if pos > 0: pos = pos - 1 key = sortedIndexFileWordMapKeys[pos] index = indexFileWordMap[key] with bz2.BZ2File("{0}.index{1}.bz2".format(infile,index), 'rb', compresslevel=9) as ipartF: for line in ipartF: if line.startswith("{0}=".format(term)): parts = line.strip().split("=") if len(parts) == 2: ffo = Indexer.getFOFromLine(parts[1]) return ffo return {}
def parallel(): remote_indexes = Indexer.allRemote() for r in remote_indexes: l = Indexer.oneLocal(r.index) if not l: if r.isDir == 1: local_dir_to_create.append(r.index) else: files_from_remote.append(r.index) else: if r.isDir == 0 and r.md5sum != l.md5sum and r.lastModification > l.lastModification: files_from_remote.append(r.index) local_indexes = Indexer.allLocal() for l in local_indexes: r = Indexer.oneRemote(l.index) if not r: if l.isDir == 1: remote_dir_to_create.append(l.index) else: files_from_local.append(l.index) else: if l.isDir == 0 and l.md5sum != r.md5sum and l.lastModification > r.lastModification: files_from_local.append(l.index)
def checkInIndexFileWordMap(term): pos = bisect.bisect(sortedIndexFileWordMapKeys,term) if pos > 0: pos = pos - 1 key = sortedIndexFileWordMapKeys[pos] index = indexFileWordMap[key] #print "key = {0} and index = {1}".format(key,index) with bz2.BZ2File("{0}.index{1}.bz2".format(infile,index), 'rb', compresslevel=9) as ipartF: #print "checking file {0}.index{1}.bz2".format(infile,index) for line in ipartF: if line.startswith("{0}=".format(term)): parts = line.strip().split("=") if len(parts) == 2: #word = parts[0] ffo = Indexer.getFOFromLine(parts[1]) return ffo return {}
def Main(cp, ip, to_stem): global __corpus_path global __index_path global doc global __stem_suffix create_city_db() Parser.stem = to_stem if to_stem is True: __stem_suffix = '_stem' else: __stem_suffix = '' start = time.time() data_set_Path(cp, ip) Indexer.create_posting_files(__stem_suffix) counter = 0 for root, dirs, files in os.walk(__corpus_path): for file in files: if (stop==True): reset() #will clear the memory of the program %% will remove the posting files and dictionary return #print("file!!!") end2 = time.time() # if ((end2-start)/60)>10 and ((end2-start)/60) <10.10: # print(str(file)) if str(file) != 'stop_words.txt': ReadFile.takeDocsInfoFromOneFile(str(pathlib.PurePath(root, file))) dic_of_one_file = Parser.parse(dic_to_parse) sorted_dictionary = collections.OrderedDict(sorted(dic_of_one_file.items())) index_start = time.time() Indexer.merge_dictionaries(sorted_dictionary) dic_to_parse.clear() counter += 1 if counter == 100: Indexer.SaveAndMergePostings() counter = 0 Indexer.SaveAndMergePostings() saveCityDictionaryToDisk(ip) saveMainDictionaryToDisk(ip) cleanDocsYeshuyot() saveDocumentDictionaryToDisk(ip) x= ReadFile.docs_dictionary saveLangListToDisk(ip) saveStopWordsDictToDisk(ip) #ranker things createAndSaveAvdlToDisk(ip) x=ReadFile.lang_list end2 = time.time() time_final = str((end2 - start) / 60) print("time of program: " + time_final) sendInfoToGUI(time_final)
def main(): global stop_words stop_words = Indexer.read_stop_words() print "Parsing the corpus" parse_corpus() print "creating the index" build() print "Parsing the queries" queries = parse_queries() Retrieval.write_queries_to_file("stopped-queries", queries) query_id = 1 for query in queries: tf_idf(index_dict, number_of_terms1, query, query_id) query_id = query_id + 1 query_id = 1 for query in queries: BM25(index_dict, query, query_id) query_id = query_id + 1
def build_engine(): global stemmer global dist_path global corpus_path global root global w global in_process global term_dictionary_in_corpus reset_lists_on_screen_language() reset_lists_on_screen_city() reset_lists_on_screen_query() total = datetime.datetime.now() term_dictionary = {} num_of_post = Indexer.start_read(corpus_path, dist_path, term_dictionary, stemmer) Indexer.extract_entity(term_dictionary, dist_path, num_of_post) num_of_doc = Indexer.merge_document_dictionary( dist_path, num_of_post, stemmer) t1 = threading.Thread(target=fill_language_list) t1.start() t2 = threading.Thread(target=fill_city_list) t2.start() Indexer.new_merge(dist_path, term_dictionary, num_of_post, stemmer) sort_key = sorted(term_dictionary) sort_term_dictionary = {} for key in sort_key: sort_term_dictionary[key] = term_dictionary[key] if stemmer: file = open(dist_path + "/term_dictionary_stemmer.pkl", "wb+") else: file = open(dist_path + "/term_dictionary.pkl", "wb+") pickle.dump(sort_term_dictionary, file, pickle.HIGHEST_PROTOCOL) term_dictionary_in_corpus = sort_term_dictionary file.close() num_of_terms = len(sort_term_dictionary) total_time = str(datetime.datetime.now() - total) showinfo( 'info', 'The number of documents in corpus: ' + str(num_of_doc) + '\r\n' + 'The number of terms in the corpus: ' + str(num_of_terms) + '\r\n' + 'The total time: ' + str(total_time)) bord_state('normal')
def main(): global number_of_terms1 global index_dict print "Select 1 for BM25" print "Select 2 for tf-idf" print "select 3 for query likelihood" print "select 4 for BM25 with stemmed corpus and quires" choice = input("Choose retrieval model") if choice == 4 or choice == 5: Indexer.build_stem() if choice != 4 and choice != 5: print "Parsing the corpus ... " Indexer.parse_corpus() print "Building the index ... " Indexer.build() number_of_terms1 = Indexer.number_of_terms1 index_dict = Indexer.index_dict print "Reading queries from query doc ..." read_query_doc() read_query_doc_stem() query_id = 1 if choice == 4: for query in parsed_queries_stem: BM25(index_dict, query, query_id) query_id = query_id + 1 if choice == 5: for query in parsed_queries_stem: tf_idf(index_dict, number_of_terms1, query, query_id) query_id += 1 for query in parsed_queries: if choice == 1: BM25(index_dict, query, query_id) elif choice == 2: tf_idf(index_dict, number_of_terms1, query, query_id) else: QLM(index_dict, query, query_id) query_id = query_id + 1
def open_mosaic(image_set_directory, target_image, output_image): Indexer.run_index(image_set_directory) Stitcher.run_stitcher(target_image, 20, 'rgb', output_image)
def index_creation(): Parser.main() Indexer.unigram_index() Indexer.output_index_to_file("unigram_index")
with open("Outputs/" + model + ".txt", "a+") as out_file: sorted_scores = [ (k, doc_scores[k]) for k in sorted(doc_scores, key=doc_scores.get, reverse=True) ] for i in range(1, min(len(sorted_scores), 101)): doc, score = sorted_scores[i] rank += 1 out_file.write( str(q_id) + " Q0 " + doc + " " + str(rank) + " " + str(score) + " " + model + "\n") if __name__ == '__main__': Indexer.unigram_index(True) INVERTED_INDEX = Indexer.INVERTED_INDEX DOC_TOKEN_COUNT = Indexer.DOC_TOKEN_COUNT models = ["StoppedIndexWithBM25", "StoppedIndexWithTFIDF"] for model in models: OUTPUT_FILE = "Outputs/" + model + ".txt" if os.path.exists(OUTPUT_FILE): os.remove(OUTPUT_FILE) query_file = open("cacm.query.txt", 'r') queries = [] query = "" for line in query_file.readlines(): if line == "\n": continue
sys.setdefaultencoding('utf-8') def to_url(request): s = request.GET.get('text') s = s.split(" ") r = [] for i in s: if i != " " and i !="": r.append(i) a= Crawler.Crawler(0, 1, r, '/home/katrin/databasetemp/') try: a.downloadPages() except Exception, e: pass try: d.start('/home/katrin/databasetemp/') except Exception, e: print e return HttpResponse('Done it') def index_url(request): return render(request, 'search/index_url.html') def find_res(request): s = Search_str.Search(request.GET.get('text')) ans= s.start() return HttpResponse(json.dumps({'urls': ans}), content_type="application/json") def urls(request): return render(request, 'search/urls.html')
sorted_scores = [ (k, doc_scores[k]) for k in sorted(doc_scores, key=doc_scores.get, reverse=True) ] for i in range(1, min(len(sorted_scores), 101)): doc, score = sorted_scores[i] rank += 1 model = "PseudoRelevanceWithKL_Divergence" out_file.write( str(q_id) + " Q0 " + doc + " " + str(rank) + " " + str(score) + " " + model + "\n") if __name__ == '__main__': Indexer.unigram_index(False) INVERTED_INDEX = Indexer.INVERTED_INDEX DOC_TOKEN_COUNT = Indexer.DOC_TOKEN_COUNT if os.path.exists(OUTPUT_FILE): os.remove(OUTPUT_FILE) query_file = open("cacm.query.txt", 'r') queries = [] query = "" for line in query_file.readlines(): if line == "\n": continue if line.startswith("<DOCNO>") or line.startswith("<DOC>"): continue if line.startswith("</DOC>"): queries.append(query.strip().lower())
if count % 3 == 0: self.id = content count = 0 count = count + 1 elif self.CurrentData == "text": self.text += content # SAX Parser : As the xml file size is large def Parse(xmlFileName): parser = xml.sax.make_parser() parser.setFeature(xml.sax.handler.feature_namespaces, 0) Handler = WikiHandler() parser.setContentHandler(Handler) parser.parse(xmlFileName) if (__name__ == "__main__"): if len(sys.argv) < 3: sys.stderr.write("Usage: " + sys.argv[0] + " <test_dump.xml> <test_index>\n") sys.exit(2) xmlFileName = sys.argv[1] Parse(xmlFileName) #print final Indexer.indexing(final, sys.argv[2]) print "time taken for script to run is ", datetime.now( ) - startTime #time taken by the script
elif self.CurrentData == "id": global count if count%3 == 0: self.id = content count = 0 count = count + 1 elif self.CurrentData == "text": self.text += content # SAX Parser : As the xml file size is large def Parse(xmlFileName): parser = xml.sax.make_parser() parser.setFeature(xml.sax.handler.feature_namespaces, 0) Handler = WikiHandler() parser.setContentHandler( Handler ) parser.parse(xmlFileName) if ( __name__ == "__main__"): if len(sys.argv) < 3: sys.stderr.write("Usage: " + sys.argv[0] + " <test_dump.xml> <test_index>\n") sys.exit(2) xmlFileName = sys.argv[1] Parse(xmlFileName) #print final Indexer.indexing(final,sys.argv[2]) print "time taken for script to run is " ,datetime.now() - startTime #time taken by the script
from shutil import copyfileobj import bz2 import time import Indexer from WikiSAXHandler import WikiContentHandler import TokenStemmer script, infile, outfile = argv toPrintProfile = "" start = int(round(time.time() * 1000)) Indexer.doInit(outfile) profileStart = int(round(time.time() * 1000)) if infile.endswith(".bz2"): with bz2.BZ2File(infile, 'rb', compresslevel=9) as compressed_infile: sax.parse(compressed_infile, WikiContentHandler(outfile)) else: sax.parse(infile, WikiContentHandler(outfile)) profileEnd = int(round(time.time() * 1000)) toPrintProfile += "sax.parse completed. Time Taken in milliseconds = {0}\n".format( (profileEnd - profileStart)) toPrintProfile += "Index.profileTime in milliseconds = {0}\n".format( Indexer.profileTime) toPrintProfile += "Index.profileTime1 in milliseconds = {0}\n".format( Indexer.profileTime1)
import xml.sax as sax import re from StringIO import StringIO from sys import argv from shutil import copyfileobj import bz2 import Indexer from WikiSAXHandler import WikiContentHandler import TokenStemmer script, infile, outfile = argv Indexer.doInit(outfile) if infile.endswith(".bz2"): with bz2.BZ2File(infile, 'rb', compresslevel=9) as compressed_infile: sax.parse(compressed_infile, WikiContentHandler(outfile)) else: sax.parse(infile, WikiContentHandler(outfile)) Indexer.linearWriter(outfile) Indexer.linearMerger(outfile) Indexer.writeIndexPartFiles(outfile) Indexer.writeTitlePartFiles(outfile)