def run(self): self.tagList.clear() self.labelList.clear() self.status.config(text='Processing...') tag_str = self.tagsEntry.get() nb_str = self.nbEntry.get() # RETRIEVE INFORMATIONS self.tagList = tag_str.split(',') self.nb_images_wished = int(nb_str) # DEBUG TO CHECK IF THE PROGRAM GOT THE RIGHT DATA print('You asked for :', self.nb_images_wished, 'images') print('With the tags : ') for i in range(0, len(self.tagList)): print('\t-> ' + self.tagList[i]) self.labelList.append( Label(self.tagsLabelFrame, self.tagList[i], self.progression_color)) self.labelList[i].pack(side='top') self.progressBarList.append( ttk.Progressbar(self.progressionFrame, length=300, value=0, max=100)) self.progressBarList[i].pack(side='top') # if i != (len(self.tagList) - 1): # Space(self.progressionFrame, self.progression_color, 640, 10).pack(side='top') Retriever(self.tagList, self.nb_images_wished, self.progressBarList, self.labelList).start()
def task3b(model): output_directory = os.path.join(os.getcwd(), "output") stemmed_corpus = get_stemmed_corpus() f = open('cacm_stem.query.txt', 'r') stemmed_queries = f.readlines() f.close() I = Indexer.InvertedIndexer('') I.stemmed_indexer(stemmed_corpus) r = Retriever.Retriever('', I, os.getcwd()) file_name = os.path.join(output_directory, 'task3b_' + model + '_stemmed.txt') f = open(file_name, 'w') query_no = [12, 13, 19, 23, 24, 25, 50] q_iter = 0 for each_query in stemmed_queries: r.process_query(each_query) docs_and_scores = r.get_scores_for_docs(model, query_no[q_iter]) # save results into appropriate file docs = docs_and_scores[0] scores = docs_and_scores[1] for i in range(100): f.write(str(query_no[q_iter]) \ + " Q0 " \ + str(docs[i]) + ' ' \ + str((i + 1)) + " " \ + str(scores[i]) + " " \ + model + "\n") q_iter += 1 f.close()
def my_form_post(): query = request.form['text'].encode('utf-8') # contains the query submitted results = Retriever.runner(query) # keep result in the form of a string, should be fine for small number of results # make sure multiple results are concatenated by using <br> so html displays them in new lines return render_template("results.html", name = results) #return what you want here
def task3a(model, raw_corpus_directory): project_directory = os.getcwd() p = Indexer.Parser() corpus_directory = p.build_corpus(raw_corpus_directory, stopped=True) output_directory = os.path.join(project_directory, "output") I = Indexer.InvertedIndexer(corpus_directory) I.ngram_indexer(1) # builds a unigram indexes for each word r = Retriever.Retriever( corpus_directory, I, project_directory ) # create a Retriever class, which contains different retrieval model os.chdir(raw_corpus_directory) os.chdir(os.pardir) f = open('cacm.query.txt', 'r') soup = BeautifulSoup(f.read(), 'html.parser') f.close() f_stop_words = open('common_words.txt', 'r') stop_words_list = f_stop_words.readlines() stop_words = [i.strip() for i in stop_words_list] f_stop_words.close() file_name = os.path.join(output_directory, 'task3a_' + model + '.txt') f = open(file_name, 'w') # open file for writing results for i in range(64): query_no = (soup.find('docno')).text.encode( 'utf-8') # extract query number and query (soup.find('docno')).decompose() query = (soup.find('doc')).text.encode('utf-8') (soup.find('doc')).decompose() r.process_query(query, stopped=True, stopwords=stop_words) # parse the query # r.clean_content(query) docs_and_scores = r.get_scores_for_docs( model, int(query_no)) # retrieve relevant documents # save results into appropriate file docs = docs_and_scores[0] scores = docs_and_scores[1] for i in range(100): f.write(str(query_no) \ + " Q0 " \ + str(docs[i]) + ' ' \ + str((i+1)) + " " \ + str(scores[i]) + " " \ + model + "\n") f.close()
# store argument into variables query_file = '' result_path = '' for opt, val in optlist: if opt == '--query': query_file = val else: result_path = val # extract feature from query image query_img = cv2.imread(query_file) feature_descriptor = FeatureDescriptor((8, 12, 3)) query_features = feature_descriptor.describe(query_img) # perform retrieval through index file '''retrieves db''' retriever = Retriever() retrieval_results = retriever.search(query_features, limit=10) # display query image cv2.imshow("Query", query_img) # loop over results of retrieved top limit image for (id, distance) in retrieval_results: result = cv2.imread(result_path + '/' + id + '.png') print result print result_path + '/' + id + '.png' cv2.imshow("Result", result) cv2.waitKey(0)
else: word_count[each_term] = k top_terms = sorted(word_count.items(), key=operator.itemgetter(1), reverse=True) expanded_query_list = [i for i, j in top_terms][:len(query_terms) + n] expanded_query = " ".join(expanded_query_list) return expanded_query current_directory = os.getcwd() corpus_directory = os.path.join(current_directory, "processed_corpus") I = Indexer.InvertedIndexer(corpus_directory) I.ngram_indexer(1) r = Retriever.Retriever(corpus_directory, I) model = 'bm25' # get the results from the previous runs (bm25 and tfidf) file_name = 'task1_' + model + '.txt' results_file_dir = os.path.join(current_directory, "task1") results_file_dir = os.path.join(results_file_dir, file_name) f = open(results_file_dir, 'r') data = f.readlines() f.close() list_of_lines = [] for each_line in data: list_of_lines.append( each_line.split()) # contains parsed lines from the task1 output file
raw_corpus_directory = raw_input( "Enter the raw corpus directory (html files): ") project_directory = os.getcwd() output_directory = os.path.join(project_directory, "output") # Parser (to process the raw corpus (no stopping)) p = Indexer.Parser() corpus_directory = p.build_corpus(raw_corpus_directory) # Indexer - Builds the inverted indexes for the processed corpus I = Indexer.InvertedIndexer(corpus_directory) I.ngram_indexer(1) # builds a unigram indexes for each word # Retriever - based on the model specified, this object can be # used to get the results. r = Retriever.Retriever(corpus_directory, I, project_directory) # Get the queries from the given file query_dic = {} # stores the queries; key - query ID, token - query os.chdir(project_directory) f = open('cacm.query.txt', 'r') soup = BeautifulSoup(f.read(), 'html.parser') f.close() for i in range(64): query_no = (soup.find('docno')).text.encode( 'utf-8') # extract query number and query (soup.find('docno')).decompose() query = (soup.find('doc')).text.encode('utf-8') (soup.find('doc')).decompose() query_dic[int(query_no)] = query
def run_task(task, model, raw_corpus_directory): project_directory = os.getcwd() output_directory = os.path.join(project_directory, "output") # Parser (to process the raw corpus (no stopping)) p = Indexer.Parser() corpus_directory = p.build_corpus(raw_corpus_directory) # Indexer - Builds the inverted indexes for the processed corpus I = Indexer.InvertedIndexer(corpus_directory) I.ngram_indexer(1) # builds a unigram indexes for each word # Retriever - based on the model specified, this object can be # used to get the results. r = Retriever.Retriever(corpus_directory, I, project_directory) # Get the queries from the given file query_dic = {} # stores the queries; key - query ID, token - query os.chdir(project_directory) f = open('cacm.query.txt', 'r') soup = BeautifulSoup(f.read(), 'html.parser') f.close() for i in range(64): query_no = (soup.find('docno')).text.encode( 'utf-8') # extract query number and query (soup.find('docno')).decompose() query = (soup.find('doc')).text.encode('utf-8') (soup.find('doc')).decompose() query_dic[int(query_no)] = query # task 1 if task == 1: os.chdir(project_directory) if not os.path.exists(output_directory): os.mkdir(output_directory, 0755) os.chdir(output_directory) f = open('task1_' + model + '.txt', 'w') for query_no in range(len(query_dic)): r.process_query(query_dic[query_no + 1]) # parse the query docs_and_scores = r.get_scores_for_docs( model, (query_no + 1)) # retrieve relevant documents # save results into appropriate file docs = docs_and_scores[0] scores = docs_and_scores[1] for i in range(100): f.write(str(query_no + 1) \ + " Q0 " \ + str(docs[i]) + ' ' \ + str((i+1)) + " " \ + str(scores[i]) + " " \ + model + "\n") f.close() # task 2 if task == 2: # read output files from task 1 file_name = 'task1_' + model + '.txt' try: f = open(os.path.join(output_directory, file_name), 'r') except: print "Run Task - 1 before Task - 2" exit() data = f.readlines() f.close() list_of_lines = [] for each_line in data: list_of_lines.append(each_line.split( )) # contains parsed lines from the task1 output file task1_output = { } # results for each; key = query ID(number), value = list of relevant files for each_line in list_of_lines: task1_output.setdefault(int(each_line[0]), []).append(each_line[2]) # get stopwords f_stop_words = open('common_words.txt', 'r') stop_words_list = f_stop_words.readlines() stop_words = [i.strip() for i in stop_words_list] f_stop_words.close() # get corpus os.chdir(corpus_directory) files_list = glob.glob('*.html') corpus = {} for each_file in files_list: doc_name = each_file[:len(each_file) - 5] text = open(each_file).read() corpus[doc_name] = text.split() file_name = 'expanded_queries_' + model + '.txt' f = open(os.path.join(output_directory, file_name), 'w') expanded_query_dic = {} for query_no, query in query_dic.viewitems(): processed_query = r.process_query(query, True) expanded_query_dic[query_no] = Query_Expander( processed_query, task1_output[query_no], corpus, stopwords=stop_words) f.write(str(query_no) + " " + expanded_query_dic[query_no] + "\n") f.close() file_name = 'task2_' + model + '.txt' f = open(os.path.join(output_directory, file_name), 'w') for query_no, query in expanded_query_dic.viewitems(): r.process_query(query) # parse the query # r.clean_content(query) docs_and_scores = r.get_scores_for_docs( model, query_no) # retrieve relevant documents # save results into appropriate file docs = docs_and_scores[0] scores = docs_and_scores[1] for i in range(100): f.write(str(query_no) \ + " Q0 " \ + str(docs[i]) + ' ' \ + str((i + 1)) + " " \ + str(scores[i]) + " " \ + model + "\n") f.close() print "Results stored in " + output_directory + " directory"
1]: index = length - i stemmed_corpus[doc_id] = total_stemmed_corpus[doc_id][:index] break return stemmed_corpus stemmed_corpus = get_stemmed_corpus() f = open('cacm_stem.query.txt', 'r') stemmed_queries = f.readlines() f.close() I = Indexer.InvertedIndexer('') I.stemmed_indexer(stemmed_corpus) r = Retriever.Retriever('', I, os.getcwd()) f = open('task3b_' + model + '_stemmed.txt', 'w') query_no = [12, 13, 19, 23, 24, 25, 50] q_iter = 0 for each_query in stemmed_queries: r.process_query(each_query) docs_and_scores = r.get_scores_for_docs(model, query_no[q_iter]) # save results into appropriate file docs = docs_and_scores[0] scores = docs_and_scores[1] for i in range(100): f.write(str(query_no[q_iter]) \ + " Q0 " \ + str(docs[i]) + ' ' \