def test(): start='' clothes = feature.Clothes() print 'press q to exit' args = parse_args() clothes = feature.Clothes(init_pca=1) index = cPickle.loads(open(args.index).read()) searcher = Searcher(index) while True: start = raw_input('please input image address: ') if start == 'q': break start = 'query-images/' + start try: queryImage = cv2.imread(start) queryFeatures = descriptor.get_descriptor(image_path, multi_box=False) results = searcher.search(queryFeatures) result = [] result.append(queryImage) for j in xrange(0, 15): # grab the result (we are using row-major order) and # load the result image imageName = results[j] #path = args.dataset + "/%s" % (imageName) result.append(cv2.imread(imageName)) plot(result) plt.show() except: print "error"
def testSave(): args = parse_args() searcher = Searcher('data/tree','data/index') imagesDir = args.query ext = ['jpg','png','jpeg','JPG','PNG','JPEG'] images = os.listdir(imagesDir) flag = 1 import time start = time.time() for image in images: imType = image.split('.')[-1] if imType in ext: image_path = os.path.join(imagesDir,image) try: queryImage = cv2.imread(image_path) queryFeatures = descriptor.get_descriptor(image_path, multi_box=False) results = searcher.search(queryFeatures) result = [] result.append(queryImage) for j in xrange(0, 14): # grab the result (we are using row-major order) and # load the result image imageName = results[j] #path = args.dataset + "/%s" % (imageName) #print os.path.join(args.dataset,imageName) result.append(cv2.imread(os.path.join(args.dataset,imageName))) #print "\t%d. %s : %.3f" % (j + 1, imageName, score) plot(result,flag,args.save) flag += 1 except: print 'error with',image_path print 'Total time: ',time.time() - start
def search(): """ When a user enters a search query, it is obtained here. It calculates the top documents by using the Searcher class. :return: A template which is populated by input_query, top 20 results, cosine scores of various query_words, and words whose document frequency is zero. """ query = request.form.get('searchBar') query = unicodedata.normalize('NFKD', query).encode('ascii', 'ignore') now = time.clock() searcher = Searcher(query) results = searcher.cosine_score() scores = searcher.query_score print time.clock() - now zero_scores = searcher.top_corrections boolean_results = searcher.boolean_results if len(boolean_results) == 0: boolean_error = True else: boolean_error = False title_results = searcher.title_results if len(title_results) > 10: title_results = [] return render_template("displayResults.html", input_query=query, results=results, scores=scores, zero_scores=zero_scores, title_results=title_results, error=boolean_error, boolean_results=boolean_results)
def test(): start='' print 'press q to exit' args = parse_args() index = cPickle.loads(open(args.index).read()) searcher = Searcher(index) while True: start = raw_input('please input image address: ') if start == 'q': break #start = 'query-images/' + start try: queryImage = cv2.imread(start) queryFeatures = clothes.run(start) results = searcher.search(queryFeatures) result = [] result.append(queryImage) for j in xrange(0, 15): # grab the result (we are using row-major order) and # load the result image (score, imageName) = results[j] #path = args.dataset + "/%s" % (imageName) result.append(cv2.imread(imageName)) print "\t%d. %s : %.3f" % (j + 1, imageName, score) plot(result) plt.show() except: print "error"
def main(): # get options from console. options = args() # get configuration from file. config = get_conf(options['config_file']) # create ES connection to hosts. connections.create_connection(hosts=config['elasticsearch']['hosts'], timeout=30) # create the searcher instance to find alarms, given the options from # console. searcher = Searcher(options['from'], options['query'], ttime=options['to'], per_page=500, min_priority=options['min_priority']) buckets = [ PathClassBucket( utils.build_url(config['kibana']['host'], config['kibana']['secure'])) ] # manually fetch all alarms from the searcher and pass it to every bucket. for alarm in searcher.pages(): for bucket in buckets: bucket.cherry_pick(alarm) # dump all buckets, this will print out all buckets. for bucket in buckets: bucket.dump()
def search(request): """ The main method to return search result Param request: the request obj """ result_template = get_template('result.html') st = time.time() query = request.GET['q'] query = query.encode('utf-8') searcher = Searcher() result = searcher.search_result(query) paginator = Paginator(result,10) try: page = int(request.GET.get('page','1')) except ValueError: page = 1 try: page_result = paginator.page(page) except(EmptyPage,InvalidPage): page_result = paginator.page(paginator.num_pages) #file_result = searcher.search_file(query) file_result = [] search_time = time.time()-st search_time = "%.3f" % search_time html = result_template.render(Context({'query':query,'result':page_result,'file_result':file_result,"search_time":search_time})) return HttpResponse(html)
def search(): query = request.form['query'] field = request.form['field'] searcher = Searcher() result = searcher.search(query, field) return render_template("results.html", query=query, videos=result["videos"])
class TestSearch(unittest.TestCase): doc1 = {'path': '/foo/doc1', 'keywords': 'document,one,foo', 'title': 'Document one title.', 'text': 'This is the a test document. Wot!'} doc2 = {'path': '/foo/doc2', 'keywords': 'document,two,bar', 'title': 'Document two title.', 'text': 'Testing is cool, yo.'} def setUp(self): self.searcher = Searcher(tempfile.mkdtemp()) self.searcher.add_documents(json.dumps((self.doc1, self.doc2))) def test_search_by_text(self): results = self.searcher.search('Wot') self.assertEqual(1, len(results)) for fieldname, fieldvalue in results[0].items(): self.assertEqual(self.doc1.get(fieldname), fieldvalue) # Now stemming. Document 2 only has "testing" in its body but it'll be # matched anyway because the "text" field of the schema uses the # StemmingAnalyzer. results = self.searcher.search('test') self.assertEqual(2, len(results)) matched_doc_paths = [hit['path'] for hit in results] self.assertIn('/foo/doc1', matched_doc_paths) self.assertIn('/foo/doc2', matched_doc_paths) def test_search_by_keyword(self): # Simple search by a single keyword. results = self.searcher.search('keywords:one') self.assertEqual(1, len(results)) # doc1 self.assertEqual(results[0]['path'], '/foo/doc1') # By two keywords. results = self.searcher.search('keywords:one,foo') self.assertEqual(1, len(results)) # doc1 self.assertEqual(results[0]['path'], '/foo/doc1') # Keyword intersection. results = self.searcher.search('keywords:one,two') self.assertEqual(0, len(results)) # There are none. # Search for docs with a "one" keyword and "yo" in the body. results = self.searcher.search('keywords:one yo') self.assertEqual(0, len(results)) # There are none. # Anything with the "document" keyword. results = self.searcher.search('keywords:document') self.assertEqual(2, len(results)) # Both docs. def test_search_by_title(self): results = self.searcher.search('title:"Document one"') self.assertEqual(1, len(results)) # doc1 self.assertEqual(results[0]['path'], '/foo/doc1')
def colorSearch(self): ''' Searches query image against index and returns the specified number of matches. Results are in the format (chi-squared distance, image name). ''' searcher = Searcher(self.colorIndex) queryFeatures = self.createHistogram(self.image) results = searcher.search(queryFeatures) return results
def aftermath(self, results): try: keywords = self.analyse_keywords(results) sentiment = self.analyse_sentiment(results) except Exception as e: print(e) searcher = Searcher() for keyword in keywords: imageResults = searcher.searchImages(keyword) imageResults = searcher.validateLinks(imageResults) self.completionFunction(self.name, keyword, sentiment, imageResults)
class ImageSearcher(): '''Image searcher API for clothes retrieval web demo''' def __init__(self): root_path = os.path.dirname(__file__) tree_path = os.path.abspath(os.path.join(root_path, 'db/tree')) inds_path = os.path.abspath(os.path.join(root_path, 'db/index')) feature_path = os.path.abspath(os.path.join(root_path, 'db/feature.npy')) self.searcher = Searcher(tree_path, inds_path, feature_path) def search(self, image_path, do_detection=0, k=20): t1 = Timer() t1.tic() #queryFeatures = descriptor.get_descriptor(image_path, multi_box=False) queryFeatures = descriptor.get_descriptor(image_path,do_detection=do_detection) t1.toc('Feature Extraction time: ') t2 = Timer() t2.tic() results, dists, ind = self.searcher.search(queryFeatures,k=k) #self.queryExpansion(results, dists, ind) #self.queryExpansion(results, dists, ind) t2.toc('Knn search time: ') result = [] dist = [] for j,imageName in enumerate(results): if imageName not in result: result.append(imageName) dist.append(dists[j]) return result[:k],dist[:k] def queryExpansion(self, results, dists, ind, threshold=0.8, k=10, top=4): """ Do Query Expansion with at most top 5 """ features = self.searcher.features feature = [] for i,dist in enumerate(dists): if dist < threshold and i < top: feature.append(features[ind[i]]) if len(feature) == 0: return 0 query = np.mean(np.array(feature), axis=0) new_results, new_dists, new_ind = self.searcher.search(query,k=k) for i,dist in enumerate(new_dists): if dist > dists[-1]: break for j,d in enumerate(dists): if dist < d: results.insert(j, new_results[i]) dists.insert(j, dist) break
def mainapp(): result = "" input = request.args.get('searchstr') sorttype = request.args.get('sortselect') results = [] if ((input is not None and sorttype is not None) and input != ""): searcher = Searcher(input, sorttype) results = searcher.search() if input is None: input = "" widgets = [r.widget for r in results] #return render_template('index.html', input=input,result="".join(widgets)) return render_template('index2.html', input=input, results=results)
def mainapp(): result="" input = request.args.get('searchstr') sorttype = request.args.get('sortselect') results = [] if((input is not None and sorttype is not None) and input != ""): searcher = Searcher(input,sorttype) results = searcher.search() if input is None: input = "" widgets = [r.widget for r in results] #return render_template('index.html', input=input,result="".join(widgets)) return render_template('index2.html', input=input,results=results)
def visualize_specific_ranker(self, query): viewer = GraphViewer() specific_documents = Searcher(query).get_topic_documents() m = self.basic_matrix specific_doc_ids = list() with closing(shelve.open("ids.db")) as db: for doc in specific_documents: specific_doc_ids.append(db[doc[16:]]) specific_vector = np.zeros(m.shape[0]) for doc_id in specific_doc_ids: specific_vector[doc_id] = (1 - self.taxation_factor) / \ len(specific_doc_ids) rank_vector = np.full(m.shape[0], 1 / m.shape[0]) print len(specific_doc_ids) viewer.view_graph(node_list=list(specific_doc_ids)) count = 0 while True: count += 1 rank_vector1 = m * rank_vector + specific_vector diff = rank_vector1 - rank_vector diff = sum(diff * diff) if diff < 1e-50: break else: rank_vector = rank_vector1 if count % 25 == 0: try: viewer.view_graph(node_list=list(specific_doc_ids), ranks=list(rank_vector), mult_factor=40000, concat=150) except networkx.exception.NetworkXError as e: print e
class Index(object): def __init__(self, cache_host=None, cache_port=None, db_file_path=None, db_url=None, load_from_db=None): cache = create_index_cache(host=cache_host, port=cache_port) db = create_index_store(file_path=db_file_path, url=db_url) self.reader = IndexReader(db, cache) self.writer = IndexWriter(db, cache) self.searcher = Searcher(self.reader) if load_from_db: self.load_from_db() def search(self, query): return self.searcher.search(query) def commit(self): self.writer.db.commit() self.load_from_db() def load_from_db(self): # Refresh data in reader with data from database self.reader.load_from_db() # Push the new data into the cache self.writer.build_cache(self.reader.doc_word_scores) def writer(self): return self.writer def reader(self): return self.reader
def testSave(): args = parse_args() #searcher = ImageSearcher() searcher = Searcher(args.index, args.feature) imagesDir = args.query ext = ['jpg','png','jpeg','JPG','PNG','JPEG'] images = os.listdir(imagesDir) os.system('rm %s'%(args.save+'/*')) f = open(os.path.join(args.save,'info.txt'),'w') flag = 1 import time start = time.time() for image in images: imType = image.split('.')[-1] if imType in ext: image_path = os.path.join(imagesDir,image) try: queryImage = cv2.imread(image_path) queryFeatures = descriptor.get_descriptor(image_path) results,dists,ind = searcher.search(queryFeatures,k=15) #results,dists = searcher.search(image_path, do_detection=1, k=15) result = [] result.append(queryImage) dist = [] dist.append(1) print '~~~~~~~~~~~~~~~',flag,'~~~~~~~~~~~~~~~~~~~' f.write(str(flag)+'.~~~~~~~~~~~~~~~~'+'\n') f.write(image_path+'\n') for j in xrange(0, 15): # grab the result (we are using row-major order) and # load the result image score,imageName = dists[j], results[j] #path = args.dataset + "/%s" % (imageName) #print os.path.join('images-to-index',imageName) #result.append(cv2.imread(os.path.join(args.dataset,imageName))) result.append(cv2.imread(imageName)) f.write(imageName+'\n') dist.append(score) #print "\t%d. %s : %.3f" % (j + 1, imageName, score) #print score plot(result,dist,flag,args.save) flag += 1 except: print 'error with',image_path f.close() print 'Total time: ', time.time() - start
def search(): query = request.args.get("q") if query is None: return render_template("index.html") try: page = int(request.args.get("p", 1)) except (TypeError, ValueError): page = 1 searcher = Searcher() results = searcher.search_page(query, page) paginator = Paginator(results) return render_template("index.html", results=results, paginator=paginator, q=query)
def __init__(self): root_path = os.path.dirname(__file__) #tree_path = os.path.abspath(os.path.join(root_path, 'db/tree5')) inds_path = os.path.abspath(os.path.join(root_path, 'db/index')) feature_path = os.path.abspath(os.path.join(root_path, 'db/feature.npy')) self.searcher = Searcher(tree_path, inds_path, feature_path) #self.dataset = os.path.abspath(os.path.join(root_path, '../CBIR/datasets')) #label_path = os.path.abspath(os.path.join(root_path, 'db/label.pkl')) self.label = cPickle.loads(open(label_path).read())
class qrels: """ A class to create the qrels of the questions for SQUAD dataset. """ def __init__(self, index_dir): self.searchObject = Searcher(index_dir) def get_id_section(self, pair): """ This is the function that returns the id of the passage that is similar to the context. :param index_dir: the folder where the dataset index is stored. :param input_query: the query that represents the context. :return: """ result = self.searchObject.pairSearch(pair, BM25Similarity()) id = None for i in range(len(result)): hitDoc = self.searchObject.searcher.doc(result[i].doc) id = hitDoc.get("id_section") content = hitDoc.get("content_section") if id != "": break return id, content def process(self, input_file, output_dir): """ This is the main function that creates the qrels file. :param input_file: the file to process. :param index_dir: the folder where the dataset index is stored. :param output_dir: the folder where to store the qrels file. :return: """ num_lines = 0 with open(input_file, encoding="utf-8") as json_file: data = json.load(json_file) for p in data['data']: for par in p['paragraphs']: num_lines += 1 output_file = open(output_dir + "/qrels.txt", 'a+', encoding="utf-8") with tqdm(total=num_lines) as pbar: with open(input_file, encoding="utf-8") as json_file: data = json.load(json_file) for p in data['data']: for par in p['paragraphs']: pbar.update(1) psg_id, content = self.get_id_section( (p['title'], par["context"])) # print("Content: "+content+"\n") #similarity = round(len(set(par["context"]) & set(content)) / len(set(par["context"])), 2) for q in par["qas"]: qst_id = q["id"] if q["is_impossible"] is False: output_file.write(qst_id + " 0 " + psg_id + " 1 \n") print("==> Qrels successfully created.\n")
def colorSearch(self, max_matches=5): ''' Searches query image against index and returns the specified number of matches. Results are in the format (chi-squared distance, image name). ''' self.index = self.createIndex() image = cv2.imread(self.image) print("Querying: " + self.image + " ...") searcher = Searcher(self.index) queryFeatures = self.createHistogram(image) results = searcher.search(queryFeatures)[:max_matches] print("Matches found:") for j in range(len(results)): (score, imageName) = results[j] print("\t%d. %s : %.3f" % (j+1, imageName, score)) return results
def searchByColor(self): ''' Searches query image against index and returns the specified number of matches. ''' MAX_NUMBER_MATCHES = 5 image = cv2.imread(self.image) print("Querying: " + self.image + " ...") searcher = Searcher(self.index) queryFeatures = self.createHistogram(image) results = searcher.search(queryFeatures)[:MAX_NUMBER_MATCHES] print("Matches found:") for j in range(len(results)): (score, imageName) = results[j] print("\t%d. %s : %.3f" % (j+1, imageName, score)) return results
def __init__(self, cache_host=None, cache_port=None, db_file_path=None, db_url=None, load_from_db=None): cache = create_index_cache(host=cache_host, port=cache_port) db = create_index_store(file_path=db_file_path, url=db_url) self.reader = IndexReader(db, cache) self.writer = IndexWriter(db, cache) self.searcher = Searcher(self.reader) if load_from_db: self.load_from_db()
def weighted_search(): """ When a user Enters weights to use for different query_words, those are obtained by this method. :return: A template populated by input_query, results obtained due to weighted search, the scores of various query_words as given by the user and words whose document frequency is zero """ weights = {} query = request.form.get("query") for key in request.form: if key == "query": query = request.form[key] query = unicodedata.normalize('NFKD', query).encode('ascii', 'ignore') else: weights[key] = request.form[key] weights[key] = unicodedata.normalize('NFKD', weights[key]).encode( 'ascii', 'ignore') weights[key] = float(weights[key]) / 100 searcher = Searcher(query, query_score=weights) results = searcher.cosine_score() scores = searcher.query_score zero_scores = searcher.top_corrections boolean_results = searcher.boolean_results if len(boolean_results) == 0: boolean_error = True else: boolean_error = False title_results = searcher.title_results return render_template("displayResults.html", input_query=query, results=results, scores=scores, zero_scores=zero_scores, title_results=title_results, error=boolean_error)
def run(args): # TODO update args according to config.json here setup = import_module(args.setup) task = args.task data_source = setup.DataSource(args) spec = setup.LearningSpec() if task == 'train': trainer = Trainer(args, data_source, spec.training_classifier()) trainer.run() elif task == 'search': searcher = Searcher(args, data_source) searcher.fit(spec.gridsearch_pipelines()) elif task == 'benchmark': run_benchmarks(args, data_source) elif task == 'learning_curves': run_learning_curves(args, spec, data_source) elif task == 'plot_pca': X, y = data_source.train_data() plot_pca(X, y) elif task == 'misclassified': print_misclassified(args, spec.training_classifier(), data_source)
def topic_specific_search(self, query, scheme="topic"): if scheme == "trust": rank_vector = np.load("trustRank.npy") else: rank_vector = self.topic_specific_ranker(query) results = Searcher(query).cosine_score(ranker=True) result_ids = [] with closing(shelve.open("ids.db")) as db: for doc, score in results: doc_id = db[doc[16:]] doc_rank = rank_vector[doc_id] result_ids.append((doc_id, doc_rank)) sorted_scores = heapq.nlargest(20, result_ids, key=operator.itemgetter(1)) return sorted_scores
class PopularTopics: def __init__(self, index_dir, analyzer): self.searcher = Searcher(index_dir, analyzer) def dict_append(self, entity, f_dist): entity = ' '.join(entity) if entity not in f_dist: f_dist[entity] = 0 f_dist[entity] += 1 def get_popular_topics(self, q_year, top_k): titles = self.searcher.search_year(q_year) unigram_dist = {} bigram_dist = {} trigram_dist = {} ngram_dist = {} tagset = None tagger = PerceptronTagger() grammar = "NP: {<JJ>*(<NN>|<NNS>)*<NN>(<NN>|<NNS>)*}" cp = nltk.RegexpParser(grammar) for title in titles: title = title.lower() text = word_tokenize(title) sentence = nltk.tag._pos_tag(text, tagset, tagger) result = cp.parse(sentence) for node in list(result): if isinstance(node, nltk.tree.Tree): entity = zip(*list(node))[0] if len(entity) == 1: self.dict_append(entity, unigram_dist) elif len(entity) == 2: self.dict_append(entity, bigram_dist) elif len(entity) == 3: self.dict_append(entity, trigram_dist) else: self.dict_append(entity, ngram_dist) unigram_result = Counter(unigram_dist).most_common(int(len(unigram_dist) * 0.01) + top_k)[int(len(unigram_dist) * 0.01):] bigram_result = Counter(bigram_dist).most_common(top_k) trigram_result = Counter(trigram_dist).most_common(top_k) result = unigram_result + bigram_result + trigram_result result = sorted(result, key=lambda k: k[1], reverse=True)[:top_k] return result
def main(args): searcher = Searcher(args["limit"]) workers = [] if "domains" in args: # load domains from file domains = load_domains(args["domains"]) for domain in domains: if domain == "": continue # lookup in search engine result = searcher.google_search(domain) # start the worker worker = Worker(domain, result.urls, result.page_source) workers.append(worker) print "\nNow waiting for workers to finish" else: # lookup in search engine result = searcher.google_search(args["domain"]) # start the worker worker = Worker(args["domain"], result.urls, result.page_source) workers.append(worker) searcher.close() # wait for all workers to finish for worker in workers: worker.wait() # write emails to a file if "output" in args: write_excel_file(args["output"], workers) print "\nFinished scraping!\n" # output all emails for worker in workers: for email in worker.emails: print "> " + email
def topic_specific_ranker(self, query): specific_documents = Searcher(query).get_topic_documents() m = self.basic_matrix specific_doc_ids = list() with closing(shelve.open("ids.db")) as db: for doc in specific_documents: specific_doc_ids.append(db[doc[16:]]) specific_vector = np.zeros(m.shape[0]) for doc_id in specific_doc_ids: specific_vector[doc_id] = (1 - self.taxation_factor) / \ len(specific_doc_ids) rank_vector = np.full(m.shape[0], 1 / m.shape[0]) while True: rank_vector1 = m * rank_vector + specific_vector diff = rank_vector1 - rank_vector diff = sum(diff * diff) if diff < 1e-50: break else: rank_vector = rank_vector1 return rank_vector
#coding:utf-8 from index import Indexer from search import Searcher if __name__ == '__main__': index = Indexer("docs.txt") searcher = Searcher(index) i = 0 while 1: i += 1 input = raw_input(str(i) + ".请输入问题:") doclist = searcher.search(input.decode('utf-8')) if len(doclist) > 0: for doc in doclist: print doc.id, doc.name, doc.text else: print "无相关结果" print "\n"
def main(): server_url = 'localhost:9200' num_queries = 1000 with open('hyper_params_set.json', 'r') as fh: hyper_params = json.load(fh) nums_groups = hyper_params['nums_groups'] nums_clusters = hyper_params['nums_clusters'] thresholds = hyper_params['thresholds'] fh.close() with open('evaluation_set.json') as f: evaluation_set = json.load(f) f.close() final_results = [] training_embedding_vectors = np.load("train_embs_VGGFace.npy") query_vector_indices = random.sample(range(len(evaluation_set.keys())), num_queries) train_labels, image_names = get_image_data( 'vn_celeb_face_recognition/train.csv') for threshold in thresholds: for num_groups in nums_groups: for num_clusters in nums_clusters: print("working on {} groups, {} clusters, {} threshold".format( num_groups, num_clusters, threshold)) search_times = [] mean_average_accuracy = 0 mean_recall = 0 for query_vector_index in query_vector_indices: query_vector = training_embedding_vectors[evaluation_set[ str(query_vector_index)][0]] actual_query_label = train_labels[evaluation_set[str( query_vector_index)][0]] num_actual_results = len( evaluation_set[str(actual_query_label)]) # print(actual_query_label) # print("------------") es = Elasticsearch(server_url) index_name = 'face_off_' + str( num_groups) + 'groups_' + str( num_clusters) + 'clusters_vgg' if not es.indices.exists( index_name ): # if data is not indexed, create index and take data to ES # then query indexer = ESIndexer('encode_results_vgg', num_groups, num_clusters, server_url, 'vgg') indexer.index() start_time = datetime.now() searcher = Searcher(threshold, num_groups, num_clusters, query_vector, server_url, index_name, 'cosine', 'vgg') results = searcher.search() # print(len(results)) if len(results) == 0: continue search_time = datetime.now() - start_time search_time_in_ms = (search_time.days * 24 * 60 * 60 + search_time.seconds) * 1000 + \ search_time.microseconds / 1000.0 search_times.append(search_time_in_ms) else: # if not, commit query start_time = datetime.now() searcher = Searcher(threshold, num_groups, num_clusters, query_vector, server_url, index_name, 'cosine', 'vgg') results = searcher.search() # print(len(results)) if len(results) == 0: continue search_time = datetime.now() - start_time search_time_in_ms = (search_time.days * 24 * 60 * 60 + search_time.seconds) * 1000 + \ search_time.microseconds / 1000.0 search_times.append(search_time_in_ms) # print(len(results)) results_labels = list() for result in results: # print(result['id']) results_labels.append(result['id']) # with open('evaluation_set.json', 'r') as fh: # evaluation_set_dict = json.load(fh) # fh.close() accuracy_i = 0 for i in range(len(results)): step_list = results_labels[:(i + 1)] num_corrects = len([ i for i, x in enumerate(step_list) if x == actual_query_label ]) accuracy_i += num_corrects / len(step_list) # print(accuracy_i/num_returns) mean_average_accuracy += accuracy_i / len(results) recall_i = num_corrects / num_actual_results # print(num_corrects) mean_recall += recall_i # print("*************************************") average_search_time = round( np.mean(np.asarray(search_times)) / 1000, 3) mean_average_accuracy = mean_average_accuracy / num_queries mean_recall = mean_recall / num_queries # print(average_search_time) # print(accuracy) final_results.append([ num_groups, num_clusters, threshold, num_queries, 'euclidean', average_search_time, round(mean_average_accuracy, 4), round(mean_recall, 4) ]) print([ num_groups, num_clusters, threshold, num_queries, 'euclidean', average_search_time, round(mean_average_accuracy, 4), round(mean_recall, 4) ]) print("finish") print("-----------------------------------------------")
def __init__(self, index_dir, analyzer): """ Initializes searcher. """ self.searcher = Searcher(index_dir, analyzer)
def __init__(self, index_dir): self.searcher = Searcher(index_dir)
def run_search(rt): searcher = Searcher(rt.config, rt.data_source) searcher.fit(rt.spec.gridsearch_pipelines())
def setUp(self): self.searcher = Searcher(tempfile.mkdtemp()) self.searcher.add_documents(json.dumps((self.doc1, self.doc2)))
class ImageSearcher(): '''Image searcher API for clothes retrieval web demo''' def __init__(self): root_path = os.path.dirname(__file__) #tree_path = os.path.abspath(os.path.join(root_path, 'db/tree5')) inds_path = os.path.abspath(os.path.join(root_path, 'db/index')) feature_path = os.path.abspath(os.path.join(root_path, 'db/feature.npy')) self.searcher = Searcher(tree_path, inds_path, feature_path) #self.dataset = os.path.abspath(os.path.join(root_path, '../CBIR/datasets')) #label_path = os.path.abspath(os.path.join(root_path, 'db/label.pkl')) self.label = cPickle.loads(open(label_path).read()) def search(self, image_path, do_detection=1, k=50): #queryImage = cv2.imread(image_path) t1 = Timer() t1.tic() #queryFeatures = descriptor.get_descriptor(image_path, multi_box=False) queryFeatures, label = descriptor.get_descriptor(image_path, multi_box=False, get_label=True, do_detection=do_detection) flag = [] #flag = [] # if do, we donot use class to filter result t1.toc('Feature Extraction time: ') t2 = Timer() t2.tic() #p = Profile() #results = p.runcall(self.searcher.search, queryFeatures) #p.print_stats() results, dists = self.searcher.search(queryFeatures) print dists t2.toc('Knn search time: ') result = [] # origine image #result.append(image_path) if len(flag) != 0: for j in xrange(0, k): imageName = results[j] if imageName not in result: #Juge class error but image similarity is high if dists[j] < 0.05: result.append(imageName) continue #if dists[j] > 0.2: # break #judge wether image belongs to the class image_path = imageName.split('/') image_dir = image_path[0]+'/'+image_path[1]+'/'+image_path[2] #print image_dir if image_dir in flag: result.append(imageName) #else: # result.append(imageName) print 'total result', len(result) if len(result)<3: # if result about class is less than 5, we do search in all datasets #print 'total result', len(result) k = 30 result = [] for j in xrange(0, k): imageName = results[j] if imageName not in result: #if dists[j] > 0.2: # break result.append(imageName) return result
melbourneBox = [143.7,-38.5,145.9,-37.05] # just obfuscate, meaning these are harder to find / steal firstHidden = base64.b64decode("VWxQNkdMc2s1TTJVVUEwVHVGZHVNREd2Yg==").decode("utf-8") secondHidden = base64.b64decode("NHNqQ1VHWE81VGZqemM5RTNuUlFXUlVLTG1iY0M2dkFPS2p3d0tUZDYwbHFvVDNyZVM=").decode("utf-8") thirdHidden = base64.b64decode("MjgwOTIyNTk0LXB4dTBtMnNqR01xeUU3ZTZhdmFOUUk0bmlDdXE2d2RoY202UmFRV04=").decode("utf-8") fourthHidden = base64.b64decode("aVV4cVk3UjNCVE5lTWN6NmZRakloczJuYTRqbjV6RUx5cmtYdGdTYTFUNGs3").decode("utf-8") # create the interface to the database db = DBInterface(dbstring) # authenticate auth = tweepy.OAuthHandler(firstHidden, secondHidden) auth.set_access_token(thirdHidden, fourthHidden) api = tweepy.API(auth) # decide what mode to run the application in mode = sys.argv[1] if mode == 'search' or mode == 'both': # start searching for all tweets going back a week sfThread = Searcher(api, db, melbourneRadial) sfThread.start() if mode == 'stream' or mode == 'both': # start streaming tweets listener = Streamer(db) stream = tweepy.Stream(auth=api.auth, listener=listener) stream.filter(locations=melbourneBox)
def main_1(var): num_groups = int(var[0]) num_clusters = int(var[1]) if var[2] >= 50: dist_function_name = 'euclidean' else: dist_function_name = 'cosine' threshold = var[3] server_url = 'localhost:9200' num_queries = 200 with open('evaluation_set.json') as f: evaluation_set = json.load(f) f.close() training_embedding_vectors = np.load("PCA_2048_to_512_new.npy") query_vector_indices = random.sample(range(len(evaluation_set.keys())), num_queries) train_labels, image_names = get_image_data( 'vn_celeb_face_recognition/train.csv') # print("working on {} groups, {} clusters, {} threshold".format(num_groups, num_clusters, threshold)) search_times = [] mean_average_accuracy = 0 mean_recall = 0 for query_vector_index in query_vector_indices: query_vector = training_embedding_vectors[evaluation_set[str( query_vector_index)][0]] # print(query_vector) actual_query_label = train_labels[evaluation_set[str( query_vector_index)][0]] num_actual_results = len(evaluation_set[str(actual_query_label)]) # print(actual_query_label) # print("------------") es = Elasticsearch(server_url) index_name = 'face_off_' + str(num_groups) + 'groups_' + str( num_clusters) + 'clusters_vgg' if not es.indices.exists( index_name ): # if data is not indexed, create index and take data to ES # then query data_encoder = DataEncoder(num_groups, num_clusters, 1000, training_embedding_vectors, 'encode_results_vgg') data_encoder.run_encode_data() json_string_tokens_generator = JsonStringTokenGenerator( 'encode_results_vgg', 'PCA_2048_to_512_new.npy', 'vn_celeb_face_recognition/train.csv', num_groups, num_clusters) encoded_string_tokens_list = json_string_tokens_generator.get_string_tokens_list( ) train_embs = json_string_tokens_generator.get_image_fetures() train_labels, image_names = json_string_tokens_generator.get_image_metadata( ) json_string_tokens_list = json_string_tokens_generator.generate_json_string_tokens_list( encoded_string_tokens_list, train_labels, image_names, train_embs) json_string_tokens_generator.save_json_string_tokens( json_string_tokens_list) print('saving completed....') print('******************************') indexer = ESIndexer('encode_results_vgg', num_groups, num_clusters, server_url, 'vgg') indexer.index() start_time = datetime.now() searcher = Searcher(threshold, num_groups, num_clusters, query_vector, server_url, index_name, dist_function_name, 'vgg') results = searcher.search() # print(len(results)) if len(results) == 0: continue search_time = datetime.now() - start_time search_time_in_ms = (search_time.days * 24 * 60 * 60 + search_time.seconds) * 1000 + \ search_time.microseconds / 1000.0 search_times.append(search_time_in_ms) else: # if not, commit query start_time = datetime.now() searcher = Searcher(threshold, num_groups, num_clusters, query_vector, server_url, index_name, dist_function_name, 'vgg') results = searcher.search() # print(len(results)) if len(results) == 0: continue search_time = datetime.now() - start_time search_time_in_ms = (search_time.days * 24 * 60 * 60 + search_time.seconds) * 1000 + \ search_time.microseconds / 1000.0 search_times.append(search_time_in_ms) results_labels = list() for result in results: results_labels.append(result['id']) # with open('evaluation_set.json', 'r') as fh: # evaluation_set_dict = json.load(fh) # fh.close() accuracy_i = 0 for i in range(len(results)): step_list = results_labels[:(i + 1)] num_corrects = len([ i for i, x in enumerate(step_list) if x == actual_query_label ]) accuracy_i += num_corrects / len(step_list) # print(accuracy_i/num_returns) mean_average_accuracy += accuracy_i / len(results) recall_i = num_corrects / num_actual_results # print(num_corrects) mean_recall += recall_i # print("*************************************") mean_average_accuracy = mean_average_accuracy / num_queries mean_recall = mean_recall / num_queries print(mean_average_accuracy, mean_recall) # print("precision: {} and recall: {}".format(mean_average_accuracy, mean_recall)) # print(average_search_time) # print(mean_average_accuracy) return 3 - mean_average_accuracy - mean_recall - ( 2 * mean_average_accuracy * mean_recall / (mean_average_accuracy + mean_recall))
from search import Searcher from flask import Flask, request, render_template logging.basicConfig(filename="runtime.log", level=logging.INFO) root = logging.getLogger() root.setLevel(logging.DEBUG) handler = logging.StreamHandler(sys.stdout) handler.setLevel(logging.DEBUG) # formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s', datefmt="%Y-%m-%d %H:%M:%S") handler.setFormatter(formatter) root.addHandler(handler) searcher = Searcher() app = Flask(__name__) @app.route('/') def hello(): return render_template("main.html") @app.route('/search', methods=['POST']) def query(): method = request.form.get("method") query = request.form.get("query") matches = searcher[method].search(query) return render_template("main.html", matches=matches) if __name__ == '__main__': app.run(host="0.0.0.0", port=5000)
def api_search(): query = request.args['query'] field = request.args['field'] searcher = Searcher() result = searcher.search(query, field) return jsonify(result)
ap = argparse.ArgumentParser() ap.add_argument("-q", "--query", required=True, help="Path to input image") arg = vars(ap.parse_args()) f = open('dictionary.txt', 'r') dataset = cPickle.loads(f.read()) queryImage = cv2.imread(arg["query"]) cv2.imshow("QueryImage", queryImage) print "Query :: %s" % (arg["query"][arg["query"].rfind('/') + 1:]) rgbHist = RGBHist([8, 8, 8]) queryHist = rgbHist.getHist(queryImage) searcher = Searcher(dataset) results = searcher.search(queryHist) set1 = np.zeros((150 * 5, 400, 3), dtype='unit8') set2 = np.zeros((150 * 5, 400, 3), dtype='unit8') for i in xrange(0, 10): (fileName, dist) = results[i] print "Result %d :: %s, Score :: %f" % (i, fileName, dist) path = './dataset/' + fileName image = cv2.imread(path) if i < 5: set1[150 * i:150 * (i + 1), :, :] = image else: set2[150 * (i - 5):150 * (i - 4), :, :] = image
def home(): searcher = Searcher() movies, tvs = searcher.default_display() return render_template("index.html", movie_videos=movies["videos"], tv_videos=tvs["videos"])
'''from preprocessing import Preprocessor preprocessor = Preprocessor(word_tokenize=True, remove_stopword=False, extract_entity=True, num_query=2) print(preprocessor.transform("Hom qua em den truong tai Ha Noi, me dat tay tung buoc den Sai Gon")) print(preprocessor.entities)''' from os import listdir from search import Searcher searcher = Searcher() fields = {'id': False, 'title': True, 'content': True, 'out': False} docs = [] for fname in listdir('./data/folders/1001 bí ẩn/'): item = { 'id': len(docs) + 1, 'title': fname[:-4], 'content': open('./data/folders/1001 bí ẩn/' + fname).read(), 'out': fname[:-4] } docs.append(item) searcher.set_fields(fields) searcher.fit(docs[:10]) for i in range(10): print(docs[i]['title']) i = 10 while True: s = input('Already to test: ') if s == 'add': print(docs[i]['title']) searcher.add_document(docs[i])
from org.apache.lucene.util import Version from org.apache.lucene.analysis.miscellaneous import PerFieldAnalyzerWrapper from java.util import HashMap from lxml import etree from search import Searcher from index import Indexer, CustomAnalyzer INDEX_DIR = 'index' # DATA_DIR = 'data/dblp.xml' DATA_DIR = 'data/dblp_small.xml' if __name__ == "__main__": # user inputs topN = 10 lucene.initVM() # index documents config = {'lowercase': True, 'stemming': True, 'stopwords': True} title_analyzer = CustomAnalyzer(config) per_field = HashMap() per_field.put("title", title_analyzer) analyzer = PerFieldAnalyzerWrapper( StandardAnalyzer(Version.LUCENE_CURRENT), per_field) Indexer(DATA_DIR, INDEX_DIR, context, analyzer) searcher = Searcher(INDEX_DIR, analyzer) # # q = raw_input("Query: ") # # searcher.search(q, N=topN) searcher.run(topN)
import argparse import cPickle import cv2 # Construct argument parser using argparse ap = argparse.ArgumentParser() ap.add_argument("-d", "--dataset", required=True, help="Path to indexed image dataset") ap.add_argument("-i", "--index", required=True, help="Path to index file") args = vars(ap.parse_args()) # Load index and initialize our searcher index = cPickle.loads(open(args["index"]).read()) searcher = Searcher(index) # loop over images in the index -- we will use each one as # a query image for (query, queryfeatures) in index.items(): # perform the search using the current query results = searcher.search(queryfeatures) # load the query image and display it path = args["dataset"] + "/%s" %(query) queryImage = cv2.imread(path) cv2.imshow("Query", queryImage) print "query: %s" %(query) # intialize the 2 montages to display our results -- # we have a total of 25 iimages in the index, but let's only
class ResultsGenerator: def __init__(self, index_dir): self.searcher = Searcher(index_dir) def get_id_section(self, request): idList = list() for i in range(len(request)): hitDoc = self.searcher.searcher.doc(request[i].doc) idList.append(hitDoc.get("id_section")) return idList def process(self, input_file, index_dir, output_dir): output_file_1 = open(output_dir + "/results_BM25_1.txt", 'a+', encoding="utf-8") output_file_2 = open(output_dir + "/results_BM25_2.txt", 'a+', encoding="utf-8") output_file_3 = open(output_dir + "/results_BM25_3.txt", 'a+', encoding="utf-8") output_file_4 = open(output_dir + "/results_BM25_4.txt", 'a+', encoding="utf-8") output_file_5 = open(output_dir + "/results_VSM_1.txt", 'a+', encoding="utf-8") output_file_6 = open(output_dir + "/results_VSM_2.txt", 'a+', encoding="utf-8") output_file_7 = open(output_dir + "/results_VSM_3.txt", 'a+', encoding="utf-8") output_file_8 = open(output_dir + "/results_VSM_4.txt", 'a+', encoding="utf-8") num_lines = 0 with open(input_file, encoding="utf-8") as json_file: data = json.load(json_file) for p in data['data']: for par in p['paragraphs']: for q in par["qas"]: num_lines += 1 with tqdm(total=num_lines) as pbar: with open(input_file, encoding="utf-8") as json_file: data = json.load(json_file) for p in data['data']: title = p["title"] for par in p['paragraphs']: for q in par["qas"]: pbar.update(1) if q["is_impossible"] is False: question_content_s_BM25 = self.searcher.simpleSearch( q["question"], BM25Similarity()) id_question_content_s_BM25 = self.get_id_section( question_content_s_BM25) question_title_content_s_BM25 = self.searcher.pairSearch( [title, q["question"]], BM25Similarity()) id_question_title_content_s_BM25 = self.get_id_section( question_title_content_s_BM25) question_content_m_BM25 = self.searcher.multiFieldsSearch( q["question"], BM25Similarity()) id_question_content_m_BM25 = self.get_id_section( question_content_m_BM25) question_title_content_m_BM25 = self.searcher.multiFieldsPairSearch( [title, q["question"]], BM25Similarity()) id_question_title_content_m_BM25 = self.get_id_section( question_title_content_m_BM25) question_content_s_TDF = self.searcher.simpleSearch( q["question"], ClassicSimilarity()) id_question_content_s_TDF = self.get_id_section( question_content_s_TDF) question_title_content_s_TDF = self.searcher.pairSearch( [title, q["question"]], ClassicSimilarity()) id_question_title_content_s_TDF = self.get_id_section( question_title_content_s_TDF) question_content_m_TDF = self.searcher.multiFieldsSearch( q["question"], ClassicSimilarity()) id_question_content_m_TDF = self.get_id_section( question_content_m_TDF) question_title_content_m_TDF = self.searcher.multiFieldsPairSearch( [title, q["question"]], ClassicSimilarity()) id_question_title_content_m_TDF = self.get_id_section( question_title_content_m_TDF) for i in range(len(question_content_s_BM25)): output_file_1.write( q["id"] + " Q0 " + str(id_question_content_s_BM25[i]) + " " + str(i + 1) + " " + str(question_content_s_BM25[i].score) + " STANDARD\n") for i in range( len(question_title_content_s_BM25)): output_file_2.write( q["id"] + " Q0 " + str(id_question_title_content_s_BM25[i] ) + " " + str(i + 1) + " " + str(question_title_content_s_BM25[i]. score) + " STANDARD\n") for i in range(len(question_content_m_BM25)): output_file_3.write( q["id"] + " Q0 " + str(id_question_content_m_BM25[i]) + " " + str(i + 1) + " " + str(question_content_m_BM25[i].score) + " STANDARD\n") for i in range( len(question_title_content_m_BM25)): output_file_4.write( q["id"] + " Q0 " + str(id_question_title_content_m_BM25[i] ) + " " + str(i + 1) + " " + str(question_title_content_m_BM25[i]. score) + " STANDARD\n") for i in range(len(question_content_s_TDF)): output_file_5.write( q["id"] + " Q0 " + str(id_question_content_s_TDF[i]) + " " + str(i + 1) + " " + str(question_content_s_TDF[i].score) + " STANDARD\n") for i in range( len(question_title_content_s_TDF)): output_file_6.write( q["id"] + " Q0 " + str(id_question_title_content_s_TDF[i]) + " " + str(i + 1) + " " + str(question_title_content_s_TDF[i]. score) + " STANDARD\n") for i in range(len(question_content_m_TDF)): output_file_7.write( q["id"] + " Q0 " + str(id_question_content_m_TDF[i]) + " " + str(i + 1) + " " + str(question_content_m_TDF[i].score) + " STANDARD\n") for i in range( len(question_title_content_m_TDF)): output_file_8.write( q["id"] + " Q0 " + str(id_question_title_content_m_TDF[i]) + " " + str(i + 1) + " " + str(question_title_content_m_TDF[i]. score) + " STANDARD\n") print("==> Results successfully created.\n")
class ImageSearcher(): '''Image searcher API for clothes retrieval web demo''' def __init__(self): root_path = os.path.dirname(__file__) inds_path = os.path.abspath(os.path.join(root_path, 'db/index')) feature_path = os.path.abspath(os.path.join(root_path, 'db/feature.npy')) self.searcher = Searcher(inds_path, feature_path) self.local_features = np.load('db/local_features.npy') def search(self, image_path, do_detection=1, k=10): #queryImage = cv2.imread(image_path) t1 = Timer() t1.tic() #queryFeatures = descriptor.get_descriptor(image_path, multi_box=False) queryFeatures = descriptor.get_descriptor(image_path) t1.toc('Feature Extraction time: ') t2 = Timer() t2.tic() #p = Profile() #results = p.runcall(self.searcher.search, queryFeatures) #p.print_stats() results, dists, ind = self.searcher.search(queryFeatures,k=5*k) #self.reranking(queryFeatures, results, dists, ind, 0.6) #self.queryExpansion2(results, dists, ind) #self.queryExpansion(queryFeatures, results, dists, ind, top=3) t2.toc('Knn search time: ') result = [] # origine image #result.append(image_path) dist = [] for j,imageName in enumerate(results): if imageName not in result: result.append(imageName) dist.append(dists[j]) #print result[:k] return result[:k],dist[:k] def reranking(self, queryFeatures, results, dists, ind, rerank_thresh=0.7): features = self.local_features feature = [] flag = 0 dist = 0 res = [] for i,index in enumerate(ind): if dists[i] < rerank_thresh: flag += 1 else: if dist == 0: dist = dists[i-1] feature.append(features[index]) res.append(results[i]) if len(feature) < 3: return feature = np.array(feature).copy() result,new_ind = self.searcher.research(res, queryFeatures, feature, 3) for j,r in enumerate(result): results.insert(flag+j, r) dists.insert(flag+j, dist) def queryExpansion2(self, results, dists, ind, threshold=0.3, k=10, top=3): features = self.searcher.features feature = [] for i in xrange(top): query = features[ind[i]] if dists[i] > threshold: break new_result, new_dist, new_ind = self.searcher.search(query,k=k) for j,dist in enumerate(new_dist): if dist > threshold: break for k,d in enumerate(dists[i:]): if dist < d: results.insert(i+k, new_result[j]) dists.insert(i+k, dist) break def queryExpansion(self, queryFeatures, results, dists, ind, threshold=0.8, k=10, top=5): """ Do Query Expansion with at most top """ features = self.searcher.features feature = [] #feature.append(queryFeatures) for i,dist in enumerate(dists): #if dist < threshold and i < top: if i < top: feature.append(features[ind[i]]) if len(feature) == 0: return 0 query = np.mean(np.array(feature), axis=0) new_results, new_dists, new_ind = self.searcher.search(query,k=k) for i,dist in enumerate(new_dists): if dist > dists[-1]: break for j,d in enumerate(dists): if dist < d: results.insert(j, new_results[i]) dists.insert(j, dist) break
import urllib import json app = Flask(__name__, static_url_path='') @app.route("/") def main(): return send_from_directory('static', 'search.html') @app.route("/q/<input>") @crossdomain(origin='*') def search(input): print urllib.unquote_plus(input).encode('utf-8') doclist = searcher.search(input) result = [] for doc in doclist: result.append(str(doc.id) + ", " + str(doc.name) + "<br>" + doc.text) return json.dumps(result) index = Indexer("docs.txt") searcher = Searcher(index) if __name__ == "__main__": app.run(host='127.0.0.1', port=8282, debug=True)
def __init__(self): root_path = os.path.dirname(__file__) inds_path = os.path.abspath(os.path.join(root_path, 'db/index')) feature_path = os.path.abspath(os.path.join(root_path, 'db/feature.npy')) self.searcher = Searcher(inds_path, feature_path) self.local_features = np.load('db/local_features.npy')
class ResultsGenerator: def __init__(self, index_dir): self.searcher = Searcher(index_dir) def get_id_section(self, request): idList = list() for i in range(len(request)): hitDoc = self.searcher.searcher.doc(request[i].doc) idList.append(hitDoc.get("id_section")) return idList def process(self, input_file, index_dir, output_dir): output_file_1 = open( output_dir + "/results.txt", 'a+', encoding="utf-8") num_lines = 0 with open(input_file, encoding="utf-8") as json_file: data = json.load(json_file) for p in data['data']: for par in p['paragraphs']: for q in par["qas"]: num_lines += 1 model = create_model() model.load_weights("5e-5 0.1.h5") with tqdm(total=num_lines) as pbar: with open(input_file, encoding="utf-8") as json_file: data = json.load(json_file) for p in data['data']: for par in p['paragraphs']: for q in par["qas"]: pbar.update(1) if q["is_impossible"] is False: result = self.searcher.simpleSearch(q["question"], BM25Similarity()) ids = [] if(result == []): output_file_1.write('"'+str(q['id'])+'": "",\n') continue content = "" tab = [''] tab.append(q["question"]) tab.pop(0) for i in range(len(result)): hitDoc = self.searcher.searcher.doc(result[i].doc) content = hitDoc.get("content_section") tab.append(str(content)) ids.append(hitDoc.get("id_section")) inputs = [] for i in range(1, len(tab)): inputs.append([tab[0],tab[i]]) # tokenization squad_examples = [] for i in inputs: question = i[0] context = i[1] squad_eg = Example(q["question"], context) squad_eg.preprocess() squad_examples.append(squad_eg) dataset_dict = {"input_ids": [], "token_type_ids": [], "attention_mask": [],} for item in squad_examples: if item.skip == False: for key in dataset_dict: dataset_dict[key].append(getattr(item, key)) for key in dataset_dict: dataset_dict[key] = np.array(dataset_dict[key]) x = [dataset_dict["input_ids"], dataset_dict["token_type_ids"], dataset_dict["attention_mask"]] y_pred = model.predict(x) sorted_indexes = sorted(range(len(y_pred)), key=lambda k: y_pred[k], reverse=True) r = 1 for i in sorted_indexes: output_file_1.write( q["id"] + " Q0 " + str(ids[i]) + " " + str(r) + " " + str(y_pred[i][0]) + " STANDARD\n") r += 1 print("==> Results successfully created.\n")
output_attentions=False, output_hidden_states=False) device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') modelClassifier.to(device) modelClassifier.load_state_dict(torch.load( '/Users/younesagabi/Desktop/YouTaQA/DeepLearning/Classifier/Models/BERT_ft_epoch10.model', map_location=torch.device(device)), strict=False) modelExtractor = BertForQuestionAnswering.from_pretrained( 'bert-large-uncased-whole-word-masking-finetuned-squad') txt_file = open(r"txt_file.txt", "w+") tsv_file = open("test.tsv") read_tsv = csv.reader(tsv_file, delimiter="\t") searchObject = Searcher( "/Users/younesagabi/Desktop/YouTaQA/IR/index_wiki_v7.0") Similarity = Similarity() for row in read_tsv: inputQuery = row[0] result = searchObject.multiFieldsSearch(inputQuery, BM25Similarity()) # print(result) # print("#" * 100) # print("#" * 100) content = "" list = [''] list.append(inputQuery) list.pop(0) j = 0 for i in range(len(result)): hitDoc = searchObject.searcher.doc(result[i].doc) score = result[i].score
from generator import Generator from search import Searcher from connectionist import Connectionist from vertex import Vertex n_vertices = 10 # number of elements/nodes g = Generator(n_vertices) searcher = Searcher() connector = Connectionist() n = 20 # number of runs for i in range(n): g.generate() belief_network = g.get_belief_network() neural_network = g.get_neural_network() coherence, (true, false) = searcher.run(belief_network) print 'coherence search:', coherence print 'accepted propositions:', sorted(true, key=lambda v: v.n) print 'rejected propositions:', sorted(false, key=lambda v: v.n) print '-----------------------------------------------' activations, harmony = connector.run(neural_network) print 'harmony', harmony true = [] false = [] for i, a in enumerate(activations): if a == 1: true.append(Vertex(i)) else: false.append(Vertex(i))
def main(_): # Import config import yaml # Configure logger logger = logging.getLogger(__name__) logger.addHandler(logging.StreamHandler()) logger.setLevel(logging.INFO) # Load NLP libraries logger.info('Loading NLP library') import spacy from nltk.corpus import stopwords nlp = spacy.load('en') STOP_WORDS = set(stopwords.words('english')) # Parse search phrase search_input = FLAGS.search_phrase search_phrase = nlp(' '.join([word for word in search_input.split(' ') if word not in STOP_WORDS])) logger.info('Search phrase: "%s"' % search_phrase.text) results = [] # Required for model g = tf.Graph() with g.as_default(): model = inference_wrapper.InferenceWrapper() restore_fn = model.build_graph_from_config(configuration.ModelConfig(), FLAGS.model_file) g.finalize() vocab = vocabulary.Vocabulary(FLAGS.vocab_file) # Find files to search search_dir = FLAGS.base_dir if FLAGS.base_dir is not None else os.path.dirname(os.path.abspath(__file__)) files = Searcher.search_from_dir(search_dir) num_files = len(files) logger.info('%d file(s) found' % num_files) with tf.Session(graph=g) as sess: # Load the model from checkpoint and instantiate caption generator model. restore_fn(sess) generator = caption_generator.CaptionGenerator(model, vocab) # Caption the files count = 0 for file_path in files: count+=1. try: with tf.gfile.GFile(file_path, "r") as f: image = f.read() captions = generator.beam_search(sess, image) logger.info("Captioning image %f: %s" % (count/num_files,file_path)) best_caption = captions[0] # Just take the most probable caption sentence = nlp(" ".join([vocab.id_to_word(word) for word in best_caption.sentence[1:-1] if word not in STOP_WORDS])) results.append((file_path, sentence.text, search_phrase.similarity(sentence))) except Exception as e: logger.warning('Failed to caption image: %s' % file_path) render_results(search_phrase.text, sorted(results, key= lambda x : x[2], reverse=True)) webbrowser.open('output.html',new=2)
from java.util import HashMap from search import Searcher from index import CustomAnalyzer from utils import check_config CONFIG_DIR = 'config.json' INDEX_DIR = 'index' DATA_DIR = 'data/dblp.xml' # run search on command line # see ui_search.py to use the search via web UI if __name__ == "__main__": with open(CONFIG_DIR) as f: config = json.load(f) config = check_config(config) lucene.initVM() # start JVM for Lucene # index documents # use different analyzer for title field title_analyzer = CustomAnalyzer(config['titleAnalyzer']) per_field = HashMap() per_field.put("title", title_analyzer) analyzer = PerFieldAnalyzerWrapper( StandardAnalyzer(Version.LUCENE_CURRENT), per_field) searcher = Searcher(INDEX_DIR, analyzer) # q = raw_input("Query: ") # searcher.search(q, N=config['topN']) searcher.run(config['topN'])
from cspProblemDefine import CSP, Constraint, ne_, is_ from operator import lt,ne,eq,gt from search import Search_from_CSP, Searcher def meet_at(p1,p2): """returns a function that is true when the words meet at the postions p1, p2 """ def meets(w1,w2): return w1[p1] == w2[p2] meets.__name__ = "meet_at("+str(p1)+','+str(p2)+')' return meets crossword1 = CSP({'one_across':{'ant', 'bus', 'car', 'has'}, 'one_down':{'buys', 'hold', 'lane', 'year'}, 'three_across':{'buys', 'hold', 'lane', 'year'}, 'two_down':{'search', 'syntax'}, 'four_across':{'ant', 'bus', 'car', 'has'} }, [Constraint(('one_across','one_down'),meet_at(0,0)), Constraint(('one_down','three_across'),meet_at(2,0)), Constraint(('one_across','two_down'),meet_at(2,0)), Constraint(('three_across','two_down'),meet_at(2,2)), Constraint(('four_across','two_down'),meet_at(0,4)) ]) searcher3 = Searcher(Search_from_CSP(crossword1)) print('The first solution searched is:') print(searcher3.search())
def __init__(self): root_path = os.path.dirname(__file__) tree_path = os.path.abspath(os.path.join(root_path, 'db/tree')) inds_path = os.path.abspath(os.path.join(root_path, 'db/index')) feature_path = os.path.abspath(os.path.join(root_path, 'db/feature.npy')) self.searcher = Searcher(tree_path, inds_path, feature_path)
def __init__(self, index_dir, analyzer): self.searcher = Searcher(index_dir, analyzer)