def GET(self): user_data = web.input() message = user_data.keyword if len(message) > 10: if (len(message) > 3 and message[-3] + message[-2] + message[-1] == 'png' or message[-3] + message[-2] + message[-1] == 'jpg'): urlretrieve(message, 'target.jpg') lis1 = shit.LSH('target.jpg') lis = [] vm_env.attachCurrentThread() STORE_DIR = 'index' directory = SimpleFSDirectory(File(STORE_DIR)) searcher = IndexSearcher(DirectoryReader.open(directory)) analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) for i in range(len(lis1)): lis.append(run(searcher, analyzer, lis1[i])[0]) else: a = func(user_data.keyword) STORE_DIR = 'index' vm_env.attachCurrentThread() directory = SimpleFSDirectory(File(STORE_DIR)) searcher = IndexSearcher(DirectoryReader.open(directory)) analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) lis = run(searcher, analyzer, a) f = login return render.movies(f, lis)
def GET(self, name): STORE_DIR_GOOD = "index_good" STORE_DIR_BAD = "index_bad" vm_env.attachCurrentThread() directory_good = SimpleFSDirectory(File(STORE_DIR_GOOD)) searcher_good = IndexSearcher(DirectoryReader.open(directory_good)) directory_bad = SimpleFSDirectory(File(STORE_DIR_BAD)) searcher_bad = IndexSearcher(DirectoryReader.open(directory_bad)) analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) user_data = web.input(name=None) command = yourInput(user_data) if user_data.brand == '': user_data.brand = '1' res = Run_Score(searcher_good, searcher_bad, analyzer, name, user_data.brand) comments = [] for i in range(len(res)): if len(res[i]) == 9: t = res[i][8] else: t = '' for j in range(len(t)): s = t[j] s.encode("utf8") if len(s) >= 50: comments.append(s) return render.comments(comments)
def __init__(self, index_path, field, similarity="boolean", use_relevance_feedback=False, feedback_index_path=None): self.reader = DirectoryReader.open( FSDirectory.open(Paths.get(index_path))) self.searcher = IndexSearcher(self.reader) if use_relevance_feedback and feedback_index_path is not None: self.feedback_reader = DirectoryReader.open( FSDirectory.open(Paths.get(feedback_index_path))) self.feedback_searcher = IndexSearcher(self.feedback_reader) self.similarity = similarity self.stopwords = stop_words() if similarity == "boolean": self.searcher.setSimilarity(BooleanSimilarity()) elif similarity == "tf": self.searcher.setSimilarity(TFSimilarity()) elif similarity == "tfidf": self.searcher.setSimilarity(ClassicSimilarity()) elif similarity == "BM25": self.searcher.setSimilarity(BM25Similarity(1.2, 0.2)) else: print("Unknown similarity, so we use BM25(1.2, 0.2) as default") self.searcher.setSimilarity(BM25Similarity(1.2, 0.2)) analyzer = StandardAnalyzer() print(self.searcher.getSimilarity()) self.parser = QueryParser(field, analyzer)
def open_reader(self): """Open IndexReader.""" if self.reader is None: if self.use_ram: print "reading from ram directory ..." self.reader = DirectoryReader.open(self.ram_dir) else: self.reader = DirectoryReader.open(self.dir)
def __init__(self): self.env = lucene.initVM(initialheap='28g', maxheap='28g', vmargs=['-Djava.awt.headless=true']) self.vocab = None BooleanQuery.setMaxClauseCount(2048) if not os.path.exists(prm.index_folder): print 'Creating index at', prm.index_folder if prm.docs_path == prm.docs_path_term: add_terms = True else: add_terms = False self.create_index(prm.index_folder, prm.docs_path, add_terms) if prm.local_index_folder: print 'copying index from', prm.index_folder, 'to', prm.local_index_folder if os.path.exists(prm.local_index_folder): print 'Folder', prm.local_index_folder, 'already exists! Doing nothing.' else: shutil.copytree(prm.index_folder, prm.local_index_folder) self.index_folder = prm.local_index_folder else: self.index_folder = prm.index_folder fsDir = MMapDirectory(Paths.get(prm.index_folder)) self.searcher = IndexSearcher(DirectoryReader.open(fsDir)) if prm.docs_path != prm.docs_path_term: if not os.path.exists(prm.index_folder_term): print 'Creating index at', prm.index_folder_term self.create_index(prm.index_folder_term, prm.docs_path_term, add_terms=True) if prm.local_index_folder_term: print 'copying index from', prm.index_folder_term, 'to', prm.local_index_folder_term if os.path.exists(prm.local_index_folder_term): print 'Folder', prm.local_index_folder_term, 'already exists! Doing nothing.' else: shutil.copytree(prm.index_folder_term, prm.local_index_folder_term) self.index_folder_term = prm.local_index_folder_term else: self.index_folder_term = prm.index_folder_term fsDir_term = MMapDirectory(Paths.get(prm.index_folder_term)) self.searcher_term = IndexSearcher( DirectoryReader.open(fsDir_term)) self.analyzer = StandardAnalyzer() self.pool = ThreadPool(processes=prm.n_threads) self.cache = {} print 'Loading Title-ID mapping...' self.title_id_map, self.id_title_map = self.get_title_id_map()
def __init__(self, fs_directory): directory = SimpleFSDirectory(Paths.get(fs_directory)) self.index_reader = DirectoryReader.open(directory) self.searcher = IndexSearcher(DirectoryReader.open(directory)) self.analyzer = StandardAnalyzer() self.query = None self.lucene_dictionary = LuceneDictionary(self.index_reader, 'contents') self.analyzer = StandardAnalyzer() self.formatter = SimpleHTMLFormatter() self.hits = None
def GET(self, name): STORE_DIR_GOOD = "index_good" STORE_DIR_BAD = "index_bad" vm_env.attachCurrentThread() directory_good = SimpleFSDirectory(File(STORE_DIR_GOOD)) searcher_good = IndexSearcher(DirectoryReader.open(directory_good)) directory_bad = SimpleFSDirectory(File(STORE_DIR_BAD)) searcher_bad = IndexSearcher(DirectoryReader.open(directory_bad)) analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) user_data = web.input(name=None) command = yourInput(user_data.shop) res = Run_GoodRate(searcher_good, searcher_bad, analyzer, command, user_data.brand) res.append(command) return render.SearchResult(res)
def query(self, data): if self.fil.exists(): searcher = IndexSearcher(DirectoryReader.open(self.d)) query = QueryParser( Version.LUCENE_30, "id", self.analyzer).parse( data['query']) hits = searcher.search(query, 100000) results = {} results['totalHits'] = hits.totalHits results['hits'] = {} for hit in hits.scoreDocs: record = {} doc = searcher.doc(hit.doc) fields = doc.getFields() record['score'] = hit.score for field in fields: if field.name() != "id": record[field.name()] = field.stringValue() results['hits'][doc.get('id')] = record searcher.getIndexReader().close() return results
def search(self, field, text): """ search text within indexed data input: field fieldname of the value that will be indexed text text to search output: hits return a list of hits """ results = [] idx_reader = DirectoryReader.open(self.directory) idx_searcher = IndexSearcher(idx_reader) # parse query parser = AnalyzingQueryParser(Version.LUCENE_CURRENT, field, self.analyser) query = parser.parse(text) # search hits = idx_searcher.search(query, 1000).scoreDocs.tolist() for hit in hits: doc = idx_searcher.doc(hit.doc) score = hit.score title = doc.get(field) url = doc.get("url") results.append((score, url, title)) return results
def search_img(output): STORE_DIR = "WebPageIndex" directory = SimpleFSDirectory(File(STORE_DIR)) searcher = IndexSearcher(DirectoryReader.open(directory)) analyzer = WhitespaceAnalyzer(Version.LUCENE_CURRENT) results = [] results.append([]) for num in output: imgnum = str(num) query = QueryParser(Version.LUCENE_CURRENT, "imgnum", analyzer).parse(imgnum) scoreDocs = searcher.search(query, 1).scoreDocs for scoreDoc in scoreDocs: result = [] doc = searcher.doc(scoreDoc.doc) result.append(doc.get("album")) result.append(doc.get("subalbum")) result.append(doc.get("singer")) result.append(doc.get("url")) result.append(doc.get("reviews")) result.append(doc.get("imgurl")) result.append(doc.get("imgnum")) results.append(result) del searcher return results
def __init__(self, indexDir): self.directory = SimpleFSDirectory(Paths.get(indexDir)) self.searcher = IndexSearcher(DirectoryReader.open(self.directory)) self.nameQueryParser = QueryParser('name', StandardAnalyzer()) self.nameQueryParser.setDefaultOperator(QueryParser.Operator.AND) self.idQueryParser = QueryParser('id', StandardAnalyzer()) self.idQueryParser.setDefaultOperator(QueryParser.Operator.AND)
def retrieve_sents(self): indexDir = self.indexDir query = self.query sent_ind_list = [] # template = CustomTemplate(format) fsDir = SimpleFSDirectory(Paths.get(indexDir)) # print indexDir searcher = IndexSearcher(DirectoryReader.open(fsDir)) analyzer = StandardAnalyzer() parser = QueryParser("contents", analyzer) parser.setDefaultOperator(QueryParser.Operator.OR) query = parser.parse(query) # print query start = datetime.now() scoreDocs = searcher.search(query, 50).scoreDocs duration = datetime.now() - start # print query if self.stats: print >> sys.stderr, "Found %d sentences (in %s) that matched query '%s':" % ( len(scoreDocs), duration, query) for scoreDoc in scoreDocs: # print scoreDoc.doc # doc = searcher.doc(scoreDoc.doc) sent_ind_list.append(scoreDoc.doc) return sent_ind_list
def SearchImgCommand(command): initvm.vm_env.attachCurrentThread() STORE_DIR = "jdindex" directory = SimpleFSDirectory(File(STORE_DIR)) searcher = IndexSearcher(DirectoryReader.open(directory)) analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) contentCommand = ' '.join(jieba.cut(command)) query = QueryParser(Version.LUCENE_CURRENT, "contents", analyzer).parse(contentCommand) scoreDocs = searcher.search(query, 50).scoreDocs Already = [] finalDocs = [] for i, scoreDoc in enumerate(scoreDocs): doc = searcher.doc(scoreDoc.doc) itemurl = doc.get("itemurl") if itemurl not in Already: oneDoc = {} oneDoc['imgurl'] = doc.get("imgurl") oneDoc['title'] = doc.get("title").strip('\n') oneDoc['itemurl'] = itemurl oneDoc['score'] = scoreDoc.score finalDocs.append(oneDoc) Already.append(itemurl) return finalDocs
def search_index(indexfile, querytext, top=10, qe=False, default_field="text", display_fields=["subreddit", "author", "text"]): lucene.initVM() lindex = SimpleFSDirectory(Paths.get(indexfile)) ireader = DirectoryReader.open(lindex) isearcher = IndexSearcher(ireader) analyser = StandardAnalyzer() parser = QueryParser(default_field, analyser) query = parser.parse(querytext) hits = isearcher.search(query, top).scoreDocs docIDs = [hit.doc for hit in hits] print_results(isearcher, hits, display_fields) if len(hits) == 0: print("No hits!") elif qe: print("\n") print("Which documents were relevant to your search need? (Enter spaced list of result numbers [1-{}], e.g. 2 4 5)".format(top)) relevantids = [docIDs[i-1] for i in [int(x) for x in input().split()]] nonrelevantids = [id for id in docIDs if id not in relevantids] print("\n\n") qequerytext = queryexpansion.rocchio(ireader, querytext, relevantids, nonrelevantids) print("Expanded search query: '{}'\n".format(qequerytext)) qequery = parser.parse(qequerytext) qehits = isearcher.search(qequery, top).scoreDocs print_results(isearcher, qehits, display_fields) ireader.close() lindex.close()
def GET(self): command = web.input().command.encode('utf-8') initvm.vm_env.attachCurrentThread() STORE_DIR = "jdindex" directory = SimpleFSDirectory(File(STORE_DIR)) searcher = IndexSearcher(DirectoryReader.open(directory)) analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) query = QueryParser(Version.LUCENE_CURRENT, "contents", analyzer).parse(command) scoreDocs = searcher.search(query, 20).scoreDocs finalDocs = [] for i, scoreDoc in enumerate(scoreDocs): doc = searcher.doc(scoreDoc.doc) title = doc.get("title").strip('\n') if title not in finalDocs: finalDocs.append(title) web.header('content-type', 'text/json') data = {} data['q'] = command data['p'] = 'false' data['s'] = finalDocs return 'fn(' + json.dumps(data) + ');'
def get_most_similar(self, sentence, do_log=False): # print('query string is',string) # q = QueryParser('pa', self.analyzer).parse(sentence) query_builder = BooleanQuery.Builder() for token in sentence.split(' '): if token not in sw: qtq = TermQuery(Term("pa", token)) query_builder.add( BooleanClause(qtq, BooleanClause.Occur.SHOULD)) q = query_builder.build() hitsPerPage = 2 reader = DirectoryReader.open(self.w) self.searcher = IndexSearcher(reader) simi = BM25Similarity(Config.k1, Config.b) # simi = ClassicSimilarity() self.searcher.setSimilarity(simi) docs = self.searcher.search(q, hitsPerPage) hits = docs.scoreDocs # print("Found " + str(len(hits)) + " hits.") if len(hits) > 0: mate = self.searcher.doc(hits[0].doc).get("id") if do_log: print("found something. mate: ", mate, "- score : ", hits[0].score) return hits[0], int(mate) else: return None, -1
def running(command): command = unicode(command) STORE_DIR = "index" directory = SimpleFSDirectory(File(STORE_DIR)) searcher = IndexSearcher(DirectoryReader.open(directory)) analyzer = SimpleAnalyzer(Version.LUCENE_CURRENT) return run(searcher, analyzer, command)
def __init__(self, db_path): directory = SimpleFSDirectory(File(db_path)) reader = DirectoryReader.open(directory) self.searcher = IndexSearcher(reader) self.analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) logger.info("Loaded DB from %s with %d documents: ", db_path, reader.numDocs())
def run(command): global vm_env STORE_DIR = "index" vm_env = lucene.getVMEnv() vm_env.attachCurrentThread() directory = SimpleFSDirectory(File(STORE_DIR)) searcher = IndexSearcher(DirectoryReader.open(directory)) analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) query = QueryParser(Version.LUCENE_CURRENT, "contents", analyzer).parse(command) scoreDocs = searcher.search(query, 10).scoreDocs #print "%s total matching documents." % len(scoreDocs) res = [] for scoreDoc in scoreDocs: doc = searcher.doc(scoreDoc.doc) tmp = [] tmp.append([doc.get('name1'), doc.get('name2')]) tmp.append(doc.get("homepage")) tmp.append(doc.get("intro")) tmp.append(doc.get('logo')) a = doc.get('goods') a = a.split('\n') for i in a: tmp.append(i) res.append(tmp) return command, res
def buscar(indexDir, args,options = None): #lucene.initVM(vmargs=['-Djava.awt.headless=true']) fsDir = SimpleFSDirectory(File(indexDir)) #print fsDir #Criando buscador baseado no diretorio dos indices passados pelo usuario searcher = IndexSearcher(DirectoryReader.open(fsDir)) #Analizador para filtro dos tokens analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) #print analyzer #Criando um QueryParser usando por padrao contents #Variavel com as restricoes da busca parser = QueryParser(Version.LUCENE_CURRENT, "keywords", analyzer) #print parser parser.setDefaultOperator(QueryParser.Operator.AND) #print args #Juntando parametros passados com o valor do mesmo command = ' +'.join(args) #print command query = parser.parse(command) print query #Criando um JArray com resultado da consulta return searcher.search(query, 200).scoreDocs
def __init__(self, searchDir): self.analyzer = MyPythonEnglishAnalyzer( stopwords=Indexer.ENGLISH_STOP_WORDS_SET) self.directory = FSDirectory.open(Paths.get(searchDir)) self.reader = DirectoryReader.open(self.directory) self.searcher = IndexSearcher(self.reader)
def __init__(self, root, storedir, isindexing=False, isBM25=True): if not os.path.exists(storedir): os.mkdir(storedir) self.analyzer = LimitTokenCountAnalyzer(StandardAnalyzer(), 1048576) if isindexing: store = SimpleFSDirectory(Paths.get(storedir)) config = IndexWriterConfig(self.analyzer) # TODO BM25 parameter tuning if isBM25: config.setSimilarity(BM25Similarity()) config.setOpenMode(IndexWriterConfig.OpenMode.CREATE) writer = IndexWriter(store, config) self.indexer(root, writer) ticker = Ticker() print('commit index') threading.Thread(target=ticker.run).start() writer.commit() writer.close() ticker.tick = False print('done') search_dir = SimpleFSDirectory(Paths.get(storedir)) self.searcher = IndexSearcher(DirectoryReader.open(search_dir)) if isBM25: self.searcher.setSimilarity(BM25Similarity())
def search(self): ''' Searches the given query in the index ''' lucene.initVM(vmargs=['-Djava.awt.headless=true']) # print 'lucene', lucene.VERSION # base_dir = os.path.dirname(os.path.abspath('.')) base_dir = '.' directory = SimpleFSDirectory(File(os.path.join(base_dir, self.index_dir))) searcher = IndexSearcher(DirectoryReader.open(directory)) analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) while True: print print "Hit enter with no input to quit." command = raw_input("Query:") if command == '': return print print "Searching for:", command query = QueryParser(Version.LUCENE_CURRENT, "title", analyzer).parse(command) scoreDocs = searcher.search(query, 50).scoreDocs print "%s total matching documents." % len(scoreDocs) for scoreDoc in scoreDocs: doc = searcher.doc(scoreDoc.doc) # print 'path:', doc.get("path"), 'name:', doc.get("name") print doc
def __init__(self, LUCENE_INDEX_DIR, similarity='BM25', lucene_vm_flag=False, is_bigram_cache_used=False, mongoObj=None): if lucene_vm_flag == False: lucene.initVM(vmargs=['-Djava.awt.headless=true']) self.lucene_vm_init = True self.index_dir = LUCENE_INDEX_DIR self.index_mm = MMapDirectory(Paths.get(LUCENE_INDEX_DIR)) self.analyzer = SimpleAnalyzer() self.config = IndexWriterConfig(self.analyzer) self.reader = DirectoryReader.open(self.index_mm) self.searchers = [] self.searchers.append(IndexSearcher(self.reader)) if similarity == 'BM25': (self.searchers[0]).setSimilarity(BM25Similarity()) # load bigram cache self.is_bigram_cache_used = is_bigram_cache_used if is_bigram_cache_used == True: seperate_char = '/' if self.index_dir.find('/') > -1 else '\\' index_name = self.index_dir.split(seperate_char)[-1] self.index_name = index_name self.conn_bigram_tf_cache = mongoObj.db[index_name + '_tf_cache'] self.conn_bigram_cf_cache = mongoObj.db[index_name + '_cf_cache'] if 'stemmed_wikipedia' in LIST_F or 'wikipedia' in LIST_F: self.conn_mapping_prob_cache = mongoObj.db[ index_name + '_mapping_prob_cache_with_wikipedia'] else: self.conn_mapping_prob_cache = mongoObj.db[ index_name + '_mapping_prob_cache']
def search(self, input_query=None, max_answers=10): ''' Searches the given query in the index ''' if input_query is None: return None base_dir = '.' directory = SimpleFSDirectory(File(os.path.join(base_dir, self.index_dir))) searcher = IndexSearcher(DirectoryReader.open(directory)) analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) # query = QueryParser(Version.LUCENE_CURRENT, "title", analyzer).parse(input_query) parser = MultiFieldQueryParser(Version.LUCENE_CURRENT, (self._posts_fields + self._answer_fields), analyzer) query = MultiFieldQueryParser.parse(parser, input_query) scoreDocs = searcher.search(query, max_answers).scoreDocs print "%s total matching documents." % len(scoreDocs) docs = [] for scoreDoc in scoreDocs: doc = searcher.doc(scoreDoc.doc) doc_dict = dict((field.name(), field.stringValue()) for field in doc.getFields()) docs.append(doc_dict) # print doc return docs
def __init__(self, index_path, method, logger=None, use_default_similarity=False): self.index_path=index_path directory = SimpleFSDirectory(File(self.index_path)) self.analyzer = StandardAnalyzer(LuceneVersion.LUCENE_CURRENT) self.reader=DirectoryReader.open(directory) self.searcher = IndexSearcher(self.reader) # uncomment one of these lines to change the type of parser, query and weight used if use_default_similarity: self.query_parser=QueryParser else: self.query_parser=FieldAgnosticQueryParser if use_default_similarity: similarity=DefaultSimilarity() self.useExplainQuery=False else: similarity=FieldAgnosticSimilarity() self.useExplainQuery=True # by default, FieldAgnosticSimilarity uses coord factor, can be disabled ## similarity.useCoord=False self.searcher.setSimilarity(similarity) self.method=method # never used? self.logger=logger
def searcher(directory, analyzer, queries_file): lines = queries_file.readlines() length = len(lines) a_query = '' query_counter = 0 log = open("log.txt", "a") for line_number in range(length): if lines[line_number].startswith("<num>"): query_id = lines[line_number][14:].strip() elif lines[line_number].startswith("<desc>"): a_query = lines[line_number + 1].strip() a_query = stop_words(a_query) if a_query != '': # searching the index reader = DirectoryReader.open(directory) searcher = IndexSearcher(reader) # parse the query parser = QueryParser("DocParagraph", analyzer) query = parser.parse(a_query) # return 50 queries are required by the assignment hits = searcher.search(query, 50).scoreDocs # rank counter 1 through 50 rank_counter = 1 for hit in hits: result = searcher.doc(hit.doc) # write search result to log text file to_log = str(query_id) + " " + "Q" + str( query_counter) + " " + str(result.get( "DocID")) + " " + str(rank_counter) + " " + str( hit.score) + " " + "Alex's" + "\n" log.write(to_log) rank_counter += 1 query_counter += 1 a_query = '' log.close()
def search(): lucene.initVM(vmargs=['-Djava.awt.headless=true']) args = [] if request.method == 'POST': if request.form['ies']: args.append('+ies:'+request.form['ies']) if request.form['area']: args.append('+area:'+request.form['area']) if request.form['professor']: args.append('+professor:'+request.form['professor']) if request.form['conceito']: #args.append('m:'+request.form['conceito']+'d:'+request.form['conceito']+'f:'+request.form['conceito']) args.append('m:'+request.form['conceito']) args.append('d:'+request.form['conceito']) args.append('f:'+request.form['conceito']) table = [] if(len(args) > 0): scoreDocs = mansearch.buscar('indexer/',args) fsDir = SimpleFSDirectory(File(indexDir)) searcher = IndexSearcher(DirectoryReader.open(fsDir)) for scoreDoc in scoreDocs: doc = searcher.doc(scoreDoc.doc) table.append(dict((field.name(), field.stringValue()) for field in doc.getFields())) return render_template('busca.html',table = table) pass
def run_music(ID): STORE_DIR = "index" vm_env = lucene.getVMEnv() vm_env.attachCurrentThread() directory = SimpleFSDirectory(File(STORE_DIR)) searcher = IndexSearcher(DirectoryReader.open(directory)) analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) query = QueryParser(Version.LUCENE_CURRENT, "id", analyzer).parse(ID) scoreDocs = searcher.search(query, 1).scoreDocs try: scoreDoc = scoreDocs[0] except: return None doc = searcher.doc(scoreDoc.doc) item = [] item.append(doc.get("song_title").encode('utf-8')) item.append(doc.get('song_url')) item.append(doc.get("singer").encode('utf-8')) item.append(doc.get("album").encode('utf-8')) item.append(doc.get("album_pic")) item.append(doc.get("album_genre").encode('utf-8')) item.append(doc.get("lyrics").encode('utf-8')) sim_str = doc.get("similar").encode('utf-8') sim_list = sim_str.split('+') for i in range(3): sim_list[i] = sim_list[i].split('*') item.append(sim_list) del searcher return item
def __init__ (self): self.mDocumentDirectory = "/home/hnguyen/Projects/CLIFinder/operations.sub" self.mIndexDirectory = "/home/hnguyen/Projects/CLIFinder/cli.index" self.mIndexReader = None if os.path.isdir(self.mIndexDirectory) and self.mIndexReader == None: directory = SimpleFSDirectory(File(self.mIndexDirectory)) self.mIndexReader = DirectoryReader.open(directory) ############################### IndexingEngine Settings ###################################### self.mSimilarity = DecreaseLengthNormSimilarity() self.mOpenMode = IndexWriterConfig.OpenMode.CREATE ############################################################################################## self.mIsDebug = False if self.mIsDebug: ############################### Setting up loggers ########################################### self.mIndexingLogPath = "/home/hnguyen/Projects/CLIFinder/logs/indexing.log" self.mSearchingLogPath = "/home/hnguyen/Projects/CLIFinder/logs/searching.log" self.mIndexingLogger = LoggingEngine(self.mIndexingLogPath, "IndexingLogger", Queue.Queue()) self.mSearchingLogger = LoggingEngine(self.mSearchingLogPath, "SearchingLogger", Queue.Queue()) self.mIndexingLogger.start() self.mSearchingLogger.start() atexit.register(self.clear)
def main(): resultados = [] indice_vacio = False if len(os.listdir("./lucene/index")) == 0: indice_vacio = True else: consulta = request.args.get("consulta", None) if consulta is not None: directory = SimpleFSDirectory(Paths.get("./lucene/index")) searcher = IndexSearcher(DirectoryReader.open(directory)) analyzer = SpanishAnalyzer() query = QueryParser("texto", analyzer).parse(consulta) scoreDocs = searcher.search(query, 10).scoreDocs for sd in scoreDocs: doc = searcher.doc(sd.doc) resultados.append({ "url": direccion_base + doc.get("pdf"), "titulo": doc.get("titulo") }) return render_template("main.html", lucene=lucene.VERSION, indice_vacio=indice_vacio, resultados=resultados)
def retriever(file_dir): analyzer = WhitespaceAnalyzer() reader = DirectoryReader.open( SimpleFSDirectory(Paths.get(file_dir + "/lucene_index/"))) searcher = IndexSearcher(reader) queryParser = QueryParser("code", analyzer) BooleanQuery.setMaxClauseCount(Integer.MAX_VALUE) with open(file_dir + "/train/train.spl.src", 'r') as fso, open(file_dir + "/train/train.txt.tgt", 'r') as fsu: sources = [line.strip() for line in fso.readlines()] summaries = [line.strip() for line in fsu.readlines()] with open(file_dir+"/test/test.ast.src") as ft, open(file_dir+"/test/test.ref.src.0", 'w') as fwo, \ open(file_dir+"/output/ast.out", 'w') as fws: queries = [ re.sub("[\W\s]+|AND|NOT|OR", ' ', line.strip()) for line in ft.readlines() ] for i, line in enumerate(queries): print("query %d" % i) query = queryParser.parse(QueryParser.escape(line)) hits = searcher.search(query, 1).scoreDocs flag = False for hit in hits: doc = searcher.doc(hit.doc) _id = eval(doc.get("id")) flag = True fwo.write(sources[_id] + '\n') fws.write(summaries[_id] + '\n') if not flag: print(query) print(hits) exit(-1)
def __init__(self, path, analyzer, topn=DEF_TOPN): self.path = path self._analyzer = analyzer self.topn = topn self._store = SimpleFSDirectory(Paths.get(os.path.abspath(self.path))) self._searcher = IndexSearcher(DirectoryReader.open(self._store))
def func(command): vm_env = lucene.getVMEnv() vm_env.attachCurrentThread() # ------------ # STORE_DIR = "index" directory = SimpleFSDirectory(File(STORE_DIR)) analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) searcher = IndexSearcher(DirectoryReader.open(directory)) # ------------ # p = get_d_dimensional_vector(command) vp = get_vp(p) query = QueryParser(Version.LUCENE_CURRENT, "Vector", analyzer).parse(vp) scoreDocs = searcher.search(query, 200).scoreDocs dict1 = {} result = "" for scoreDoc in scoreDocs: doc = searcher.doc(scoreDoc.doc) rank = 0.6 * float(doc.get("Likes")) + 0.4 * float(doc.get("Views")) ch = doc.get('Page_num') + ' ' ch += 'data/' + doc.get('Page_num') + '.jpg' + ' ' ch += doc.get('Page_link') + ' ' ch += doc.get('Views') + ' ' ch += doc.get('Likes') + ' ' tmp_alt = doc.get('Img_alt') tmp_alt = '_'.join(tmp_alt.split()) ch += tmp_alt dict1[ch] = rank res_list = sorted(dict1.items(), key = lambda item:item[1], reverse = True) for i in res_list: result += i[0] result += ' ' del searcher del analyzer return result
def __init__(self, path=INDEX_DIR): # 初始化lucene,设置好analyzer、reader、searcher和分词器 lucene.initVM() self.indir = SimpleFSDirectory(Paths.get(path)) self.analyzer = SmartChineseAnalyzer() self.reader = DirectoryReader.open(self.indir) self.searcher = IndexSearcher(self.reader)
def retrieve(command): try: lucene.initVM(vmargs=['-Djava.awt.headless=true']) except ValueError: print "JVM running." print 'lucene', lucene.VERSION base_dir = os.path.dirname(os.path.abspath(sys.argv[0])) directory = SimpleFSDirectory(Paths.get(os.path.join(base_dir, INDEX_DIR))) searcher = IndexSearcher(DirectoryReader.open(directory)) analyzer = StandardAnalyzer() # to convert to AND query command = re.sub(r' ', r' +', command) command = "+" + command print "Searching for:", command query = QueryParser("contents", analyzer).parse(command) print query scoreDocs = searcher.search(query, 500).scoreDocs print "%s total matching documents." % len(scoreDocs) retrieved_docs = [] for scoreDoc in scoreDocs: doc = searcher.doc(scoreDoc.doc) retrieved_docs.append(os.path.join(doc.get("path"), doc.get("name"))) del searcher return retrieved_docs
def build_corpus(n=0): sbcs = texeval_corpus.test_subcorpora sbc = sbcs[n] # Hack for parallelizing queries, uses one index per domain. directory = FSDirectory.open(File(wiki_index+'-'+sbc)) searcher = IndexSearcher(DirectoryReader.open(directory)) analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) build_corpus_from_terms_with_wiki(sbc, searcher, analyzer)
def _maybeReopen(self): if len(self._latestModifications) > 10000: newReader = DirectoryReader.openIfChanged(self._reader, self._writer, True) if not newReader is None: self._reader.close() self._reader = newReader self._searcher = IndexSearcher(self._reader) self._latestModifications.clear()
def __init__(self,base_dir, index_dir,index_file,queryDict): self.baseDir = base_dir self.indexFile = os.path.join(index_dir,index_file) lucene.initVM(vmargs=['-Djava.awt.headless=true']) # uncomment when run Retrieve separately directory = SimpleFSDirectory(File(self.indexFile)) searcher = IndexSearcher(DirectoryReader.open(directory)) self.BM25(searcher,queryDict) del searcher
def config(): base_dir = os.path.dirname(os.path.abspath(sys.argv[0])) directory = SimpleFSDirectory(File(os.path.join(base_dir, INDEX_DIR))) searcher = IndexSearcher(DirectoryReader.open(directory)) bm25Sim = BM25Similarity(2.0,0.75) #BM25 with these default values: k1 = 1.2, b = 0.75. searcher.setSimilarity(bm25Sim) analyzer = SmartChineseAnalyzer(Version.LUCENE_CURRENT) return searcher,analyzer
def search_docs(self, value, field="general_info"): MAX_RESULTS = 1000 searcher = IndexSearcher(DirectoryReader.open(self.store)) query = QueryParser(Version.LUCENE_CURRENT, field, self.analyzer).parse(value) topDocs = searcher.search(query, MAX_RESULTS) return [searcher.doc(hit.doc) for hit in topDocs.scoreDocs]
def perform_search(self, searchterm, results_per_page, page): # if there is a field in the searchterm """if ":" in searchterm: # processing a query parser = QueryParser(Version.LUCENE_CURRENT, "content", self.analyzer) parser.setDefaultOperator(QueryParser.Operator.AND) query = parser.parse(searchterm) else: query = BooleanQuery() query_title = TermQuery(Term("title", searchterm)) query_description = TermQuery(Term("description", searchterm)) query_content = TermQuery(Term("content", searchterm)) # BooleanClause.Occur.MUST for AND queries query.add(query_title, BooleanClause.Occur.SHOULD) query.add(query_description, BooleanClause.Occur.SHOULD) query.add(query_content, BooleanClause.Occur.SHOULD)""" # create QueryParser for each field to be searched parser_title = QueryParser(Version.LUCENE_CURRENT, "title", self.analyzer) parser_description = QueryParser(Version.LUCENE_CURRENT, "description", self.analyzer) parser_content = QueryParser(Version.LUCENE_CURRENT, "content", self.analyzer) # put fields together query = BooleanQuery() query.add(parser_title.parse(searchterm), BooleanClause.Occur.SHOULD) query.add(parser_description.parse(searchterm), BooleanClause.Occur.SHOULD) query.add(parser_content.parse(searchterm), BooleanClause.Occur.SHOULD) # conducting search searcher = IndexSearcher(DirectoryReader.open(self.store)) start = datetime.now() hits = searcher.search(query, results_per_page + (results_per_page * page)) score_docs = hits.scoreDocs count_results = hits.totalHits duration = datetime.now() - start # results to return results = [] count = 0 for scoreDoc in score_docs: # skip offset if count < results_per_page * page: count += 1 continue count += 1 doc = searcher.doc(scoreDoc.doc) table = dict((field.name(), field.stringValue()) for field in doc.getFields()) results.append(table) return results, duration, count_results
def deleteRec(self, pid): config = IndexWriterConfig(self.analyzer) config.setOpenMode(IndexWriterConfig.OpenMode.APPEND) writer = IndexWriter(self.indexDir, config) writer.deleteDocuments(Term('uid', pid)) writer.commit() writer.close() self.searcher = IndexSearcher(DirectoryReader.open(self.indexDir)) return
def __init__(self, store_dir): initVM() directory = SimpleFSDirectory(File(store_dir)) self.searcher = IndexSearcher(DirectoryReader.open(directory)) print 'loaded index: %s' % store_dir self.analyzer = {} self.analyzer['StandardAnalyzer'] = StandardAnalyzer(Version.LUCENE_CURRENT) self.analyzer['SimpleAnalyzer'] = SimpleAnalyzer(Version.LUCENE_CURRENT) self.analyzer['ChineseAnalyzer'] = ChineseAnalyzer(Version.LUCENE_CURRENT)
def __init__(self, index_dir): #lucene.initVM(vmargs=['-Djava.awt.headless=true', '-Xmx4g']) self.index_dir = index_dir self.directory = SimpleFSDirectory(File(self.index_dir)) self.analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) self.config = IndexWriterConfig(Version.LUCENE_CURRENT, self.analyzer) self.config.setOpenMode(IndexWriterConfig.OpenMode.CREATE) self.searcher = IndexSearcher(DirectoryReader.open(self.directory))
def init_lucene_search(): lucene.initVM(vmargs=['-Djava.awt.headless=true']) print 'lucene', lucene.VERSION print 'Index ', INDEX_DIR base_dir = os.path.dirname(os.path.abspath(sys.argv[0])) # current dir directory = SimpleFSDirectory(File(INDEX_DIR)) searcher = IndexSearcher(DirectoryReader.open(directory)) analyzer = SmartChineseAnalyzer(Version.LUCENE_CURRENT, SmartChineseAnalyzer.getDefaultStopSet()) return searcher, analyzer
def is_article_indexed(art_id, index='index'): store = SimpleFSDirectory(File(index)) searcher = IndexSearcher(DirectoryReader.open(store)) analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) query = QueryParser(Version.LUCENE_CURRENT, 'art_id', analyzer).parse(str(art_id)) docs = searcher.search(query, 1).scoreDocs return len(docs) > 0
def __init__(self): vm_env = lucene.getVMEnv() vm_env.attachCurrentThread() indexDir = SimpleFSDirectory(File(app.config['INDEX_PATH'])) self.searcher = IndexSearcher(DirectoryReader.open(indexDir)) self.analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) self.parser = QueryParser(Version.LUCENE_CURRENT, "contents", self.analyzer)
def __init__(self, indexPath): lucene.initVM(vmargs=['-Djava.awt.headless=true']) print 'lucene', lucene.VERSION #initialize the index self.INDEX_DIR = indexPath #"Clue_Index" self.results = None self.searcher = IndexSearcher(DirectoryReader.open( SimpleFSDirectory(File(self.INDEX_DIR)))) self.searcher.setSimilarity(BM25Similarity())
def query(self, txt, ant=10): """Searches for a person or family by id, name, place, or date""" q = QueryParser("text", self.analyzer).parse(txt.replace('/', '\/').lower()) if not self.searcher: self.searcher = IndexSearcher(DirectoryReader.open(self.indexDir)) scoreDocs = self.searcher.search(q, ant).scoreDocs hits = [] for scoreDoc in scoreDocs: doc = self.searcher.doc(scoreDoc.doc) hits.append([doc.get("uid"), scoreDoc.score]) return hits
def __init__(self, **kwargs): """ Initialize a new instance of the Searcher :param count: The number of counts to return from a query :param output: The output directory of the underlying index """ self.count = kwargs.get("count", 100) self.output = kwargs.get("root", "index") self.store = SimpleFSDirectory(File(self.output)) self.analyzer = StandardAnalyzer(Version.LUCENE_30) self.searcher = IndexSearcher(DirectoryReader.open(self.store))
def __init__(self, settings, indexDirectory=None, taxoDirectory=None): self._settings = settings self._similarity = settings.similarity self._numberOfConcurrentTasks = settings.numberOfConcurrentTasks self._reader = DirectoryReader.open(indexDirectory) self.taxoReader = DirectoryTaxonomyReader(taxoDirectory) self._readerSettingsWrapper = ReaderSettingsWrapper() self._readerSettingsWrapper.get = lambda: {"similarity": self.searcher.getSimilarity().toString(), "numberOfConcurrentTasks": self._numberOfConcurrentTasks} self._readerSettingsWrapper.set = self._setReadSettings self._searcher = None self._executor = None self._reopenSearcher = True
def search(term, n_docs=10, index='index'): store = SimpleFSDirectory(File(index)) searcher = IndexSearcher(DirectoryReader.open(store)) analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) query = QueryParser(Version.LUCENE_CURRENT, 'art_body', analyzer).parse(term) # str(query.getClass().toString()) == "class org.apache.lucene.search.TermQuery" score_docs = searcher.search(query, n_docs).scoreDocs return [(score_doc.score, unicode(searcher.doc(score_doc.doc).get('art_body'))) for score_doc in score_docs]
def reopen(self): reader = DirectoryReader.openIfChanged(self._reader) if reader is None: return self._reader.close() self._reader = reader self._reopenSearcher = True taxoReader = DirectoryTaxonomyReader.openIfChanged(self.taxoReader) if taxoReader is None: return self.taxoReader.close() self.taxoReader = taxoReader
def _getSearcher(self, identifier=None): modifications = len(self._latestModifications) if modifications == 0: return self._searcher if identifier and str(identifier) not in self._latestModifications and modifications < _MAX_MODIFICATIONS: return self._searcher newreader = DirectoryReader.openIfChanged(self._reader, self._writer, True) if newreader: self._reader.close() self._reader = newreader self._searcher = IndexSearcher(newreader) self._latestModifications.clear() return self._searcher
def __init__(self): if luceneImport: self.lucene = True else: self.lucene = False #Lucene connection lucene.initVM() indexDir = "texts/index" directory = MMapDirectory(File(indexDir)) directory = DirectoryReader.open(directory) self.analyzer = StandardAnalyzer(Version.LUCENE_30) self.searcher = IndexSearcher(directory)