def open_reader(self): """Open IndexReader.""" if self.reader is None: if self.use_ram: print "reading from ram directory ..." self.reader = DirectoryReader.open(self.ram_dir) else: self.reader = DirectoryReader.open(self.dir)
def __init__ (self): self.mDocumentDirectory = "/home/hnguyen/Projects/CLIFinder/operations.sub" self.mIndexDirectory = "/home/hnguyen/Projects/CLIFinder/cli.index" self.mIndexReader = None if os.path.isdir(self.mIndexDirectory) and self.mIndexReader == None: directory = SimpleFSDirectory(File(self.mIndexDirectory)) self.mIndexReader = DirectoryReader.open(directory) ############################### IndexingEngine Settings ###################################### self.mSimilarity = DecreaseLengthNormSimilarity() self.mOpenMode = IndexWriterConfig.OpenMode.CREATE ############################################################################################## self.mIsDebug = False if self.mIsDebug: ############################### Setting up loggers ########################################### self.mIndexingLogPath = "/home/hnguyen/Projects/CLIFinder/logs/indexing.log" self.mSearchingLogPath = "/home/hnguyen/Projects/CLIFinder/logs/searching.log" self.mIndexingLogger = LoggingEngine(self.mIndexingLogPath, "IndexingLogger", Queue.Queue()) self.mSearchingLogger = LoggingEngine(self.mSearchingLogPath, "SearchingLogger", Queue.Queue()) self.mIndexingLogger.start() self.mSearchingLogger.start() atexit.register(self.clear)
def search(): lucene.initVM(vmargs=['-Djava.awt.headless=true']) args = [] if request.method == 'POST': if request.form['ies']: args.append('+ies:'+request.form['ies']) if request.form['area']: args.append('+area:'+request.form['area']) if request.form['professor']: args.append('+professor:'+request.form['professor']) if request.form['conceito']: #args.append('m:'+request.form['conceito']+'d:'+request.form['conceito']+'f:'+request.form['conceito']) args.append('m:'+request.form['conceito']) args.append('d:'+request.form['conceito']) args.append('f:'+request.form['conceito']) table = [] if(len(args) > 0): scoreDocs = mansearch.buscar('indexer/',args) fsDir = SimpleFSDirectory(File(indexDir)) searcher = IndexSearcher(DirectoryReader.open(fsDir)) for scoreDoc in scoreDocs: doc = searcher.doc(scoreDoc.doc) table.append(dict((field.name(), field.stringValue()) for field in doc.getFields())) return render_template('busca.html',table = table) pass
def buscar(indexDir, args,options = None): #lucene.initVM(vmargs=['-Djava.awt.headless=true']) fsDir = SimpleFSDirectory(File(indexDir)) #print fsDir #Criando buscador baseado no diretorio dos indices passados pelo usuario searcher = IndexSearcher(DirectoryReader.open(fsDir)) #Analizador para filtro dos tokens analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) #print analyzer #Criando um QueryParser usando por padrao contents #Variavel com as restricoes da busca parser = QueryParser(Version.LUCENE_CURRENT, "keywords", analyzer) #print parser parser.setDefaultOperator(QueryParser.Operator.AND) #print args #Juntando parametros passados com o valor do mesmo command = ' +'.join(args) #print command query = parser.parse(command) print query #Criando um JArray com resultado da consulta return searcher.search(query, 200).scoreDocs
def search(self, input_query=None, max_answers=10): ''' Searches the given query in the index ''' if input_query is None: return None base_dir = '.' directory = SimpleFSDirectory(File(os.path.join(base_dir, self.index_dir))) searcher = IndexSearcher(DirectoryReader.open(directory)) analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) # query = QueryParser(Version.LUCENE_CURRENT, "title", analyzer).parse(input_query) parser = MultiFieldQueryParser(Version.LUCENE_CURRENT, (self._posts_fields + self._answer_fields), analyzer) query = MultiFieldQueryParser.parse(parser, input_query) scoreDocs = searcher.search(query, max_answers).scoreDocs print "%s total matching documents." % len(scoreDocs) docs = [] for scoreDoc in scoreDocs: doc = searcher.doc(scoreDoc.doc) doc_dict = dict((field.name(), field.stringValue()) for field in doc.getFields()) docs.append(doc_dict) # print doc return docs
def search(self, field, text): """ search text within indexed data input: field fieldname of the value that will be indexed text text to search output: hits return a list of hits """ results = [] idx_reader = DirectoryReader.open(self.directory) idx_searcher = IndexSearcher(idx_reader) # parse query parser = AnalyzingQueryParser(Version.LUCENE_CURRENT, field, self.analyser) query = parser.parse(text) # search hits = idx_searcher.search(query, 1000).scoreDocs.tolist() for hit in hits: doc = idx_searcher.doc(hit.doc) score = hit.score title = doc.get(field) url = doc.get("url") results.append((score, url, title)) return results
def search(self): ''' Searches the given query in the index ''' lucene.initVM(vmargs=['-Djava.awt.headless=true']) # print 'lucene', lucene.VERSION # base_dir = os.path.dirname(os.path.abspath('.')) base_dir = '.' directory = SimpleFSDirectory(File(os.path.join(base_dir, self.index_dir))) searcher = IndexSearcher(DirectoryReader.open(directory)) analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) while True: print print "Hit enter with no input to quit." command = raw_input("Query:") if command == '': return print print "Searching for:", command query = QueryParser(Version.LUCENE_CURRENT, "title", analyzer).parse(command) scoreDocs = searcher.search(query, 50).scoreDocs print "%s total matching documents." % len(scoreDocs) for scoreDoc in scoreDocs: doc = searcher.doc(scoreDoc.doc) # print 'path:', doc.get("path"), 'name:', doc.get("name") print doc
def __init__(self, db_path): directory = SimpleFSDirectory(File(db_path)) reader = DirectoryReader.open(directory) self.searcher = IndexSearcher(reader) self.analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) logger.info("Loaded DB from %s with %d documents: ", db_path, reader.numDocs())
def query(self, data): if self.fil.exists(): searcher = IndexSearcher(DirectoryReader.open(self.d)) query = QueryParser( Version.LUCENE_30, "id", self.analyzer).parse( data['query']) hits = searcher.search(query, 100000) results = {} results['totalHits'] = hits.totalHits results['hits'] = {} for hit in hits.scoreDocs: record = {} doc = searcher.doc(hit.doc) fields = doc.getFields() record['score'] = hit.score for field in fields: if field.name() != "id": record[field.name()] = field.stringValue() results['hits'][doc.get('id')] = record searcher.getIndexReader().close() return results
def __init__(self, index_path, method, logger=None, use_default_similarity=False): self.index_path=index_path directory = SimpleFSDirectory(File(self.index_path)) self.analyzer = StandardAnalyzer(LuceneVersion.LUCENE_CURRENT) self.reader=DirectoryReader.open(directory) self.searcher = IndexSearcher(self.reader) # uncomment one of these lines to change the type of parser, query and weight used if use_default_similarity: self.query_parser=QueryParser else: self.query_parser=FieldAgnosticQueryParser if use_default_similarity: similarity=DefaultSimilarity() self.useExplainQuery=False else: similarity=FieldAgnosticSimilarity() self.useExplainQuery=True # by default, FieldAgnosticSimilarity uses coord factor, can be disabled ## similarity.useCoord=False self.searcher.setSimilarity(similarity) self.method=method # never used? self.logger=logger
def build_corpus(n=0): sbcs = texeval_corpus.test_subcorpora sbc = sbcs[n] # Hack for parallelizing queries, uses one index per domain. directory = FSDirectory.open(File(wiki_index+'-'+sbc)) searcher = IndexSearcher(DirectoryReader.open(directory)) analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) build_corpus_from_terms_with_wiki(sbc, searcher, analyzer)
def __init__(self,base_dir, index_dir,index_file,queryDict): self.baseDir = base_dir self.indexFile = os.path.join(index_dir,index_file) lucene.initVM(vmargs=['-Djava.awt.headless=true']) # uncomment when run Retrieve separately directory = SimpleFSDirectory(File(self.indexFile)) searcher = IndexSearcher(DirectoryReader.open(directory)) self.BM25(searcher,queryDict) del searcher
def search_docs(self, value, field="general_info"): MAX_RESULTS = 1000 searcher = IndexSearcher(DirectoryReader.open(self.store)) query = QueryParser(Version.LUCENE_CURRENT, field, self.analyzer).parse(value) topDocs = searcher.search(query, MAX_RESULTS) return [searcher.doc(hit.doc) for hit in topDocs.scoreDocs]
def config(): base_dir = os.path.dirname(os.path.abspath(sys.argv[0])) directory = SimpleFSDirectory(File(os.path.join(base_dir, INDEX_DIR))) searcher = IndexSearcher(DirectoryReader.open(directory)) bm25Sim = BM25Similarity(2.0,0.75) #BM25 with these default values: k1 = 1.2, b = 0.75. searcher.setSimilarity(bm25Sim) analyzer = SmartChineseAnalyzer(Version.LUCENE_CURRENT) return searcher,analyzer
def perform_search(self, searchterm, results_per_page, page): # if there is a field in the searchterm """if ":" in searchterm: # processing a query parser = QueryParser(Version.LUCENE_CURRENT, "content", self.analyzer) parser.setDefaultOperator(QueryParser.Operator.AND) query = parser.parse(searchterm) else: query = BooleanQuery() query_title = TermQuery(Term("title", searchterm)) query_description = TermQuery(Term("description", searchterm)) query_content = TermQuery(Term("content", searchterm)) # BooleanClause.Occur.MUST for AND queries query.add(query_title, BooleanClause.Occur.SHOULD) query.add(query_description, BooleanClause.Occur.SHOULD) query.add(query_content, BooleanClause.Occur.SHOULD)""" # create QueryParser for each field to be searched parser_title = QueryParser(Version.LUCENE_CURRENT, "title", self.analyzer) parser_description = QueryParser(Version.LUCENE_CURRENT, "description", self.analyzer) parser_content = QueryParser(Version.LUCENE_CURRENT, "content", self.analyzer) # put fields together query = BooleanQuery() query.add(parser_title.parse(searchterm), BooleanClause.Occur.SHOULD) query.add(parser_description.parse(searchterm), BooleanClause.Occur.SHOULD) query.add(parser_content.parse(searchterm), BooleanClause.Occur.SHOULD) # conducting search searcher = IndexSearcher(DirectoryReader.open(self.store)) start = datetime.now() hits = searcher.search(query, results_per_page + (results_per_page * page)) score_docs = hits.scoreDocs count_results = hits.totalHits duration = datetime.now() - start # results to return results = [] count = 0 for scoreDoc in score_docs: # skip offset if count < results_per_page * page: count += 1 continue count += 1 doc = searcher.doc(scoreDoc.doc) table = dict((field.name(), field.stringValue()) for field in doc.getFields()) results.append(table) return results, duration, count_results
def __init__(self, store_dir): initVM() directory = SimpleFSDirectory(File(store_dir)) self.searcher = IndexSearcher(DirectoryReader.open(directory)) print 'loaded index: %s' % store_dir self.analyzer = {} self.analyzer['StandardAnalyzer'] = StandardAnalyzer(Version.LUCENE_CURRENT) self.analyzer['SimpleAnalyzer'] = SimpleAnalyzer(Version.LUCENE_CURRENT) self.analyzer['ChineseAnalyzer'] = ChineseAnalyzer(Version.LUCENE_CURRENT)
def deleteRec(self, pid): config = IndexWriterConfig(self.analyzer) config.setOpenMode(IndexWriterConfig.OpenMode.APPEND) writer = IndexWriter(self.indexDir, config) writer.deleteDocuments(Term('uid', pid)) writer.commit() writer.close() self.searcher = IndexSearcher(DirectoryReader.open(self.indexDir)) return
def __init__(self): vm_env = lucene.getVMEnv() vm_env.attachCurrentThread() indexDir = SimpleFSDirectory(File(app.config['INDEX_PATH'])) self.searcher = IndexSearcher(DirectoryReader.open(indexDir)) self.analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) self.parser = QueryParser(Version.LUCENE_CURRENT, "contents", self.analyzer)
def __init__(self, index_dir): #lucene.initVM(vmargs=['-Djava.awt.headless=true', '-Xmx4g']) self.index_dir = index_dir self.directory = SimpleFSDirectory(File(self.index_dir)) self.analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) self.config = IndexWriterConfig(Version.LUCENE_CURRENT, self.analyzer) self.config.setOpenMode(IndexWriterConfig.OpenMode.CREATE) self.searcher = IndexSearcher(DirectoryReader.open(self.directory))
def init_lucene_search(): lucene.initVM(vmargs=['-Djava.awt.headless=true']) print 'lucene', lucene.VERSION print 'Index ', INDEX_DIR base_dir = os.path.dirname(os.path.abspath(sys.argv[0])) # current dir directory = SimpleFSDirectory(File(INDEX_DIR)) searcher = IndexSearcher(DirectoryReader.open(directory)) analyzer = SmartChineseAnalyzer(Version.LUCENE_CURRENT, SmartChineseAnalyzer.getDefaultStopSet()) return searcher, analyzer
def is_article_indexed(art_id, index='index'): store = SimpleFSDirectory(File(index)) searcher = IndexSearcher(DirectoryReader.open(store)) analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) query = QueryParser(Version.LUCENE_CURRENT, 'art_id', analyzer).parse(str(art_id)) docs = searcher.search(query, 1).scoreDocs return len(docs) > 0
def __init__(self, indexPath): lucene.initVM(vmargs=['-Djava.awt.headless=true']) print 'lucene', lucene.VERSION #initialize the index self.INDEX_DIR = indexPath #"Clue_Index" self.results = None self.searcher = IndexSearcher(DirectoryReader.open( SimpleFSDirectory(File(self.INDEX_DIR)))) self.searcher.setSimilarity(BM25Similarity())
def query(self, txt, ant=10): """Searches for a person or family by id, name, place, or date""" q = QueryParser("text", self.analyzer).parse(txt.replace('/', '\/').lower()) if not self.searcher: self.searcher = IndexSearcher(DirectoryReader.open(self.indexDir)) scoreDocs = self.searcher.search(q, ant).scoreDocs hits = [] for scoreDoc in scoreDocs: doc = self.searcher.doc(scoreDoc.doc) hits.append([doc.get("uid"), scoreDoc.score]) return hits
def __init__(self, **kwargs): """ Initialize a new instance of the Searcher :param count: The number of counts to return from a query :param output: The output directory of the underlying index """ self.count = kwargs.get("count", 100) self.output = kwargs.get("root", "index") self.store = SimpleFSDirectory(File(self.output)) self.analyzer = StandardAnalyzer(Version.LUCENE_30) self.searcher = IndexSearcher(DirectoryReader.open(self.store))
def search(term, n_docs=10, index='index'): store = SimpleFSDirectory(File(index)) searcher = IndexSearcher(DirectoryReader.open(store)) analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) query = QueryParser(Version.LUCENE_CURRENT, 'art_body', analyzer).parse(term) # str(query.getClass().toString()) == "class org.apache.lucene.search.TermQuery" score_docs = searcher.search(query, n_docs).scoreDocs return [(score_doc.score, unicode(searcher.doc(score_doc.doc).get('art_body'))) for score_doc in score_docs]
def __init__(self, settings, indexDirectory=None, taxoDirectory=None): self._settings = settings self._similarity = settings.similarity self._numberOfConcurrentTasks = settings.numberOfConcurrentTasks self._reader = DirectoryReader.open(indexDirectory) self.taxoReader = DirectoryTaxonomyReader(taxoDirectory) self._readerSettingsWrapper = ReaderSettingsWrapper() self._readerSettingsWrapper.get = lambda: {"similarity": self.searcher.getSimilarity().toString(), "numberOfConcurrentTasks": self._numberOfConcurrentTasks} self._readerSettingsWrapper.set = self._setReadSettings self._searcher = None self._executor = None self._reopenSearcher = True
def setupDir(dbName): global indexDir, searcher (user,db) = dbName.split('_', 1) #directory = "./searchDB/"+dbName directory = "./files/"+user+'/'+db+'/LuceneIndex' if not os.path.exists(directory): os.mkdir(directory) #indexDir = SimpleFSDirectory(File(directory)) indexDir = SimpleFSDirectory(Paths.get(directory)) try: searcher = IndexSearcher(DirectoryReader.open(indexDir)) except Exception, e: pass
def retrieve_wiki(text_query, index_directory_name): lucene.initVM() directory = FSDirectory.open(File(index_directory_name)) searcher = IndexSearcher(DirectoryReader.open(directory)) analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) txt =text_query query = QueryParser(Version.LUCENE_CURRENT, "contents", analyzer).parse(txt) scoreDocs = searcher.search(query, 1000).scoreDocs for scoreDoc in scoreDocs: doc = searcher.doc(scoreDoc.doc) yield doc.get('contents')
def search(self, q, sex, ant=5, config = None): """Searches for a match""" query = QueryParser("match", self.analyzer).parse(q.replace('/', '\/')) #Hur lägga till sex? if not self.searcher: self.searcher = IndexSearcher(DirectoryReader.open(self.indexDir)) scoreDocs = self.searcher.search(query, ant).scoreDocs hits = [] for scoreDoc in scoreDocs: doc = self.searcher.doc(scoreDoc.doc) if sex == doc.get("sex"): hits.append([doc.get("uid"), scoreDoc.score]) return hits
def text_search(command, cpage, meth): global vm_env, searcher, analyzer vm_env.attachCurrentThread() print 'lucene', lucene.VERSION directory = SimpleFSDirectory(File(STORE_TEXT_DIR)) searcher = IndexSearcher(DirectoryReader.open(directory)) analyzer = WhitespaceAnalyzer(Version.LUCENE_CURRENT) text, maxnum = runstext(command, cpage, meth) del searcher return text, maxnum
def __init__(self): #self.segmentor.load('./cws.model') INDEXDIR = './Myindex' #lucene.initVM(vmargs='-Xcheck:jni,-verbose:jni,-verbose:gc') lucene.initVM(vmargs=['-Djava.awt.headless=true']) #vm_env = lucene.getVMEnv() #vm_env.attachCurrentThread() #lucene.initVM(vmargs='-') #print 'lucene', lucene.VERSION self.directory = SimpleFSDirectory(File(INDEXDIR)) self.searcher = IndexSearcher(DirectoryReader.open(self.directory)) self.analyzer = WhitespaceAnalyzer(Version.LUCENE_CURRENT) self.reader = IndexReader.open(self.directory)
def __init__(self): if luceneImport: self.lucene = True else: self.lucene = False #Lucene connection lucene.initVM() indexDir = "texts/index" directory = MMapDirectory(File(indexDir)) directory = DirectoryReader.open(directory) self.analyzer = StandardAnalyzer(Version.LUCENE_30) self.searcher = IndexSearcher(directory)
def __init__(self, store_dir, analyzer, preprocess = lambda x: x): ''' Input: `store_dir`: directory storing the Lucene index `analyzer`: analyzer required to split the query `preprocess`: user-defined preprocess function ''' # Initialize `IndexSearcher` self.dir = SimpleFSDirectory(File(store_dir).toPath()) self.searcher = IndexSearcher(DirectoryReader.open(self.dir)) self.preprocess = preprocess # Initialize `QueryParser` self.parser = QueryParser("description", analyzer)
def getHitCount(self, fieldName, searchString): reader = DirectoryReader.open(self.dir) #readOnly = True print '%s total docs in index' % reader.numDocs() searcher = IndexSearcher(reader) #readOnly = True t = Term(fieldName, searchString) query = TermQuery(t) hitCount = len(searcher.search(query, 50).scoreDocs) reader.close() print "%s total matching documents for %s\n---------------" \ % (hitCount, searchString) return hitCount
def retrieveDocs(q): STORE_DIR = "IndexFiles.index" lucene.initVM() analyzer = StandardAnalyzer() base_dir = os.path.dirname(os.path.abspath(sys.argv[0])) dir = SimpleFSDirectory(File(STORE_DIR).toPath()) #directory = FSDirectory.getDirectory(File(STORE_DIR)) reader = DirectoryReader.open(dir) #reader = IndexReader.open(SimpleFSDirectory(File("index/"))) searcher = IndexSearcher(DirectoryReader.open(dir)) query = QueryParser("contents", analyzer).parse(q) #query = QueryParser(Version.LUCENE_CURRENT, "contents", analyzer).parse(q) #query = QueryParser(Version.LUCENE_30, "text", analyzer).parse(q) MAX = 1000 hits = searcher.search(query, MAX) nonDiverse = [] docsToScores = {} #create a list of html files with relevant websites rQ = [] print "Found %d document(s) that matched query '%s':" % (hits.totalHits, query) for hit in hits.scoreDocs: print hit.score, hit.doc, hit.toString() doc = searcher.doc(hit.doc) #print doc.get("contents").encode("utf-8") #print(new_urls[str(hit.doc)]) result = str(hit.score) + " " + str(hit.doc) + " " + hit.toString() if (len(nonDiverse) < 10): nonDiverse.append(new_urls[str(hit.doc)]) #find the document that corresponds to the html website and append to a list for min distance website = new_urls[str(hit.doc)] #html_files numbers of the hit websites added to rQ rQ.append(inv_map[website]) docsToScores[int(inv_map[website])] = hit.score print(inv_map[website]) return docsToScores, rQ, nonDiverse
def search(music_tags, dir_path): lucene.initVM() query_str = "content:" + " ".join(music_tags) index_dir = SimpleFSDirectory(Paths.get(dir_path)) lucene_analyzer = StandardAnalyzer() lucene_searcher = IndexSearcher(DirectoryReader.open(index_dir)) my_query = QueryParser("content", lucene_analyzer).parse(query_str) total_hits = lucene_searcher.search(my_query, 50) for hit in total_hits.scoreDocs: doc = lucene_searcher.doc(hit.doc) print doc
def init_search(search_content, vm_env): vm_env = lucene.getVMEnv() vm_env.attachCurrentThread() lucene.initVM() STORE_DIR = "index" print 'lucene', lucene.VERSION #base_dir = os.path.dirname(os.path.abspath(sys.argv[0])) directory = SimpleFSDirectory(File(STORE_DIR)) searcher = IndexSearcher(DirectoryReader.open(directory)) analyzer = WhitespaceAnalyzer(Version.LUCENE_CURRENT) result_s = run(searcher, analyzer, search_content) del searcher print(result_s) return result_s
def runDrillDown(self): # open readers taxo = DirectoryTaxonomyReader(self.taxoDir) indexReader = DirectoryReader.open(self.indexDir) for drilldown in drilldownCategories: print "search with drilldown: %s" % '/'.join(drilldown) facetRes = SimpleSearcher.searchWithDrillDown(drilldown, indexReader, taxo, self.facets_config) # close readers taxo.close() indexReader.close() # return result return facetRes
def func_nr(command): vm_env = lucene.getVMEnv() vm_env.attachCurrentThread() STORE_DIR = "index_tb_new" directory = SimpleFSDirectory(File(STORE_DIR)) searcher = IndexSearcher(DirectoryReader.open(directory)) analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) query = QueryParser(Version.LUCENE_CURRENT, "name", analyzer).parse(command) scoreDocs = searcher.search( query, 50, Sort([SortField("rate", SortField.Type.DOUBLE, True)])).scoreDocs results = process(scoreDocs, searcher) return results
def similarityOfSynopsis(self): directory = SimpleFSDirectory(File(settings.SYNOPSIS_INDEX)) ireader = DirectoryReader.open(directory) searcher = IndexSearcher(ireader) analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) queryParser = QueryParser(Version.LUCENE_CURRENT, FIELD_CONTENTS, analyzer) for root, dirnames, filenames in os.walk(settings.SYNOPSIS): filenames = [int(item) for item in filenames] filenames.sort() filenames = [str(item) for item in filenames] for filename in filenames: path = os.path.join(root, filename) major_movie = models.Movie.objects.get(pk=filename) with open(path, 'r') as moviedoc: content = moviedoc.read().replace('\n', ' ') content = re.sub('[^A-Za-z0-9 ]+', '', content) while True: try: query = queryParser.parse( QueryParser.escape(content)) except Exception as e: self.boolean_query.setMaxClauseCount( self.boolean_query.maxClauseCount * 2) print self.boolean_query.maxClauseCount continue break topDocs = searcher.search(query, len(filenames)) scoreDocs = topDocs.scoreDocs for scoreDoc in scoreDocs: doc = searcher.doc(scoreDoc.doc) movie_id = int(doc.get(FIELD_PATH)) if movie_id <= major_movie.id: continue minor_movie = models.Movie.objects.get(pk=movie_id) try: similarity = models.Similarities.objects.filter( first_movie=major_movie, second_movie=minor_movie).first() if not similarity: similarity = models.Similarities.objects.filter( first_movie=minor_movie, second_movie=major_movie).first() similarity.synopsis = scoreDoc.score similarity.save() except Exception as e: print major_movie.id, minor_movie.id raise e print u"{0} completed.".format(major_movie.id)
def define_search_params(STORE_DIR, FIELD_CONTENTS, TERM): store = SimpleFSDirectory(Paths.get(STORE_DIR)) reader = DirectoryReader.open(store) searcher = IndexSearcher(reader) # Get the analyzer analyzer = WhitespaceAnalyzer() # Constructs a query parser. We specify what field to search into. queryParser = QueryParser(FIELD_CONTENTS, analyzer) # Create the query query = queryParser.parse(TERM) return searcher, reader, query
def search(command): STORE_DIR = "index" # base_dir = os.path.dirname(os.path.abspath(sys.argv[0])) directory = MMapDirectory(Paths.get(STORE_DIR)) searcher = IndexSearcher(DirectoryReader.open(directory)) analyzer = SmartChineseAnalyzer() ans = run(searcher, analyzer, command) del searcher return ans # vm_env = lucene.initVM(vmargs=['-Djava.awt.headless=true']) # for y in search('二三四五'): # print(y)
def main(): store_dir = "lucene_index" if not os.path.isdir(store_dir): raise RuntimeError("Cannot find Lucene index at: {}".format(store_dir)) store = SimpleFSDirectory(Paths.get(store_dir)) searcher = IndexSearcher(DirectoryReader.open(store)) analyzer = EnglishAnalyzer() # query_string = "House is a simple fact about science reaction" # query_string = get_random_question() # search(query_string, analyzer, searcher) # by_random_question(analyzer, searcher) annotate_all_questions(analyzer, searcher) del searcher
def func_perfumer(command): vm_env = lucene.getVMEnv() vm_env.attachCurrentThread() STORE_DIR = "index_tb_new" directory = SimpleFSDirectory(File(STORE_DIR)) searcher = IndexSearcher(DirectoryReader.open(directory)) analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) query = QueryParser(Version.LUCENE_CURRENT, "perfumer", analyzer).parse(command) scoreDocs = searcher.search(query, 233).scoreDocs results = process(scoreDocs, searcher) return results
def populate_data(path, args): name = path.split('/')[-1] print(f"Processing {name}") all_senses = {} all_senses[args.lang1] = {} all_senses[args.lang2] = {} if args.pivot_lang is not None: all_senses[args.pivot_lang] = {} all_translation_mappings = [] if args.pivot_lang is not None: all_translation_pivot1_mappings = [] all_translation_pivot2_mappings = [] store = SimpleFSDirectory(Paths.get(path)) dr = DirectoryReader.open(store) searcher = IndexSearcher(dr) analyzer = StandardAnalyzer() query = QueryParser("title", analyzer).parse("*:*") topDocs = searcher.search(query, 1000000000) for scoreDoc in topDocs.scoreDocs: doc = scoreDoc.doc language_lemmas = searcher.doc(doc).getValues("LANGUAGE_LEMMA") sense_ids = searcher.doc(doc).getValues("ID_SENSE") for language_lemma, sense_id in zip(language_lemmas, sense_ids): lang = language_lemma[:2] lemma = language_lemma[3:] if language_lemma[:2] in LANGUAGES_OF_INTEREST: all_senses[lang] = {sense_id: lemma} if args.pivot_lang is not None and language_lemma[:2] == args.pivot_lang: all_senses[args.pivot_lang] = {sense_id: lemma} translation_mappings = searcher.doc(doc).getValues("TRANSLATION_MAPPING") create_translation_mapping(translation_mappings, all_senses, all_translation_mappings, LANGUAGES_OF_INTEREST) if args.pivot_lang is not None: create_translation_mapping(translation_mappings, all_senses, all_translation_pivot1_mappings, [args.lang1, args.pivot_lang]) create_translation_mapping(translation_mappings, all_senses, all_translation_pivot2_mappings, [args.lang2, args.pivot_lang]) output = open(f'{args.internal_data_path}/{name}.pkl', 'wb') pickle.dump(all_translation_mappings, output) output.close() if args.pivot_lang is not None: output = open(f'{args.internal_data_path}/{name}_{args.lang1}-{args.pivot_lang}.pkl', 'wb') pickle.dump(all_translation_pivot1_mappings, output) output.close() output = open(f'{args.internal_data_path}/{name}_{args.lang2}-{args.pivot_lang}.pkl', 'wb') pickle.dump(all_translation_pivot2_mappings, output) output.close()
def __init__(self): lucene.initVM(vmargs=['-Djava.awt.headless=true']) print 'lucene', lucene.VERSION self.base_dir = os.path.dirname(os.path.abspath(sys.argv[0])) self.directory = SimpleFSDirectory(File(os.path.join(self.base_dir, INDEX_DIR))) self.reader = DirectoryReader.open(self.directory) self.searcher = IndexSearcher(self.reader) self.numDocs = self.reader.maxDoc() self.mlt = MoreLikeThis(self.reader) self.mlt.setMinTermFreq(1) self.mlt.setMinDocFreq(1) '''
def GET(self): aud = web.input() f = login_aud() audpath = str(aud['uploadfile']) STORE_DIR = "index2" vm_env = lucene.getVMEnv() vm_env.attachCurrentThread() directory = SimpleFSDirectory(File(STORE_DIR)) searcher = IndexSearcher(DirectoryReader.open(directory)) analyzer = WhitespaceAnalyzer(Version.LUCENE_CURRENT) geshou, geming, zhuanji, liupai, shijian, jianjie, geci, imgurl = run_aud( searcher, analyzer, audpath) del searcher return render.result_aud(f, geshou, geming, zhuanji, imgurl, liupai, shijian, jianjie, geci)
def main(): lucene.initVM(vmargs=['-Djava.awt.headless=true']) queries = makeQueryList(args["queryFile"]) print 'lucene', lucene.VERSION print "\n" directory = SimpleFSDirectory(Paths.get(os.getcwd(), INDEX_DIR)) print directory.getDirectory() searcher = IndexSearcher(DirectoryReader.open(directory)) searcher.setSimilarity(ClassicSimilarity()) analyzer = StandardAnalyzer() run(searcher, analyzer, queries) del searcher
def __init__(self, settings, indexDirectory=None, taxoDirectory=None): self._settings = settings self._similarity = settings.similarity self._numberOfConcurrentTasks = settings.numberOfConcurrentTasks self._reader = DirectoryReader.open(indexDirectory) self.taxoReader = DirectoryTaxonomyReader(taxoDirectory) self._readerSettingsWrapper = ReaderSettingsWrapper() self._readerSettingsWrapper.get = lambda: { "similarity": self.searcher.getSimilarity().toString(), "numberOfConcurrentTasks": self._numberOfConcurrentTasks } self._readerSettingsWrapper.set = self._setReadSettings self._searcher = None self._executor = None self._reopenSearcher = True
def search(self, query): lucene.initVM() luceneDirectory = "/index/" path = str(os.path.abspath(os.getcwd()) + luceneDirectory) directory = FSDirectory.open(Paths.get(path)) reader = DirectoryReader.open(directory) searcher = IndexSearcher(reader) analyzer = StandardAnalyzer() #args = len(sys.argv) - 1 #if args < 1: # print ("\n No query was submitted! \n") #else: #query_string = "" #position = 1 #while(args >= position): #query_string = query_string + str(sys.argv[position]) + " " #position = position + 1 print("Searching for '" + query + "'") fields_to_search = ["text", "page title", "date"] filter_date = 'date:"May 25"' filtered_query = filter_date + "AND " + query parser = MultiFieldQueryParser(fields_to_search, analyzer) updated_query = MultiFieldQueryParser.parse(parser, filtered_query) scored_documents = searcher.search(updated_query, 10).scoreDocs # array of docs print("Found " + str((len(scored_documents))) + " matches in the collection.") results = [] for doc in scored_documents: scoredTweet = dict() scoredTweet['score'] = doc.score result = searcher.doc(doc.doc) scoredTweet['username'] = result.get("username") scoredTweet['tweet_body'] = result.get("text") scoredTweet['date'] = result.get("date") results.append(scoredTweet) print(scoredTweet) return results
def define_search_params(STORE_DIR, FIELD_CONTENTS): #indexPath = File(STORE_DIR).toPath() #indexDir = FSDirectory.open(indexPath) store = SimpleFSDirectory(Paths.get(STORE_DIR)) reader = DirectoryReader.open(store) searcher = IndexSearcher(reader) # Get the analyzer # analyzer = WhitespaceAnalyzer() # Constructs a query parser. We specify what field to search into. # queryParser = QueryParser(FIELD_CONTENTS, analyzer) return searcher, reader
def __init__(self, store_dir, analyzer, preprocess=lambda x: x): ''' Input: `store_dir`: directory storing the Lucene index `analyzer`: analyzer required to split the query `preprocess`: user-defined preprocess function ''' # Initialize `IndexSearcher` self.dir = SimpleFSDirectory(File(store_dir).toPath()) self.searcher = IndexSearcher(DirectoryReader.open(self.dir)) self.preprocess = preprocess # Store Analyzer self.analyzer = analyzer # Initialize `Formatter` self.formatter = SimpleHTMLFormatter('<em>', '</em>')
def __init__(self, lang): lucene.initVM() if lang == 'zh': indexDir = SimpleFSDirectory(Paths.get(str(config.IDX_SSQA))) analyzer = SmartChineseAnalyzer() else: raise ValueError( 'lang should be "zh" or "en", {} is invalid!'.format(lang)) self.reader = DirectoryReader.open(indexDir) self.searcher = IndexSearcher(self.reader) self.searcher.setSimilarity(mySimilarity()) self.analyzer = analyzer logger.debug('search similarity func: {}'.format( self.searcher.getSimilarity()))
def search_trip(command): '''command must be encoded in unicode''' STORE_DIR = "index_trip" vm_env.attachCurrentThread() directory = SimpleFSDirectory(File(STORE_DIR)) folders = { 'parsed_ctrip': ['source', 'location', 'introduction', 'score', 'img_list'], 'parsed_qunar': ['location', 'rank', 'score', 'time', 'introduction', 'img_list'], 'eic_mfw': ['location', 'introduction', 'img_list'] } readers = constructReaders(folders) searcher = IndexSearcher(DirectoryReader.open(directory)) analyzer = WhitespaceAnalyzer(Version.LUCENE_CURRENT) command_dict = parseCommand(command) querys = BooleanQuery() for k, v in command_dict.iteritems(): query = QueryParser(Version.LUCENE_CURRENT, k, analyzer).parse(v) querys.add(query, BooleanClause.Occur.MUST) scoreDocs = searcher.search(querys, 50).scoreDocs print 'total: %s' % (len(scoreDocs)) maxf = [] maxrank = -1000.0 for scoreDoc in scoreDocs: doc = searcher.doc(scoreDoc.doc) filename = doc.get('filename') rank = ranker(command_dict, getInfo(folders, readers, filename)) if rank > maxrank: maxf = [filename] maxrank = rank elif rank == maxrank: maxf.append(filename) del searcher if len(maxf) == 0: print "error in searchtrip.py: no result while searching", command_dict.get( 'location', '') return "Interior Error" elif len(maxf) != 1: print "warning in searchtrip.py: multiple results when searching", command_dict.get( 'location', '') return getInfo(folders, readers, maxf[0])
def main(args): global verbose verbose = args.verbose if verbose: logger.info(f'Read {args.dir_index}') directory = SimpleFSDirectory.open(Paths.get(args.dir_index)) searcher = IndexSearcher(DirectoryReader.open(directory)) reader = searcher.getIndexReader() if verbose: logger.info(f'Write to {args.path_output}') with open(args.path_output, 'w') as f: for idx in trange(reader.maxDoc()): doc = reader.document(idx) babelnet_id = doc.get('ID') synset_id = doc.get('SYNSET_ID') pos = doc.get('POS') synset_type = doc.get('TYPE') main_sense = doc.get('MAIN_SENSE') categories = list(doc.getValues('CATEGORY')) translation_mappings = list(doc.getValues('TRANSLATION_MAPPING')) images = list(doc.getValues('IMAGE')) lemmas = doc.getValues('LEMMA') forms = [] for i in range(len(lemmas)): forms.append({ 'lemma': lemmas[i], 'source': doc.getValues('LEMMA_SOURCE')[i], 'lang': doc.getValues('LEMMA_LANGUAGE')[i], 'weight': doc.getValues('LEMMA_WEIGHT')[i], 'sense_key': doc.getValues('LEMMA_SENSEKEY')[i], }) entry = { 'id': babelnet_id, 'synset': synset_id, 'pos': pos, 'type': synset_type, 'main_sense': main_sense, 'categories': categories, 'translation_mappings': translation_mappings, 'images': images, 'forms': forms } f.write(json.dumps(entry, ensure_ascii=False) + '\n') return 0
def __init__(self, LUCENE_INDEX_DIR, similarity='BM25', lucene_vm_flag=False): if lucene_vm_flag == False: lucene.initVM(vmargs=['-Djava.awt.headless=true']) self.lucene_vm_init = True self.index_dir = LUCENE_INDEX_DIR self.index_mm = MMapDirectory(Paths.get(LUCENE_INDEX_DIR)) self.analyzer = StandardAnalyzer() self.config = IndexWriterConfig(self.analyzer) self.reader = DirectoryReader.open(self.index_mm) self.searcher = IndexSearcher(self.reader) self.dict_term_freq = {} self.dict_doc_field_title = {} if similarity == 'BM25': (self.searcher).setSimilarity(BM25Similarity())
def main(storeDir): reader = DirectoryReader.open(storeDir) numDocs = reader.numDocs() print("n_docs:", numDocs) for i in range(numDocs): tvec = reader.getTermVector(i, 'body') if tvec is not None: termsEnum = tvec.iterator() vec = {} for term in BytesRefIterator.cast_(termsEnum): dpEnum = termsEnum.postings(None) dpEnum.nextDoc() vec[term.utf8ToString()] = dpEnum.freq() print(vec) reader.close()
def runSimple(self): # open readers taxo = DirectoryTaxonomyReader(self.taxoDir) indexReader = DirectoryReader.open(self.indexDir) for term in searchValues: print "\nsearch by term '%s' ..." % term facetRes = SimpleSearcher.searchWithTerm(term, indexReader, taxo, self.facets_config) print "\nsearch all documents ..." facetRes = SimpleSearcher.searchWithFacets(indexReader, taxo, self.facets_config) # close readers taxo.close() indexReader.close() # return result return facetRes
def GetDocField(self, docIdx, field=CONTENT): """ Get the document's field :Parameters: - `docIdx`: Document's index ID (Int). - `field`: Field to retrieve (Str). :Returns: - Document's field. (Str) """ reader = DirectoryReader.open(self.__indexDir) doc = reader.document(docIdx) content = doc.get(field) reader.close() return content
def __init__(self, path): print('Searcher initialized...') self.path = path self.analyzer = SmartChineseAnalyzer() # self.analyzer = WhitespaceAnalyzer(Version.LATEST) self.reader = DirectoryReader.open( SimpleFSDirectory(Paths.get(self.path))) self.searcher = IndexSearcher(self.reader) self.thu = thulac.thulac(deli='/') file = Path('w2v.model') if file.is_file(): print('Model was already trained...loading model') self.w2v_model = Word2Vec.load('w2v.model') else: self.model_train() print('Model trained...')