def runSearch(self, runCount, mainThread=False): """ search for runCount number of times """ # problem: if there are any assertion errors in the child # thread, the calling thread is not notified and may still # consider the test case pass. We are using self.totalQueries # to double check that work has actually been done. if not mainThread: getVMEnv().attachCurrentThread() time.sleep(0.5) searcher = self.getSearcher() try: self.query = PhraseQuery() for word, count in self.testData[0:runCount]: query = TermQuery(Term("field", word)) topDocs = searcher.search(query, 50) self.assertEqual(topDocs.totalHits, count) self.lock.acquire() self.totalQueries += 1 self.lock.release() finally: del searcher
def run(self): owf = "%sresult%s.csv"%(WRITE_DIR,self.i) print owf t = open(owf,"w") getVMEnv().attachCurrentThread() searcher = lucene.IndexSearcher(directory,True) a = 0 for line in self.content: query = lucene.QueryParser(lucene.Version.LUCENE_CURRENT, 'content',analyzer).parse(line) results = searcher.search(query,None,1) score_docs = results.scoreDocs b = 0 for score_doc in score_docs: doc = searcher.doc(score_doc.doc) b += 1 result = doc['tag'] t.write("%s,\"%s\"\n"%(self.label[a],result.strip())) a += 1 if a % 10 == 0: print "线程%s 完成%s,百分之%s已经完成"%(self.i,a,1.0*a/len(self.content))
def matchE(request): lucene.getVMEnv().attachCurrentThread() try: student = {} student['name'] = request.POST['student_name'] student['interest'] = \ request.POST['student_interest'] student['affiliation'] = \ request.POST['student_affiliation'] except KeyError: return render_to_response('index.html', {'error_msg':'missing field'}, context_instance=RequestContext(request)) else: prof_matcher = matcher() prof_list = prof_matcher.getProfMatch(student) request.session['prof_list'] = prof_list request.session['student'] = student info_list = [] for i,prof in enumerate(prof_list): score,explainList = prof_matcher.explainPos(i+1) info_list.append((prof,score,explainList)) for prof in prof_list: print prof['name'] aff_count = prof['affiliation'].count(student['affiliation']) prof['co_count'] = aff_count student = request.session.get('student') print 'in match', student, prof_list[0].get('name') return render_to_response('explain.html', {'info_list':info_list,'student':student})
def __init__(self, indexDir, doClear=True, computeLengthNorm=False): # if not jpype.isJVMStarted(): # lucene.initVM() lucene.getVMEnv().attachCurrentThread() self.analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) # self.analyzer = LimitTokenCountAnalyzer(self.analyzer, 100678)#is here? self.config = IndexWriterConfig(Version.LUCENE_CURRENT, self.analyzer) self.config.setRAMBufferSizeMB(256.0) # 设置自动提交的最大RAM为256MB self.config.setMaxBufferedDocs(10000) # 设置自动提交的最大Docs个数为10000 if not computeLengthNorm: sim = CustomSimilarity() self.config.setSimilarity(sim) self.path = os.path.join(INDEX_PATH, indexDir) # print self.path # path.mkdir(self.path) # if doClear: # self.clearExistingIndex() self.store = SimpleFSDirectory(File(self.path)) self.writer = IndexWriter(self.store, self.config) self.t1 = FieldType() # 域t1 self.t1.setIndexed(True) self.t1.setStored(True) self.t1.setTokenized(False) self.t1.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS) self.t2 = FieldType() # 域t2 self.t2.setIndexed(True) self.t2.setStored(False) self.t2.setTokenized(True) self.t2.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)
def matchE(request): lucene.getVMEnv().attachCurrentThread() try: student = {} student['name'] = request.POST['student_name'] student['interest'] = \ request.POST['student_interest'] student['affiliation'] = \ request.POST['student_affiliation'] except KeyError: return render_to_response('index.html', {'error_msg': 'missing field'}, context_instance=RequestContext(request)) else: prof_matcher = matcher() prof_list = prof_matcher.getProfMatch(student) request.session['prof_list'] = prof_list request.session['student'] = student info_list = [] for i, prof in enumerate(prof_list): score, explainList = prof_matcher.explainPos(i + 1) info_list.append((prof, score, explainList)) for prof in prof_list: aff_count = prof['affiliation'].count(student['affiliation']) prof['co_count'] = aff_count student = request.session.get('student') return render_to_response('explain.html', { 'info_list': info_list, 'student': student })
def initIndex(tbl): lucene.getVMEnv().attachCurrentThread() writer = getWriter(getStore(), getAnalyzer(), True) STORE = lucene.Field.Store.YES COMPRESS = lucene.Field.Store.COMPRESS TOKENIZED = lucene.Field.Index.TOKENIZED UN_TOKENIZED = lucene.Field.Index.UN_TOKENIZED rowset = tbl.select() acc = 0 for row in rowset: acc += 1 if acc == 100: acc = 0 sys.stdout.write(".") sys.stdout.flush() #Begin Lucene copy section doc = lucene.Document() doc.add(lucene.Field("id", unicode(row.id), STORE, UN_TOKENIZED)) doc.add(lucene.Field('data', unicode(row.data), COMPRESS, TOKENIZED)) doc.add(lucene.Field('source', unicode(row.source), COMPRESS, TOKENIZED)) #End Lucene copy section writer.addDocument(doc) print "|" writer.optimize(True) writer.close()
def process_q_test(q, out_q): lucene.initVM() lucene.getVMEnv().attachCurrentThread() index = DirectoryReader.open(SimpleFSDirectory( Paths.get(robust_index_dir))) searcher = IndexSearcher(index) searcher.setSimilarity(BM25Similarity()) analyzer = EnglishAnalyzer() qparser = QueryParser("contents", analyzer) preprocessor = Preprocess() while not exitFlag: qid, query = q.get() tname = multiprocessing.current_process().name # print(tname, qid, query) if query == "DONE": break try: # dids, scores = get_lm_matched_docs(query, searcher, qparser, 2000) # if len(dids) >= 10: # out_q.put((qid, dids, scores)) dids_text = get_lm_doc_snippets(query, searcher, qparser, analyzer, preprocessor) out_q.put((qid, dids_text)) except: print('%s exception %s, %s' % (tname, qid, query))
def reindex(row): if "id" not in row.__dict__: return lucene.getVMEnv().attachCurrentThread() try: writer = getWriter(getStore(), getAnalyzer()) #print "got Writer" STORE = lucene.Field.Store.YES COMPRESS = lucene.Field.Store.COMPRESS TOKENIZED = lucene.Field.Index.TOKENIZED UN_TOKENIZED = lucene.Field.Index.UN_TOKENIZED #Begin Lucene copy section doc = lucene.Document() #print "created new document" doc.add(lucene.Field("id", unicode(row.id), STORE, UN_TOKENIZED)) #print "added id field" doc.add(lucene.Field('data', unicode(row.data), COMPRESS, TOKENIZED)) doc.add(lucene.Field('source', unicode(row.source), COMPRESS, TOKENIZED)) #End Lucene copy section writer.deleteDocuments(lucene.Term("id", unicode(row.id))) #print "deleted existing document" writer.addDocument(doc) #print "added document" writer.optimize(True) #print "optimized index" writer.close() #print "closed writer" except: print "Failed in reindex of " + unicode(row.id) + "!" delLock()
def query_network(): """Handle API request '/network'. API Request Parameters ---------------------- ids : list of int nodes_limit : int edges_limit : int include_user_mentions : bool API Response Keys ----------------- status : string num_of_entries : int edges : dict canonical_url : string date_published : string formatted datetime domain : string from_user_id : string from_user_screen_name : string id : int is_mention : bool site_type : {'claim', 'fact_checking'} title : string to_user_id : string to_user_screen_name : string tweet_created_at : string formatted datetime tweet_id: string tweet_type: {'origin', 'retweet', 'quote', 'reply'} """ lucene.getVMEnv().attachCurrentThread() q_network_schema = Schema({ 'ids': Use(flask.json.loads), Optional('nodes_limit', default=1000): And(Use(int), lambda i: i > 0), Optional('edges_limit', default=12500): And(Use(int), lambda i: i > 0), Optional('include_user_mentions', default=True): And( unicode, Use(lambda s: s.lower()), lambda s: s in ('true', 'false'), Use(lambda s: True if s == 'true' else False)), }) q_kwargs = copy_req_args(request.args) try: q_kwargs = q_network_schema.validate(q_kwargs) df = db_query_network(engine, **q_kwargs) if len(df) == 0: raise APINoResultError('No edge could be built!') response = dict(status='OK', num_of_entries=len(df), edges=flask.json.loads(df.to_json(**TO_JSON_KWARGS))) except SchemaError as e: response = dict(status='ERROR', error=str(e)) except APINoResultError as e: response = dict(status='No result error', error=str(e)) except Exception as e: logger.exception(e) response = dict(status='ERROR', error='Server error, query failed') return flask.jsonify(response)
def get_instance(): """ Static access method. """ if QueryLuceneManager.__instance is None: lucene.initVM() QueryLuceneManager.__instance = QueryLucene() lucene.getVMEnv().attachCurrentThread() return QueryLuceneManager.__instance
def __getitem__(self, key): try: indexer = super(GetIndexers, self).__getitem__(key) self[key] = indexer except KeyError: indexer = db_indexers.get(key, None) if not indexer: raise KeyError, "Database not found" lucene.getVMEnv().attachCurrentThread() return indexer
def SearchFiles(command): STORE_DIR = "lucene/index" getVMEnv().attachCurrentThread() # print 'lucene', VERSION directory = SimpleFSDirectory(File(STORE_DIR)) searcher = IndexSearcher(directory, True) analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) rankedfiles = run(searcher, analyzer, command) searcher.close() return rankedfiles
def query_top_spreaders(): """Handle API request '/top-user'. API Request Parameters ---------------------- upper_day : string formatted datetime most_recent : bool API Response Keys ----------------- status : string num_of_entries : int spreaders : dict bot_score : float number_of_tweets : int site_type : {'claim', 'fact_checking'} spreading_type : {'active', 'influencial'} upper_day : string formatted datetime user_id : int user_raw_id : string user_screen_name : string """ lucene.getVMEnv().attachCurrentThread() yesterday = datetime.utcnow().date() - timedelta(days=1) yesterday = yesterday.strftime('%Y-%m-%d') q_top_spreaders_schema = Schema({ Optional('upper_day', default=yesterday): And(Regex('^\d{4}-\d{2}-\d{2}$'), Use(dateutil.parser.parse), error='Invalid date, should be yyyy-mm-dd format'), Optional('most_recent', default=True): And(unicode, Use(lambda s: s.lower()), lambda s: s in ('true', 'false'), Use(lambda s: True if s == 'true' else False)), }) q_kwargs = copy_req_args(request.args) try: q_kwargs = q_top_spreaders_schema.validate(q_kwargs) df = db_query_top_spreaders(engine, **q_kwargs) if len(df) == 0: raise APINoResultError('No top spreader found!') response = dict( status='OK', num_of_entries=len(df), spreaders=flask.json.loads(df.to_json(**TO_JSON_KWARGS))) except SchemaError as e: response = dict(status='ERROR', error=str(e)) except APINoResultError as e: response = dict(status='No result error', error=str(e)) except Exception as e: logger.exception(e) response = dict(status='ERROR', error='Server error, query failed') return flask.jsonify(response)
def rowDeleted(*args, **kw): print "id: " + str(args[0].id) + " is scheduled for termintation" lucene.getVMEnv().attachCurrentThread() try: writer = getWriter(getStore(), getAnalyzer()) writer.deleteDocuments(lucene.Term("id", unicode(args[0].id))) writer.optimize(True) writer.close() except: print "Failed in deletion of " + unicode(args[0].id) + " from lucene" delLock()
def run(self): print("Starting " + self.name) lucene.getVMEnv().attachCurrentThread() index = DirectoryReader.open( SimpleFSDirectory(Paths.get(robust_index_dir))) searcher = IndexSearcher(index) searcher.setSimilarity(BM25Similarity()) analyzer = EnglishAnalyzer() qparser = QueryParser("contents", analyzer) # process_query(self.name, self.q, self.out_q, searcher, qparser) print("Exiting " + self.name)
def query_top_articles(): """Handle API request 'top-articles' API Request Parameters ---------------------- upper_day : string formatted datetime most_recent : bool API Response Keys ----------------- status : string num_of_entries : int articles : dict canonical_url : string date_captured : string formatted datetime number_of_tweets : int site_type : {'claim', 'fact_checking'} title : string upper_day : string formatted datetime """ lucene.getVMEnv().attachCurrentThread() yesterday = datetime.utcnow().date() - timedelta(days=1) yesterday = yesterday.strftime('%Y-%m-%d') q_top_article_schema = Schema({ Optional('upper_day', default=yesterday): And(Regex('^\d{4}-\d{2}-\d{2}$'), Use(dateutil.parser.parse), error='Invalid date, shoul be yyyy-mm-dd format'), Optional('most_recent', default=True): And(unicode, Use(lambda s: s.lower()), lambda s: s in ('true', 'false'), Use(lambda s: True if s == 'true' else False)), Optional('exclude_tags', default=[]): And(Use(eval), error='Invalid exclude_tags input format'), }) q_kwargs = copy_req_args(request.args) try: q_kwargs = q_top_article_schema.validate(q_kwargs) df = db_query_top_articles(engine, **q_kwargs) if len(df) == 0: raise APINoResultError('No top article found!') response = dict( status='OK', num_of_entries=len(df), articles=flask.json.loads(df.to_json(**TO_JSON_KWARGS))) except SchemaError as e: response = dict(status='ERROR', error=str(e)) except APINoResultError as e: response = dict(status='No result error', error=str(e)) except Exception as e: logger.exception(e) response = dict(status='ERROR', error='Server error, query failed') return flask.jsonify(response)
def multiFieldsSearch(self, query, sim): lucene.getVMEnv().attachCurrentThread() parser = MultiFieldQueryParser( ["content_section", "title_section", 'title_article'], self.analyzer) parser.setDefaultOperator(QueryParserBase.OR_OPERATOR) query = MultiFieldQueryParser.parse(parser, QueryParser.escape(query)) self.searcher.setSimilarity(sim) hits = self.searcher.search(query, 6).scoreDocs return hits
def hello(query=None): if query: lucene.getVMEnv().attachCurrentThread() parsed_query = transform(query) results = find_results(query, reader) return render_template('page.pug', parsed_query=parsed_query, results=results, shown_fragments=3) return render_template('page.pug')
def startOaiPmh(self, portNumber, oaiJazz, storageComponent, register): getVMEnv().attachCurrentThread() with Reactor() as reactor: server = be( (Observable(), (ObservableHttpServer(reactor, portNumber), (OaiPmh(repositoryName='repositoryName', adminEmail='adminEmail', batchSize=2, supportXWait=True), (register, ), ( oaiJazz, (register, ), ), (storageComponent, ))))) list(compose(server.once.observer_init())) self._loopReactor(reactor)
def __call__(self, request): # Code to be executed for each request before # the view (and later middleware) are called. try: # ge the vm context and use it for this thread lucene.getVMEnv().attachCurrentThread() except: lucene.initVM() response = self.get_response(request) # Code to be executed for each request/response after # the view is called. return response
def run(cls, args): try: # print(args) args = cls.args_schema.validate(args) except SchemaError as e: sys.exit(e) session = Session() # make sure lucene be inited lucene.initVM() lucene.getVMEnv().attachCurrentThread() if args['--index'] is True: configure_logging( 'lucene.index', console_level=args['--console-log-level']) mgid = get_or_create_m( session, MetaInfo, data=dict( name='article_group_id_lucene_index', value='0', value_type='int', description='article.group_id used for lucene index'), fb_uk='name') if args['--mode'] == 'create': mgid.set_value(0) session.commit() logger.debug('Indexing started.. Getting articles..') q = """ SELECT DISTINCT ON (a.group_id) a.id, a.group_id, a.canonical_url, a.title, a.meta, a.content, coalesce(a.date_published, a.date_captured) AS pd, s.domain, s.site_type FROM article AS a JOIN site AS s ON s.id=a.site_id WHERE a.site_id IS NOT NULL AND s.is_enabled IS TRUE AND a.group_id>:gid ORDER BY group_id, pd ASC """ articles_iter = session.execute( sqlalchemy.text(q).bindparams(gid=mgid.get_value())) cls.index(session, args['--mode'], articles_iter, mgid) elif args['--search'] is True: configure_logging( 'lucene.search', console_level=args['--console-log-level']) cls.search(args['--query'], args['--top']) else: print("Unrecognized command!") sys.exit(2)
def run(self): vm_env = lucene.getVMEnv() vm_env.attachCurrentThread() # yeah, this should be refactored if "search" in self.action.keys(): self.run_searcher(self.action['search']) if "delete" in self.action.keys(): self.delete_index(self.action['delete']) if "export_tdm" in self.action.keys(): self.export_TDM(self.action['export_tdm']) if "export_tdm_csv" in self.action.keys(): self.export_TDM_csv(self.action['export_tdm_csv']) if "export_tdm_stm" in self.action.keys(): self.export_TDM_stm(self.action['export_tdm_stm']) if "export_contents" in self.action.keys(): self.export_contents(self.action['export_contents']) if "import_directory" in self.action.keys(): self.import_directory(self.action['import_directory']) if "import_csv" in self.action.keys(): self.import_csv(self.action['import_csv']) if "import_csv_with_content" in self.action.keys(): self.import_csv_with_content(*self.action['import_csv_with_content']) if "rebuild_metadata_cache" in self.action.keys(): self.rebuild_metadata_cache(*self.action['rebuild_metadata_cache']) if "reindex" in self.action.keys(): self.reindex()
def func(command): vm_env = lucene.getVMEnv() vm_env.attachCurrentThread() # ------------ # STORE_DIR = "index" directory = SimpleFSDirectory(File(STORE_DIR)) analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) searcher = IndexSearcher(DirectoryReader.open(directory)) # ------------ # p = get_d_dimensional_vector(command) vp = get_vp(p) query = QueryParser(Version.LUCENE_CURRENT, "Vector", analyzer).parse(vp) scoreDocs = searcher.search(query, 200).scoreDocs dict1 = {} result = "" for scoreDoc in scoreDocs: doc = searcher.doc(scoreDoc.doc) rank = 0.6 * float(doc.get("Likes")) + 0.4 * float(doc.get("Views")) ch = doc.get('Page_num') + ' ' ch += 'data/' + doc.get('Page_num') + '.jpg' + ' ' ch += doc.get('Page_link') + ' ' ch += doc.get('Views') + ' ' ch += doc.get('Likes') + ' ' tmp_alt = doc.get('Img_alt') tmp_alt = '_'.join(tmp_alt.split()) ch += tmp_alt dict1[ch] = rank res_list = sorted(dict1.items(), key = lambda item:item[1], reverse = True) for i in res_list: result += i[0] result += ' ' del searcher del analyzer return result
def search(): args = [] if request.method == 'POST': vm_env = lucene.getVMEnv() if vm_env == None: lucene.initVM(vmargs=['-Djava.awt.headless=true']) if request.form['ies']: args.append('ies:'+request.form['ies']) if request.form['area']: args.append('area:'+request.form['area']) if request.form['professor']: args.append('professor:'+request.form['professor']) if request.form['uf']: args.append('uf:'+request.form['uf']) if request.form['conceito']: #args.append('m:'+request.form['conceito']+'d:'+request.form['conceito']+'f:'+request.form['conceito']) args.append('m:'+request.form['conceito']) args.append('d:'+request.form['conceito']) table = [] if(len(args) > 0): scoreDocs = mansearch.buscar('indexer/',args) fsDir = SimpleFSDirectory(File(indexDir)) searcher = IndexSearcher(DirectoryReader.open(fsDir)) for scoreDoc in scoreDocs: doc = searcher.doc(scoreDoc.doc) table.append(dict((field.name(), field.stringValue()) for field in doc.getFields())) return render_template('busca.html',table = table) pass
def func2(command): STORE_DIR = "index1" vm_env = lucene.getVMEnv() vm_env.attachCurrentThread() directory = SimpleFSDirectory(File(STORE_DIR)) searcher = IndexSearcher(DirectoryReader.open(directory)) analyzer = WhitespaceAnalyzer(Version.LUCENE_CURRENT) res = [] if command == '': return query = QueryParser(Version.LUCENE_CURRENT, "zhuliao", analyzer).parse(command) scoreDocs = searcher.search(query, 9).scoreDocs for scoreDoc in scoreDocs: doc = searcher.doc(scoreDoc.doc) try: res.append([ doc.get("name"), doc.get("collect_num"), doc.get("zhuliao").split(' '), doc.get("zuofa").split('\n'), doc.get("img_url"), doc.get("url") ]) except: pass res1 = [] for i in res: i[1] = int(i[1]) res1.append(tuple(i)) res2 = sorted(res1, cmp=None, key=lambda x: x[1], reverse=True) return res2
def getTermVectors(route): vm_env = lucene.getVMEnv() vm_env.attachCurrentThread() stopWords = [] stopWordsSet = StopFilter.makeStopSet(Version.LUCENE_CURRENT, stopWords) analyzer = StopAnalyzer(Version.LUCENE_CURRENT, stopWordsSet) directory = SimpleFSDirectory(File(route)) ireader = IndexReader.open(directory) ls = [] for doc in range(ireader.numDocs()): vector = FreqVector() vector.vector = [] vector.freqs = [] norm = 0.0 terms = ireader.getTermVector(doc, "content") if terms is not None: termsEnum = terms.iterator(None) for term in BytesRefIterator.cast_(termsEnum): text = term.utf8ToString() tf = 1 + math.log(termsEnum.totalTermFreq(), 2) t = Term("content", term) idf = math.log(ireader.numDocs() / ireader.docFreq(t)) vector.vector.append(text) vector.freqs.append(tf * idf) norm += (tf * idf) * (tf * idf) ls.append((vector, math.sqrt(norm))) else: ls.append((vector, 0)) return ls
def run(command): vm_env = lucene.getVMEnv() vm_env.attachCurrentThread() STORE_DIR = "index1" directory = SimpleFSDirectory(File(STORE_DIR)) searcher = IndexSearcher(DirectoryReader.open(directory)) analyzer = WhitespaceAnalyzer(Version.LUCENE_CURRENT) query = QueryParser(Version.LUCENE_CURRENT, "contents", analyzer).parse(analysis(command)) HighlightFormatter = SimpleHTMLFormatter() highlighter = Highlighter(HighlightFormatter, QueryScorer(query)) scoreDocs = searcher.search(query, 500).scoreDocs print "%s total matching documents." % len(scoreDocs) result = [] for scoreDoc in scoreDocs: doc = searcher.doc(scoreDoc.doc) print 'path:', doc.get("path"), 'name:', doc.get( "name"), 'url:', doc.get("url"), 'title:', doc.get("title") text = doc.get('contents') highLightText = highlighter.getBestFragment(analyzer, "contents", text) if highLightText != None: highLightText = ''.join(highLightText.split(' ')) data = {} data['url'] = doc.get("url") data['title'] = doc.get('title') data['highlight'] = highLightText result.append(data) return result
def run_img(command): vm_env = lucene.getVMEnv() vm_env.attachCurrentThread() STORE_DIR = "index2" directory = SimpleFSDirectory(File(STORE_DIR)) searcher = IndexSearcher(DirectoryReader.open(directory)) analyzer = WhitespaceAnalyzer(Version.LUCENE_CURRENT) querys = BooleanQuery() query_content = QueryParser(Version.LUCENE_CURRENT, "urlcontent", analyzer).parse(command) query_title = QueryParser(Version.LUCENE_CURRENT, "title", analyzer).parse(command) querys.add(query_content, BooleanClause.Occur.SHOULD) querys.add(query_title, BooleanClause.Occur.SHOULD) scoreDocs = searcher.search(querys, 50).scoreDocs if len(scoreDocs) == 0: print "WARNING: No result" result = [] for scoreDoc in scoreDocs: doc = searcher.doc(scoreDoc.doc) print doc.get("title") data = {} data['title'] = doc.get('title') data['url'] = doc.get('url') data['imgurl'] = doc.get('imgurl') result.append(data) return result
def getMostFrequentTermNoStopwords(route, query): vm_env = lucene.getVMEnv() vm_env.attachCurrentThread() stopWords = [] stopWordsSet = StopFilter.makeStopSet(Version.LUCENE_CURRENT, stopWords) analyzer = StopAnalyzer(Version.LUCENE_CURRENT, stopWordsSet) directory = SimpleFSDirectory(File(route)) ireader = IndexReader.open(directory) currentTerm = "" currentTermFreq = 0 for doc in range(ireader.numDocs()): terms = ireader.getTermVector(doc, "content") if terms is not None: termsEnum = terms.iterator(None) for term in BytesRefIterator.cast_(termsEnum): text = term.utf8ToString() t = Term("content", term) freq = ireader.totalTermFreq(t) if freq > currentTermFreq and text not in query: currentTerm = text currentTermFreq = freq return currentTerm
def Qsearch(self,query): words = seg.segment(query.strip()) #words = self.segmentor.segment(query.strip()) #print ' '.join(words) vm_env = lucene.getVMEnv() vm_env.attachCurrentThread() result = QueryParser(Version.LUCENE_CURRENT, "contents",self.analyzer) result.setPhraseSlop(0) # "\""+' '.join(words)+"\"~0" means words should be continuous query = result.parse("\""+' '.join(words)+"\"~0") totalHits = self.searcher.search(query, 50) #print "%s total matching documents." % totalHits.totalHits #return totalHits.totalHits for hit in totalHits.scoreDocs: #print"Hit Score: ",hit.score, "Hit Doc:",hit.doc, "HitString:",hit.toString() doc= self.searcher.doc(hit.doc) #print doc.get("name").encode("utf-8") #print "----------------------------------------" t = Term('contents',' '.join(words)) #termDocs = ireader.termDocs(t) #for tt in termDocs: # print ireader.document(termDocs.docs).getFeildable('neme'),termDocs.freq() #print self.reader.totalTermFreq(t) return self.reader.totalTermFreq(t)
def func1(command): global vm_env STORE_DIR = "index" vm_env = lucene.getVMEnv() vm_env.attachCurrentThread() #lucene.initVM(vmargs=['-Djava.awt.headless=true']) # base_dir = os.path.dirname(os.path.abspath(sys.argv[0])) directory = SimpleFSDirectory(File(STORE_DIR)) searcher = IndexSearcher(DirectoryReader.open(directory)) analyzer = WhitespaceAnalyzer(Version.LUCENE_CURRENT) if command == '': return [] command_list = jieba.cut(command) command = " ".join(command_list) query = QueryParser(Version.LUCENE_CURRENT, "contents", analyzer).parse(command) scoreDocs = searcher.search(query, 50).scoreDocs result = [] for scoreDoc in scoreDocs: doc = searcher.doc(scoreDoc.doc) doct = { 'title': doc.get("title"), 'url': doc.get("url"), "sentence": doc.get("sentence") } result.append(doct) del searcher return result
def search(command): reordering = 'no' vm_env = lucene.getVMEnv() vm_env.attachCurrentThread() command = escape_lucene_special_chars(command) print("Searching for:", command) query = QueryParser("body", analyzer).parse(command) scoreDocs = searcher.search(query, 100).scoreDocs if reordering == 'ups': scoreDocs, scores = reorder_ups(scoreDocs, searcher) elif reordering == 'long': scoreDocs, scores = reorder_long(scoreDocs, searcher) elif reordering == 'normups': scoreDocs, scores = reorder_normups(scoreDocs, searcher) else: n_docs = len(scoreDocs) scores = {sd.doc: (n_docs-i,) for i,sd in enumerate(scoreDocs)} scoreDocs = scoreDocs[:5] for sd in scoreDocs: print(sd.doc,'\t',scores[sd.doc]) docs = [searcher.doc(sd.doc) for sd in scoreDocs] return [(d.get('name'), d.get('body')) for d in docs]
def __enter__(self): vm_env = lucene.getVMEnv() vm_env.attachCurrentThread() self.reader = reader self.searcher = searcher self.analyzer = analyzer return self
def run_music(ID): STORE_DIR = "index" vm_env = lucene.getVMEnv() vm_env.attachCurrentThread() directory = SimpleFSDirectory(File(STORE_DIR)) searcher = IndexSearcher(DirectoryReader.open(directory)) analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) query = QueryParser(Version.LUCENE_CURRENT, "id", analyzer).parse(ID) scoreDocs = searcher.search(query, 1).scoreDocs try: scoreDoc = scoreDocs[0] except: return None doc = searcher.doc(scoreDoc.doc) item = [] item.append(doc.get("song_title").encode('utf-8')) item.append(doc.get('song_url')) item.append(doc.get("singer").encode('utf-8')) item.append(doc.get("album").encode('utf-8')) item.append(doc.get("album_pic")) item.append(doc.get("album_genre").encode('utf-8')) item.append(doc.get("lyrics").encode('utf-8')) sim_str = doc.get("similar").encode('utf-8') sim_list = sim_str.split('+') for i in range(3): sim_list[i] = sim_list[i].split('*') item.append(sim_list) del searcher return item
def getMostFrequentTermStopwords(route, query): vm_env = lucene.getVMEnv() vm_env.attachCurrentThread() stopWords = [ "a", "an", "and", "are", "as", "at", "be", "but", "by", "for", "if", "no", "not", "more", "http", "html", "of", "on", "or", "such", "that", "the", "their", "then", "there", "these", "they", "this", "to", "was", "will", "with", "el", "la", "lo", "los", "las", "ante", "con", "sin", "que", "es", "de", "en", "por", "y", "los" ] stopWordsSet = StopFilter.makeStopSet(Version.LUCENE_CURRENT, stopWords) analyzer = StopAnalyzer(Version.LUCENE_CURRENT, stopWordsSet) directory = SimpleFSDirectory(File(route)) ireader = IndexReader.open(directory) currentTerm = "" currentTermFreq = 0 for doc in range(ireader.numDocs()): terms = ireader.getTermVector(doc, "content") if terms is not None: termsEnum = terms.iterator(None) for term in BytesRefIterator.cast_(termsEnum): text = term.utf8ToString().encode('UTF-8') t = Term("content", term) freq = ireader.totalTermFreq(t) if freq > currentTermFreq and text not in query: currentTerm = text currentTermFreq = freq return currentTerm
def createIndexNoStopwords(texts, route, rebuild): vm_env = lucene.getVMEnv() vm_env.attachCurrentThread() stopWords = [] stopWordsSet = StopFilter.makeStopSet(Version.LUCENE_CURRENT, stopWords) analyzer = StopAnalyzer(Version.LUCENE_CURRENT, stopWordsSet) directory = SimpleFSDirectory(File(route)) conf = IndexWriterConfig(Version.LUCENE_4_10_1, analyzer) if rebuild: conf.setOpenMode(IndexWriterConfig.OpenMode.CREATE) else: conf.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND) iwriter = IndexWriter(directory, conf) for key in texts: doc = Document() doc.add( Field("docName", "doc", Field.Store.YES, Field.Index.NOT_ANALYZED)) doc.add( Field("content", texts[key], Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.YES)) iwriter.addDocument(doc) iwriter.close()
def multiFieldsSearch(self, query, sim): """ Method that searches through documents using content_section and title_article Fields searchDir : the path to the folder that contains the index. """ # Now search the index: lucene.getVMEnv().attachCurrentThread() parser = MultiFieldQueryParser(["content_section", "title_article"], self.analyzer) parser.setDefaultOperator(QueryParserBase.OR_OPERATOR) query = MultiFieldQueryParser.parse(parser, QueryParser.escape(query)) self.searcher.setSimilarity(sim) hits = self.searcher.search(query, 6).scoreDocs return hits
def createIndexStopwords(texts, route, rebuild): vm_env = lucene.getVMEnv() vm_env.attachCurrentThread() stopWords = [ "a", "an", "and", "are", "as", "at", "be", "but", "by", "for", "if", "no", "not", "of", "on", "or", "such", "that", "the", "their", "then", "there", "these", "they", "this", "to", "was", "will", "with", "el", "la", "lo", "los", "las", "ante", "con", "sin", "que", "es", "de", "en", "por", "y", "los" ] stopWordsSet = StopFilter.makeStopSet(Version.LUCENE_CURRENT, stopWords) analyzer = StopAnalyzer(Version.LUCENE_CURRENT, stopWordsSet) directory = SimpleFSDirectory(File(route)) conf = IndexWriterConfig(Version.LUCENE_4_10_1, analyzer) if rebuild: conf.setOpenMode(IndexWriterConfig.OpenMode.CREATE) else: conf.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND) iwriter = IndexWriter(directory, conf) for key in texts: doc = Document() doc.add( Field("docName", key.__str__(), Field.Store.YES, Field.Index.NOT_ANALYZED)) doc.add( Field("content", texts[key], Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.YES)) iwriter.addDocument(doc) iwriter.close()
def run(command): global vm_env STORE_DIR = "index" vm_env = lucene.getVMEnv() vm_env.attachCurrentThread() directory = SimpleFSDirectory(File(STORE_DIR)) searcher = IndexSearcher(DirectoryReader.open(directory)) analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) query = QueryParser(Version.LUCENE_CURRENT, "contents", analyzer).parse(command) scoreDocs = searcher.search(query, 10).scoreDocs #print "%s total matching documents." % len(scoreDocs) res = [] for scoreDoc in scoreDocs: doc = searcher.doc(scoreDoc.doc) tmp = [] tmp.append([doc.get('name1'), doc.get('name2')]) tmp.append(doc.get("homepage")) tmp.append(doc.get("intro")) tmp.append(doc.get('logo')) a = doc.get('goods') a = a.split('\n') for i in a: tmp.append(i) res.append(tmp) return command, res
def init(vmargs='-Xrs,-Djava.awt.headless=true', **kwargs): """Callback to initialize VM and app roots after daemonizing.""" assert lucene.getVMEnv() or lucene.initVM(vmargs=vmargs, **kwargs) for app in cherrypy.tree.apps.values(): if isinstance(app.root, WebSearcher): app.root.__init__(*app.root.__dict__.pop('args'), **app.root.__dict__.pop('kwargs'))
def get_image_pmcid(pmcid, classes = ""): fields = ["pmcid", "class"] docs = [] location = web.__path__[0] + "/static/web/files/index/index.figures" #lucene.initVM() vm_env = lucene.getVMEnv() vm_env.attachCurrentThread() analyzer = StandardAnalyzer(Version.LUCENE_4_10_1) reader = IndexReader.open(SimpleFSDirectory(File(location))) searcher = IndexSearcher(reader) # multi field query: http://stackoverflow.com/questions/2005084/how-to-specify-two-fields-in-lucene-queryparser #query = MultiFieldQueryParser(Version.LUCENE_4_10_1, fields, analyzer) # query.setDefaultOperator(QueryParserBase.AND_OPERATOR) #query = query.parse(query, ('4175339','1')) # query.parse(queryString)#"Shigella sonnei" # query = QueryParser(Version.LUCENE_4_10_1, "abstract", analyzer).parse(queryString)#"Shigella sonnei" MAX = 10000 #hits = searcher.search(query, MAX) if classes == "all": queryStr = "pmcid:(" + ' '.join(pmcid) +")" else: queryStr = "pmcid:(" + ' '.join(pmcid) +")" + " AND class:" + classes query = QueryParser(Version.LUCENE_4_10_1, "pmcid",analyzer)#needed to build a custom query q = query.parse(queryStr) hits = searcher.search(q, MAX) for hit in hits.scoreDocs:#should only be one #print hit.score, hit.doc, hit.toString() docs.append(searcher.doc(hit.doc)) return docs #This will return the image documents that belong to a pmcid(article)
def getRandomDoc2(): location = web.__path__[0] + "/static/web/files/index/index.articles" #lucene.initVM() vm_env = lucene.getVMEnv() vm_env.attachCurrentThread() analyzer = StandardAnalyzer(Version.LUCENE_4_10_1) reader = IndexReader.open(SimpleFSDirectory(File(location))) searcher = IndexSearcher(reader) #query = QueryParser(Version.LUCENE_4_10_1, "keywords", analyzer).parse(queryString)#"Shigella sonnei" MAX = 1000 docNum = randrange(0, reader.maxDoc()) doc = reader.document(docNum) #print "Found %d document(s) that matched query '%s':" % (hits.totalHits, query) files = [] fileRoots = [] paths = [] paths.append(doc.get("articlepath")) pth = paths[0].replace("/home/kevin/Downloads/","/home/kevin/git/YIF/imageFinder/web/static/web/")#os.path.join(tools.__path__,"static/web/images") for root, directories, filenames in os.walk(pth):#probably something wrong with the location for filename in filenames: if (".jpg" or ".gif" or ".png") in filename: files.append(root.replace("/home/kevin/git/YIF/imageFinder/web/static/web/","") + "/" +filename)#temp, will need to chance fileRoots.append(root) print (root.replace("/home/kevin/git/YIF/imageFinder/web/static/web/","") + "/" + filename) try: rng = randrange(0, len(files)) except: return -1 else: return files[randrange(0, len(files))]
def getTermVectors(route): vm_env = lucene.getVMEnv() vm_env.attachCurrentThread() stopWords = [] stopWordsSet = StopFilter.makeStopSet(Version.LUCENE_CURRENT, stopWords) analyzer = StopAnalyzer(Version.LUCENE_CURRENT, stopWordsSet) directory = SimpleFSDirectory(File(route)) ireader = IndexReader.open(directory) ls = [] for doc in range(ireader.numDocs()): vector = FreqVector() vector.vector = [] vector.freqs = [] norm = 0.0 terms = ireader.getTermVector(doc, "content") if (terms is not None): termsEnum = terms.iterator(None) for term in BytesRefIterator.cast_(termsEnum): text = term.utf8ToString() tf = 1 + math.log(termsEnum.totalTermFreq(), 2) t = Term("content", term) idf = math.log(ireader.numDocs() / ireader.docFreq(t)) vector.vector.append(text) vector.freqs.append(tf * idf) norm += (tf * idf) * (tf * idf) ls.append((vector, math.sqrt(norm))) else: ls.append((vector, 0)) return ls
def run(self): from lucene import getVMEnv self._vmEnv = env = getVMEnv() if env is not None: env.attachCurrentThread() super(RepositoryThread, self).run()
def SearchQuery(queryString, fields, classification): #if __name__ == "__main__": #if __name__ == "retriever": location = web.__path__[0] + "/static/web/files/index/index.articles" #lucene.initVM() vm_env = lucene.getVMEnv() vm_env.attachCurrentThread() analyzer = StandardAnalyzer(Version.LUCENE_4_10_1) reader = IndexReader.open(SimpleFSDirectory(File(location))) searcher = IndexSearcher(reader) #multi field query: http://stackoverflow.com/questions/2005084/how-to-specify-two-fields-in-lucene-queryparser query = MultiFieldQueryParser(Version.LUCENE_4_10_1, fields, analyzer) #query.setDefaultOperator(QueryParserBase.AND_OPERATOR) query = MultiFieldQueryParser.parse(query, queryString) #query.parse(queryString)#"Shigella sonnei" #query = QueryParser(Version.LUCENE_4_10_1, "abstract", analyzer).parse(queryString)#"Shigella sonnei" MAX = 10000 hits = searcher.search(query, MAX) print "Found %d document(s) that matched query '%s':" % (hits.totalHits, query) paths = [] pmcids = [] documentDict = {} for hit in hits.scoreDocs: doc = searcher.doc(hit.doc) pmcids.append(doc.get("pmcid")) docDict = {"title" : doc.get("title")}#we can add any other field we want... documentDict[doc.get("pmcid")] = docDict #Where we get the images for all the pmcids images = get_image_pmcid(pmcids, classification)#should take in pmcids and class #create dictionary of images with pmcid being their key imagesDict = {} for img in images: img_pmcid = img.get("pmcid") if img_pmcid in imagesDict.keys(): imagesDict[img_pmcid].append(img.get("filepath") + "/" + img.get("figureid")) else: imagesDict[img_pmcid] = [(img.get("filepath") + "/" + img.get("figureid"))] #for each pmcid, we will assign an image to it for the search results for pmcid in pmcids: if imagesDict: docDict = documentDict[pmcid] docDict["imgURL"] = imagesDict[pmcid][0] documentDict[pmcid] = docDict else: docDict = documentDict[pmcid] docDict["imgURL"] = "images/NoImageAvailable.jpg" documentDict[pmcid] = docDict #END - Where we get the images for all the pmcids return documentDict
def startOaiPmh(self, portNumber, oaiJazz, storageComponent, register): getVMEnv().attachCurrentThread() reactor = Reactor() server = be( (Observable(), (ObservableHttpServer(reactor, portNumber), (OaiPmh(repositoryName='repositoryName', adminEmail='adminEmail', batchSize=2, supportXWait=True), (register,), (oaiJazz, (register,), ), (storageComponent,) ) ) ) ) list(compose(server.once.observer_init())) self._loopReactor(reactor)
def __init__(self): """ Inits a Reader by attaching the current luceneVM to the thread and creating a store and a IndexReader instance. """ vm_env = lucene.getVMEnv() # get lucene.vm vm_env.attachCurrentThread() self.analyzer = lucene.StandardAnalyzer(lucene.Version.LUCENE_CURRENT) self.store = lucene.SimpleFSDirectory(lucene.File(DIR)) self.reader = lucene.IndexReader.open(self.store, True)
def __init__(self): """ Inits a Writer by attaching the current luceneVM to the thread and creating a analyzer, a store and a IndexWriter instance. """ vm_env = lucene.getVMEnv() # get lucene.vm vm_env.attachCurrentThread() self.analyzer = lucene.StandardAnalyzer(lucene.Version.LUCENE_CURRENT) self.store = lucene.SimpleFSDirectory(lucene.File(DIR)) self.writer = lucene.IndexWriter(self.store, self.analyzer, True, lucene.IndexWriter.MaxFieldLength(512))
def __init__(self): """ Inits a Searcher by attaching the current luceneVM to the thread and creating a analyzer, a store, a parser and a IndexSearcher instance. """ vm_env = lucene.getVMEnv() vm_env.attachCurrentThread() self.analyzer = lucene.StandardAnalyzer(lucene.Version.LUCENE_CURRENT) self.store = lucene.SimpleFSDirectory(lucene.File(DIR)) self.parser = lucene.MultiFieldQueryParser(lucene.Version.LUCENE_CURRENT, ["content"], self.analyzer) self.searcher = lucene.IndexSearcher(self.store, readOnly=True)
def _start_indexer(config, db_name): jcc_evn = lucene.getVMEnv() jcc_evn.attachCurrentThread() while True: restart = _run_indexer(config, db_name) if not restart: print "Exit db indexer %s" % db_name break print "Restarted db indexer %s" % db_name
def doSearch(queryS, field="id", defaultField="data"): lucene.getVMEnv().attachCurrentThread() store = getStore() searcher = getSearcher(store) analyzer = getAnalyzer() parser = lucene.QueryParser(defaultField, analyzer) query = parser.parse(queryS) query = query.rewrite(getReader(store)) hits = searcher.search(query) results = [] for i in range(0, hits.length()): results.append(hits.doc(i).get(field)) searcher.close() store.close() return results
def search(self, q=None): lucene.getVMEnv().attachCurrentThread() if q is None or not q.strip(): search = False query = '' query_raw = '' hits = 0 places = [] else: search = True query_raw = q.replace('"', '') query = utils.escape_html(q) hits, places = self.storage.search(q, ontology=self.ontology) return tmpl_lookup.get_template('search.mako').render_unicode( search=search, query=query, query_raw=query_raw, hits=hits, places=places)
def __init__(self): vm_env = lucene.getVMEnv() vm_env.attachCurrentThread() indexDir = SimpleFSDirectory(File(app.config['INDEX_PATH'])) self.searcher = IndexSearcher(DirectoryReader.open(indexDir)) self.analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) self.parser = QueryParser(Version.LUCENE_CURRENT, "contents", self.analyzer)
def doSearch(queryS, field="id"): lucene.getVMEnv().attachCurrentThread() lucene.BooleanQuery.setMaxClauseCount(sys.maxint) store = getStore() searcher = getSearcher(store) analyzer = getAnalyzer() parser = lucene.QueryParser("ssid", analyzer) query = parser.parse(queryS) query = query.rewrite(getReader(store)) hits = searcher.search(query) results = [] for i in range(0, hits.length()): results.append(hits.doc(i).get(field)) searcher.close() store.close() return results
def __init__(self, rows=None): #lucene.initVM() # Django의 setttings.py 에 lucene.initVM() 설정 후 불러다 사용 vm_env = lucene.getVMEnv() if vm_env == None: lucene.initVM() else: vm_env.attachCurrentThread() self.analyzer = lucene.StandardAnalyzer(Version.LUCENE_30) self.indexDir = SimpleFSDirectory(File(INDEX_DIRECTORY)) self.rows = rows
def query(s): vm_env = lucene.getVMEnv() vm_env.attachCurrentThread() print (s) a = WebIndexer() result = [] try: res = a.query(u'name:"'+' '.join(jieba.cut(s, cut_all=False))+'" ', 'name') except Exception, e: print (e) print unicode(e.getJavaException())
def _importVM(): maxheap = getenv('PYLUCENE_MAXHEAP') if not maxheap: maxheap = '4g' warn("Using '4g' as maxheap for lucene.initVM(). To override use PYLUCENE_MAXHEAP environment variable.") from lucene import initVM, getVMEnv try: VM = initVM(maxheap=maxheap) # VM = initVM(maxheap=maxheap, vmargs='-agentlib:hprof=heap=sites') except ValueError: VM = getVMEnv() return VM