def luceneRetriver(query): lucene.initVM() indir = SimpleFSDirectory(File(INDEXDIR)) lucene_analyzer = StandardAnalyzer(Version.LUCENE_30) lucene_searcher = IndexSearcher(indir) my_query= QueryParser(Version.LUCENE_30,"text",\ lucene_analyzer).parse(query) MAX = 1000 total_hits = lucene_searcher.search(my_query, MAX) print "Hits: ", total_hits.totalHits for hit in total_hits.scoreDocs: print "Hit Score: ", hit.score, "Hit Doc:", hit.doc, "HitString:", hit.toString( ) doc = lucene_searcher.doc(hit.doc) print doc.get("text").encode("utf-8")
def post(self): q= self.get_argument("query") # self.write(key) # def query(query): # query = self.get_argument("q") lucene.initVM() indexDir = "index" dir = SimpleFSDirectory(File(indexDir)) analyzer = StandardAnalyzer(Version.LUCENE_30) searcher = IndexSearcher(dir) query = QueryParser(Version.LUCENE_30, "text", analyzer).parse(q) MAX = 10 hits = searcher.search(query, MAX) print "Found %d document(s) that matched query '%s':" % (hits.totalHits, query) items = [] rQ = [] #for key, value in doc_urls.iteritems() # print (key, value) for hit in hits.scoreDocs: #items.append({'score':hit.score, 'doc':hit.doc, 'blah':hit.toString(), 'url':doc_urls[str(hit.doc)]}) print hit.score, hit.doc, hit.toString() print(len(doc_urls)) items.append(doc_urls[str(hit.doc)]) doc = searcher.doc(hit.doc) print(hit.doc) self.render("index.html", title="Results", items=items, query=q)
def setUp(self): self.directory = RAMDirectory() self.analyzer = WhitespaceAnalyzer() writer = IndexWriter(self.directory, self.analyzer, True, IndexWriter.MaxFieldLength.UNLIMITED) doc = Document() doc.add( Field("f", "the quick brown fox jumps over the lazy dog", Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(doc) doc = Document() doc.add( Field("f", "the quick red fox jumps over the sleepy cat", Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(doc) writer.close() self.searcher = IndexSearcher(self.directory, True) self.reader = IndexReader.open(self.directory, True) self.quick = SpanTermQuery(Term("f", "quick")) self.brown = SpanTermQuery(Term("f", "brown")) self.red = SpanTermQuery(Term("f", "red")) self.fox = SpanTermQuery(Term("f", "fox")) self.lazy = SpanTermQuery(Term("f", "lazy")) self.sleepy = SpanTermQuery(Term("f", "sleepy")) self.dog = SpanTermQuery(Term("f", "dog")) self.cat = SpanTermQuery(Term("f", "cat"))
def retrieve( self, query, max_res = 10 ): lucene.initVM() inDir = SimpleFSDirectory( File( self.INDEX_DIR ) ) lucene_analyzer = StandardAnalyzer( Version.LUCENE_30 ) lucene_searcher = IndexSearcher( inDir ) my_query = QueryParser( Version.LUCENE_30, 'content' , lucene_analyzer ).parse( query ) MAX = max_res total_hits = lucene_searcher.search( my_query, MAX ) res_head = '{"query":"' + query + '","results":[' res_tail = ']}' result = res_head hits = total_hits.totalHits if ( hits > 0 ): res_body = '' it = 0 for hit in total_hits.scoreDocs: it += 1 doc = lucene_searcher.doc( hit.doc ) res_body += '{"rank":' +\ str( it ) +\ ',"score":"' +\ str( hit.score ) +\ '","title":"' +\ doc.get( 'title' ).encode('utf-8') +\ '","id":"' +\ doc.get( 'id' ).encode('utf-8') +\ '"}' if ( it < hits ): res_body += ',' result += res_body result += res_tail return result
def document( self, docId, max_res = 1 ): lucene.initVM() inDir = SimpleFSDirectory( File( self.INDEX_DIR ) ) lucene_analyzer = StandardAnalyzer( Version.LUCENE_30 ) lucene_searcher = IndexSearcher( inDir ) my_query = QueryParser( Version.LUCENE_30, 'id' , lucene_analyzer ).parse( docId ) MAX = max_res total_hits = lucene_searcher.search( my_query, MAX ) result = '{' hits = total_hits.totalHits if ( hits == 1 ): for hit in total_hits.scoreDocs: doc = lucene_searcher.doc( hit.doc ) result += '"id":"' +\ doc.get( 'id' ) +\ '","title":"' +\ doc.get( 'title' ) +\ '","abstract":"' +\ doc.get( 'abstract' ) +\ '","keyword":"' +\ doc.get( 'keyword' ) +\ '","content":"' +\ doc.get( 'content' ) +\ '","authors":"' +\ doc.get( 'authors' ) +\ '"' result += '}' return result
class OccuredCandidates: indexDir = 'data/index' max_candidates = 30 def __init__(self): lucene.initVM() self._lversion = Version.LUCENE_30 self._analyzer = EnglishAnalyzer(self._lversion) self._searcher = IndexSearcher(SimpleFSDirectory(File(self.indexDir))) self._translation = loadTranslation() self._links = loadLinks() def find(self, phrase): phrase = phrase.lower().encode('utf8') query = ' '.join(['+'+ word for word in phrase.split(' ')]); query = QueryParser(self._lversion, 'contents', self._analyzer).parse(query) hits = self._searcher.search(query, self.max_candidates) # if not hits.totalHits: print "%d documents for '%s'" % (hits.totalHits, str(query)) # potential bug # todo put article_id in lucene index instead of translating document title links = {} for hit in hits.scoreDocs: title = quote(self._searcher.doc(hit.doc).get("title").encode('utf-8').replace(' ', '_')).replace('%28', '(').replace('%29', ')') if title in self._translation: links[self._translation[title]] = hit.score # else: print title # potential bug return self._links[phrase].get(-1, 0), links def clear_links(self, annotations): return filter(lambda annotation: annotation['links'] and max(annotation['links'].values()) > 1, annotations)
def lucene_search(index_dir, limit, query_text): ''' lucene_search: Search a built index and return upto limit number of responses Arguments: Input index folder, limit value of results returned, query(as string) Returns: paths of responsive files as list ''' logging.basicConfig(file=os.path.join(index_dir,"lucene_search.log")) logger.info("Initializing search....") lucene.initVM() logger.info("Reading index from "+index_dir) index = SimpleFSDirectory(File(index_dir)) analyzer = StandardAnalyzer(Version.LUCENE_30) #Lucene version used to generate index searcher = IndexSearcher(index) logger.info("Parsing query :"+ query_text) query = QueryParser(Version.LUCENE_30, "text", analyzer).parse(query_text) hits = searcher.search(query, limit) logger.info("Found %d document(s) that matched query '%s':" % (hits.totalHits, query)) hit_paths = [] for hit in hits.scoreDocs: # The following code also generates score for responsive/found documents and the # content index which matched # print hit.score, hit.doc, hit.toString() doc = searcher.doc(hit.doc) hit_paths.append(doc.get("path")) return hit_paths
def luceneRetriver(query): lucene.initVM() indir = SimpleFSDirectory(File(INDEXDIR)) lucene_analyzer = StandardAnalyzer(Version.LUCENE_30) lucene_searcher = IndexSearcher(indir) my_query = QueryParser(Version.LUCENE_30,"text",\ lucene_analyzer).parse(query) MAX = 1000 total_hits = lucene_searcher.search(my_query,MAX) print "Hits: ",total_hits.totalHits for hit in total_hits.scoreDocs: print "Hit Score: ",hit.score, "Hit Doc:",hit.doc, "Hit String:",hit.toString() doc = lucene_searcher.doc(hit.doc) print doc.get("text").encode("utf-8")
def searchWithRequestAndQuery(cls, query, indexReader, taxoReader, indexingParams, facetRequest): """ Search an index with facets for given query and facet requests. returns a List<FacetResult> """ # prepare searcher to search against searcher = IndexSearcher(indexReader) # collect matching documents into a collector topDocsCollector = TopScoreDocCollector.create(10, True) if not indexingParams: indexingParams = DefaultFacetIndexingParams() # Faceted search parameters indicate which facets are we interested in facetSearchParams = FacetSearchParams(indexingParams) # Add the facet request of interest to the search params facetSearchParams.addFacetRequest(facetRequest) facetsCollector = FacetsCollector(facetSearchParams, indexReader, taxoReader) # perform documents search and facets accumulation searcher.search( query, MultiCollector.wrap([topDocsCollector, facetsCollector])) # Obtain facets results and print them res = facetsCollector.getFacetResults() i = 0 for facetResult in res: print "Result #%d has %d descendants" % ( i, facetResult.getNumValidDescendants()) print "Result #%d : %s" % (i, facetResult) i += 1 return res
def get_doc_details(doc_id, lucene_index_dir): ''' This function gets a file's details from the lucene index. Arguments: doc_id - file id lucene_index_dir - lucene index directory Returns: file details as a list ''' store = SimpleFSDirectory(File(lucene_index_dir)) searcher = IndexSearcher(store, True) doc = searcher.doc(doc_id) table = dict((field.name(), field.stringValue()) for field in doc.getFields()) row = [] metadata = MetadataType._types for field in metadata: if table.get(field,'empty') != 'empty' : row.append(table.get(field,'empty')) else: row.append('') row.append(str(table.get(MetadataType.FILE_ID,'empty'))) return row
def retrieveDocs(q): lucene.initVM() analyzer = StandardAnalyzer(Version.LUCENE_30) reader = IndexReader.open(SimpleFSDirectory(File("index/"))) searcher = IndexSearcher(reader) query = QueryParser(Version.LUCENE_30, "text", analyzer).parse(q) MAX = 1000 hits = searcher.search(query, MAX) nonDiverse = [] docsToScores = {} #create a list of html files with relevant websites rQ = [] print "Found %d document(s) that matched query '%s':" % (hits.totalHits, query) for hit in hits.scoreDocs: print hit.score, hit.doc, hit.toString() doc = searcher.doc(hit.doc) print doc.get("text").encode("utf-8") #print(new_urls[str(hit.doc)]) result = str(hit.score) + " " + str(hit.doc) + " " + hit.toString() if (len(nonDiverse) < 10): nonDiverse.append(new_urls[str(hit.doc)]) #find the document that corresponds to the html website and append to a list for min distance website = new_urls[str(hit.doc)] #html_files numbers of the hit websites added to rQ rQ.append(inv_map[website]) docsToScores[int(inv_map[website])] = hit.score print(inv_map[website]) return docsToScores, rQ, nonDiverse
def run(writer, analyzer): while True: print print "Hit enter with no input to quit." command = raw_input("Query:") if command == '': return print "Searching for:", command IndexReader = writer.getReader() searcher = IndexSearcher(IndexReader) #query = QueryParser(Version.LUCENE_CURRENT, "hashtag", analyzer).parse(command) #scoreDocs = searcher.search(query, 50).scoreDocs wildquery = command + "*" term = Term("hashtag", wildquery) query = WildcardQuery(term) scoreDocs = searcher.search(query, 5).scoreDocs print "%s total matching documents." % len(scoreDocs) for scoreDoc in scoreDocs: doc = searcher.doc(scoreDoc.doc) score = ( len(command) / len(doc.get("hashtag")) ) * scoreDoc.score print 'tweet:', doc.get("contents") print 'user_name:', doc.get("user_name") print 'when', doc.get("creation_date")
def get_indexed_file_details(ts_results, lucene_index_dir): ''' This function gets each files details from the lucene index. Arguments: ts_results - topic search results, each item contains [file id, root, file name, similarity score] lucene_index_dir - lucene index directory Returns: file details in a list ''' store = SimpleFSDirectory(File(lucene_index_dir)) searcher = IndexSearcher(store, True) rows = [] for rs in ts_results: doc = searcher.doc(rs[0]) table = dict((field.name(), field.stringValue()) for field in doc.getFields()) row = [] metadata = MetadataType._types for field in metadata: if table.get(field,'empty') != 'empty' : row.append(table.get(field,'empty')) else: row.append('') row.append(str(table.get(MetadataType.FILE_ID,'empty'))) row.append(str(rs[3])) # similarity score rows.append(row) return rows
def __init__(self, location): lucene.initVM() directory = SimpleFSDirectory(File(location)) self.reader = IndexReader.open(directory, True) self.searcher = IndexSearcher(self.reader) self.query_parser = QueryParser(Version.LUCENE_CURRENT, "text", WhitespaceAnalyzer())
def search(r, keyword=""): import logging logger = logging.getLogger("search") bench = Benchmark(logger) from lucene import IndexSearcher, StandardAnalyzer, FSDirectory, QueryParser, File, Hit import lucene, os os.environ["JAVA_HOME"] = "/usr/local/jdk1.6.0_17" lucene.initVM(lucene.CLASSPATH) directory = FSDirectory.open(File(CONFIG.INDEX_PATH)) ROBOT_INDEX = IndexSearcher(directory, True) ROBOT_ANALYZER = StandardAnalyzer() keyword = keyword or r.GET["keyword"] query = QueryParser("context", ROBOT_ANALYZER) query = query.parse('"%s"' % keyword) bench.start_mark("search") hits = ROBOT_INDEX.search(query) count = len(hits) result = [] i = 0 for hit in hits: i += 1 if i > 100: break doc = Hit.cast_(hit).getDocument() result.append(SearchResult(doc, i, keyword)) ROBOT_INDEX.close() et = bench.stop_mark() return render_to_response("robot_search_result.html", {"result": result, "count": count, "elaspe": et})
def testChinese(self): searcher = IndexSearcher(self.directory, True) query = TermQuery(Term("contents", "道")) scoreDocs = searcher.search(query, 50).scoreDocs self.assertEqual(1, len(scoreDocs), "tao")
def query(indexName, queryString): indSearcher = IndexSearcher(SimpleFSDirectory(File(indexName))) qp = QueryParser(Version.LUCENE_CURRENT, "content", StandardAnalyzer(Version.LUCENE_CURRENT)) qp.setDefaultOperator(qp.Operator.AND) query = qp.parse(queryString.replace("-","_")) aux = indSearcher.search(query, 100) results = aux.scoreDocs hits = aux.totalHits ir = indSearcher.getIndexReader() #results = collector.topDocs() i = 0 res = [] for r in results: doc = ir.document(i) res.insert(i, doc.get('id')) i+=1 return res
def testKeyword(self): searcher = IndexSearcher(self.directory, True) t = Term("isbn", "1930110995") query = TermQuery(t) scoreDocs = searcher.search(query, 50).scoreDocs self.assertEqual(1, len(scoreDocs), "JUnit in Action")
def testTermRangeQuery(self): searcher = IndexSearcher(self.directory, True) query = TermRangeQuery("title2", "d", "j", True, True) topDocs = searcher.search(query, 100) self.assertEqual(3, topDocs.totalHits) searcher.close()
def testExactPhrase(self): searcher = IndexSearcher(self.directory, True) query = QueryParser(Version.LUCENE_24, "contents", self.porterAnalyzer).parse('"over the lazy"') topDocs = searcher.search(query, 50) self.assertEqual(0, topDocs.totalHits, "exact match not found!")
def __init__(self): lucene.initVM() self._lversion = Version.LUCENE_30 self._analyzer = EnglishAnalyzer(self._lversion) self._searcher = IndexSearcher(SimpleFSDirectory(File(self.indexDir))) self._translation = loadTranslation() self._links = loadLinks()
def main(cls, argv): if len(argv) != 2: print "Usage: BerkeleyDbSearcher <index dir>" return dbHome = argv[1] env = DBEnv() env.set_flags(DB_LOG_INMEMORY, 1); if os.name == 'nt': env.set_cachesize(0, 0x4000000, 1) elif os.name == 'posix': from commands import getstatusoutput if getstatusoutput('uname') == (0, 'Linux'): env.set_cachesize(0, 0x4000000, 1) env.open(dbHome, (DB_THREAD | DB_INIT_MPOOL | DB_INIT_LOCK | DB_INIT_TXN), 0) index = DB(env) blocks = DB(env) txn = None try: txn = env.txn_begin(None) index.open(filename = '__index__', dbtype = DB_BTREE, flags = DB_THREAD, txn = txn) blocks.open(filename = '__blocks__', dbtype = DB_BTREE, flags = DB_THREAD, txn = txn) except: if txn is not None: txn.abort() txn = None raise else: txn.commit() txn = None try: txn = env.txn_begin(None) directory = DbDirectory(txn, index, blocks, 0) searcher = IndexSearcher(directory, True) topDocs = searcher.search(TermQuery(Term("contents", "fox")), 50) print topDocs.totalHits, "document(s) found" searcher.close() except: if txn is not None: txn.abort() txn = None raise else: txn.abort() index.close() blocks.close() env.close()
def __init__(self, user_loc_string, debug=False): analyzer = StopAnalyzer() fields = ['name', 'alternate_names', 'state'] directory = FSDirectory.getDirectory("index") self.DEBUG = debug self.searcher = IndexSearcher(directory) self.parser = MultiFieldQueryParser(fields, analyzer) self.user_location = self.doSearch(user_loc_string)
def query(self,title): self._th.attachCurrentThread() searcher = IndexSearcher(self._dir) query=QueryParser(Version.LUCENE_30, "title", self._analyzer).parse(title) total_hits = searcher.search(query, 10) for hit in total_hits.scoreDocs: doc = (searcher.doc(hit.doc)) return doc.get("title")+"\n"+doc.get("content")+"--------------------------------" return "None"
def getHitCount(self, fieldName, searchString): searcher = IndexSearcher(self.dir, True) t = Term(fieldName, searchString) query = TermQuery(t) hitCount = len(searcher.search(query, 50).scoreDocs) searcher.close() return hitCount
class MultiPhraseQueryTest(TestCase): def setUp(self): directory = RAMDirectory() writer = IndexWriter(directory, WhitespaceAnalyzer(), True, IndexWriter.MaxFieldLength.UNLIMITED) doc1 = Document() doc1.add(Field("field", "the quick brown fox jumped over the lazy dog", Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(doc1) doc2 = Document() doc2.add(Field("field", "the fast fox hopped over the hound", Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(doc2) writer.close() self.searcher = IndexSearcher(directory, True) def testBasic(self): query = MultiPhraseQuery() query.add([Term("field", "quick"), Term("field", "fast")]) query.add(Term("field", "fox")) print query topDocs = self.searcher.search(query, 10) self.assertEqual(1, topDocs.totalHits, "fast fox match") query.setSlop(1); topDocs = self.searcher.search(query, 10) self.assertEqual(2, topDocs.totalHits, "both match"); def testAgainstOR(self): quickFox = PhraseQuery() quickFox.setSlop(1) quickFox.add(Term("field", "quick")) quickFox.add(Term("field", "fox")) fastFox = PhraseQuery() fastFox.add(Term("field", "fast")) fastFox.add(Term("field", "fox")) query = BooleanQuery() query.add(quickFox, BooleanClause.Occur.SHOULD) query.add(fastFox, BooleanClause.Occur.SHOULD) topDocs = self.searcher.search(query, 10) self.assertEqual(2, topDocs.totalHits) def debug(self, hits): for i, doc in hits: print "%s: %s" %(hits.score(i), doc['field'])
class MultiPhraseQueryTest(TestCase): def setUp(self): directory = RAMDirectory() writer = IndexWriter(directory, WhitespaceAnalyzer(), True, IndexWriter.MaxFieldLength.UNLIMITED) doc1 = Document() doc1.add( Field("field", "the quick brown fox jumped over the lazy dog", Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(doc1) doc2 = Document() doc2.add( Field("field", "the fast fox hopped over the hound", Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(doc2) writer.close() self.searcher = IndexSearcher(directory, True) def testBasic(self): query = MultiPhraseQuery() query.add([Term("field", "quick"), Term("field", "fast")]) query.add(Term("field", "fox")) print query topDocs = self.searcher.search(query, 10) self.assertEqual(1, topDocs.totalHits, "fast fox match") query.setSlop(1) topDocs = self.searcher.search(query, 10) self.assertEqual(2, topDocs.totalHits, "both match") def testAgainstOR(self): quickFox = PhraseQuery() quickFox.setSlop(1) quickFox.add(Term("field", "quick")) quickFox.add(Term("field", "fox")) fastFox = PhraseQuery() fastFox.add(Term("field", "fast")) fastFox.add(Term("field", "fox")) query = BooleanQuery() query.add(quickFox, BooleanClause.Occur.SHOULD) query.add(fastFox, BooleanClause.Occur.SHOULD) topDocs = self.searcher.search(query, 10) self.assertEqual(2, topDocs.totalHits) def debug(self, hits): for i, doc in hits: print "%s: %s" % (hits.score(i), doc['field'])
def search(command=command1): searcher = IndexSearcher(reader) analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) query = QueryParser(Version.LUCENE_CURRENT, "contents", analyzer).parse(command) res = searcher.search(query, 1000000) print 'Total hits:', res.totalHits # return searcher, res return [searcher.doc(doc.doc) for doc in res.scoreDocs[:20]]
def getCrowds(self, query, field = CrowdFields.text): searcher = IndexSearcher(self.index, True) q = QueryParser(Version.LUCENE_CURRENT, field, self.analyzer).parse(query) collector = TopScoreDocCollector.create(hitsPerPage, True) searcher.search(q, collector) hits = collector.topDocs().scoreDocs return [ searcher.doc(scoreDoc.doc).get(CrowdFields.id) for scoreDoc in hits]
def SearchFiles(command): STORE_DIR = "lucene/index" getVMEnv().attachCurrentThread() # print 'lucene', VERSION directory = SimpleFSDirectory(File(STORE_DIR)) searcher = IndexSearcher(directory, True) analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) rankedfiles = run(searcher, analyzer, command) searcher.close() return rankedfiles
def TotalSearch(self, keyWord): try: searcher = IndexSearcher(self.indexDir) keyWord = keyWord.encode('utf8') query = QueryParser(Version.LUCENE_30, "title", self.analyzer).parse(keyWord) hits = searcher.search(query, 1000) return self.__MakeResultFormat(hits, searcher) except Exception, err: sys.stderr.write("ERROR: %s\n"% str(err))
def search(command): STORE_DIR = "index" vm_env = initVM() print 'lucene', VERSION directory = SimpleFSDirectory(File(STORE_DIR)) searcher = IndexSearcher(directory, True) analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) result = run(searcher, analyzer, command) searcher.close() return result
def Searchfile(command, prior, page, RPP): STORE_DIR = "index_ans" directory = SimpleFSDirectory(File(STORE_DIR)) searcher = IndexSearcher(directory, True) analyzer = WhitespaceAnalyzer(Version.LUCENE_CURRENT) store = run(searcher, analyzer, command, prior) searcher.close() start = (page - 1) * RPP end = start + RPP return store[start:end], len(store)
class LuceneSearch(object): def __init__(self): STORE_DIR = "index" initVM() print 'lucene', VERSION self.directory = SimpleFSDirectory(File(STORE_DIR)) print self.directory self.searcher = IndexSearcher(self.directory, True) self.analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) def close(self): self.searcher.close() def raw_search(self, query_string): query = QueryParser(Version.LUCENE_CURRENT, "contents", self.analyzer).parse(query_string) scoreDocs = self.searcher.search(query, 50).scoreDocs print "%s total matching documents." % len(scoreDocs) matches = [] for scoreDoc in scoreDocs: doc = self.searcher.doc(scoreDoc.doc) #print 'doc matched = ', dir(doc) contents = LuceneDoc.load(doc.get('name')) matches.append({'contents' : contents, 'doc' : doc}) return matches def search(self, query): matches = self.raw_search(query) results = '' if len(matches) > 0: results += str(len(matches))+" results <br/>" for match in matches: results += '<a href='+str(match['contents']['dealUrl'])+'>'+str(match['contents']['merchant'])+'</a><br />' results += '<p>'+str(match['contents']['shortAnnouncementTitle'])+','+str(match['contents']['redemptionLocation'])+'</p><br/>' else: results = "0 results <br/>" return results def cli_search(self): while True: print print "Hit enter with no input to quit." command = raw_input("Query:") if command == '': return matches = self.raw_search(command) print print "Searching for:", command for match in matches: print match['contents']['dealUrl'] print match['contents']['merchant'], ',', match['contents']['redemptionLocation'], ', ', match['contents']['div'] print match['contents']['shortAnnouncementTitle'] print '-'*80
def testExclusive(self): searcher = IndexSearcher(self.directory, True) # pub date of TTC was October 1988 query = NumericRangeQuery.newIntRange("pubmonth", Integer(198805), Integer(198810), False, False) topDocs = searcher.search(query, 100) self.assertEqual(0, topDocs.totalHits) searcher.close()
def GET(self): form1 = login() user_data = web.input() vm_env.attachCurrentThread() STORE_DIR = "F:\\imgindex" directory = SimpleFSDirectory(File(STORE_DIR)) searcher = IndexSearcher(directory, True) analyzer = SimpleAnalyzer(Version.LUCENE_CURRENT) a,b,c,d,e = img_func(user_data.keyword,searcher,analyzer) searcher.close() return render.img_result(form1,a,b,c,d,e)
def testExclusive(self): searcher = IndexSearcher(self.directory, True) # pub date of TTC was October 1988 query = NumericRangeQuery.newIntRange("pubmonth", 198805, 198810, False, False) topDocs = searcher.search(query, 100) self.assertEqual(0, topDocs.totalHits) searcher.close()
def getCrowds(self, query, field=CrowdFields.text): searcher = IndexSearcher(self.index, True) q = QueryParser(Version.LUCENE_CURRENT, field, self.analyzer).parse(query) collector = TopScoreDocCollector.create(hitsPerPage, True) searcher.search(q, collector) hits = collector.topDocs().scoreDocs return [ searcher.doc(scoreDoc.doc).get(CrowdFields.id) for scoreDoc in hits ]
def TotalSearch(self, keyWord): try: searcher = IndexSearcher(self.indexDir) keyWord = keyWord.encode('utf8') query = QueryParser(Version.LUCENE_30, "title", self.analyzer).parse(keyWord) hits = searcher.search(query, 1000) return self.__MakeResultFormat(hits, searcher) except Exception, err: sys.stderr.write("ERROR: %s\n" % str(err))
def pesquisar_com_lucene(): initVM() #print 'lucene', VERSION # Get handle to index directory directory = SimpleFSDirectory(File(STORE_DIR)) # Creates a searcher searching the provided index. ireader = IndexReader.open(directory, True) # Implements search over a single IndexReader. # Use a single instance and use it across queries # to improve performance. searcher = IndexSearcher(ireader) # Get the analyzer analyzer = EnglishAnalyzer(Version.LUCENE_CURRENT) for query in querys: query_number = query.query_number # Constructs a query parser. We specify what field to search into. query.query_text = query.query_text.replace('?','') query.query_text = query.query_text.replace('*','') queryParser = QueryParser(Version.LUCENE_CURRENT, FIELD_CONTENTS, analyzer) # Create the query query = queryParser.parse(query.query_text) # Run the query and get top 50 results topDocs = searcher.search(query,50000) # Get top hits scoreDocs = topDocs.scoreDocs r = resultado_query(query_number,scoreDocs) resultados.append(r) #print "%s total matching documents." % len(scoreDocs) #for scoreDoc in scoreDocs: # doc = searcher.doc(scoreDoc.doc) # print doc.get(FIELD_PATH) with open('resultados_da_busca/resultados.csv', 'w') as csvfile: spamwriter = csv.writer(csvfile, delimiter=';', quotechar='|', quoting=csv.QUOTE_MINIMAL) for row in resultados: resultados_da_row = [] i = 1 for resultado_da_query in row.query_results: doc = searcher.doc(resultado_da_query.doc) resultados_da_row.append((i,int(doc.get(FIELD_PATH)))) i = i + 1 spamwriter.writerow([row.query_number,resultados_da_row])
def begining(command): STORE_DIR = "index" global vm_env vm_env = initVM() vm_env.attachCurrentThread() #print 'lucene', VERSION directory = SimpleFSDirectory(File(STORE_DIR)) searcher = IndexSearcher(directory, True) analyzer = lucene.WhitespaceAnalyzer(Version.LUCENE_CURRENT) a = run(command, searcher, analyzer) searcher.close() return a
def LatestSearch(self): try: searcher = IndexSearcher(self.indexDir) today = time.strftime('%Y%m%d') keyWord = today.encode('utf8') print keyWord query = QueryParser(Version.LUCENE_30, "regDate", self.analyzer).parse(keyWord) hits = searcher.search(query, 1000) return self.__MakeResultFormat(hits, searcher) except: print 'BookSearcher TotalSearch Exception'
def testWithSlop(self): searcher = IndexSearcher(self.directory, True) parser = QueryParser(Version.LUCENE_CURRENT, "contents", self.porterAnalyzer) parser.setPhraseSlop(1) query = parser.parse('"over the lazy"') topDocs = searcher.search(query, 50) self.assertEqual(1, topDocs.totalHits, "hole accounted for")
def testTerm(self): searcher = IndexSearcher(self.directory, True) t = Term("subject", "ant") query = TermQuery(t) scoreDocs = searcher.search(query, 50).scoreDocs self.assertEqual(1, len(scoreDocs), "JDwA") t = Term("subject", "junit") scoreDocs = searcher.search(TermQuery(t), 50).scoreDocs self.assertEqual(2, len(scoreDocs)) searcher.close()
def setUp(self): self.directory = RAMDirectory() writer = IndexWriter(self.directory, self.synonymAnalyzer, True, IndexWriter.MaxFieldLength.UNLIMITED) doc = Document() doc.add( Field("content", "The quick brown fox jumps over the lazy dogs", Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(doc) writer.close() self.searcher = IndexSearcher(self.directory, True)
def testAnd(self): searchingBooks = TermQuery(Term("subject", "search")) books2004 = NumericRangeQuery.newIntRange("pubmonth", Integer(200401), Integer(200412), True, True) searchingBooks2004 = BooleanQuery() searchingBooks2004.add(searchingBooks, BooleanClause.Occur.MUST) searchingBooks2004.add(books2004, BooleanClause.Occur.MUST) searcher = IndexSearcher(self.directory, True) scoreDocs = searcher.search(searchingBooks2004, 50).scoreDocs self.assertHitsIncludeTitle(searcher, scoreDocs, "Lucene in Action")
def testSpecifiedOperator(self): MUST = BooleanClause.Occur.MUST query = MultiFieldQueryParser.parse(Version.LUCENE_CURRENT, "development", ["title", "subject"], [MUST, MUST], SimpleAnalyzer()) searcher = IndexSearcher(self.directory, True) scoreDocs = searcher.search(query, 50).scoreDocs self.assertHitsIncludeTitle(searcher, scoreDocs, "Java Development with Ant") self.assertEqual(1, len(scoreDocs), "one and only one")
def testIdRangeQuery(self): parser = CustomQueryParser("field", self.analyzer) query = parser.parse("id:[37 TO 346]") self.assertEqual("id:[0000000037 TO 0000000346]", query.toString("field"), "padded") searcher = IndexSearcher(self.directory, True) scoreDocs = searcher.search(query, 1000).scoreDocs self.assertEqual(310, len(scoreDocs)) print parser.parse("special:[term TO *]") print parser.parse("special:[* TO term]")
def setUp(self): # set up sample document directory = RAMDirectory() writer = IndexWriter(directory, WhitespaceAnalyzer(), True, IndexWriter.MaxFieldLength.UNLIMITED) doc = Document() doc.add( Field("field", "the quick brown fox jumped over the lazy dog", Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(doc) writer.close() self.searcher = IndexSearcher(directory)
def testStems(self): searcher = IndexSearcher(self.directory) query = QueryParser(Version.LUCENE_CURRENT, "contents", self.porterAnalyzer).parse("laziness") topDocs = searcher.search(query, 50) self.assertEqual(1, topDocs.totalHits, "lazi") query = QueryParser(Version.LUCENE_CURRENT, "contents", self.porterAnalyzer).parse('"fox jumped"') topDocs = searcher.search(query, 50) self.assertEqual(1, topDocs.totalHits, "jump jumps jumped jumping")