def testPrefixQuery(self): parser = QueryParser(Version.LUCENE_CURRENT, "category", StandardAnalyzer(Version.LUCENE_CURRENT)) parser.setLowercaseExpandedTerms(False) print parser.parse("/Computers/technology*").toString("category")
def does_line_exist(self,line,x,y): """ Old, more complex function if a sentence already exists in the index. Not used in the moment """ return self.does_line_existNew(line, x, y) try: array = re.findall(r'[\w\s]+',x) x = "" for item in array: x+=item qp = QueryParser(Version.LUCENE_35, "X", analyzer) qp.setDefaultOperator(qp.Operator.AND) query = qp.parse(x) MAX = 100000 hits = searcher.search(query, MAX) #First check, if an x already exists for hit in hits.scoreDocs: doc = searcher.doc(hit.doc) y_entry = doc["Y"] if y_entry == y: print "y found" print try: array = re.findall(r'[\w\s]+',line) string = "" for item in array: string+=item qp = QueryParser(Version.LUCENE_35, "Sentence", analyzer) qp.setDefaultOperator(qp.Operator.AND) query = qp.parse(string) MAX = 10 hits = searcher.search(query, MAX) if len(hits.scoreDocs)>0: return True except Exception: s_tmp = str(sys.exc_info()) if "too many boolean clauses" in s_tmp: print "too many boolean clauses" return True else: print "Unexpected error:", sys.exc_info()[0] print "in does line exist" print s_tmp print 'nothing found' return False except: print("Fail (does line exists) in x:"+x+" y:"+y) print "Unexpected error:", sys.exc_info()[0] print
def search(self, string ,special = None): query = "" try: MAX = 100000 #for dates such as 1931.08.06 string = string.replace("."," ") array = re.findall(r'[\w\s]+',string) string = "" for item in array: string+=item qp = QueryParser(Version.LUCENE_35, "title", analyzer) qp.setDefaultOperator(qp.Operator.AND) query = qp.parse(string) # print ("query",query) hits = searcher.search(query, MAX) sentence_list = [] for hit in hits.scoreDocs: doc = searcher.doc(hit.doc) sentence_list.append(doc.get("title").encode("utf-8")) return sentence_list except: print("Fail in receiving sentence with term "+string) print ("query",query) print "Unexpected error:", sys.exc_info()[0] # raw_input("wait") print return []
def main(cls, argv): allBooks = MatchAllDocsQuery() parser = QueryParser(Version.LUCENE_CURRENT, "contents", StandardAnalyzer(Version.LUCENE_CURRENT)) query = BooleanQuery() query.add(allBooks, BooleanClause.Occur.SHOULD) query.add(parser.parse("java OR action"), BooleanClause.Occur.SHOULD) indexDir = System.getProperty("index.dir") directory = SimpleFSDirectory(File(indexDir)) example = SortingExample(directory) example.displayResults(query, Sort.RELEVANCE) example.displayResults(query, Sort.INDEXORDER) example.displayResults(query, Sort(SortField("category", SortField.STRING))) example.displayResults(query, Sort(SortField("pubmonth", SortField.INT, True))) example.displayResults(query, Sort([SortField("category", SortField.STRING), SortField.FIELD_SCORE, SortField("pubmonth", SortField.INT, True)])) example.displayResults(query, Sort([SortField.FIELD_SCORE, SortField("category", SortField.STRING)])) directory.close()
def searchForDbpediaURI(self, uri): """ Returns all anchor texts, which are related to the given DBpedia URI. Also returns for each anchor text the corresponding URI and the number of how often the anchor appears on the english Wikipedia """ uri_old = uri uri = uri.replace("http://dbpedia.org/resource/","") array = re.findall(r'[\w\s]+',uri) uri = "" for item in array: uri+=item try: qp = QueryParser(Version.LUCENE_35, "dbpedia_uri", analyzer) qp.setDefaultOperator(qp.Operator.AND) query = qp.parse(uri) MAX = 10000 result = [] hits = searcher.search(query, MAX) for hit in hits.scoreDocs: doc = searcher.doc(hit.doc) dbpedia_uri = doc["dbpedia_uri"].encode("utf-8") if dbpedia_uri == uri_old: result.append([doc["anchor"].encode("utf-8"), doc["anchor_uri"].encode("utf-8"), dbpedia_uri, doc["number"].encode("utf-8")]) return result except: print("searchForDbpediaURI - Fail in uri: "+uri) return []
def searchForDbpediaURI(self, uri): """ Returns all sentences, which are tagged with the given DBpedia URI """ print "in searchForDbpediaURI" uri_old = uri uri = uri.replace("http://dbpedia.org/ontology/","") uri = uri.replace("http://dbpedia.org/property/","") uri = uri.replace("http://dbpedia.org/resource/","") array = re.findall(r'[\w\s]+',uri) uri = "" for item in array: uri+=item try: qp = QueryParser(Version.LUCENE_35, "URI", analyzer) qp.setDefaultOperator(qp.Operator.AND) query = qp.parse(uri) print "query: "+str(query) MAX = 500000 result = [] hits = searcher.search(query, MAX) for hit in hits.scoreDocs: doc = searcher.doc(hit.doc) dbpedia_uri = doc["URI"] if dbpedia_uri == uri_old: result.append([IndexUtils.sentence_wrapper(doc["Sentence"]), doc["X"], doc["Y"],dbpedia_uri]) return result except: print("Fail in uri: "+uri) print "Unexpected error:", sys.exc_info()[0] return result
def does_line_existNew(self,line,x,y): """ Checks, if parsed sentence already exists in index """ query = "" try: array = re.findall(r'[\w]+',line) string = "" for item in array: string+=item+" " qp = QueryParser(Version.LUCENE_35, "Sentence", analyzer) qp.setDefaultOperator(qp.Operator.AND) query = qp.parse(string) MAX = 10 hits = searcher.search(query, MAX) if len(hits.scoreDocs)>0: return True else: return False except Exception: s_tmp = str(sys.exc_info()) if "too many boolean clauses" in s_tmp: print "too many boolean clauses" """ Returns true, so that the sentence is not added each time, to avoid further error messages. Only occours with very large sentences. """ return True else: print "Unexpected error:", sys.exc_info()[0] print "in does line exist" print s_tmp return False
def searchString(self, string): 'searches for a string and returns an array of POS-tagged sentences' query = "" #print("Input String: ",string) try: MAX = 100000 #for dates such as 1931.08.06 string = string.replace("."," ") array = re.findall(r'[\w\s]+',string) string = "" for item in array: string+=item #print("Input String2: ",string) qp = QueryParser(Version.LUCENE_35, "sentence", analyzer) qp.setDefaultOperator(qp.Operator.AND) query = qp.parse(string) #print ("query",query) hits = searcher.search(query, MAX) #print len(hits) sentence_list = [] for hit in hits.scoreDocs: doc = searcher.doc(hit.doc) #print doc.get("sentence") sentence_list.append(eval(doc.get("sentence").encode("utf-8"))) return sentence_list except: print("Fail in receiving sentence with term "+string+" in search term") print ("query",query) print "Unexpected error:", sys.exc_info()[0] # raw_input("wait") print return []
def search(r, keyword=""): import logging logger = logging.getLogger("search") bench = Benchmark(logger) from lucene import IndexSearcher, StandardAnalyzer, FSDirectory, QueryParser, File, Hit import lucene, os os.environ["JAVA_HOME"] = "/usr/local/jdk1.6.0_17" lucene.initVM(lucene.CLASSPATH) directory = FSDirectory.open(File(CONFIG.INDEX_PATH)) ROBOT_INDEX = IndexSearcher(directory, True) ROBOT_ANALYZER = StandardAnalyzer() keyword = keyword or r.GET["keyword"] query = QueryParser("context", ROBOT_ANALYZER) query = query.parse('"%s"' % keyword) bench.start_mark("search") hits = ROBOT_INDEX.search(query) count = len(hits) result = [] i = 0 for hit in hits: i += 1 if i > 100: break doc = Hit.cast_(hit).getDocument() result.append(SearchResult(doc, i, keyword)) ROBOT_INDEX.close() et = bench.stop_mark() return render_to_response("robot_search_result.html", {"result": result, "count": count, "elaspe": et})
def searchXYPair(self,x,y): """ Returns all sentences, which are tagged with the given two entities (x,y) """ tmp_hm = {} if x == "" or y == "": return [] try: array = re.findall(r'[\w\s]+',x) x = "" for item in array: x+=item qp = QueryParser(Version.LUCENE_35, "X", analyzer) qp.setDefaultOperator(qp.Operator.AND) query = qp.parse(x) MAX = 100000 result_list = [] hits = searcher.search(query, MAX) for hit in hits.scoreDocs: doc = searcher.doc(hit.doc) y_entry = doc["Y"] if y_entry == y: tmp_hm[doc["Sentence"]]="" for key in tmp_hm: result_list.append(IndexUtils.sentence_wrapper(key)) tmp_hm = {} return result_list except: print("Fail (search XYPair) in x:"+x+" y:"+y) print "Unexpected error:", sys.exc_info()[0] print return []
def query(indexName, queryString): indSearcher = IndexSearcher(SimpleFSDirectory(File(indexName))) qp = QueryParser(Version.LUCENE_CURRENT, "content", StandardAnalyzer(Version.LUCENE_CURRENT)) qp.setDefaultOperator(qp.Operator.AND) query = qp.parse(queryString.replace("-","_")) aux = indSearcher.search(query, 100) results = aux.scoreDocs hits = aux.totalHits ir = indSearcher.getIndexReader() #results = collector.topDocs() i = 0 res = [] for r in results: doc = ir.document(i) res.insert(i, doc.get('id')) i+=1 return res
def searchKey(self, key , rank = None): query = "" try: MAX = 100000 qp = QueryParser(Version.LUCENE_35, "key", analyzer) qp.setDefaultOperator(qp.Operator.AND) query = qp.parse(key) # print ("query",query) hits = searcher.search(query, MAX) sentence_list = [] for hit in hits.scoreDocs: doc = searcher.doc(hit.doc) try: sentence_list.append(eval(doc.get("sentence").encode("utf-8"))) except: print doc.get("sentence") return sentence_list except: print("Fail in receiving sentence with term "+key) print ("query",query) print "Unexpected error:", sys.exc_info()[0] # raw_input("wait") print return []
def testSlop(self): q = QueryParser(Version.LUCENE_CURRENT, "field", self.analyzer).parse('"exact phrase"') self.assertEqual('"exact phrase"', q.toString("field"), "zero slop") qp = QueryParser(Version.LUCENE_CURRENT, "field", self.analyzer) qp.setPhraseSlop(5) q = qp.parse('"sloppy phrase"') self.assertEqual('"sloppy phrase"~5', q.toString("field"), "sloppy, implicitly")
def testLowercasing(self): q = QueryParser(Version.LUCENE_CURRENT, "field", self.analyzer).parse("PrefixQuery*") self.assertEqual("prefixquery*", q.toString("field"), "lowercased") qp = QueryParser(Version.LUCENE_CURRENT, "field", self.analyzer) qp.setLowercaseExpandedTerms(False) q = qp.parse("PrefixQuery*") self.assertEqual("PrefixQuery*", q.toString("field"), "not lowercased")
def extractFeatureQueryWords(query): import string from lucene import Document, TermQuery, Term # create analyzer aux_analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) try: file = open('../features.txt', 'r') featurelist = [] for line in file.readlines(): words_in_line = line.split() featurelist += words_in_line querywordlist = query.split() featureQueryList = [] productQueryList = [] for word in querywordlist: if word in featurelist: featureQueryList.append(word) else: # create parser for word aux_parser = QueryParser(Version.LUCENE_CURRENT, "title", aux_analyzer) aux_query = aux_parser.parse(word) scoreDocs = searcher.search(aux_query, 50).scoreDocs if scoreDocs: productQueryList.append(word) featureQuery = "" if featureQueryList: featureQuery = "(" for i in range(len(featureQueryList)): if i == len(featureQueryList) - 1: featureQuery += featureQueryList[i] + ")" else: featureQuery += featureQueryList[i] + " AND " print featureQuery productQuery = "" if productQueryList: productQuery = "(" for i in range(len(productQueryList)): if i == len(productQueryList) - 1: productQuery += productQueryList[i] + ")" else: productQuery += productQueryList[i] + " AND " return (featureQuery, productQuery, featureQueryList, productQueryList) except Exception, ex: print "Could not separate feature query words. Reason: ", ex return ("", "(" + query + ")", [], querywordlist)
def getResultScoreDocs(query): # create analyzer analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) # create parser for user submitted query parser = QueryParser(Version.LUCENE_CURRENT, "title", analyzer) parser.setDefaultOperator(QueryParser.Operator.AND) formatted_query = parser.parse(query) scoreDocs = searcher.search(formatted_query, 50).scoreDocs return scoreDocs
def testSlop(self): q = QueryParser(Version.LUCENE_CURRENT, "field", self.analyzer).parse('"exact phrase"') self.assertEqual("\"exact phrase\"", q.toString("field"), "zero slop") qp = QueryParser(Version.LUCENE_CURRENT, "field", self.analyzer) qp.setPhraseSlop(5) q = qp.parse('"sloppy phrase"') self.assertEqual("\"sloppy phrase\"~5", q.toString("field"), "sloppy, implicitly")
class IndexSearcherWrapper(object): def __init__(self, location): lucene.initVM() directory = SimpleFSDirectory(File(location)) self.reader = IndexReader.open(directory, True) self.searcher = IndexSearcher(self.reader) self.query_parser = QueryParser(Version.LUCENE_CURRENT, "text", WhitespaceAnalyzer()) def search(self, topic, max=5000): query = self.query_parser.parse(topic.title) return self.searcher.search(query, max)
def build_advanced_search_query(params, operator, analyzer): """ Takes a dictionary containing key=value pairs where keys are fields in our lucene document and values are search terms provided by the user. A BooleanQuery is built from these key=value pairs """ parser = QueryParser(Version.LUCENE_CURRENT, "name", analyzer) query_list = ["%s:\"%s\"" % (field, process_query_param(val)) for (field, val) in get_adv_query_packet(params)] return parser.parse("%s" % (" " + operator + " ").join(query_list))
def pesquisar_com_lucene(): initVM() #print 'lucene', VERSION # Get handle to index directory directory = SimpleFSDirectory(File(STORE_DIR)) # Creates a searcher searching the provided index. ireader = IndexReader.open(directory, True) # Implements search over a single IndexReader. # Use a single instance and use it across queries # to improve performance. searcher = IndexSearcher(ireader) # Get the analyzer analyzer = EnglishAnalyzer(Version.LUCENE_CURRENT) for query in querys: query_number = query.query_number # Constructs a query parser. We specify what field to search into. query.query_text = query.query_text.replace('?','') query.query_text = query.query_text.replace('*','') queryParser = QueryParser(Version.LUCENE_CURRENT, FIELD_CONTENTS, analyzer) # Create the query query = queryParser.parse(query.query_text) # Run the query and get top 50 results topDocs = searcher.search(query,50000) # Get top hits scoreDocs = topDocs.scoreDocs r = resultado_query(query_number,scoreDocs) resultados.append(r) #print "%s total matching documents." % len(scoreDocs) #for scoreDoc in scoreDocs: # doc = searcher.doc(scoreDoc.doc) # print doc.get(FIELD_PATH) with open('resultados_da_busca/resultados.csv', 'w') as csvfile: spamwriter = csv.writer(csvfile, delimiter=';', quotechar='|', quoting=csv.QUOTE_MINIMAL) for row in resultados: resultados_da_row = [] i = 1 for resultado_da_query in row.query_results: doc = searcher.doc(resultado_da_query.doc) resultados_da_row.append((i,int(doc.get(FIELD_PATH)))) i = i + 1 spamwriter.writerow([row.query_number,resultados_da_row])
def testWithSlop(self): searcher = IndexSearcher(self.directory, True) parser = QueryParser(Version.LUCENE_CURRENT, "contents", self.porterAnalyzer) parser.setPhraseSlop(1) query = parser.parse('"over the lazy"') topDocs = searcher.search(query, 50) self.assertEqual(1, topDocs.totalHits, "hole accounted for")
def testDateRangeQuery(self): # locale diff between jre and gcj 1/1/04 -> 01/01/04 # expression = "modified:[1/1/04 TO 12/31/04]" expression = "modified:[01/01/04 TO 12/31/04]" parser = QueryParser(Version.LUCENE_CURRENT, "subject", self.analyzer) parser.setLocale(Locale.US) query = parser.parse(expression) print expression, "parsed to", query topDocs = self.searcher.search(query, 50) self.assert_(topDocs.totalHits > 0)
def build_advanced_search_query(params, operator, analyzer): """ Takes a dictionary containing key=value pairs where keys are fields in our lucene document and values are search terms provided by the user. A BooleanQuery is built from these key=value pairs """ parser = QueryParser(Version.LUCENE_CURRENT, "name", analyzer) query_list = [ "%s:\"%s\"" % (field, process_query_param(val)) for (field, val) in get_adv_query_packet(params) ] return parser.parse("%s" % (" " + operator + " ").join(query_list))
def search(self, query,category_id=None): SHOULD = BooleanClause.Occur.SHOULD #MultiFieldQueryParser.setOperator(QueryParser.DEFAULT_OPERATOR_AND); parser1 = QueryParser('summary',self.analyzer) parser2 = QueryParser('title',self.analyzer) parser1.setDefaultOperator(QueryParser.AND_OPERATOR) parser2.setDefaultOperator(QueryParser.AND_OPERATOR) q1 = parser1.parse(query) q2 = parser2.parse(query) boolQuery = BooleanQuery() boolQuery.add(q1,SHOULD) boolQuery.add(q2,SHOULD) #camp = CategoryComparatorSource(query) #sortfield = SortField("link", camp) #sort = Sort(sortfield) if category_id: self.catfilter.query = query self.catfilter.category_id = category_id hits = self.searcher.search(boolQuery,self.catfilter) else: hits = self.searcher.search(boolQuery) return hits
def search(self, command, field_id="contents", sort_on=None, sort_order=False, analyzer_id=None): """Do the lucene search.""" analyzer = self.getAnalyzer(analyzer_id) try: if VERSION.startswith('1.9'): query = QueryParser.parse(command, field_id, analyzer) else: query = QueryParser(field_id, analyzer).parse(command) except JavaError: print "Error: Lucene cannot parse this query." return None if sort_on: return self.searcher.search(query, Sort(sort_on, sort_order)) return self.searcher.search(query)
def query(indexName, queryFile, runName): indReader = IndexReader.open(SimpleFSDirectory(File(indexName))) indSearcher = IndexSearcher(indReader) ir = indSearcher.getIndexReader() qp = QueryParser(Version.LUCENE_CURRENT, "content", StandardAnalyzer(Version.LUCENE_CURRENT)) f = open('results-'+runName, 'w') while(True): id = queryFile.readline() if id == "": break id = id.replace("C","") id = id.replace("\n","") queryString = queryFile.readline() queryString = queryString.replace("?","") queryString = queryString.replace("*","") queryString = queryString.replace("-","_") queryString = queryString.replace("\n","") query = qp.parse(queryString) queryFile.readline() returnedDocs = 1000 collector = TopScoreDocCollector.create(returnedDocs, True) indSearcher.search(query, collector) hits = collector.topDocs().scoreDocs size = len(hits) print "Total hits for query " +id+ ": "+str(size) i = 0 for hit in hits: docId = hits[i].doc score = hits[i].score doc = ir.document(docId) j = i + 1 f.write(id + " 0 " + doc.get('id') + " " + str(j) + " " + str(score) +" " + runName +"\n") i+=1 f.close()
def run(command): if command == '': return None STORE_DIR = "index" initVM(CLASSPATH) directory = FSDirectory.getDirectory(STORE_DIR, False) searcher = IndexSearcher(directory) analyzer = StandardAnalyzer() parser = QueryParser("contents", analyzer) parser.setDefaultOperator(QueryParser.Operator.AND) parser.setFuzzyMinSim(0.2) query = parser.parse(command) hits = map(transform, searcher.search(query)) searcher.close() return hits
def boolean_search_lucene_index(index_dir, query_text, limit): ''' This function searches a boolean query in the learned lucene index Arguments: index_dir - the lucene index directory query_text - the query text which follows http://lucene.apache.org/core/3_6_0/queryparsersyntax.html limit - the number of records to be retrieved Return: rows - the returned document details ''' DEFAULT_QUERY_FIELD = 'all' store = SimpleFSDirectory(File(index_dir)) searcher = IndexSearcher(store, True) parser = QueryParser(Version.LUCENE_CURRENT, DEFAULT_QUERY_FIELD, STD_ANALYZER) query = parser.parse(query_text) start = datetime.datetime.now() scoreDocs = searcher.search(query, limit).scoreDocs duration = datetime.datetime.now() - start # print "Lucene Search: Found %d document(s) (in %s) that matched query '%s':" %(len(scoreDocs), duration, query) rows = [] for scoreDoc in scoreDocs: doc = searcher.doc(scoreDoc.doc) table = dict((field.name(), field.stringValue()) for field in doc.getFields()) row = [] metadata = MetadataType._types for field in metadata: if table.get(field,'empty') != 'empty' : row.append(table.get(field,'empty')) else: row.append('') row.append(str(table.get(MetadataType.FILE_ID,'empty'))) # the unique file id of a file row.append(scoreDoc.score) rows.append(row) return rows
def someMethod(self): directory = RAMDirectory() analyzer = StandardAnalyzer() writer = IndexWriter(directory, analyzer, True) doc = Document() doc.add(Field.Text("title", "This is the title")) doc.add(Field.UnStored("contents", "...document contents...")) writer.addDocument(doc) writer.addDocument(doc, analyzer) expression = "some query" query = QueryParser.parse(expression, "contents", analyzer) parser = QueryParser("contents", analyzer) query = parser.parseQuery(expression)
def testAnalyzer(self): analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) queryString = "category:/philosophy/eastern" parser = QueryParser(Version.LUCENE_CURRENT, "contents", analyzer) parser.setAutoGeneratePhraseQueries(True) query = parser.parse(queryString) self.assertEqual("category:\"philosophy eastern\"", query.toString("contents"), "path got split, yikes!") perFieldAnalyzer = PerFieldAnalyzerWrapper(analyzer) perFieldAnalyzer.addAnalyzer("category", WhitespaceAnalyzer()) query = QueryParser(Version.LUCENE_CURRENT, "contents", perFieldAnalyzer).parse(queryString) self.assertEqual("category:/philosophy/eastern", query.toString("contents"), "leave category field alone")
def testDateRangeQuery(self): # locale diff between jre and gcj 1/1/04 -> 01/01/04 # expression = "modified:[1/1/04 TO 12/31/04]" #expression = "modified:[01/01/04 TO 31/12/04]" #parser = QueryParser(Version.LUCENE_CURRENT, "subject", self.analyzer) #parser.setLocale(Locale("en_US.utf8")) #query = parser.parse(expression) #print expression, "parsed to", query #TODO: locale problem currently... expression = "modified:[04/01/01 TO 04/12/31]" parser = QueryParser(Version.LUCENE_CURRENT, "subject", self.analyzer) parser.setLocale(Locale("en_US.utf8")) query = parser.parse(expression) print expression, "parsed to", query topDocs = self.searcher.search(query, 50) #self.assert_(topDocs.totalHits > 0) no docs? self.assert_(str(expression) != str(query))
def searchForDbpediaURImax(self, uri, number): """ Returns maximal the number of anchor texts, which are related to the given DBpedia URI. Also returns for each anchor text the corresponding URI and the number of how often the anchor appears on the English Wikipedia """ uri_old = uri uri = uri.replace("http://dbpedia.org/resource/","") array = re.findall(r'[\w\s]+',uri) uri = "" for item in array: uri+=item try: qp = QueryParser(Version.LUCENE_35, "dbpedia_uri", analyzer) qp.setDefaultOperator(qp.Operator.AND) query = qp.parse(uri) MAX = 10000 result = [] hits = searcher.search(query, MAX) for hit in hits.scoreDocs: doc = searcher.doc(hit.doc) dbpedia_uri = doc["dbpedia_uri"].encode("utf-8") if dbpedia_uri == uri_old: result.append([doc["anchor"].encode("utf-8"), doc["anchor_uri"].encode("utf-8"), dbpedia_uri, int(doc["number"].encode("utf-8"))]) result = sorted(result, key = itemgetter(3), reverse=True) if len(result) > number: return result[0:number] else: return result return result except: print("searchForDbpediaURImax - Fail in uri: "+uri) print "Unexpected error:", sys.exc_info()[0] # raise print return []
def run(searcher, analyzer, input, filepath): #input = raw_input("Query:").decode('gbk').encode('utf8') #print "Search for: " + input command = convert(input.decode('gbk').encode('utf8')) print "Search for:" + command.decode('utf8').encode('gbk') qp = QueryParser(Version.LUCENE_CURRENT, "sentence", analyzer) #qp.setPhraseSlop(0) query = qp.parse(command) scoreDocs = searcher.search(query, 1000000).scoreDocs print "%s total matching documents." % len(scoreDocs) print try: #filepath = "D:\\TotalCode\\PyluceneSample\\Output_pylucene.txt" filew = open(filepath, 'w') result_num = 0 for scoreDoc in scoreDocs: try: result_num += 1 if result_num % 1000 == 0: # time.sleep(5) print "Search added " + str(result_num) + " sentences..." #print 'scoreDoc.doc:', scoreDoc.doc doc = searcher.doc(scoreDoc.doc) path = doc.get("path") #print "path:" + path #print 'name:', doc.get("name") #print 'sentence_num:', str(doc.get("sentence_num")) #print 'sentence:', doc.get("sentence") #sentence = GetSentence(doc.get("sentence_num"), path) sentence = doc.get("sentence") #print 'sentence:', sentence OutputSentence(filew, doc.get("name"), sentence) except: continue filew.close() except: #Exception, e: print "Failed in Outputsentence:"#, e
# Creates a searcher searching the provided index. ireader = IndexReader.open(directory, True) # Implements search over a single IndexReader. # Use a single instance and use it across queries # to improve performance. searcher = IndexSearcher(ireader) # Get the analyzer analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) # Constructs a query parser. We specify what field to search into. queryParser = QueryParser(Version.LUCENE_CURRENT, FIELD_CONTENTS, analyzer) appearance_dict = {} for TERM in term_list: print 'Searching for: "' + TERM + '"' # Create the query query = queryParser.parse(TERM) # Run the query and get documents that contain the term docs_containing_term = searcher.search(query, ireader.numDocs()) docs = [] print 'Found ' + str(len(docs_containing_term.scoreDocs) ) + ' documents with the term "' + TERM + '".' #hits = searcher.search(query, 1) for hit in (docs_containing_term.scoreDocs): #print(hit.score, hit.doc, hit.toString()) doc = searcher.doc(hit.doc) docs.append(doc.get(DOC_NAME)) appearance_dict[TERM] = set(docs) """
format = a elif o == "--index": indexDir = a elif o == "--stats": stats = True class CustomTemplate(Template): delimiter = '#' template = CustomTemplate(format) fsDir = SimpleFSDirectory(File(indexDir)) searcher = IndexSearcher(fsDir, True) analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) parser = QueryParser(Version.LUCENE_CURRENT, "keywords", analyzer) parser.setDefaultOperator(QueryParser.Operator.AND) query = parser.parse(' '.join(args)) start = datetime.now() scoreDocs = searcher.search(query, 50).scoreDocs duration = datetime.now() - start if stats: print >>sys.stderr, "Found %d document(s) (in %s) that matched query '%s':" %(len(scoreDocs), duration, query) for scoreDoc in scoreDocs: doc = searcher.doc(scoreDoc.doc) table = dict((field.name(), field.stringValue()) for field in doc.getFields()) print template.substitute(table)
# Creates a searcher searching the provided index. ireader = IndexReader.open(directory, True) # Implements search over a single IndexReader. # Use a single instance and use it across queries # to improve performance. searcher = IndexSearcher(ireader) # Get the analyzer analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) # Constructs a query parser. queryParser = QueryParser(Version.LUCENE_CURRENT, FIELD_CONTENTS, analyzer) # Create a query query = queryParser.parse(QUERY_STRING) topDocs = searcher.search(query, 50) # Get top hits scoreDocs = topDocs.scoreDocs print "%s total matching documents." % len(scoreDocs) HighlightFormatter = SimpleHTMLFormatter() query_score = QueryScorer (query) highlighter = Highlighter(HighlightFormatter, query_score) # Set the fragment size. We break text in to fragment of 64 characters fragmenter = SimpleSpanFragmenter(query_score, 64) highlighter.setTextFragmenter(fragmenter)