def run(command, searcher, analyzer): if command == '': return seg_list = jieba.cut(command) command = " ".join(jieba.cut(command)) query = lucene.QueryParser(lucene.Version.LUCENE_CURRENT, "charac", analyzer).parse(command) scoreDocs = searcher.search(query, 50).scoreDocs title = [] url = [] imgurl = [] score = [] for scoreDoc in scoreDocs: doc = searcher.doc(scoreDoc.doc) title.append(doc.get("title0")) url.append(doc.get("pUrl")) imgurl.append(doc.get("imgUrl")) score.append(doc.get("score")) query = lucene.QueryParser(lucene.Version.LUCENE_CURRENT, "title", analyzer).parse(command) scoreDocs = searcher.search(query, 50).scoreDocs for scoreDoc in scoreDocs: doc = searcher.doc(scoreDoc.doc) title.append(doc.get("title0")) url.append(doc.get("pUrl")) imgurl.append(doc.get("imgUrl")) score.append(doc.get("score")) resultInfo = "%s total matching images." % len(title) return resultInfo, title, url, imgurl, score
def search(self, restrictions, destination): """ @see: L{NullPrincipalSearcher<datafinder.persistence.search.searcher.NullSearcher>} E1101: Pylint cannot detect the internals of the modules solr and lucene. """ # pylint: disable=E1101 results = list() queryString = search_restriction_mapping.mapSearchRestriction( restrictions) if self._configuration.luceneIndexUri.startswith("file:///"): try: self._configuration.env.attachCurrentThread() indexDir = lucene.SimpleFSDirectory( lucene.File( self._configuration.luceneIndexUri.replace( "file:///", ""))) analyzer = lucene.StandardAnalyzer( lucene.Version.LUCENE_CURRENT) searcher = lucene.IndexSearcher(indexDir) query = lucene.QueryParser(lucene.Version.LUCENE_CURRENT, "content", analyzer).parse(queryString) hits = searcher.search(query, constants.MAX_RESULTS) for hit in hits.scoreDocs: doc = searcher.doc(hit.doc) results.append("/%s" % urllib.unquote( doc.get(constants.FILEPATH_FIELD).encode("utf-8"))) searcher.close() except Exception, error: errorMessage = "Cannot search items. Reason: '%s'" % error raise PersistenceError(errorMessage)
def getRecentConversations(self, username): #Determine index and data paths index_dir = self.indexdir + username data_dir = self.datadir + username #Load the index if os.path.isdir(index_dir) == True: luc_index = lucene.FSDirectory.getDirectory(index_dir) #Get the current time in UTC seconds curtime = int(time.time()) #Convert to a search range searchstart = self.__padTimestamp(curtime - SECONDS_IN_20_MINUTES) searchend = self.__padTimestamp(MAX_TIMESTAMP) #Build and perform the query qtext = "timestamp:[" + searchstart + " TO " + searchend + "]" searcher = lucene.IndexSearcher(luc_index) qparser = lucene.QueryParser("text", lucene.StandardAnalyzer()) query = qparser.parse(qtext) sortmethod = lucene.Sort(["protocol", "friend_chat", "timestamp"]) qresults = searcher.search(query, sortmethod) #Fetch the results conversationlist = [] for i in range(qresults.length()): mprotocol = qresults.doc(i).get("protocol") mfriend_chat = qresults.doc(i).get("friend_chat") mtimestamp = int(qresults.doc(i).get("timestamp")) mwho_sent = qresults.doc(i).get("who_sent") mfileoffset = int(qresults.doc(i).get("file_offset")) mrank = qresults.score(i) #This is a really bad and slow method that should #be optimized at a later date. #Simply search through all previously retrieved #conversations and check for a match. If match is #found, add it, otherwise create a new conversation. messagetext = self.__getMessageFromFile( username, mfriend_chat, mprotocol, mfileoffset) message = LogMessage(messagetext, mtimestamp, mwho_sent) message.setRank(mrank) found = False for j in range(len(conversationlist)): if conversationlist[j].getProtocol() == mprotocol and \ conversationlist[j].getFriendChat() == mfriend_chat: found = True conversationlist[j].addMessage(message) break if found == False: conversation = LogConversation(mprotocol, mfriend_chat) conversation.addMessage(message) conversationlist.append(conversation) return conversationlist else: #Index does not exist return False
def search(searcher, analyzer): print query = lucene.QueryParser(lucene.Version.LUCENE_CURRENT, "name", analyzer).parse("nitin") hits = searcher.search(query, MAX_RESULTS).scoreDocs for hit in hits: doc = searcher.doc(hit.doc) print "name is:nitin and data is:", doc
def search2(searcher, analyzer): print query = lucene.QueryParser(lucene.Version.LUCENE_CURRENT, "name", analyzer).parse("nitin2") hits = searcher.search(query, MAX_RESULTS).scoreDocs for hit in hits: doc = searcher.doc(hit.doc) print "name is:nitin2 and data is:", doc print print "Successfully retrieved documents....."
def __init__(self, session, config, parent): IndexStore.__init__(self, session, config, parent) path = self.get_path(session, 'defaultPath') self.analyzer = NullC3Analyzer() self.dir = lucene.FSDirectory.getDirectory(path, False) self.parser = lucene.QueryParser("", lucene.StandardAnalyzer()) self.searcher = lucene.IndexSearcher(self.dir) self.writer = None self.currDoc = None self.currRec = None
def SearchExactContents(self, keyword): "블로그 내용에 대하서 Exactch Matching 수행" searcher = lucene.IndexSearcher(self.store) print("Searching for ", keyword) k = keyword.decode('cp949').encode('utf-8') query = lucene.QueryParser('contents', self.analyzer).parse(k) hits = searcher.search(query) print ("%s matching documents" % hits.length()) return self.__MakeResultFormat(hits, searcher)
def SearchKeyword(indexDir, keyword): directory = lucene.FSDirectory.getDirectory(indexDir) searcher = lucene.IndexSearcher(directory) # 인덱스 검색 객체 analyzer = lucene.StandardAnalyzer() print ("Searching for %s" % keyword) keyword = keyword.decode('cp949').encode('utf-8') queryParser = lucene.QueryParser('content', analyzer) # 질의 생성 query = queryParser.parse(keyword) hits = searcher.search(query) # 검색 수행 print ("%s matching documents" % hits.length()) # 결과 갯수 for h in hits: # 결과 출력 doc = lucene.Hit.cast_(h).getDocument() print("Path: %s, name: %s" % (doc.get("path"), doc.get("name"))) searcher.close()
def search(input_q, web_data): numberOfHits = 5 collector = lucene.TopScoreDocCollector.create(numberOfHits, True) searcher = lucene.IndexSearcher(directory, True) qp = lucene.QueryParser(lucene.Version.LUCENE_CURRENT, 'word', analyzer) qp.setDefaultOperator(lucene.QueryParser.Operator.OR) query = qp.parse(input_q) searcher.search(query, collector) score_docs = collector.topDocs().scoreDocs count = 0 url_list = [] for my_doc in score_docs: #print my_doc.score doc = searcher.doc(my_doc.doc) # count,'|', doc['page_num'] ,'|',web_data[doc['page_num']] url_list.append('http://' + web_data[doc['page_num']]) count += 1 return url_list
def search(self, query, field="content", limit=None): ''' Searches the index based on the query supplied. ''' directory = lucene.SimpleFSDirectory(lucene.File(self.index_dir)) searcher = lucene.IndexSearcher(directory, True) query = lucene.QueryParser(lucene.Version.LUCENE_CURRENT, field, self.analyser).parse(query) try: #if there's no limit then use a collector to retrieve them all if limit is None: collector = DocumentHitCollector(searcher) scoreDocs = searcher.search(query, collector) results = collector.get_collected_documents() else: scoreDocs = searcher.search(query, limit).scoreDocs results = [] for scoreDoc in scoreDocs: results.append(searcher.doc(scoreDoc.doc)) except lucene.JavaError, e: print e
def search_location(word): print("searching ") vm_env = lucene.getVMEnv() vm_env.attachCurrentThread() searcher = lucene.IndexSearcher(directory1, True) query = lucene.QueryParser(lucene.Version.LUCENE_CURRENT, 'eng', analyzer1).parse(word) #print "查询" results = searcher.search(query, None, 20) score_docs = results.scoreDocs f**k = [] for score_doc in score_docs: doc = searcher.doc(score_doc.doc) p = doc['eng'] f**k.append(p) searcher.close() return f**k
index_file = siteconfig.get("search_index_file") if lucene_is_2x: store = lucene.FSDirectory.getDirectory(index_file, False) elif lucene_is_3x: store = lucene.FSDirectory.open(lucene.File(index_file)) else: assert False try: searcher = lucene.IndexSearcher(store) except lucene.JavaError, e: # FIXME: show a useful error raise e if lucene_is_2x: parser = lucene.QueryParser('text', lucene.StandardAnalyzer()) result_ids = [int(lucene.Hit.cast_(hit).getDocument().get('id')) \ for hit in searcher.search(parser.parse(query))] elif lucene_is_3x: parser = lucene.QueryParser( lucene.Version.LUCENE_CURRENT, 'text', lucene.StandardAnalyzer(lucene.Version.LUCENE_CURRENT)) result_ids = [searcher.doc(hit.doc).get('id') \ for hit in searcher.search(parser.parse(query), 100).scoreDocs] searcher.close() results = ReviewRequest.objects.filter(id__in=result_ids, local_site__name=local_site_name) return object_list(request=request,
def __getSurroundingMessages(self, when, searcher, username, protocol, friend_chat, timestamp, docid): #Determine the query text if when == "before": searchstart = self.__padTimestamp(timestamp - SECONDS_IN_5_MINUTES) searchend = self.__padTimestamp(timestamp) else: searchstart = self.__padTimestamp(timestamp) searchend = self.__padTimestamp(timestamp + SECONDS_IN_5_MINUTES) querytext = "timestamp:[" + searchstart + " TO " + searchend + "]" querytext += ' AND protocol:"' + protocol + '"' querytext += ' AND friend_chat:"' + friend_chat + '"' #Build and perform the query qparser = lucene.QueryParser("text", lucene.StandardAnalyzer()) query = qparser.parse(querytext) sortmethod = lucene.Sort("timestamp") qresults = searcher.search(query, sortmethod) #Determine which results to include if when == "before": rangestart = 0 else: if qresults.length() > 5: rangestart = qresults.length() - 5 else: rangestart = 0 #We cant assume the results will exclude messages outside the #range we are looking for in the case that many messages in #the conversation have the identical timestamp. We just will #just deal with it in the for loop rangeend = qresults.length() #Fetch the results messagelist = [] ignore = False for i in range(rangestart, rangeend): mid = int(qresults.id(i)) if mid == docid: if when == "before": #We ran into the reference point message, this means we #don't need to capture any more and we can return break else: #We ran into the reference point message #The easiest thing to do is declare all messages we have #found so far as null and reset the list to be returned #Also, stop ignoring if we reached 5 messages before now messagelist = [] ignore == False elif ignore == False: mtimestamp = int(qresults.doc(i).get("timestamp")) mwho_sent = qresults.doc(i).get("who_sent") mfileoffset = int(qresults.doc(i).get("file_offset")) messagetext = self.__getMessageFromFile( username, friend_chat, protocol, mfileoffset) message = LogMessage(messagetext, mtimestamp, mwho_sent) message.setID(mid) messagelist.append(message) #Only allow up to 5 messages if len(messagelist) == 5: #Setting an ignore flag allows us to deal with cases #like when our reference point message is the 7th #message in a string that all have the same timestamp #and we are trying to get 5 messages after it ignore = True return messagelist
def searchMessages(self, username, querytext): #Determine index and data paths index_dir = self.indexdir + username data_dir = self.datadir + username #Load the index if os.path.isdir(index_dir) == True: luc_index = lucene.FSDirectory.getDirectory(index_dir) #Build and perform the query searcher = lucene.IndexSearcher(luc_index) qparser = lucene.QueryParser("text", lucene.StandardAnalyzer()) query = qparser.parse(querytext) qresults = searcher.search(query) #Fetch the results conversationlist = [] for i in range(qresults.length()): mid = int(qresults.id(i)) mprotocol = qresults.doc(i).get("protocol") mfriend_chat = qresults.doc(i).get("friend_chat") mtimestamp = int(qresults.doc(i).get("timestamp")) mwho_sent = qresults.doc(i).get("who_sent") mfileoffset = int(qresults.doc(i).get("file_offset")) mrank = qresults.score(i) #First check if it exists in one of the previously matched #conversations found = False for j in range(len(conversationlist)): for k in range(len(conversationlist[j].messages)): if conversationlist[j].messages[k].getID() == mid: #Match found, so just update the messages rank conversationlist[j].messages[k].setRank(mrank) found = True #If no match was found, create a new conversation if found == False: #Create a conversation for each result conversation = LogConversation(mprotocol, mfriend_chat) messagetext = self.__getMessageFromFile( username, mfriend_chat, mprotocol, mfileoffset) before_msgs = self.__getSurroundingMessages( "before", searcher, username, mprotocol, mfriend_chat, mtimestamp, mid) for j in range(len(before_msgs)): conversation.addMessage(before_msgs[j]) message = LogMessage(messagetext, mtimestamp, mwho_sent) message.setRank(mrank) message.setID(mid) conversation.addMessage(message) after_msgs = self.__getSurroundingMessages( "after", searcher, username, mprotocol, mfriend_chat, mtimestamp, mid) for j in range(len(after_msgs)): conversation.addMessage(after_msgs[j]) conversationlist.append(conversation) #End of fetching each result return conversationlist else: #Index not found return False
def searchWeibos(self, q, created_at_start_secs, created_at_end_secs, user_ids=list()): startexec = datetime.datetime.now() #q = " ".join(text) first = True query = lucene.BooleanQuery() query.setMaxClauseCount(2097152) #q = 'created_at:[%(start)d TO %(end)d] AND (%(q)s)' % { "q": q, "start": created_at_start_secs, "end": created_at_end_secs } #q = 'created_at:[%(start)d TO %(end)d]' % { "q": q, "start": created_at_start_secs, "end": created_at_end_secs } #query = lucene.QueryParser(lucene.Version.LUCENE_33, "created_at", self.analyzers["smartcn"]).parse(q) sorter = lucene.Sort( lucene.SortField("created_at", lucene.SortField.INT, True)) if len(q) > 0: query.add( lucene.QueryParser(lucene.Version.LUCENE_33, "text", self.analyzers["smartcn"]).parse(q), lucene.BooleanClause.Occur.MUST) if created_at_start_secs is not None and created_at_end_secs is not None: dateFilter = lucene.NumericRangeFilter.newIntRange( "created_at", created_at_start_secs, created_at_end_secs, True, True) else: if created_at_start_secs is not None and created_at_end_secs is not None: query.add( lucene.NumericRangeQuery.newIntRange( "created_at", created_at_start_secs, created_at_end_secs, True, True), lucene.BooleanClause.Occur.MUST) #dateFilter = lucene.NumericRangeFilter.newIntRange("created_at", created_at_start_secs, created_at_end_secs, True, True) topScoreCollector = lucene.TopScoreDocCollector if len(user_ids) > 0: user_ids_str = list() numfilters = list() count = 0 for x in user_ids: count += 1 user_ids_str.append(str(x)) #user_ids_str.append("user_id:\"" + str(x) + '"') #query.add(lucene.NumericRangeQuery.newIntRange("user_id", x, x, True, True), lucene.BooleanClause.Occur.SHOULD) numfilter = lucene.NumericRangeFilter.newIntRange( "user_id", x, x, True, True) numfilters.append(numfilter) #if count > 1000: # break chainedNumFilters = lucene.ChainedFilter(numfilters, lucene.ChainedFilter.OR) cachingChainedNumFilters = lucene.CachingWrapperFilter( chainedNumFilters) if len(q) > 0: chain = lucene.ChainedFilter( [cachingChainedNumFilters, dateFilter], lucene.ChainedFilter.AND) else: chain = cachingChainedNumFilters #query.add(lucene.QueryParser(lucene.Version.LUCENE_33, "user_id", self.analyzers["smartcn"]).parse("(%s)" % " ".join(user_ids_str)), lucene.BooleanClause.Occur.MUST) #query.add(lucene.QueryParser(lucene.Version.LUCENE_33, "user_id", self.analyzers["smartcn"]).parse("user_id:(%s)" % " OR ".join(user_ids_str)), lucene.BooleanClause.Occur.MUST) #topDocs = self.searcher.search(query, chain, self.MAX_ITEMS, sorter) topDocs = self.searcher.search(query, chain, sorter) else: if len( q ) > 0 and created_at_start_secs is not None and created_at_end_secs is not None: topDocs = self.searcher.search(query, dateFilter, self.MAX_ITEMS, sorter) else: topDocs = self.searcher.search(query, self.MAX_ITEMS, sorter) #return "%(nb)d results found in %(secs)f seconds" % ids = list() ids_str = list() hits = list() count = 0 for scoreDoc in topDocs.scoreDocs: count += 1 doc = self.searcher.doc(scoreDoc.doc) id = doc.get("id") user_id = doc.get("user_id") #ids.append(id) hit = {"id": id, "user_id": user_id} hits.append(hit) #ids_str.append(str(id)) #if count > self.MAX_ITEMS: #break out = { "totalhits": topDocs.totalHits, "nb_users": len(user_ids), "ids": ids, "q": q, "hits": hits } out["lucene_query_finished"] = long( time.mktime(datetime.datetime.now().timetuple())) * 1000 if len(user_ids) > 0: out["user_ids"] = user_ids_str # Logging f = open("/var/data/sinaweibo/searchlog/searchweibos.log", "a") f.write( datetime.datetime.strftime(datetime.datetime.now(), "%Y-%m-%d %H:%M:%S") + "\t" + q + "\n") f.close() endexec = datetime.datetime.now() td = endexec - startexec microtime = td.microseconds + (td.seconds + td.days * 86400) * 1000000 secondstime = microtime / 1000000.0 out["secs"] = secondstime print out return out
def query_content(self, query_string, max_hits=5000): query_parser = lucene.QueryParser(lucene.Version.LUCENE_30, "content", self.analyzer) query = query_parser.parse(query_string) return self.search.search(query, max_hits)
def searchForums(self, q, time_start_secs, time_end_secs, uids=list(), offset=None, floor=None): if offset <> None: try: offset = int(offset) if offset > self.MAX_ITEMS: self.MAX_ITEMS = offset + 100 except: pass page_start = page_end = None if floor <> None and len(floor) > 0: m = re.match(r"(\d+)-?(\d*)", floor) if m <> None: page_start = int(m.group(1)) try: page_end = int(m.group(2)) except: page_end = page_start startexec = datetime.datetime.now() first = True query = lucene.BooleanQuery() query.setMaxClauseCount(2097152) sorter = lucene.Sort( lucene.SortField("time", lucene.SortField.INT, True)) pageFilter = None if len(q) > 0: query.add( lucene.QueryParser(lucene.Version.LUCENE_33, "content", self.analyzers["smartcn"]).parse(q), lucene.BooleanClause.Occur.MUST) dateFilter = lucene.NumericRangeFilter.newIntRange( "time", time_start_secs, time_end_secs, True, True) else: query.add( lucene.NumericRangeQuery.newIntRange("time", time_start_secs, time_end_secs, True, True), lucene.BooleanClause.Occur.MUST) if page_start <> None and page_end <> None: pageFilter = lucene.NumericRangeFilter.newIntRange( "floor", page_start, page_end, True, True) topScoreCollector = lucene.TopScoreDocCollector if len(uids) > 0: uids_str = list() numfilters = list() count = 0 for x in uids: count += 1 uids_str.append(str(x)) numfilter = lucene.NumericRangeFilter.newIntRange( "uid", x, x, True, True) numfilters.append(numfilter) #if count > 1000: # break chainedNumFilters = lucene.ChainedFilter(numfilters, lucene.ChainedFilter.OR) cachingChainedNumFilters = lucene.CachingWrapperFilter( chainedNumFilters) if len(q) > 0: chain = lucene.ChainedFilter( [cachingChainedNumFilters, dateFilter, pageFilter], lucene.ChainedFilter.AND) else: chain = cachingChainedNumFilters topDocs = self.searcher.search(query, chain, sorter) else: if len( q ) > 0 and time_start_secs is not None and time_end_secs is not None: if pageFilter is not None: filters = [dateFilter, pageFilter] chainedFilters = lucene.ChainedFilter( filters, lucene.ChainedFilter.AND) topDocs = self.searcher.search(query, chainedFilters, self.MAX_ITEMS, sorter) else: topDocs = self.searcher.search(query, dateFilter, self.MAX_ITEMS, sorter) else: if pageFilter is not None: topDocs = self.searcher.search(query, pageFilter, self.MAX_ITEMS, sorter) else: topDocs = self.searcher.search(query, self.MAX_ITEMS, sorter) #return "%(nb)d results found in %(secs)f seconds" % ids = list() ids_str = list() hits = list() count = 0 for scoreDoc in topDocs.scoreDocs: count += 1 doc = self.searcher.doc(scoreDoc.doc) id = doc.get("pid") uid = doc.get("uid") tid = doc.get("tid") #ids.append(id) hit = {"pid": id, "uid": uid, "tid": tid} hits.append(hit) #ids_str.append(str(id)) #if count > self.MAX_ITEMS: #break out = { "totalhits": topDocs.totalHits, "nb_users": len(uids), "ids": ids, "q": q, "hits": hits } out["lucene_query_finished"] = long( time.mktime(datetime.datetime.now().timetuple())) * 1000 if len(uids) > 0: out["user_ids"] = uids_str # Logging f = open( "/var/data/hkforums/searchlog/%(forum)s.log" % {"forum": self.forum}, "a") f.write( datetime.datetime.strftime(datetime.datetime.now(), "%Y-%m-%d %H:%M:%S") + "\t" + q + "\n") f.close() endexec = datetime.datetime.now() td = endexec - startexec microtime = td.microseconds + (td.seconds + td.days * 86400) * 1000000 secondstime = microtime / 1000000.0 out["secs"] = secondstime print out return out