def run(searcher, analyzer): while True: print print "Hit enter with no input to quit." command = raw_input("Query:") command = unicode(command, 'UTF-8') if command == '': return print print "Searching for:", command querys = BooleanQuery() command_dict = parseCommand(command) for k, v in command_dict.iteritems(): query = QueryParser(Version.LUCENE_CURRENT, k, analyzer).parse(v) querys.add(query, BooleanClause.Occur.MUST) scoreDocs = searcher.search(querys, 50).scoreDocs print "%s total matching documents." % len(scoreDocs) for scoreDoc in scoreDocs: doc = searcher.doc(scoreDoc.doc) print '------------------------------------------------' print 'title:', doc.get('title') print 'url:', doc.get('url') print 'src:', doc.get('src')
def run(searcher, analyzer): while True: print print "Hit enter with no input to quit." command = raw_input("Query:") command = unicode(command, 'UTF-8') if command == '': return print print "Searching for:", command querys = BooleanQuery() command_dict = parseCommand(command) for k,v in command_dict.iteritems(): query = QueryParser(Version.LUCENE_CURRENT, k, analyzer).parse(v) querys.add(query, BooleanClause.Occur.MUST) scoreDocs = searcher.search(querys, 50).scoreDocs print "%s total matching documents." % len(scoreDocs) for scoreDoc in scoreDocs: doc = searcher.doc(scoreDoc.doc) print '------------------------------------------------' print 'title:',doc.get('title') print 'url:',doc.get('url') print 'src:',doc.get('src')
def get_or_query(self, queries): """Creates an OR Boolean query from multiple Lucene queries """ # empty boolean query with Similarity.coord() disabled bq = BooleanQuery(False) for q in queries: bq.add(q, BooleanClause.Occur.SHOULD) return bq
def more_like_this(self, film, count=4): """ Use query by document techniques to find related documents :param film: film :param count: number of results :return: a list of related films """ # Retrieve doc id of the given film film_query = TermQuery(Term('id', str(film.film_id))) results = self.searcher.search(film_query, 1) if results.totalHits != 1: return [] # Use MoreLikeThis query by document technology mlt = MoreLikeThis(reader) mlt.setFieldNames(["title", "director", "writer", "genre", "cast", "fullplot"]) mlt.setMinTermFreq(0) mlt.setMinDocFreq(0) mlt.setAnalyzer(self.analyzer) mlt_query = mlt.like(results.scoreDocs[0].doc) # Filter the original film filtered_query = BooleanQuery() filtered_query.add(mlt_query, BooleanClause.Occur.MUST) filtered_query.add(film_query, BooleanClause.Occur.MUST_NOT) score_docs = self.searcher.search(filtered_query, count).scoreDocs return self._retrieve_in_order(score_docs)
def run_pic(valueFromOut, searcher, analyzer): command = valueFromOut seg_list = jieba.cut(command) command = " ".join(seg_list) if command == '': return result = [] command_dict = parseCommand(command) querys = BooleanQuery() for k, v in command_dict.iteritems(): query = QueryParser(Version.LUCENE_CURRENT, k, analyzer).parse(v) querys.add(query, BooleanClause.Occur.MUST) scoreDocs = searcher.search(querys, 10).scoreDocs print "%s total matching documents." % len(scoreDocs) for scoreDoc in scoreDocs: doc = searcher.doc(scoreDoc.doc) partResult = {} partResult['title'] = doc.get('title') partResult['url'] = doc.get('url') partResult['imgurl'] = doc.get('imgurl') result.append(partResult) return result
def run(searcher, analyzer): while True: print print "Hit enter with no input to quit." command = raw_input("Query:") command = unicode(command, 'UTF-8') if command == '': return print print "Searching for:", command #朱莉与茱莉娅 # final = jieba.cut(command) # query = QueryParser(Version.LUCENE_CURRENT, "contents", # analyzer).parse(' '.join(final)) querys = BooleanQuery() command_dict = parseCommand(command) for k,v in command_dict.iteritems(): if(k=='site'): t = Term('url','*'+v.strip()+'*') query = WildcardQuery(t) else: query = QueryParser(Version.LUCENE_CURRENT, k,analyzer).parse(v) querys.add(query, BooleanClause.Occur.MUST) scoreDocs = searcher.search(querys, 50).scoreDocs print "%s total matching documents." % len(scoreDocs) for scoreDoc in scoreDocs: doc = searcher.doc(scoreDoc.doc) print '------------------------------------------' #print 'path:', doc.get("path"), 'name:', doc.get("name"),'site:', doc.get('site') print 'title:',doc.get('title'), print 'url:',doc.get('url')
def search_kw(kw, mode): vm_env.attachCurrentThread() lists = [] l = jieba.cut(kw) query = BooleanQuery() for i in l: ii = QueryParser(Version.LUCENE_CURRENT, "introduction", analyzer).parse(i) query.add(ii, BooleanClause.Occur.MUST) if mode: sf = SortField("score", SortField.Type.STRING, True) s = Sort(sf) else: sf = SortField("comments", SortField.Type.FLOAT, True) s = Sort(sf) scoreDocs = searcher1.search(query, 20, s).scoreDocs for scoreDoc in scoreDocs: movie = [] doc = searcher1.doc(scoreDoc.doc) #### movie.append(doc.get("url")) movie.append(doc.get("picture")) movie.append(doc.get("title")) movie.append(doc.get("score")) movie.append(doc.get("genre")) movie.append(doc.get("stars")) movie.append(doc.get("comments")) ##### lists.append(movie) return lists
def func1(genre, year): vm_env.attachCurrentThread() lists = [] query = BooleanQuery() if genre != "111": item = QueryParser(Version.LUCENE_CURRENT, "genre", analyzer).parse(genre) query.add(item, BooleanClause.Occur.MUST) if year != "111": item = QueryParser(Version.LUCENE_CURRENT, "year", analyzer).parse(year) query.add(item, BooleanClause.Occur.MUST) sf = SortField("score", SortField.Type.STRING, True) s = Sort(sf) scoreDocs = searcher1.search(query, 20, s).scoreDocs for scoreDoc in scoreDocs: movie = [] doc = searcher1.doc(scoreDoc.doc) movie.append(doc.get("url")) movie.append(doc.get("picture")) movie.append(doc.get("title")) movie.append(doc.get("score")) movie.append(doc.get("genre")) movie.append(doc.get("stars")) movie.append(doc.get("comments")) lists.append(movie) return lists
def run(searcher, analyzer): while True: print print "Hit enter with no input to quit." command = raw_input("Query:") command = unicode(command, 'utf-8') if command == '': return print "Searching for:", command command_dict = parseCommand(command) querys = BooleanQuery() for k, v in command_dict.iteritems(): print k, v query = QueryParser(Version.LUCENE_CURRENT, k, analyzer).parse(v) querys.add(query, BooleanClause.Occur.MUST) scoreDocs = searcher.search(querys, 10).scoreDocs print "%s total matching documents." % len(scoreDocs) for scoreDoc in scoreDocs: doc = searcher.doc(scoreDoc.doc) ## explanation = searcher.explain(query, scoreDoc.doc) print "------------------------" print 'path:', doc.get("path") print 'name:', doc.get("name") print 'title:', doc.get('title') print 'url:', doc.get("url")
def main(): _vm = lucene.initVM(vmargs=['-Djava.awt.headless=true']) query = BooleanQuery() query.add(MatchAllDocsQuery(), BooleanClause.Occur.MUST) query.add(TermQuery(Term('type', 'user')), BooleanClause.Occur.MUST) i = 0 with zh_iatd.create_searcher() as searcher: with open('pagerank_data.txt', 'w') as fout: reslst = searcher.searcher.search(query, 100) initval = 1.0 / reslst.totalHits while len(reslst.scoreDocs) > 0: for x in reslst.scoreDocs: realdoc = searcher.searcher.doc(x.doc) obj = document_to_obj(realdoc) if not obj.data.followed_users is None: print '{0:8}'.format(i), ' user', obj.index, len( obj.data.followed_users) fout.write('{0}\t{1}\t{2}\n'.format( obj.index, initval, ' '.join( (x.encode('utf8') for x in obj.data.followed_users)))) else: print '{0:8}'.format(i), 'I user', obj.index i += 1 reslst = searcher.searcher.searchAfter(reslst.scoreDocs[-1], query, 100)
def search(**kwargs): vm_env.attachCurrentThread() query = BooleanQuery() print("Searched keywords:") for field_name, keywords in kwargs.items(): # assert field_name in SearchConfig.searchable_fields # keywords = list(filter(None, jieba.cut(keywords, cut_all=True))) keywords = list(filter(None, (k.strip() for k in jieba.cut_for_search(keywords)))) for kw in keywords: print(kw) # construct query for kw in keywords: q = QueryParser(Version.LUCENE_CURRENT, field_name, analyzer).parse(kw) query.add(q, BooleanClause.Occur.SHOULD) if field_name == 'keywords': for kw in keywords: q = QueryParser(Version.LUCENE_CURRENT, 'ent_name', analyzer).parse(kw) query.add(q, BooleanClause.Occur.SHOULD) # search scoreDocs = searcher.search(query, 50).scoreDocs return [retrieve(searcher.doc(scoreDoc.doc)) for scoreDoc in scoreDocs]
def ch_seach(self, command_dict, target_range=None, targets=('title', 'author', 'text', 'likes', 'imgurl', 'label')): res = [] querys = BooleanQuery() for key, value in command_dict.items(): if key not in ['author', 'title', 'label', 'content']: continue query = QueryParser(Version.LUCENE_CURRENT, key, self.Analyzer).parse(utils.jieba_seg(value[0])) if value[1]: querys.add(query, BooleanClause.Occur.MUST) else: querys.add(query, BooleanClause.Occur.SHOULD) totalDocs = self.chSearcher.search(querys, utils.MAX_RESULTS).scoreDocs total_match = len(totalDocs) if target_range is None: scoreDocs = totalDocs[:] else: scoreDocs = totalDocs[max(0, int(target_range[0]) ):min(total_match, int(target_range[1]))] del totalDocs for i, scoreDoc in enumerate(scoreDocs): doc = self.chSearcher.doc(scoreDoc.doc) res.append({key: doc.get(key) for key in targets}) return total_match, res
def run(searcher, analyzer): while True: print print "Hit enter with no input to quit." command = raw_input("Query:") command = unicode(command) if command == '': return command_dict = parseCommand(command) seg_list = jieba.cut(command_dict['contents']) command_dict['contents'] = (" ".join(seg_list)) querys = BooleanQuery() for k, v in command_dict.iteritems(): query = QueryParser(Version.LUCENE_CURRENT, k, analyzer).parse(v) querys.add(query, BooleanClause.Occur.MUST) print print "Searching for:", command scoreDocs = searcher.search(querys, 50).scoreDocs print "%s total matching documents." % len(scoreDocs) for i, scoreDoc in enumerate(scoreDocs): doc = searcher.doc(scoreDoc.doc) print 'path:', doc.get("path"), \ '\nname:', doc.get("name"), \ '\ntitle:', doc.get("title"), \ "url:",doc.get("url"), \ "\nsite:",doc.get("site"), "\n"
def get_or_query(self, queries): """Creates an OR Boolean query from multiple Lucene queries.""" # empty boolean query with Similarity.coord() disabled bq = BooleanQuery(False) for q in queries: bq.add(q, BooleanClause.Occur.SHOULD) return bq
def run(searcher, analyzer): while True: print print "Hit enter with no input to quit." command = raw_input("Query:") if command == '': return print print "Searching for:", command command_dict = parseCommand(command) querys = BooleanQuery() for k, v in command_dict.iteritems(): query = QueryParser(Version.LUCENE_CURRENT, k, analyzer).parse(v) querys.add(query, BooleanClause.Occur.MUST) scoreDocs = searcher.search(querys, 50).scoreDocs finalDocTitles = [] for i, scoreDoc in enumerate(scoreDocs): doc = searcher.doc(scoreDoc.doc) if (doc.get("title") not in finalDocTitles): print 'title:', doc.get("title"), 'url:', doc.get( "url"), 'score:', scoreDoc.score, 'contents:', doc.get( 'contents') finalDocTitles.append(doc.get("title")) # print 'explain:', searcher.explain(query, scoreDoc.doc) print "%s total matching documents." % len(finalDocTitles)
def run_img(command): vm_env = lucene.getVMEnv() vm_env.attachCurrentThread() STORE_DIR = "index2" directory = SimpleFSDirectory(File(STORE_DIR)) searcher = IndexSearcher(DirectoryReader.open(directory)) analyzer = WhitespaceAnalyzer(Version.LUCENE_CURRENT) querys = BooleanQuery() query_content = QueryParser(Version.LUCENE_CURRENT, "urlcontent", analyzer).parse(command) query_title = QueryParser(Version.LUCENE_CURRENT, "title", analyzer).parse(command) querys.add(query_content, BooleanClause.Occur.SHOULD) querys.add(query_title, BooleanClause.Occur.SHOULD) scoreDocs = searcher.search(querys, 50).scoreDocs if len(scoreDocs) == 0: print "WARNING: No result" result = [] for scoreDoc in scoreDocs: doc = searcher.doc(scoreDoc.doc) print doc.get("title") data = {} data['title'] = doc.get('title') data['url'] = doc.get('url') data['imgurl'] = doc.get('imgurl') result.append(data) return result
def run(searcher, analyzer): while True: print print "Hit enter with no input to quit." command = raw_input("Query:") if command == '': print "=== [ QUIT ] ===" return print print "Searching for:", command command_dict = parseCommand(command) querys = BooleanQuery() for k, v in command_dict.iteritems(): if 'contents' == k: v = " ".join(jieba.cut(v)) if DEBUG_MODE: print k, v query = QueryParser(Version.LUCENE_CURRENT, k, analyzer).parse(v) querys.add(query, BooleanClause.Occur.MUST) scoreDocs = searcher.search(querys, MAX_ITEMS_PER_PAGE).scoreDocs print "%s total matching documents." % len(scoreDocs) for idx, scoreDoc in enumerate(scoreDocs): doc = searcher.doc(scoreDoc.doc) # # explanation = searcher.explain(query, scoreDoc.doc) print "-- #", str(idx + 1), "--" print '\ttitle:\t', doc.get("title") print '\turl:\t', doc.get("url") print '\tpath:\t', doc.get("path") print '\tname:\t', doc.get("name") print
def do_mapping(line): regex = re.match(r"(?P<netflix_id>[0-9]+),(?P<year>([0-9]+)|NULL),(?P<title>.+)", line) if not regex: raise ValueError(line) netflix_id = int(regex.group("netflix_id")) title = QueryParser.escape(regex.group("title")) query1 = QueryParser(Version.LUCENE_CURRENT, "title", analyzer).parse(title) year = regex.group("year") if year == "NULL": scoreDocs = searcher.search(query1, 1).scoreDocs else: year = int(year) query2 = NumericRangeQuery.newIntRange("year", year, year, True, True) booleanQuery = BooleanQuery(); booleanQuery.add(query1, BooleanClause.Occur.MUST); booleanQuery.add(query2, BooleanClause.Occur.MUST); scoreDocs = searcher.search(booleanQuery, 1).scoreDocs if scoreDocs: if scoreDocs[0].score > 1.5: doc = searcher.doc(scoreDocs[0].doc) doc_id = doc.getField("id").stringValue() doc.add(StringField("netflix_id", str(netflix_id), Field.Store.YES)) writer.updateDocument(Term("id", doc_id), doc)
def createDrilldownQuery(self, luceneQuery, drilldownQueries): q = BooleanQuery(True) if luceneQuery: q.add(luceneQuery, BooleanClause.Occur.MUST) for field, path in drilldownQueries: q.add(TermQuery(self._fieldRegistry.makeDrilldownTerm(field, path)), BooleanClause.Occur.MUST); return q
def text_search(command): envir.vm_env.attachCurrentThread() command_dict = parseCommand(command, "contents") querys = BooleanQuery() for k, v in command_dict.iteritems(): query = QueryParser(Version.LUCENE_CURRENT, k, envir.analyzer).parse(v) querys.add(query, BooleanClause.Occur.MUST) scoreDocs = envir.text_searcher.search(querys, 30).scoreDocs res = [] query_highlight = QueryParser(Version.LUCENE_CURRENT, k, envir.analyzer).parse(command_dict["contents"]) myhighlighter = Highlighter( SimpleHTMLFormatter(), QueryScorer(query_highlight)) myhighlighter.setTextFragmenter(SimpleFragmenter(50)) for scoreDoc in scoreDocs: # find texts which are around the keyword doc = envir.text_searcher.doc(scoreDoc.doc) text = doc.get("contents") key_text = "".join((myhighlighter.getBestFragments( envir.analyzer, "contents", text, 3))) key_text = re.sub('\s', '', key_text) temp = [doc.get("title"), doc.get('url'), key_text] res.append(temp) return res
def rewrite(data_string): data=json.loads(data_string) toupdate=json.loads(update) #primary_key_modified=False #delete the appropriate document query=BooleanQuery() for key in primary_keys_map: temp=QueryParser(Version.LUCENE_CURRENT,key,analyzer).parse(data[key]) query.add(BooleanClause(temp,BooleanClause.Occur.MUST)) #modify the values for key,value in toupdate.items(): #if such a key is not present the we either add and update that key into data,or just ignore it!(By default it is set to True!) if add_field_if_not_exists==False: if key in data.keys(): data[key]=value else: data[key]=value #this deletion statement has been intenstionally added here #only if the modified data,has primary keys already not existing,will the updating process continue primary_key_update=False for key in toupdate.keys(): if key in primary_keys_map: primary_key_update=True break if primary_key_update == True: query_search=BooleanQuery() for key in primary_keys_map: temp=QueryParser(Version.LUCENE_CURRENT,key,analyzer).parse(data[key]) query_search.add(BooleanClause(temp,BooleanClause.Occur.MUST)) hits=searcher.search(query_search,MAX_RESULTS).scoreDocs if len(hits) > 0: return 106 writer.deleteDocuments(query) #add the newly modified document doc=Document() #index files wrt primary key for primary_key in primary_keys_map: try: field=Field(primary_key,data[primary_key],Field.Store.NO,Field.Index.ANALYZED) doc.add(field) except: # primary_keys_map.pop(collection_name) return 101 #compress data using snappy if compression is on if to_be_compressed_input==True: temp=json.dumps(data) data_string=base64.b64encode(snappy.compress(temp)) else: temp=json.dumps(data) data_string=base64.b64encode(temp) field=Field("$DATA$",data_string,Field.Store.YES,Field.Index.ANALYZED) doc.add(field) writer.addDocument(doc)
def testFlat(self): q = BooleanQuery() q.add(BooleanClause(self.t1, BooleanClause.Occur.SHOULD)) q.add(BooleanClause(self.t2, BooleanClause.Occur.SHOULD)) q.add(BooleanClause(self.c1, BooleanClause.Occur.SHOULD)) q.add(BooleanClause(self.c2, BooleanClause.Occur.SHOULD)) self.assertEqual(1, self.search(q))
def visitSCOPED_CLAUSE(self, node): clause = CqlVisitor.visitSCOPED_CLAUSE(self, node) if len(clause) == 1: return clause[0] lhs, operator, rhs = clause query = BooleanQuery() query.add(lhs, LHS_OCCUR[operator]) query.add(rhs, RHS_OCCUR[operator]) return query
def lucene_retrieval_multifield(q_string, q_class, feature_type, use_BM25=False): """ multifield: different query string for different field not same word on different field :param q_string: :param feature_type: :param use_BM25: :return: retrieval_scores for each question-answer pair """ index = set_lucene_index['ind'] # nonlocal variable index def retrieval_scores(hists): """ return sorted document+score by score :param hists: """ def doc_score(hists): """ return doc_name & score :param hists: """ for h in hists: # docID = h.doc # doc = searcher.doc(docID) # file_name = doc.get("corpus_name") # doc_name = doc.get("doc_name") # text = doc.get("text") score = h.score # yield (file_name, doc_name, score, text) yield score doc_score_list = list(doc_score(hists)) return map(lambda f: f(doc_score_list), feature_type) # feature_type is a list of function text_query = QueryParser(version, 'text', analyzer).parse(QueryParser.escape(q_string)) subject_query = QueryParser(version, 'corpus_name', analyzer).parse(QueryParser.escape(q_class)) query = BooleanQuery() # BooleanClause.Occur # MUST implies that the keyword must occur # SHOULD implies that the keyword SHOULD occur query.add(text_query, BooleanClause.Occur.SHOULD) query.add(subject_query, BooleanClause.Occur.SHOULD) # search reader = IndexReader.open(index) searcher = IndexSearcher(reader) if use_BM25: searcher.setSimilarity(BM25Similarity(k1=1.5, b=0.75)) # todo: BM25 parameters collector = TopScoreDocCollector.create(hitsPerPage, True) searcher.search(query, collector) hs = collector.topDocs().scoreDocs # hists results = retrieval_scores(hs) # reader.close() return results # retrieval_scores for each question-answer pair
def visitSCOPED_CLAUSE(self, node): clause = CqlVisitor.visitSCOPED_CLAUSE(self, node) if len(clause) == 1: return clause[0] lhs, operator, rhs = clause query = BooleanQuery() query.add(lhs, LHS_OCCUR[operator]) query.add(rhs, RHS_OCCUR[operator]) return query
def firstsearch(searcher, analyzer, command): if len(command.split()) > 1: return [] querys = BooleanQuery() query = QueryParser(Version.LUCENE_CURRENT, "name_not_cut", analyzer).parse(command) querys.add(query, BooleanClause.Occur.MUST) scoreDocs = searcher.search(querys, 1000).scoreDocs return scoreDocs
def search_dianping(province, kind, query): STORE_DIR = "index" vm_env.attachCurrentThread() #base_dir = os.path.dirname(os.path.abspath(sys.argv[0])) directory = SimpleFSDirectory(File(STORE_DIR)) searcher = IndexSearcher(DirectoryReader.open(directory)) analyzer = WhitespaceAnalyzer(Version.LUCENE_CURRENT) allowed_opt = ['food', 'foodshop'] if kind not in allowed_opt: return None if query == '': return None command = '%s:%s province:%s' % (kind, query, province) command = unicode(command, 'utf8', 'ignore') command_dict = parseCommand(command) querys = BooleanQuery() for k, v in command_dict.iteritems(): query = QueryParser(Version.LUCENE_CURRENT, k, analyzer).parse(v) querys.add(query, BooleanClause.Occur.MUST) scoreDocs = searcher.search(querys, 50).scoreDocs #比较评分 max_rank = 0 best_shop = '' for scoreDoc in scoreDocs: doc = searcher.doc(scoreDoc.doc) cur_shop = doc.get("foodshop").split()[-1] cur_rank = float(doc.get('rank')) if cur_rank > max_rank: max_rank = cur_rank best_shop = cur_shop result = {} for scoreDoc in scoreDocs: doc = searcher.doc(scoreDoc.doc) cur_shop = doc.get("foodshop").split()[-1] cur_rank = float(doc.get('rank')) if cur_rank == max_rank: result['name'] = cur_shop.encode('utf8', 'ignore') result['rank'] = doc.get('rank').encode('utf8', 'ignore') result['food'] = doc.get('food').encode('utf8', 'ignore') result['location'] = doc.get('location').encode('utf8', 'ignore') result['tel'] = doc.get('tel').encode('utf8', 'ignore') result['environment_score'] = doc.get('environment_score').encode( 'utf8', 'ignore') result['flavour_score'] = doc.get('flavour_score').encode( 'utf8', 'ignore') result['service_score'] = doc.get('service_score').encode( 'utf8', 'ignore') result['price_level'] = doc.get('price_level').encode( 'utf8', 'ignore') del searcher return result
def run(searcher, analyzer): while True: print print "Hit enter with no input to quit." command = raw_input("Query:") #command = unicode(command, 'GBK') command = unicode(command, 'utf8') if command == '': return print print 'searching for : ' + command command_dict = parseCommand(command) querys = BooleanQuery() for k, v in command_dict.iteritems(): query = QueryParser(Version.LUCENE_CURRENT, k, analyzer).parse(v) querys.add(query, BooleanClause.Occur.MUST) scoreDocs = searcher.search(querys, 50).scoreDocs print "%s total matching documents." % len(scoreDocs) #比较评分 max_rank = 0 best_shop = '' for scoreDoc in scoreDocs: doc = searcher.doc(scoreDoc.doc) cur_shop = doc.get("foodshop").split()[-1] cur_rank = float(doc.get('rank')) if cur_rank > max_rank: max_rank = cur_rank best_shop = cur_shop result = {} for scoreDoc in scoreDocs: doc = searcher.doc(scoreDoc.doc) cur_shop = doc.get("foodshop").split()[-1] cur_rank = float(doc.get('rank')) if cur_rank == max_rank: result['name'] = cur_shop.encode('utf8', 'ignore') result['rank'] = doc.get('rank').encode('utf8', 'ignore') result['food'] = doc.get('food').encode('utf8', 'ignore') result['location'] = doc.get('location').encode( 'utf8', 'ignore') result['tel'] = doc.get('tel').encode('utf8', 'ignore') result['environment_score'] = doc.get( 'environment_score').encode('utf8', 'ignore') result['flavour_score'] = doc.get('flavour_score').encode( 'utf8', 'ignore') result['service_score'] = doc.get('service_score').encode( 'utf8', 'ignore') result['price_level'] = doc.get('price_level').encode( 'utf8', 'ignore') print result
def testCollectScoresWithNoResultAndBooleanQueryDoesntFailOnFakeScorerInAggregateScoreCollector(self): q = BooleanQuery() q.add(luceneQueryFromCql('M=true'), BooleanClause.Occur.SHOULD) q.add(luceneQueryFromCql('M=true'), BooleanClause.Occur.SHOULD) q = ComposedQuery('coreA', query=q) q.start = 0 q.stop = 0 q.setRankQuery(core='coreC', query=luceneQueryFromCql('S=true')) q.addMatch(dict(core='coreA', uniqueKey=KEY_PREFIX+'A'), dict(core='coreC', key=KEY_PREFIX+'C')) result = returnValueFromGenerator(self.dna.any.executeComposedQuery(q)) self.assertEquals(4, result.total) self.assertEquals([], result.hits)
def do_query(property, qstring, limit = 10): query = BooleanQuery() stream = analyzer.tokenStream(property, StringReader(qstring)) stream.reset() attr = stream.getAttribute(CharTermAttribute) while stream.incrementToken(): term = attr.toString() termQuery = TermQuery(Term(property, term)) query.add(termQuery, Occur.SHOULD) hits = searcher.search(query, None, limit).scoreDocs return [Document(searcher.doc(hit.doc)) for hit in hits]
def testOutOfOrderDocsScoringSort(self): """ Two Sort criteria to instantiate the multi/single comparators. """ sorts = [Sort(SortField.FIELD_DOC), Sort()] tfcOptions = [[False, False, False], [False, False, True], [False, True, False], [False, True, True], [True, False, False], [True, False, True], [True, True, False], [True, True, True]] actualTFCClasses = [ "OutOfOrderOneComparatorNonScoringCollector", "OutOfOrderOneComparatorScoringMaxScoreCollector", "OutOfOrderOneComparatorScoringNoMaxScoreCollector", "OutOfOrderOneComparatorScoringMaxScoreCollector", "OutOfOrderOneComparatorNonScoringCollector", "OutOfOrderOneComparatorScoringMaxScoreCollector", "OutOfOrderOneComparatorScoringNoMaxScoreCollector", "OutOfOrderOneComparatorScoringMaxScoreCollector" ] bq = BooleanQuery() # Add a Query with SHOULD, since bw.scorer() returns BooleanScorer2 # which delegates to BS if there are no mandatory clauses. bq.add(MatchAllDocsQuery(), BooleanClause.Occur.SHOULD) # Set minNrShouldMatch to 1 so that BQ will not optimize rewrite to # return the clause instead of BQ. bq.setMinimumNumberShouldMatch(1) for sort in sorts: for tfcOption, actualTFCClass in izip(tfcOptions, actualTFCClasses): tdc = TopFieldCollector.create(sort, 10, tfcOption[0], tfcOption[1], tfcOption[2], False) self.assert_(tdc.getClass().getName().endswith("$" + actualTFCClass)) self.full.search(bq, tdc) tds = tdc.topDocs() sds = tds.scoreDocs self.assertEqual(10, len(sds))
def testUnqualifiedTermFields(self): composer = LuceneQueryComposer(unqualifiedTermFields=[("field0", 0.2), ("field1", 2.0)], luceneSettings=LuceneSettings()) ast = parseCql("value") result = composer.compose(ast) query = BooleanQuery() left = TermQuery(Term("field0", "value")) left.setBoost(0.2) query.add(left, BooleanClause.Occur.SHOULD) right = TermQuery(Term("field1", "value")) right.setBoost(2.0) query.add(right, BooleanClause.Occur.SHOULD) self.assertEquals(type(query), type(result)) self.assertEquals(repr(query), repr(result))
def func2(name): vm_env.attachCurrentThread() lists = [] query = BooleanQuery() item = QueryParser(Version.LUCENE_CURRENT, "name", analyzer).parse(name) query.add(item, BooleanClause.Occur.MUST) scoreDocs = searcher2.search(query, 20).scoreDocs for scoreDoc in scoreDocs: list = [] doc = searcher2.doc(scoreDoc.doc) list.append(doc.get("picture")) list.append(doc.get("url")) list.append(doc.get("name")) lists.append(list) return lists
def delete(primary_keys_map,collection_name,todelete,commit=False): INDEX_DIR_DEFAULT="IndexFiles.index" if collection_name!="DEFAULT": INDEX_DIR=collection_name else: INDEX_DIR=INDEX_DIR_DEFAULT try: tofind_keyvalue_pairs=json.loads(todelete) except: return 100 direc=SimpleFSDirectory(File(INDEX_DIR)) analyzer=StandardAnalyzer(Version.LUCENE_CURRENT) #setting writer configurations try: config=IndexWriterConfig(Version.LUCENE_CURRENT,analyzer) config.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND) writer=IndexWriter(direc,config) ireader=IndexReader.open(direc) except: return 105 ###as of now deletion of documents support is only based on indexed keys.###################3 tofind_primary_keyvalue_pairs={} tofind_nonprimary_keyvalue_pairs={} #separating out primary and non_primary keys for key in tofind_keyvalue_pairs.keys(): if key in primary_keys_map: tofind_primary_keyvalue_pairs[key]=tofind_keyvalue_pairs[key] else: tofind_nonprimary_keyvalue_pairs[key]=tofind_keyvalue_pairs[key] #filtering documents according to primary keys query=BooleanQuery() for key in tofind_primary_keyvalue_pairs.keys(): temp=QueryParser(Version.LUCENE_CURRENT,key,analyzer).parse(tofind_primary_keyvalue_pairs[key]) query.add(BooleanClause(temp,BooleanClause.Occur.MUST)) a=writer.deleteDocuments(query) if commit==True: writer.commit() writer.close() return 000;
def delete(primary_keys_map, collection_name, todelete, commit=False): INDEX_DIR_DEFAULT = "IndexFiles.index" if collection_name != "DEFAULT": INDEX_DIR = collection_name else: INDEX_DIR = INDEX_DIR_DEFAULT try: tofind_keyvalue_pairs = json.loads(todelete) except: return 100 direc = SimpleFSDirectory(File(INDEX_DIR)) analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) #setting writer configurations try: config = IndexWriterConfig(Version.LUCENE_CURRENT, analyzer) config.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND) writer = IndexWriter(direc, config) ireader = IndexReader.open(direc) except: return 105 ###as of now deletion of documents support is only based on indexed keys.###################3 tofind_primary_keyvalue_pairs = {} tofind_nonprimary_keyvalue_pairs = {} #separating out primary and non_primary keys for key in tofind_keyvalue_pairs.keys(): if key in primary_keys_map: tofind_primary_keyvalue_pairs[key] = tofind_keyvalue_pairs[key] else: tofind_nonprimary_keyvalue_pairs[key] = tofind_keyvalue_pairs[key] #filtering documents according to primary keys query = BooleanQuery() for key in tofind_primary_keyvalue_pairs.keys(): temp = QueryParser(Version.LUCENE_CURRENT, key, analyzer).parse(tofind_primary_keyvalue_pairs[key]) query.add(BooleanClause(temp, BooleanClause.Occur.MUST)) a = writer.deleteDocuments(query) if commit == True: writer.commit() writer.close() return 000
def search_trip(command): '''command must be encoded in unicode''' STORE_DIR = "index_trip" vm_env.attachCurrentThread() directory = SimpleFSDirectory(File(STORE_DIR)) folders = { 'parsed_ctrip': ['source', 'location', 'introduction', 'score', 'img_list'], 'parsed_qunar': ['location', 'rank', 'score', 'time', 'introduction', 'img_list'], 'eic_mfw': ['location', 'introduction', 'img_list'] } readers = constructReaders(folders) searcher = IndexSearcher(DirectoryReader.open(directory)) analyzer = WhitespaceAnalyzer(Version.LUCENE_CURRENT) command_dict = parseCommand(command) querys = BooleanQuery() for k, v in command_dict.iteritems(): query = QueryParser(Version.LUCENE_CURRENT, k, analyzer).parse(v) querys.add(query, BooleanClause.Occur.MUST) scoreDocs = searcher.search(querys, 50).scoreDocs print 'total: %s' % (len(scoreDocs)) maxf = [] maxrank = -1000.0 for scoreDoc in scoreDocs: doc = searcher.doc(scoreDoc.doc) filename = doc.get('filename') rank = ranker(command_dict, getInfo(folders, readers, filename)) if rank > maxrank: maxf = [filename] maxrank = rank elif rank == maxrank: maxf.append(filename) del searcher if len(maxf) == 0: print "error in searchtrip.py: no result while searching", command_dict.get( 'location', '') return "Interior Error" elif len(maxf) != 1: print "warning in searchtrip.py: multiple results when searching", command_dict.get( 'location', '') return getInfo(folders, readers, maxf[0])
def visitSEARCH_CLAUSE(self, node): # possible children: # CQL_QUERY # SEARCH_TERM # INDEX, RELATION, SEARCH_TERM firstChild = node.children[0].name results = CqlVisitor.visitSEARCH_CLAUSE(self, node) if firstChild == 'SEARCH_TERM': (unqualifiedRhs, ) = results if unqualifiedRhs == '*': return MatchAllDocsQuery() subQueries = [] for fieldname, boost in self._unqualifiedTermFields: subQuery = self._termOrPhraseQuery(fieldname, unqualifiedRhs) if isinstance( subQuery, PhraseQuery ) and not self._fieldRegistry.phraseQueryPossible(fieldname): continue subQuery.setBoost(boost) subQueries.append(subQuery) if len(subQueries) == 1: query = subQueries[0] else: query = BooleanQuery() for subQuery in subQueries: query.add(subQuery, BooleanClause.Occur.SHOULD) return query elif firstChild == 'INDEX': (left, (relation, boost), right) = results if relation in [ '==', 'exact' ] or (relation == '=' and self._fieldRegistry.isUntokenized(left)): query = TermQuery(self._createTerm(left, right)) elif relation == '=': query = self._termOrPhraseQuery(left, right) elif relation in ['<', '<=', '>=', '>']: query = self._termRangeQuery(left, relation, right) else: raise UnsupportedCQL("'%s' not supported for the field '%s'" % (relation, left)) query.setBoost(boost) return query else: ((query, ), ) = results return query
def testParenthesisMust2(self): q3 = BooleanQuery() q3.add(BooleanClause(self.t1, BooleanClause.Occur.SHOULD)) q3.add(BooleanClause(self.t2, BooleanClause.Occur.SHOULD)) q4 = BooleanQuery() q4.add(BooleanClause(self.c1, BooleanClause.Occur.SHOULD)) q4.add(BooleanClause(self.c2, BooleanClause.Occur.SHOULD)) q2 = BooleanQuery() q2.add(q3, BooleanClause.Occur.SHOULD) q2.add(q4, BooleanClause.Occur.MUST) self.assertEqual(1, self.search(q2))
def run(searcher, analyzer, command): commandsplit = command.split() maxlen = len(commandsplit[0]) maxindex = 0 for i in range(len(commandsplit)): if maxlen < len(commandsplit[i]): maxlen = len(commandsplit[i]) maxindex = i commands = " ".join(jieba.cut(command.split()[maxindex])).split() querys = BooleanQuery() for i in commands: try: query = QueryParser(Version.LUCENE_CURRENT, "name", analyzer).parse(i) querys.add(query, BooleanClause.Occur.MUST) except: continue scoreDocs = searcher.search(querys, 50).scoreDocs if len(scoreDocs) == 0: querys = BooleanQuery() for i in commands: for j in i: try: query = QueryParser(Version.LUCENE_CURRENT, "not_seg", analyzer).parse(j) querys.add(query, BooleanClause.Occur.MUST) except: continue scoreDocs = searcher.search(querys, 50).scoreDocs temp = [] if len(scoreDocs) > 0: doc = searcher.doc(scoreDocs[0].doc) temp = [ doc.get("org"), doc.get("path"), doc.get("price"), doc.get("imgsrc") ] else: temp = ['unknown'] * 4 return temp
def visitSEARCH_CLAUSE(self, node): # possible children: # CQL_QUERY # SEARCH_TERM # INDEX, RELATION, SEARCH_TERM firstChild = node.children[0].name results = CqlVisitor.visitSEARCH_CLAUSE(self, node) if firstChild == 'SEARCH_TERM': (unqualifiedRhs,) = results if unqualifiedRhs == '*': return MatchAllDocsQuery() subQueries = [] for fieldname, boost in self._unqualifiedTermFields: subQuery = self._termOrPhraseQuery(fieldname, unqualifiedRhs) if isinstance(subQuery, PhraseQuery) and not self._fieldRegistry.phraseQueryPossible(fieldname): continue subQuery.setBoost(boost) subQueries.append(subQuery) if len(subQueries) == 1: query = subQueries[0] else: query = BooleanQuery() for subQuery in subQueries: query.add(subQuery, BooleanClause.Occur.SHOULD) return query elif firstChild == 'INDEX': (left, (relation, boost), right) = results if relation in ['==', 'exact'] or (relation == '=' and self._fieldRegistry.isUntokenized(left)): query = TermQuery(self._createTerm(left, right)) elif relation == '=': query = self._termOrPhraseQuery(left, right) elif relation in ['<','<=','>=','>']: query = self._termRangeQuery(left, relation, right) else: raise UnsupportedCQL("'%s' not supported for the field '%s'" % (relation, left)) query.setBoost(boost) return query else: ((query,),) = results return query
def lucene_sample_query_parse(sampleq, ftypes): fields = [] queries = [] booleans = [] bq = BooleanQuery() for query_tuple in sampleq: (field, op_, value) = re.split(snapconf.RANGE_QUERY_OPS, query_tuple) m=snapconf.RANGE_QUERY_FIELD_PATTERN.search(query_tuple) if m is None or field is None: continue op=m.group(1) if op not in snapconf.operators: sys.stderr.write("bad operator %s in range query,exiting\n" % (str(op))) sys.exit(-1) field_w_type = snapconf.SAMPLE_HEADER_FIELDS_TYPE_MAP[field] (fieldtypechar, ftype_method) = ftypes[field_w_type] #range query if fieldtypechar == 'i' or fieldtypechar == 'f': bq.add(lucene_range_query_parse(field_w_type, op, value, fieldtypechar, ftype_method), BOOLEAN_OCCUR) #phrase query elif ' ' in value or '\t' in value: pquery = PhraseQuery() [pquery.add(Term(field_w_type, v.lower())) for v in re.split(r'\s+',value)] #force exact phrase matching only pquery.setSlop(0) bq.add(pquery, BOOLEAN_OCCUR) #term query else: bq.add(TermQuery(Term(field_w_type, value.lower())), BOOLEAN_OCCUR) sys.stderr.write("value + fields: %s %s\n" % (value.lower(), field_w_type)) return bq
def run(searcher, analyzer, command): command_dict = parseCommand(command) seg_list = jieba.cut(command_dict['contents']) command_dict['contents'] = (" ".join(seg_list)) querys = BooleanQuery() for k, v in command_dict.iteritems(): query = QueryParser(Version.LUCENE_CURRENT, k, analyzer).parse(v) querys.add(query, BooleanClause.Occur.MUST) scoreDocs = searcher.search(querys, 50).scoreDocs print "%s total matching documents." % len(scoreDocs) scorer = QueryScorer(query) fragmenter = SimpleSpanFragmenter(scorer, 250) simpleHTMLFormatter = SimpleHTMLFormatter("<b>", "</b>") highlighter = Highlighter(simpleHTMLFormatter, scorer) highlighter.setTextFragmenter(fragmenter) results = [] for i, scoreDoc in enumerate(scoreDocs): doc = searcher.doc(scoreDoc.doc) contents = doc.get("contents") if contents: tkStream = analyzer.tokenStream("contents", contents) highlight = highlighter.getBestFragment(tkStream, contents) highlightseg = highlight.split() highlight = ''.join(highlightseg) results.append( (doc.get("title").strip(), doc.get("url"), highlight)) ''' print 'path:', doc.get("path"), \ '\nname:', doc.get("name"), \ '\ntitle:', doc.get("title"), \ "url:",doc.get("url"), \ "\nsite:",doc.get("site"),\ "\ncontent:",highlight,"\n" ''' # print 'explain:', searcher.explain(query, scoreDoc.doc) return results
def _create_query(self, fields): """ Build query with Term, Phrase and Fuzzy clauses. :param fields: dictionary of (field, text) tuples :return: query """ query = BooleanQuery() for (field, text) in fields: if field.startswith("year"): start, end = text.split(",") numeric_query = NumericRangeQuery.newIntRange( 'year', int(start), int(end), True, True) query.add(BooleanClause(numeric_query, BooleanClause.Occur.MUST)) if field == 'title': spans = [] for word in text.lower().split(): spans.append(SpanTermQuery(Term(field, word))) query.add(BooleanClause(SpanNearQuery(spans, 2, True), BooleanClause.Occur.SHOULD)) field_names, field_texts = zip(*fields) flags = [BooleanClause.Occur.MUST] * len(field_names) query_parser_query = MultiFieldQueryParser.parse( Version.LUCENE_CURRENT, field_texts, field_names, flags, StandardAnalyzer(Version.LUCENE_CURRENT)) query.add(BooleanClause(query_parser_query, BooleanClause.Occur.MUST)) fuzzify = lambda s: (s + " ").replace(" ", "~1 ") fuzzy_field_texts = map(fuzzify, field_texts) fuzzy_query_parser_query = MultiFieldQueryParser.parse( Version.LUCENE_CURRENT, fuzzy_field_texts, field_names, flags, StandardAnalyzer(Version.LUCENE_CURRENT)) query.add(BooleanClause(fuzzy_query_parser_query, BooleanClause.Occur.MUST)) boostQuery = FunctionQuery( LinearFloatFunction( PowFloatFunction( DoubleConstValueSource(0.0001), ScaleFloatFunction(IntFieldSource("imdb_votes_boost"), 0.0, 1.0) ), -1.0, 1.0)) query = CustomScoreQuery(query, boostQuery) return query
def perform_search(self, searchterm, results_per_page, page): # if there is a field in the searchterm """if ":" in searchterm: # processing a query parser = QueryParser(Version.LUCENE_CURRENT, "content", self.analyzer) parser.setDefaultOperator(QueryParser.Operator.AND) query = parser.parse(searchterm) else: query = BooleanQuery() query_title = TermQuery(Term("title", searchterm)) query_description = TermQuery(Term("description", searchterm)) query_content = TermQuery(Term("content", searchterm)) # BooleanClause.Occur.MUST for AND queries query.add(query_title, BooleanClause.Occur.SHOULD) query.add(query_description, BooleanClause.Occur.SHOULD) query.add(query_content, BooleanClause.Occur.SHOULD)""" # create QueryParser for each field to be searched parser_title = QueryParser(Version.LUCENE_CURRENT, "title", self.analyzer) parser_description = QueryParser(Version.LUCENE_CURRENT, "description", self.analyzer) parser_content = QueryParser(Version.LUCENE_CURRENT, "content", self.analyzer) # put fields together query = BooleanQuery() query.add(parser_title.parse(searchterm), BooleanClause.Occur.SHOULD) query.add(parser_description.parse(searchterm), BooleanClause.Occur.SHOULD) query.add(parser_content.parse(searchterm), BooleanClause.Occur.SHOULD) # conducting search searcher = IndexSearcher(DirectoryReader.open(self.store)) start = datetime.now() hits = searcher.search(query, results_per_page + (results_per_page * page)) score_docs = hits.scoreDocs count_results = hits.totalHits duration = datetime.now() - start # results to return results = [] count = 0 for scoreDoc in score_docs: # skip offset if count < results_per_page * page: count += 1 continue count += 1 doc = searcher.doc(scoreDoc.doc) table = dict((field.name(), field.stringValue()) for field in doc.getFields()) results.append(table) return results, duration, count_results
def lucene_range_query_parse(query_string): '''parse the user's range query string into something pylucene can understand''' query = BooleanQuery() queries_ = query_string.split(snapconf.RANGE_QUERY_DELIMITER) start = None end = None start_inclusive = True end_inclusive = True for query_tuple in queries_: m=snapconf.RANGE_QUERY_FIELD_PATTERN.search(query_tuple) (col,op_,val)=re.split(snapconf.RANGE_QUERY_OPS,query_tuple) if not m or not col or col not in snapconf.TABIX_DBS or col not in snapconf.LUCENE_TYPES: continue op=m.group(1) if op not in snapconf.operators: sys.stderr.write("bad operator %s in range query,exiting\n" % (str(op))) sys.exit(-1) (ltype,ptype,qtype) = snapconf.LUCENE_TYPES[col] rquery = None if ptype == str: rquery = TermQuery(qtype(col,str(val))) else: #assume operator == '=' (start,end) = (ptype(val),ptype(val)) if op == '>=': end = None if op == '<=': start = None if op == '<': start = None end_inclusive = False if op == '>': end = None start_inclusive = False rquery = qtype(col,start,end,start_inclusive,end_inclusive) query.add(rquery,BooleanClause.Occur.MUST) #sys.stderr.write("query + fields: %s %s\n" % (query,field)) return query
def extract_phrase_query(self, q, field, slop=0, boost=5): phrases = re.findall(r'"([^"]*)"', q) if len(phrases) == 0: return None, q q = re.sub(r'"([^"]*)"', "", q).strip() # query without phrases if self.verbose: print "Detected phrases: ", phrases bq = BooleanQuery() for phrase in phrases: # pq = PhraseQuery() # for term in filter(None, phrase.split(' ')): # pq.add(Term(field, term)) qparser = QueryParser(Version.LUCENE_CURRENT, field, self.analyzer) # parse phrase - this may or may not be desired # pq = qparser.parse(field + ':"' + phrase + '"') pq = qparser.parse('%s "%s"~%d^%.1f' % (phrase, phrase, slop, boost)) # phrase queries have high priority bq.add(pq, BooleanClause.Occur.MUST) # bq.add(pq, BooleanClause.Occur.SHOULD) return bq, q
def testBraces(self): self.assertConversion(TermQuery(Term('unqualified', 'cats')), '(cats)') innerQuery = BooleanQuery() innerQuery.add(TermQuery(Term('unqualified', 'cats')), BooleanClause.Occur.MUST) innerQuery.add(TermQuery(Term('unqualified', 'dogs')), BooleanClause.Occur.MUST) outerQuery = BooleanQuery() outerQuery.add(innerQuery, BooleanClause.Occur.SHOULD) outerQuery.add(TermQuery(Term('unqualified', 'mice')), BooleanClause.Occur.SHOULD) self.assertConversion(outerQuery, '(cats AND dogs) OR mice')
def _luceneQuery(self, prefix, sets=None, setsMask=None, partition=None): query = BooleanQuery() if prefix: query.add(TermQuery(Term(PREFIX_FIELD, prefix)), BooleanClause.Occur.MUST) if sets: setQuery = BooleanQuery() for setSpec in sets: setQuery.add(TermQuery(Term(SETS_FIELD, setSpec)), BooleanClause.Occur.SHOULD) query.add(setQuery, BooleanClause.Occur.MUST) for set_ in setsMask or []: query.add(TermQuery(Term(SETS_FIELD, set_)), BooleanClause.Occur.MUST) if partition: partitionQueries = [] for start, stop in partition.ranges(): partitionQueries.append(NumericRangeQuery.newIntRange(HASH_FIELD, start, stop, True, False)) if len(partitionQueries) == 1: pQuery = partitionQueries[0] else: pQuery = BooleanQuery() for q in partitionQueries: pQuery.add(q, BooleanClause.Occur.SHOULD) query.add(pQuery, BooleanClause.Occur.MUST) if query.clauses().size() == 0: query.add(MatchAllDocsQuery(), BooleanClause.Occur.MUST) return query
def testBooleanOrTermOutput(self): query = BooleanQuery() query.add(TermQuery(Term('unqualified', 'cats')), BooleanClause.Occur.SHOULD) query.add(TermQuery(Term('unqualified', 'dogs')), BooleanClause.Occur.SHOULD) self.assertConversion(query, 'cats OR dogs')
def testEquality(self): bq1 = BooleanQuery() bq1.add(TermQuery(Term("field", "value1")), BooleanClause.Occur.SHOULD) bq1.add(TermQuery(Term("field", "value2")), BooleanClause.Occur.SHOULD) nested1 = BooleanQuery() nested1.add(TermQuery(Term("field", "nestedvalue1")), BooleanClause.Occur.SHOULD) nested1.add(TermQuery(Term("field", "nestedvalue2")), BooleanClause.Occur.SHOULD) bq1.add(nested1, BooleanClause.Occur.SHOULD) bq2 = BooleanQuery() bq2.add(TermQuery(Term("field", "value1")), BooleanClause.Occur.SHOULD) bq2.add(TermQuery(Term("field", "value2")), BooleanClause.Occur.SHOULD) nested2 = BooleanQuery() nested2.add(TermQuery(Term("field", "nestedvalue1")), BooleanClause.Occur.SHOULD) nested2.add(TermQuery(Term("field", "nestedvalue2")), BooleanClause.Occur.SHOULD) bq2.add(nested2, BooleanClause.Occur.SHOULD) self.assert_(bq1.equals(bq2))
def testBooleanNotTermOutput(self): query = BooleanQuery() query.add(TermQuery(Term('unqualified', 'cats')), BooleanClause.Occur.MUST) query.add(TermQuery(Term('unqualified', 'dogs')), BooleanClause.Occur.MUST_NOT) self.assertConversion(query, 'cats NOT dogs')
def addDuplicatesQuery(self, query): not_duplicate = TermQuery(Term('duplicate', 'false')) booleanQuery = BooleanQuery() booleanQuery.add(not_duplicate, BooleanClause.Occur.MUST) booleanQuery.add(query, BooleanClause.Occur.MUST) return booleanQuery
def search(primary_keys_map,to_be_compressed_input,collection_name,tofind,MAX_RESULTS=1000): INDEX_DIR_DEFAULT="IndexFiles.index" if collection_name!="DEFAULT": INDEX_DIR=collection_name else: INDEX_DIR=INDEX_DIR_DEFAULT try: print "********" + tofind tofind_keyvalue_pairs=json.loads(tofind) except: return 100 direc=SimpleFSDirectory(File(INDEX_DIR)) analyzer=StandardAnalyzer(Version.LUCENE_CURRENT) try: ireader=IndexReader.open(direc) searcher=IndexSearcher(ireader) except: return 105 #initializing return list return_list=[] #check_list=[] tofind_primary_keyvalue_pairs={} tofind_nonprimary_keyvalue_pairs={} #separating out primary and non_primary keys for key in tofind_keyvalue_pairs.keys(): if key in primary_keys_map: tofind_primary_keyvalue_pairs[key]=tofind_keyvalue_pairs[key] else: tofind_nonprimary_keyvalue_pairs[key]=tofind_keyvalue_pairs[key] #filtering documents if len(tofind_primary_keyvalue_pairs)>0: query=BooleanQuery() for key in tofind_primary_keyvalue_pairs.keys(): temp=QueryParser(Version.LUCENE_CURRENT,key,analyzer).parse(tofind_primary_keyvalue_pairs[key]) query.add(BooleanClause(temp,BooleanClause.Occur.MUST)) hits=searcher.search(query,MAX_RESULTS).scoreDocs for hit in hits: doc=searcher.doc(hit.doc) if to_be_compressed_input==True: data=snappy.uncompress(doc.get("$DATA$")) else: data=doc.get("$DATA$") #non primary key filtering(without having to load all the primary key filtered values into main memory!) if len(tofind_nonprimary_keyvalue_pairs)>0: entry=json.loads(data) satisfied=True for key in tofind_nonprimary_keyvalue_pairs.keys(): if entry.get(key)!=tofind_nonprimary_keyvalue_pairs[key]: satisfied=False break if satisfied==True: return_list.append(data) else: return_list.append(data) else: for i in range(0,ireader.numDocs()): doc=searcher.doc(i) if to_be_compressed_input==True: data=snappy.uncompress(str(doc.get("$DATA$"))) else: data=doc.get("$DATA$") #non primary key filtering(without having to load all the primary key filtered values into main memory!) if len(tofind_nonprimary_keyvalue_pairs)>0: entry=json.loads(data) satisfied=True for key in tofind_nonprimary_keyvalue_pairs.keys(): if entry.get(key)!=tofind_nonprimary_keyvalue_pairs[key]: satisfied=False break if satisfied==True: return_list.append(data) else: return_list.append(data) ireader.close() if len(return_list)==0: return None else: return return_list
def get_query_results(reader,query,n,field): searcher = IndexSearcher(reader) hits = searcher.search(query, n).scoreDocs print("Found %d hits:" % len(hits)) for i, hit in enumerate(hits): doc = searcher.doc(hit.doc) print("%d. %s" % (i + 1, doc.get(field))) #### part(a) query1a = TermQuery(Term("capital_html","greek")) query2a = TermQuery(Term("capital_html","roman")) query3a = TermQuery(Term("capital_html","persian")) boolean_query_a = BooleanQuery() boolean_query_a.add(query1a, BooleanClause.Occur.MUST) boolean_query_a.add(query2a, BooleanClause.Occur.MUST) boolean_query_a.add(query3a, BooleanClause.Occur.MUST_NOT) get_query_results(reader,boolean_query_a,n_docs,"capital") #Found 32 hits: #1. https://en.wikipedia.org/wiki/Sukhumi #2. https://en.wikipedia.org/wiki/Nicosia #3. https://en.wikipedia.org/wiki/Nicosia #4. https://en.wikipedia.org/wiki/Tiraspol #5. https://en.wikipedia.org/wiki/Tripoli #6. https://en.wikipedia.org/wiki/Tunis #7. https://en.wikipedia.org/wiki/Lisbon #8. https://en.wikipedia.org/wiki/Podgorica #9. https://en.wikipedia.org/wiki/Cetinji
def update(primary_keys_map,to_be_compressed_input,collection_name,tofind,update,commit=False,add_field_if_not_exists=True): INDEX_DIR_DEFAULT="IndexFiles.index" #As of now the update will be implemented as search,modify data in json file,delete and re-write if collection_name!="DEFAULT": INDEX_DIR=collection_name else: INDEX_DIR=INDEX_DIR_DEFAULT try: tofind_keyvalue_pairs=json.loads(tofind) except: return 100 direc=SimpleFSDirectory(File(INDEX_DIR)) analyzer=StandardAnalyzer(Version.LUCENE_CURRENT) try: ireader=IndexReader.open(direc) searcher=IndexSearcher(ireader) #setting writer configurations config=IndexWriterConfig(Version.LUCENE_CURRENT,analyzer) config.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND) writer=IndexWriter(direc,config) except: return 105 no_of_documents_modified=0 #finding the document to update #Scope for making this more efficient def rewrite(data_string): data=json.loads(data_string) toupdate=json.loads(update) #primary_key_modified=False #delete the appropriate document query=BooleanQuery() for key in primary_keys_map: temp=QueryParser(Version.LUCENE_CURRENT,key,analyzer).parse(data[key]) query.add(BooleanClause(temp,BooleanClause.Occur.MUST)) #modify the values for key,value in toupdate.items(): #if such a key is not present the we either add and update that key into data,or just ignore it!(By default it is set to True!) if add_field_if_not_exists==False: if key in data.keys(): data[key]=value else: data[key]=value #this deletion statement has been intenstionally added here #only if the modified data,has primary keys already not existing,will the updating process continue query_search=BooleanQuery() for key in primary_keys_map: temp=QueryParser(Version.LUCENE_CURRENT,key,analyzer).parse(data[key]) query_search.add(BooleanClause(temp,BooleanClause.Occur.MUST)) hits=searcher.search(query_search,MAX_RESULTS).scoreDocs if len(hits) > 0: return 106 writer.deleteDocuments(query) #add the newly modified document doc=Document() #index files wrt primary key for primary_key in primary_keys_map: try: field=Field(primary_key,data[primary_key],Field.Store.NO,Field.Index.ANALYZED) doc.add(field) except: # primary_keys_map.pop(collection_name) return 101 #compress data using snappy if compression is on if to_be_compressed_input==True: data_string=snappy.compress(str(json.dumps(data))) else: data_string=json.dumps(data) field=Field("$DATA$",data_string,Field.Store.YES,Field.Index.ANALYZED) doc.add(field) writer.addDocument(doc) tofind_primary_keyvalue_pairs={} tofind_nonprimary_keyvalue_pairs={} #separating out primary and non_primary keys for key in tofind_keyvalue_pairs.keys(): if key in primary_keys_map: tofind_primary_keyvalue_pairs[key]=tofind_keyvalue_pairs[key] else: tofind_nonprimary_keyvalue_pairs[key]=tofind_keyvalue_pairs[key] #filtering documents if len(tofind_primary_keyvalue_pairs)>0: query=BooleanQuery() for key in tofind_primary_keyvalue_pairs.keys(): temp=QueryParser(Version.LUCENE_CURRENT,key,analyzer).parse(tofind_primary_keyvalue_pairs[key]) query.add(BooleanClause(temp,BooleanClause.Occur.MUST)) hits=searcher.search(query,MAX_RESULTS).scoreDocs for hit in hits: doc=searcher.doc(hit.doc) if to_be_compressed_input==True: data=snappy.uncompress(doc.get("$DATA$")) else: data=doc.get("$DATA$") #non primary key filtering(without having to load all the primary key filtered values into main memory!) if len(tofind_nonprimary_keyvalue_pairs)>0: entry=json.loads(data) satisfied=True for key in tofind_nonprimary_keyvalue_pairs.keys(): if entry.get(key)!=tofind_nonprimary_keyvalue_pairs[key]: satisfied=False break if satisfied==True: if rewrite(data)!=106: no_of_documents_modified+=1 else: writer.rollback() return 106 else: if rewrite(data)!=106: no_of_documents_modified+=1 else: writer.rollback() return 106 else: for i in range(0,ireader.numDocs()): doc=searcher.doc(i) if to_be_compressed_input==True: data=snappy.uncompress(doc.get("$DATA$")) else: data=doc.get("$DATA$") #non primary key filtering(without having to load all the primary key filtered values into main memory!) if len(tofind_nonprimary_keyvalue_pairs)>0: entry=json.loads(data) satisfied=True for key in tofind_nonprimary_keyvalue_pairs.keys(): if entry.get(key)!=tofind_nonprimary_keyvalue_pairs[key]: satisfied=False break if satisfied==True: if rewrite(data)!=106: no_of_documents_modified+=1 else: writer.rollback() return 106 else: if rewrite(data)!=106: no_of_documents_modified+=1 else: writer.rollback() return 106 ireader.close() if commit==True: writer.commit() writer.close() return str(no_of_documents_modified)+" have been modified"
def store(primary_keys_map,to_be_compressed_input,collection_name,data,commit=False): INDEX_DIR_DEFAULT="IndexFiles.index" if collection_name!="DEFAULT": INDEX_DIR=collection_name else: INDEX_DIR=INDEX_DIR_DEFAULT print "started indexing input data......" #extracting values try: contents=json.loads(data) except: return 100 direc=SimpleFSDirectory(File(INDEX_DIR)) analyzer=StandardAnalyzer(Version.LUCENE_CURRENT) #checking for existance of record with same primary_key set try: ireader=IndexReader.open(direc) searcher=IndexSearcher(ireader) query=BooleanQuery() for key in primary_keys_map: temp=QueryParser(Version.LUCENE_CURRENT,key,analyzer).parse(contents[key]) query.add(BooleanClause(temp,BooleanClause.Occur.MUST)) hits=searcher.search(query,MAX_RESULTS).scoreDocs if len(hits) > 0: return 106 except: pass #setting writer configurations config=IndexWriterConfig(Version.LUCENE_CURRENT,analyzer) config.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND) writer=IndexWriter(direc,config) #fix this later.....FieldType not defined #field_type=FieldType() #field_type.setIndexed(True) #field_type.setStored(False) #field_type.setTokenized(False) try: doc=Document() #index files wrt primary key for primary_key in primary_keys_map: try: field=Field(primary_key,contents[primary_key],Field.Store.NO,Field.Index.ANALYZED) doc.add(field) except: # primary_keys_map.pop(collection_name) return 101 #compress data using snappy if compression is on if to_be_compressed_input==True: data=snappy.compress(data) field=Field("$DATA$",data,Field.Store.YES,Field.Index.ANALYZED) doc.add(field) writer.addDocument(doc) if commit==True: writer.commit() writer.close() return 000 except: return 102