def testEquality(self): b1 = BooleanQuery.Builder() b1.add(TermQuery(Term("field", "value1")), BooleanClause.Occur.SHOULD) b1.add(TermQuery(Term("field", "value2")), BooleanClause.Occur.SHOULD) bq1 = b1.build() n1 = BooleanQuery.Builder() n1.add(TermQuery(Term("field", "nestedvalue1")), BooleanClause.Occur.SHOULD) n1.add(TermQuery(Term("field", "nestedvalue2")), BooleanClause.Occur.SHOULD) nested1 = n1.build() b1.add(nested1, BooleanClause.Occur.SHOULD) bq1 = b1.build() b2 = BooleanQuery.Builder() b2.add(TermQuery(Term("field", "value1")), BooleanClause.Occur.SHOULD) b2.add(TermQuery(Term("field", "value2")), BooleanClause.Occur.SHOULD) n2 = BooleanQuery.Builder() n2.add(TermQuery(Term("field", "nestedvalue1")), BooleanClause.Occur.SHOULD) n2.add(TermQuery(Term("field", "nestedvalue2")), BooleanClause.Occur.SHOULD) nested2 = n2.build() b2.add(nested2, BooleanClause.Occur.SHOULD) bq2 = b2.build() self.assert_(bq1.equals(bq2))
def search_html(searcher, analyzer): while True: print("Hit enter with no input to quit.") command = input("Query:") os.system("clear") if command == "": return print("Searching for:", command) command_dict = parse_command(command) querys = BooleanQuery.Builder() for k, v in command_dict.items(): if k == "content": cutted = [x for x in jieba.cut_for_search(v) if x.strip()] v = " ".join(cutted) print("After segmentation:", v) query = QueryParser(k, analyzer).parse(v) querys.add(query, BooleanClause.Occur.MUST) # query = QueryParser("content", analyzer).parse(command) querys = querys.build() # query = QueryParser("content", analyzer).parse("美好") # query = TermQuery(Term("content", "美好")) querys = BooleanQuery.Builder().add(query, BooleanClause.Occur.MUST).build() scoreDocs = searcher.search(querys, 10).scoreDocs print("{} total matching documents.".format(len(scoreDocs))) for num, scoreDoc in enumerate(scoreDocs): doc = searcher.doc(scoreDoc.doc) print( "\n#{num}:\nTitle:{title}\nURL:{url}\nSite:{site}\nPath:{path}\nFile Name:{name}\n" .format(num=num + 1, title=doc.get("title"), url=doc.get("url"), path=doc.get("path"), name=doc.get("name"), site=doc.get("site")))
def search(self, field: str): sear = self._search if len(self._commandInfo.getKey()) == 0 or self._commandInfo.getKey()[0] in ['-', '~']: query = QueryParser(field, StandardAnalyzer()).parse(make_parser(self._commandInfo.getWordList()[0])) elif self._commandInfo.getKey()[0] == '#': query1 = QueryParser(field, StandardAnalyzer()).parse(make_parser(self._commandInfo.getWordList()[0])) query2 = QueryParser(field, StandardAnalyzer()).parse(make_parser(self._commandInfo.getWordList()[1])) bc1 = BooleanClause(query1, BooleanClause.Occur.MUST) bc2 = BooleanClause(query2, BooleanClause.Occur.MUST) query = BooleanQuery.Builder().add(bc1).add(bc2).build() elif self._commandInfo.getKey()[0] in ['$', '+']: bq = BooleanQuery.Builder() for w in self._commandInfo.getWordList(): queryx = QueryParser(field, StandardAnalyzer()).parse(make_parser(w)) bc = BooleanClause(queryx, BooleanClause.Occur.MUST) bq.add(bc) query = bq.build() else: query = '' hits = sear.search(query, 999999) for hit in hits.scoreDocs: doc = sear.doc(hit.doc) res = doc.get(field) id = doc.get(field+'_id') if doc_hit(res, self._commandInfo): sentences = re.split('[!?!?。]', res) map(lambda x: sentences.pop(x) if x == '' else 0, range(len(sentences))) for sentence in sentences: if key_filter(self._commandInfo, sentence): self._doc[id] = res self._resultSentencesList.append((id, sentence)) return self
def testParenthesisMust2(self): b3 = BooleanQuery.Builder() b3.add(BooleanClause(self.t1, BooleanClause.Occur.SHOULD)) b3.add(BooleanClause(self.t2, BooleanClause.Occur.SHOULD)) q3 = b3.build() b4 = BooleanQuery.Builder() b4.add(BooleanClause(self.c1, BooleanClause.Occur.SHOULD)) b4.add(BooleanClause(self.c2, BooleanClause.Occur.SHOULD)) q4 = b4.build() b2 = BooleanQuery.Builder() b2.add(q3, BooleanClause.Occur.SHOULD) b2.add(q4, BooleanClause.Occur.MUST) q2 = b2.build() self.assertEqual(1, self.search(q2))
def search_html(query_string, limit=10): command_dict = parse_command(query_string) vm_env.attachCurrentThread() querys = BooleanQuery.Builder() cutted_query = None for k, v in command_dict.items(): if k == "content": cutted_query = [x for x in jieba.cut_for_search(v) if x.strip()] v = " ".join(cutted_query) query = QueryParser(k, analyzer).parse(v) querys.add(query, BooleanClause.Occur.MUST) querys = querys.build() querys = BooleanQuery.Builder().add(query, BooleanClause.Occur.MUST).build() scoreDocs = searcher["html"].search(querys, limit).scoreDocs result = list() for num, scoreDoc in enumerate(scoreDocs): doc = searcher["html"].doc(scoreDoc.doc) single_result = { "title": doc.get("title"), "url": doc.get("url"), } with open(doc.get("path"), mode="r", encoding="utf8") as file: content = file.read() html2text = HTML2Text() html2text.ignore_links = True html2text.ignore_images = True content = html2text.handle(content) cutted_content = jieba.cut(content) flag = False if cutted_query: word_num, cnt = 20, 0 for x in cutted_content: if not x.strip(): continue if not flag and x in cutted_query: flag = True content = "" if flag and cnt < word_num: cnt += 1 content += (x if x not in cutted_query else "<span class='highlight'>{0}</span>".format(x)) elif cnt >= word_num: break single_result["content"] = content if flag else content[:100] result.append(single_result) return result
def get_most_similar(self, sentence, do_log=False): # print('query string is',string) # q = QueryParser('pa', self.analyzer).parse(sentence) query_builder = BooleanQuery.Builder() for token in sentence.split(' '): if token not in sw: qtq = TermQuery(Term("pa", token)) query_builder.add( BooleanClause(qtq, BooleanClause.Occur.SHOULD)) q = query_builder.build() hitsPerPage = 2 reader = DirectoryReader.open(self.w) self.searcher = IndexSearcher(reader) simi = BM25Similarity(Config.k1, Config.b) # simi = ClassicSimilarity() self.searcher.setSimilarity(simi) docs = self.searcher.search(q, hitsPerPage) hits = docs.scoreDocs # print("Found " + str(len(hits)) + " hits.") if len(hits) > 0: mate = self.searcher.doc(hits[0].doc).get("id") if do_log: print("found something. mate: ", mate, "- score : ", hits[0].score) return hits[0], int(mate) else: return None, -1
def run(searcher, analyzer): #while True: print() print("Hit enter with no input to quit.") #command = input("Query:") # command = unicode(command, 'GBK') command = '环保节能社会 site:guancha.cn' if command == '': return print() print("Searching for:", command) command_dict = parseCommand(command) print(command_dict) querys = BooleanQuery.Builder() for k, v in command_dict.items(): query = QueryParser(k, analyzer).parse(v) # print(query) querys.add(query, BooleanClause.Occur.MUST) scoreDocs = searcher.search(querys.build(), 50).scoreDocs print("%s total matching documents." % len(scoreDocs)) for scoreDoc in scoreDocs: doc = searcher.doc(scoreDoc.doc) ## explanation = searcher.explain(query, scoreDoc.doc) print("------------------------") print('path:', doc.get("path")) print('name:', doc.get("name")) print('title:', doc.get('title')) print('url:', doc.get('url'))
def combine_queries(q1, q2): '''Combine the two given queries into a BooleanQuery with the AND operator.''' b = BooleanQuery.Builder() b.add(q1, BooleanClause.Occur.MUST) # Must include results from q1 b.add(q2, BooleanClause.Occur.MUST) # Must include results from q2 bq = b.build() # BooleanQuery instance return bq
def get_or_query(self, queries): """Creates an OR Boolean query from multiple Lucene queries """ # empty boolean query with Similarity.coord() disabled bq_builder = BooleanQuery.Builder() for q in queries: bq_builder.add(q, BooleanClause.Occur.SHOULD) bq = bq_builder.build() return bq
def testFlat(self): b = BooleanQuery.Builder() b.add(BooleanClause(self.t1, BooleanClause.Occur.SHOULD)) b.add(BooleanClause(self.t2, BooleanClause.Occur.SHOULD)) b.add(BooleanClause(self.c1, BooleanClause.Occur.SHOULD)) b.add(BooleanClause(self.c2, BooleanClause.Occur.SHOULD)) q = b.build() self.assertEqual(1, self.search(q))
def get_and_query(self, queries): """Creates an AND Boolean query from multiple Lucene queries """ # empty boolean query with Similarity.coord() disabled bq_builder = BooleanQuery.Builder() bq_builder.setDisableCoord(False) for q in queries: bq_builder.add(q, BooleanClause.Occur.MUST) bq = bq_builder.build() return bq
def __init__(self, index_path=os.path.join(ROOT_DIR, 'corpus/indexRI')): """ Lucene components initialization :param index_path: path of the index """ self.analyzer = StandardAnalyzer() self.index = SimpleFSDirectory(File(index_path).toPath()) self.reader = DirectoryReader.open(self.index) self.searcher = IndexSearcher(self.reader) self.constrained_query = BooleanQuery.Builder() self.parser = Parser()
def expandQuery(ixreader, result, nrRelevant): relevant = [] mlt = MoreLikeThis(ixreader) for i in result[0:nrRelevant - 1]: docid = ixreader.doc(i.doc) relevant.append(mlt.like(docid)) querybuilder = BooleanQuery.Builder() for i in relevant: querybuilder.add(i, BooleanClause.Occur.SHOULD) return querybuilder.build()
def search(self, command_dict): ''' Search for the query in the Lucene index. Input: `command_dict`: dict containing preprocessed query Output: score_docs satisfying the requirement ''' querys = BooleanQuery.Builder() for k, v in command_dict.items(): query = QueryParser(k, self.analyzer).parse(v) querys.add(query, BooleanClause.Occur.MUST) return self.searcher.search(querys.build(), 50).scoreDocs
def _luceneQueryBuilder(self, prefix, sets=None, setsMask=None, partition=None): numberOfClausesAdded = 0 queryBuilder = BooleanQuery.Builder() if prefix: queryBuilder.add(TermQuery(Term(PREFIX_FIELD, prefix)), BooleanClause.Occur.MUST) numberOfClausesAdded += 1 if sets: setQueryBuilder = BooleanQuery.Builder() for setSpec in sets: setQueryBuilder.add(TermQuery(Term(SETS_FIELD, setSpec)), BooleanClause.Occur.SHOULD) queryBuilder.add(setQueryBuilder.build(), BooleanClause.Occur.MUST) numberOfClausesAdded += 1 for set_ in setsMask or []: queryBuilder.add(TermQuery(Term(SETS_FIELD, set_)), BooleanClause.Occur.MUST) numberOfClausesAdded += 1 if partition: partitionQueries = [] for start, stop in partition.ranges(): partitionQueries.append( IntPoint.newRangeQuery(HASH_FIELD, start, stop - 1)) if len(partitionQueries) == 1: pQuery = partitionQueries[0] else: pQueryBuilder = BooleanQuery.Builder() for q in partitionQueries: pQueryBuilder.add(q, BooleanClause.Occur.SHOULD) pQuery = pQueryBuilder.build() queryBuilder.add(pQuery, BooleanClause.Occur.MUST) numberOfClausesAdded += 1 if numberOfClausesAdded == 0: queryBuilder.add(MatchAllDocsQuery(), BooleanClause.Occur.MUST) return queryBuilder
def getQueryBuiler(): # builder = QueryBuilder(analyzer) boolean_query = BooleanQuery.Builder() # print(args.search) if len(args.search) == 0: boolean_query.add(MatchAllDocsQuery(), BooleanClause.Occur.MUST) return boolean_query for i in range(len(args.search)): curSearch = args.search[i].split(' ') if curSearch[1] == 'query': parser = QueryParser(curSearch[2], analyzer) query = parser.parse(curSearch[3]) elif curSearch[1] == 'intrange': query = IntPoint.newRangeQuery(curSearch[2], curSearch[3], curSearch[4]) elif curSearch[1] == 'termrange': lowerDate = handleDate(curSearch[3], '%d/%b/%Y:%H:%M:%S') upperDate = handleDate(curSearch[4], '%d/%b/%Y:%H:%M:%S') query = TermRangeQuery.newStringRange(curSearch[2], lowerDate, upperDate, True, True) if curSearch[0] == 'must': boolean_query.add(query, BooleanClause.Occur.MUST) elif curSearch[0] == 'should': boolean_query.add(query, BooleanClause.Occur.SHOULD) elif curSearch[0] == 'filter': boolean_query.add(query, BooleanClause.Occur.FILTER) elif curSearch[0] == 'must_not': boolean_query.add(query, BooleanClause.Occur.MUST_NOT) else: print('raise exception') # raise Exception # exit() # parser = QueryParser('method1', analyzer) # query = parser.parse('options') # boolean_query.add(query, BooleanClause.Occur.MUST) # parser = QueryParser('response_code', analyzer) # query = IntPoint.newRangeQuery('response_code', 200, 300) # boolean_query.add(query, BooleanClause.Occur.MUST) # lowerDate = handleDate("19/Jul/2020:05:40:00 +0000") # upperDate = handleDate("19/Jul/2020:06:45:04 +0000") # query = TermRangeQuery.newStringRange("date_time", lowerDate, upperDate, True, True) # boolean_query.add(query, BooleanClause.Occur.MUST) return boolean_query
def query_section(self, section): searcher = self._searcher query_doc = RegexpQuery(Term('id', self._id + '\\..+')) query_section = TermQuery(Term('section', section)) query = BooleanQuery.Builder() bc1 = BooleanClause(query_doc, BooleanClause.Occur.MUST) bc2 = BooleanClause(query_section, BooleanClause.Occur.MUST) query = query.add(bc1).add(bc2).build() top_docs = searcher.search(query, 1000000) hits = top_docs.scoreDocs res_dict = {} for hit in hits: doc = searcher.doc(hit.doc) id = doc.get('id') document = doc.get('document') section = doc.get('section') author = doc.get('author') dynasty = doc.get('dynasty') type = doc.get('type') text = doc.get('text') color = doc.get('color') area = doc.get('area') zhujie = doc.get('zhujie') detail = doc.get('detail') res_dict[id] = {} if document: res_dict[id]['document'] = document if section: res_dict[id]['section'] = section if author: res_dict[id]['author'] = author if dynasty: res_dict[id]['dynasty'] = dynasty if type: res_dict[id]['type'] = type if text: res_dict[id]['text'] = text if color: res_dict[id]['color'] = color if area: res_dict[id]['area'] = area if zhujie: res_dict[id]['zhujie'] = zhujie if detail: res_dict[id]['detail'] = detail res_dict[self._id] = {'document': section} self._resDict = res_dict return self
def get_results(self, nb_results=1000): """ Get results that match with the query :param nb_results: :return: """ docs = self.searcher.search(self.constrained_query.build(), nb_results).scoreDocs self.constrained_query = BooleanQuery.Builder() hits = [] for i in range(len(docs)): hits.append({}) for field in self.reader.document(docs[i].doc).getFields(): hits[i][field.name()] = field.stringValue() hits = self.remove_duplicates(hits) return hits
def _parse_query(self, field_name, query): ts = self.analyzer.tokenStream("dummy", StringReader(query)) termAtt = ts.getAttribute(CharTermAttribute.class_) ts.reset() tokens = [] while ts.incrementToken(): tokens.append(termAtt.toString()) ts.end() ts.close() booleanQuery = BooleanQuery.Builder() for token in tokens: builder = PhraseQuery.Builder() for i, word in enumerate(token.split(' ')): builder.add(Term(field_name, word), i) pq = builder.build() booleanQuery.add(pq, BooleanClause.Occur.SHOULD) final_query = booleanQuery.build() return final_query
def pairSearch(self, pair, sim): """ Method that searches through documents using only content_section Field searchDir : the path to the folder that contains the index. """ # Now search the index: title = pair[0].replace('_', ' ') content = pair[1] parser = QueryParser("content_section", self.analyzer) query1 = parser.parse(QueryParser.escape(title)) query2 = parser.parse(QueryParser.escape(content)) bq = BooleanQuery.Builder() bq.add(query1, BooleanClause.Occur.FILTER) bq.add(query2, BooleanClause.Occur.SHOULD) self.searcher.setSimilarity(sim) hits = self.searcher.search(bq.build(), 6).scoreDocs return hits
def searchAncient(self, field): indexDir = SimpleFSDirectory(Paths.get(self._dir)) sear = IndexSearcher(DirectoryReader.open(indexDir)) bq = BooleanQuery.Builder() q = QueryParser(field, StandardAnalyzer()).parse(make_parser(self._searchWord)) bc = BooleanClause(q, BooleanClause.Occur.MUST) bq.add(bc) search_fields = self._fields for i in search_fields: if i == 'section' or i == 'document': continue queryx = QueryParser(i, KeywordAnalyzer()).parse(make_ancient_parser(search_fields[i])) bc = BooleanClause(queryx, BooleanClause.Occur.MUST) bq.add(bc) query = bq.build() hits = sear.search(query, 9999) for hit in hits.scoreDocs: doc = sear.doc(hit.doc) res = doc.get(field) id = doc.get('id') detail = get_detail(doc) zhujie = detail['zhujie'] if detail['detail'] and 'detail' in detail['detail'].keys(): detail['detail'] = detail['detail']['detail'] detail.pop('zhujie') detail.pop('text') detail.pop('type') detail = json.dumps(detail) self._doc[id] = res if doc_hit(res, self._words): f = key_filter(self._words, self._re, res) if f: if 'section' in search_fields.keys(): if not search_upper_title_filter(id, sear, search_fields['section'], 0): continue if 'document' in search_fields.keys(): if not search_upper_title_filter(id, sear, search_fields['document'], 1): continue self._match.append(f) self._resultSentencesList.append((id, res, detail, zhujie)) print(res) print(self._match) return self
def get_user_query(positive_rated_movies): tags, genres, descriptions = get_user_profile(positive_rated_movies) query_builder = BooleanQuery.Builder() if tags != '': tags = tags_parser.escape(tags) tags = tags_parser.parse(tags) query_builder.add(tags, BooleanClause.Occur.SHOULD) if genres != '': genres = genres_parser.escape(genres) genres = genres_parser.parse(genres) query_builder.add(genres, BooleanClause.Occur.SHOULD) if descriptions != '': descriptions = descr_parser.escape(descriptions) descriptions = descr_parser.parse(descriptions) query_builder.add(descriptions, BooleanClause.Occur.SHOULD) return query_builder.build()
def multiFieldsPairSearch(self, pair, sim): """ Method that searches through documents using only content_section Field searchDir : the path to the folder that contains the index. """ # Now search the index: title = pair[0].replace('_', ' ') content = pair[1] parser = MultiFieldQueryParser(["content_section", "title_article"], self.analyzer) parser.setDefaultOperator(QueryParserBase.OR_OPERATOR) query1 = MultiFieldQueryParser.parse(parser, QueryParser.escape(title)) query2 = MultiFieldQueryParser.parse(parser, QueryParser.escape(content)) bq = BooleanQuery.Builder() bq.add(query1, BooleanClause.Occur.FILTER) bq.add(query2, BooleanClause.Occur.SHOULD) self.searcher.setSimilarity(sim) hits = self.searcher.search(bq.build(), 6).scoreDocs return hits
def find_results(query, reader): """ For the given `query`, search the index against the 'content' field in the index. """ searcher = IndexSearcher(reader) content_query = QueryParser('content', Analyzer()).parse(query) highlighter = build_highlighter(content_query) abstract_query = QueryParser('abstract', Analyzer()).parse(query) abstract_query = BoostQuery(abstract_query, ABSTRACT_BOOST) # boost the abstract content_query = BoostQuery(content_query, CONTENT_BOOST) # query on both the abstract and the content field query_builder = BooleanQuery.Builder() query_builder.add(abstract_query, BooleanClause.Occur.SHOULD) query_builder.add(content_query, BooleanClause.Occur.MUST) query = query_builder.build() hits = searcher.search(query, MAX_N_DOCS).scoreDocs results = [] for hit in hits: doc = searcher.doc(hit.doc) content = doc.get('content') stream = TokenSources.getTokenStream('content', content, Analyzer()) fragments = highlighter.getBestTextFragments( stream, content, MERGE_CONTIGUOUS_FRAGMENTS, MAX_N_FRAGMENTS) fragments = [unicode(f).strip() for f in fragments] fragments = [f for f in fragments if f != ''] # no empty fragments if not ''.join(fragments) == '': results.append( Result(doc.get('name'), doc.get('path'), fragments, hit.doc, reader)) return results
def __recs_query(self, positive_rated_document_list, scores, recs_number, items_directory, candidate_list: List) -> pd.DataFrame: """ Builds a query using the contents that the user liked. The terms relative to the contents that the user liked are boosted by the rating he/she gave. A filter clause is added to the query to consider only candidate items Args: positive_rated_document_list: List of contents that the user liked scores: Ratings given by the user recs_number: How many items must be recommended. You can only specify the number, not a specific item for which compute the prediction items_directory: Directory where the items are stored Returns: score_frame (pd.DataFrame): DataFrame containing the recommendations for the user """ BooleanQuery.setMaxClauseCount(2000000) searcher = IndexSearcher( DirectoryReader.open(SimpleFSDirectory( Paths.get(items_directory)))) if self.__classic_similarity: searcher.setSimilarity(ClassicSimilarity()) field_list = searcher.doc(positive_rated_document_list[0]).getFields() user_fields = {} field_parsers = {} analyzer = SimpleAnalyzer() for field in field_list: if field.name() == 'content_id': continue user_fields[field.name()] = field.stringValue() field_parsers[field.name()] = QueryParser(field.name(), analyzer) positive_rated_document_list.remove(positive_rated_document_list[0]) for _ in positive_rated_document_list: for field in field_list: if field.name() == 'content_id': continue user_fields[field.name()] += field.stringValue() logger.info("Building query") query_builder = BooleanQuery.Builder() for score in scores: for field_name in user_fields.keys(): if field_name == 'content_id': continue field_parsers[field_name].setDefaultOperator( QueryParser.Operator.OR) field_query = field_parsers[field_name].escape( user_fields[field_name]) field_query = field_parsers[field_name].parse(field_query) field_query = BoostQuery(field_query, score) query_builder.add(field_query, BooleanClause.Occur.SHOULD) if candidate_list is not None: id_query_string = ' OR '.join("content_id:\"" + content_id + "\"" for content_id in candidate_list) id_query = QueryParser("testo_libero", KeywordAnalyzer()).parse(id_query_string) query_builder.add(id_query, BooleanClause.Occur.MUST) query = query_builder.build() docs_to_search = len(positive_rated_document_list) + recs_number scoreDocs = searcher.search(query, docs_to_search).scoreDocs logger.info("Building score frame to return") recorded_items = 0 columns = ['to_id', 'rating'] score_frame = pd.DataFrame(columns=columns) for scoreDoc in scoreDocs: if recorded_items >= recs_number: break if scoreDoc.doc not in positive_rated_document_list: doc = searcher.doc(scoreDoc.doc) item_id = doc.getField("content_id").stringValue() recorded_items += 1 score_frame = pd.concat([ score_frame, pd.DataFrame.from_records([(item_id, scoreDoc.score)], columns=columns) ]) return score_frame
def testPhraseQueryInConjunctionScorer(self): writer = self.getWriter() doc = Document() doc.add(Field("source", "marketing info", TextField.TYPE_STORED)) writer.addDocument(doc) doc = Document() doc.add(Field("contents", "foobar", TextField.TYPE_STORED)) doc.add(Field("source", "marketing info", TextField.TYPE_STORED)) writer.addDocument(doc) writer.close() searcher = self.getSearcher() builder = PhraseQuery.Builder() builder.add(Term("source", "marketing")) builder.add(Term("source", "info")) phraseQuery = builder.build() topDocs = searcher.search(phraseQuery, 50) self.assertEqual(2, topDocs.totalHits) termQuery = TermQuery(Term("contents", "foobar")) builder = BooleanQuery.Builder() builder.add(termQuery, BooleanClause.Occur.MUST) builder.add(phraseQuery, BooleanClause.Occur.MUST) booleanQuery = builder.build() topDocs = searcher.search(booleanQuery, 50) self.assertEqual(1, topDocs.totalHits) writer = self.getWriter() doc = Document() doc.add(Field("contents", "map entry woo", TextField.TYPE_STORED)) writer.addDocument(doc) doc = Document() doc.add(Field("contents", "woo map entry", TextField.TYPE_STORED)) writer.addDocument(doc) doc = Document() doc.add( Field("contents", "map foobarword entry woo", TextField.TYPE_STORED)) writer.addDocument(doc) writer.close() searcher = self.getSearcher() termQuery = TermQuery(Term("contents", "woo")) builder = PhraseQuery.Builder() builder.add(Term("contents", "map")) builder.add(Term("contents", "entry")) topDocs = searcher.search(termQuery, 50) self.assertEqual(3, topDocs.totalHits) phraseQuery = builder.build() topDocs = searcher.search(phraseQuery, 50) self.assertEqual(2, topDocs.totalHits) builder = BooleanQuery.Builder() builder.add(termQuery, BooleanClause.Occur.MUST) builder.add(phraseQuery, BooleanClause.Occur.MUST) booleanQuery = builder.build() topDocs = searcher.search(booleanQuery, 50) self.assertEqual(2, topDocs.totalHits) builder = BooleanQuery.Builder() builder.add(phraseQuery, BooleanClause.Occur.MUST) builder.add(termQuery, BooleanClause.Occur.MUST) booleanQuery = builder.build() topDocs = searcher.search(booleanQuery, 50) self.assertEqual(2, topDocs.totalHits)
def search(self, field): s = self._search u = self._userQuery zh_to_hant_dict = self._zh_to_hant_dict info = u.getFlagsInfo() flags_list = u.getFlagsList() sq_list = [] word_index_list = [] index_count = 0 for flag in flags_list: if flag["type"] == "word": word_index_list.append(index_count) if len(flag["content"]) == 1: if flag["content"][0] in zh_to_hant_dict: stq_list = [ SpanTermQuery(Term(field, flag["content"][0])) ] for hant in zh_to_hant_dict[flag["content"][0]]: stq_list.append(SpanTermQuery(Term(field, hant))) sq_list.append(SpanOrQuery(stq_list)) else: sq_list.append( SpanTermQuery(Term(field, flag["content"][0]))) else: snq_list = [] for w in flag["content"]: if w in zh_to_hant_dict: stq_list = [SpanTermQuery(Term(field, w))] for hant in zh_to_hant_dict[w]: stq_list.append( SpanTermQuery(Term(field, hant))) snq_list.append(SpanOrQuery(stq_list)) else: snq_list.append(SpanTermQuery(Term(field, w))) sq_list.append(SpanNearQuery(snq_list, 0, True)) else: sq_list.append({ "op": info[flag["content"]]["op"], "num": info[flag["content"]]["num"] }) index_count += 1 q = None count = 0 for index in word_index_list: if count == 0: q = sq_list[index] count += 1 else: if not isinstance(sq_list[index - 1], dict): q = SpanNearQuery([q, sq_list[index]], 0, True) else: q = SpanNearQuery([q, sq_list[index]], sq_list[index - 1]["num"][-1], True) query = q # 过滤项 filters = u.getFields() bq = BooleanQuery.Builder() bq.add(BooleanClause(query, BooleanClause.Occur.MUST)) for key in filters.keys(): cur_reg = '(' for ft in filters[key]: cur_reg += ft + '|' cur_reg = cur_reg[0:-1] + ')' rq = RegexpQuery(Term(key, cur_reg)) bq.add(BooleanClause(rq, BooleanClause.Occur.MUST)) query = bq.build() top_docs = s.search(query, 9999) self._cur_field = field reg = get_test_reg(flags_list, info, zh_to_hant_dict) doc_id_list = [] hits = top_docs.scoreDocs for hit in hits: doc = s.doc(hit.doc) text = doc.get("text") match_res = re.search(reg, text) if match_res: doc_id_list.append(hit.doc) self._res = doc_id_list self._reg = reg return self
def testSimilarity(self): writer = self.getWriter(analyzer=SimpleAnalyzer( Version.LUCENE_CURRENT), similarity=SimpleSimilarity()) d1 = Document() d1.add(Field("field", "a c", TextField.TYPE_STORED)) d2 = Document() d2.add(Field("field", "a c b", TextField.TYPE_STORED)) writer.addDocument(d1) writer.addDocument(d2) writer.commit() writer.close() searcher = self.getSearcher() searcher.setSimilarity(SimpleSimilarity()) a = Term("field", "a") b = Term("field", "b") c = Term("field", "c") class collector1(PythonSimpleCollector): def collect(_self, doc, score): self.assertEqual(1.0, score) def doSetNextReader(_self, context): pass def scoreMode(_self): return ScoreMode.COMPLETE searcher.search(TermQuery(b), collector1()) builder = BooleanQuery.Builder() builder.add(TermQuery(a), BooleanClause.Occur.SHOULD) builder.add(TermQuery(b), BooleanClause.Occur.SHOULD) bq = builder.build() class collector2(PythonSimpleCollector): def collect(_self, doc, score): self.assertEqual(doc + _self.base + 1, score) def doSetNextReader(_self, context): _self.base = context.docBase def scoreMode(_self): return ScoreMode.COMPLETE searcher.search(bq, collector2()) pq = PhraseQuery(a.field(), [a.bytes(), c.bytes()]) class collector3(PythonSimpleCollector): def collect(_self, doc, score): self.assertEqual(1.0, score) def doSetNextReader(_self, context): pass def scoreMode(_self): return ScoreMode.COMPLETE searcher.search(pq, collector3()) pq = PhraseQuery(2, a.field(), [a.bytes(), b.bytes()]) class collector4(PythonSimpleCollector): def collect(_self, doc, score): self.assertEqual(0.5, score) def doSetNextReader(_self, context): pass def scoreMode(_self): return ScoreMode.COMPLETE searcher.search(pq, collector4())
def searchQ(self,query): ps=PorterStemmer() # qboosto=BoostQuery.BoostQuery() f = open("C:/Users/Tigmanshu/Documents/IRWeb/ssProject/Lucene/wn_s.pl") t = Thesaurus.from_file(f) # print(t.synonyms("regression")) result = [] qList=query.mainq.lower().split() checkList=[] notQ="" mustQ="" shouldQ="" for qt in qList: if qt[0]=='!': notQ=notQ+" "+qt[1:len(qt)] # checkList.append(qt[1:len(qt)]) elif qt[0]=='$': mustQ = mustQ + " " + qt[1:len(qt)] checkList.append(ps.stem(qt[1:len(qt)])) else: shouldQ=shouldQ + " " + qt thes=t.synonyms(qt) i=0 for term in thes: if i==5: break shouldQ = shouldQ + " " + term i+=1 checkList.append(ps.stem(qt)) del qList self.stopList="a an the of is zero".split() searcher = IndexSearcher(DirectoryReader.open(self.openStore())) analyzer = StandardAnalyzer() # q1 = QueryParser("abstract", analyzer).parse(shouldQ) # q2 = QueryParser("entities", analyzer).parse(shouldQ) # q3 = QueryParser("abstract", analyzer).parse(mustQ) # q4 = QueryParser("entities", analyzer).parse(mustQ) # q5 = QueryParser("abstract", analyzer).parse(notQ) # q6 = QueryParser("entities", analyzer).parse(notQ) b1 = BooleanQuery.Builder() if len(shouldQ)>0: q1 = QueryParser("abstract", analyzer).parse(shouldQ) q2 = QueryParser("entities", analyzer).parse(shouldQ) # q1.setBoost(2) b1.add(q1, BooleanClause.Occur.SHOULD) b1.add(q2, BooleanClause.Occur.SHOULD) if len(mustQ)>0: q3 = QueryParser("abstract", analyzer).parse(mustQ) q4 = QueryParser("entities", analyzer).parse(mustQ) b1.add(q3, BooleanClause.Occur.MUST) b1.add(q4, BooleanClause.Occur.MUST) if len(notQ)>0: q5 = QueryParser("abstract", analyzer).parse(notQ) q6 = QueryParser("entities", analyzer).parse(notQ) b1.add(q5, BooleanClause.Occur.MUST_NOT) b1.add(q6, BooleanClause.Occur.MUST_NOT) if len(query.journal)>0: print("$$$$$") print(query.journal) q7 = QueryParser("journalName", analyzer).parse(query.journal) b1.add(q7, BooleanClause.Occur.SHOULD) if len(query.author)>0: q8 = QueryParser("authorName", analyzer).parse(query.author) b1.add(q8, BooleanClause.Occur.SHOULD) bq1 = b1.build() #print(bq1) topDocs = searcher.search(bq1, 100) scoreDocs = topDocs.scoreDocs #print(len(scoreDocs)) entitiesHitList = {} for i in scoreDocs: myDoc= d.Document() doc = searcher.doc(i.doc) # print(doc.get("id"), i) # print(doc.get("abstract")) # print(doc.get("authorName")) for entity in doc.get("entities").split(): entity=entity.lower() if ps.stem(entity) in checkList: continue if entity in self.stopList: continue if entity in entitiesHitList: entitiesHitList[entity] += 1 else: entitiesHitList[entity] = 1 myDoc.setDocTitle(doc.get("title")) myDoc.setAbstract(doc.get("abstract")) myDoc.setJournal(doc.get("journalName")) myDoc.setAuthor(doc.get("authorName")) myDoc.setURL(doc.get("url")) result.append(myDoc) sorted_d = sorted(entitiesHitList.items(), key=operator.itemgetter(1), reverse=True) del entitiesHitList n_items = dict(islice(sorted_d , 10)) del sorted_d return {"res":result,"sug":n_items,"query":query}
import lucene from java.nio.file import Paths from org.apache.lucene.store import SimpleFSDirectory from org.apache.lucene.search import IndexSearcher, BooleanClause, BooleanQuery, TermQuery from org.apache.lucene.index import DirectoryReader, Term input_q = input().replace('ی', 'ي') lucene.initVM() index_path = Paths.get('./lucene.index') question_field = 'question' answer_field = 'answer' directory = SimpleFSDirectory(index_path) searcher = IndexSearcher(DirectoryReader.open(directory)) query_builder = BooleanQuery.Builder() for q_word in input_q.split(' '): qtq = TermQuery(Term(question_field, q_word)) query_builder\ .add(BooleanClause(qtq, BooleanClause.Occur.SHOULD)) query = query_builder.build() top_n = 5 scoreDocs = searcher.search(query, top_n).scoreDocs print('found nums: ', len(scoreDocs)) for scoreDoc in scoreDocs: doc = searcher.doc(scoreDoc.doc) print('Best Math: ', doc.get(question_field), '\n') print('Answer: ', doc.get(answer_field)) print('---------------------\n')