def search(self, field: str): sear = self._search if len(self._commandInfo.getKey()) == 0 or self._commandInfo.getKey()[0] in ['-', '~']: query = QueryParser(field, StandardAnalyzer()).parse(make_parser(self._commandInfo.getWordList()[0])) elif self._commandInfo.getKey()[0] == '#': query1 = QueryParser(field, StandardAnalyzer()).parse(make_parser(self._commandInfo.getWordList()[0])) query2 = QueryParser(field, StandardAnalyzer()).parse(make_parser(self._commandInfo.getWordList()[1])) bc1 = BooleanClause(query1, BooleanClause.Occur.MUST) bc2 = BooleanClause(query2, BooleanClause.Occur.MUST) query = BooleanQuery.Builder().add(bc1).add(bc2).build() elif self._commandInfo.getKey()[0] in ['$', '+']: bq = BooleanQuery.Builder() for w in self._commandInfo.getWordList(): queryx = QueryParser(field, StandardAnalyzer()).parse(make_parser(w)) bc = BooleanClause(queryx, BooleanClause.Occur.MUST) bq.add(bc) query = bq.build() else: query = '' hits = sear.search(query, 999999) for hit in hits.scoreDocs: doc = sear.doc(hit.doc) res = doc.get(field) id = doc.get(field+'_id') if doc_hit(res, self._commandInfo): sentences = re.split('[!?!?。]', res) map(lambda x: sentences.pop(x) if x == '' else 0, range(len(sentences))) for sentence in sentences: if key_filter(self._commandInfo, sentence): self._doc[id] = res self._resultSentencesList.append((id, sentence)) return self
def search_pair_score_multithread_part(self, q_doc_int): # print(q_doc_int) spl=q_doc_int.split('<|endoftext|>') q = spl[0] print(q) doc_int = int(spl[1]) print(doc_int) if not self.env.isCurrentThreadAttached(): self.env.attachCurrentThread() try: q = q.replace('AND', '\\AND').replace('OR', '\\OR').replace('NOT', '\\NOT') query = QueryParser("text", self.analyzer).parse(QueryParser.escape(q)) except: print('Unexpected error when processing query:', str(q)) print('Using query "dummy".') q = 'dummy' query = QueryParser("text", self.analyzer).parse(QueryParser.escape(q)) c = OrderedDict() exp = self.curr_searcher.explain(query, doc_int) c[1] = exp return c
def _search_singlethread( self, queries: List[str], doc_max: int) -> List[List[Dict[str, Union[float, str]]]]: queries_result = [] for query in queries: try: query = QueryParser('text', self.analyzer).parse( QueryParser.escape(query)) except Exception as exception: # pylint: disable=broad-except logger.warning( colored(f'{exception}: {query}, use query dummy.'), 'yellow') query = QueryParser('text', self.analyzer).parse('dummy') query_results = [] hits = self.searcher.search(query, doc_max) for hit in hits.scoreDocs: doc = self.searcher.doc(hit.doc) query_results.append({ 'score': hit.score, 'title': doc['title'], 'text': doc['text'] }) if not query_results: logger.warning( colored( f'WARN: search engine returns no results for query: {query}.', 'yellow')) queries_result.append(query_results) return queries_result
def more_like_this2(self, item_doc, result_num): similar_questions = [] if not item_doc: item_doc.append(ResultItem(None, 1.0, "No Title", 0)) query = "" if item_doc.doc: query += self.document_to_query(item_doc.doc) query = remove_unified_stop_lists(query) queryparser = QueryParser(Version.LUCENE_CURRENT, "term", self.analyzer) if query: try: like_query = queryparser.parse(query) hits = self.searcher.search(like_query, result_num).scoreDocs for i, hit in enumerate(hits): doc = self.searcher.doc(hit.doc) similar_questions.append(doc.get("question_id")) except Exception as e: print "Question Searcher: Error: %s" % e # write_search_log("Question Searcher: Error: %s" % e + "\n") print(traceback.format_exc()) # self.searchermgr.decRef(self.searcher) # self.searchermgr.release(self.searcher) # self.searcher = None # self.directory.close() # self.directory = None return similar_questions # def release(self, searcher):
def get_evidence(searcher, analyzer, claim): escaped_string = QueryParser.escape(claim) query = QueryParser("text", analyzer).parse(escaped_string) start = datetime.now() scoreDocs = searcher.search(query, 50).scoreDocs duration = datetime.now() - start claim = nlp(claim) claim_evid = [] line_no = [] sim_score = [] final_evidence = [] for scoreDoc in scoreDocs: doc = searcher.doc(scoreDoc.doc) norm_doc = doc.get("text") norm_doc = nlp(norm_doc) val = claim.similarity(norm_doc) try: int(doc.get("Sno")) claim_evid.append(doc.get("keyterm")) line_no.append(int(doc.get("Sno"))) sim_score.append(val) except ValueError: pass # or whatever if len(sim_score)>5: for val in range(0,5): index = sim_score.index(max(sim_score)) claim = claim_evid.pop(index) line = line_no.pop(index) final_evidence.append([claim , line]) del sim_score[index] else: for i in range(0, len(sim_score)-1): final_evidence.append([claim_evid[i] , int(line_no[i])]) return final_evidence
def run_img(command): vm_env = lucene.getVMEnv() vm_env.attachCurrentThread() STORE_DIR = "index2" directory = SimpleFSDirectory(File(STORE_DIR)) searcher = IndexSearcher(DirectoryReader.open(directory)) analyzer = WhitespaceAnalyzer(Version.LUCENE_CURRENT) querys = BooleanQuery() query_content = QueryParser(Version.LUCENE_CURRENT, "urlcontent", analyzer).parse(command) query_title = QueryParser(Version.LUCENE_CURRENT, "title", analyzer).parse(command) querys.add(query_content, BooleanClause.Occur.SHOULD) querys.add(query_title, BooleanClause.Occur.SHOULD) scoreDocs = searcher.search(querys, 50).scoreDocs if len(scoreDocs) == 0: print "WARNING: No result" result = [] for scoreDoc in scoreDocs: doc = searcher.doc(scoreDoc.doc) print doc.get("title") data = {} data['title'] = doc.get('title') data['url'] = doc.get('url') data['imgurl'] = doc.get('imgurl') result.append(data) return result
def findLiteral(self, instanceUri, propertyURI): labels = list() try: analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) fields = [FreyaConstants.INST_FEATURE_LKB, FreyaConstants.PROPERTY_FEATURE_LKB] flags = [BooleanClause.Occur.MUST, BooleanClause.Occur.MUST] labelOrTitleUris = "\"" + propertyURI + "\"" queries = ["\"" + QueryParser.escape(instanceUri) + "\"", QueryParser.escape(labelOrTitleUris)] query = MultiFieldQueryParser.parse(Version.LUCENE_CURRENT, queries, fields, flags, analyzer) result = self._searcher.search(query, 1) logging.debug("For " + str(query) + " : " + str(result.totalHits)) freq = result.totalHits if freq > 0: result = self._searcher.search(query, freq) hits = pyJava.JArray2List(result.scoreDocs) # for (ScoreDoc hit : hits) { indexus = 0 while indexus < len(hits): hit = hits[indexus] doc = self._searcher.doc(hit.doc) labels.append(doc.get(FreyaConstants.FIELD_EXACT_CONTENT)) indexus += 1 except Exception as e:#CorruptIndexException(e): print e.message logging.error("Error") return labels
def searcher(directory, analyzer, queries_file): lines = queries_file.readlines() length = len(lines) a_query = '' query_counter = 0 log = open("log.txt", "a") for line_number in range(length): if lines[line_number].startswith("<num>"): query_id = lines[line_number][14:].strip() elif lines[line_number].startswith("<desc>"): a_query = lines[line_number + 1].strip() a_query = stop_words(a_query) if a_query != '': # searching the index reader = DirectoryReader.open(directory) searcher = IndexSearcher(reader) # parse the query parser = QueryParser("DocParagraph", analyzer) query = parser.parse(a_query) # return 50 queries are required by the assignment hits = searcher.search(query, 50).scoreDocs # rank counter 1 through 50 rank_counter = 1 for hit in hits: result = searcher.doc(hit.doc) # write search result to log text file to_log = str(query_id) + " " + "Q" + str( query_counter) + " " + str(result.get( "DocID")) + " " + str(rank_counter) + " " + str( hit.score) + " " + "Alex's" + "\n" log.write(to_log) rank_counter += 1 query_counter += 1 a_query = '' log.close()
def more_like_this(self, result_num, query): result = [] queryparser = QueryParser(Version.LUCENE_CURRENT, "methods_called", self.porter_analyzer) if query: try: query = arranging_query_regex(query=query) # print '4. Right after the regex handling : ', query like_query = queryparser.parse(query) # print '5. Right after the Lucene parser : ', like_query hits = self.searcher.search(like_query, result_num).scoreDocs # filterScoreDosArray = hits.topDocs().scoreDocs; for i, hit in enumerate(hits): doc = self.searcher.doc(hit.doc) # matched_terms = self.get_matched_keywords(like_query, hit.doc) result.append(doc.get("answer_id")) except Exception as e: print "AnswerSearcher: Error: %s" % e print(traceback.format_exc()) # self.searchermgr.decRef(self.searcher) self.searchermgr.release(self.searcher) self.searcher = None self.directory.close() self.directory = None return result
def findDirectTypes(self, instanceUri, max): dTypes = list() try: analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) parser = QueryParser(Version.LUCENE_CURRENT, "inst", analyzer) query = parser.parse("\"" + QueryParser.escape(instanceUri) + "\"") result = self._searcher.search(query, 1) logging.debug("For " + str(query) + " : " + str(result.totalHits)) freq = 0 if max != None: freq = max else: freq = result.totalHits if freq > 0: result = self._searcher.search(query, freq) hits = pyJava.JArray2List(result.scoreDocs) # for (ScoreDoc hit : hits) { indexus = 0 while indexus < len(hits): hit = hits[indexus] doc = self._searcher.doc(hit.doc) self._searcher.explain(query, hit.doc) dTypes.append(doc.get(FreyaConstants.CLASS_FEATURE_LKB)) indexus += 1 except Exception as e:#CorruptIndexException(e): print e.message logging.error("Error") logging.debug("there are " + str(len(dTypes)) + " unique direct types") return dTypes
def get_image_pmcid(pmcid, classes = ""): fields = ["pmcid", "class"] docs = [] location = web.__path__[0] + "/static/web/files/index/index.figures" #lucene.initVM() vm_env = lucene.getVMEnv() vm_env.attachCurrentThread() analyzer = StandardAnalyzer(Version.LUCENE_4_10_1) reader = IndexReader.open(SimpleFSDirectory(File(location))) searcher = IndexSearcher(reader) # multi field query: http://stackoverflow.com/questions/2005084/how-to-specify-two-fields-in-lucene-queryparser #query = MultiFieldQueryParser(Version.LUCENE_4_10_1, fields, analyzer) # query.setDefaultOperator(QueryParserBase.AND_OPERATOR) #query = query.parse(query, ('4175339','1')) # query.parse(queryString)#"Shigella sonnei" # query = QueryParser(Version.LUCENE_4_10_1, "abstract", analyzer).parse(queryString)#"Shigella sonnei" MAX = 10000 #hits = searcher.search(query, MAX) if classes == "all": queryStr = "pmcid:(" + ' '.join(pmcid) +")" else: queryStr = "pmcid:(" + ' '.join(pmcid) +")" + " AND class:" + classes query = QueryParser(Version.LUCENE_4_10_1, "pmcid",analyzer)#needed to build a custom query q = query.parse(queryStr) hits = searcher.search(q, MAX) for hit in hits.scoreDocs:#should only be one #print hit.score, hit.doc, hit.toString() docs.append(searcher.doc(hit.doc)) return docs #This will return the image documents that belong to a pmcid(article)
def retriever(file_dir): analyzer = WhitespaceAnalyzer() reader = DirectoryReader.open( SimpleFSDirectory(Paths.get(file_dir + "/lucene_index/"))) searcher = IndexSearcher(reader) queryParser = QueryParser("code", analyzer) BooleanQuery.setMaxClauseCount(Integer.MAX_VALUE) with open(file_dir + "/train/train.spl.src", 'r') as fso, open(file_dir + "/train/train.txt.tgt", 'r') as fsu: sources = [line.strip() for line in fso.readlines()] summaries = [line.strip() for line in fsu.readlines()] with open(file_dir+"/test/test.ast.src") as ft, open(file_dir+"/test/test.ref.src.0", 'w') as fwo, \ open(file_dir+"/output/ast.out", 'w') as fws: queries = [ re.sub("[\W\s]+|AND|NOT|OR", ' ', line.strip()) for line in ft.readlines() ] for i, line in enumerate(queries): print("query %d" % i) query = queryParser.parse(QueryParser.escape(line)) hits = searcher.search(query, 1).scoreDocs flag = False for hit in hits: doc = searcher.doc(hit.doc) _id = eval(doc.get("id")) flag = True fwo.write(sources[_id] + '\n') fws.write(summaries[_id] + '\n') if not flag: print(query) print(hits) exit(-1)
def search_index(indexfile, querytext, top=10, qe=False, default_field="text", display_fields=["subreddit", "author", "text"]): lucene.initVM() lindex = SimpleFSDirectory(Paths.get(indexfile)) ireader = DirectoryReader.open(lindex) isearcher = IndexSearcher(ireader) analyser = StandardAnalyzer() parser = QueryParser(default_field, analyser) query = parser.parse(querytext) hits = isearcher.search(query, top).scoreDocs docIDs = [hit.doc for hit in hits] print_results(isearcher, hits, display_fields) if len(hits) == 0: print("No hits!") elif qe: print("\n") print("Which documents were relevant to your search need? (Enter spaced list of result numbers [1-{}], e.g. 2 4 5)".format(top)) relevantids = [docIDs[i-1] for i in [int(x) for x in input().split()]] nonrelevantids = [id for id in docIDs if id not in relevantids] print("\n\n") qequerytext = queryexpansion.rocchio(ireader, querytext, relevantids, nonrelevantids) print("Expanded search query: '{}'\n".format(qequerytext)) qequery = parser.parse(qequerytext) qehits = isearcher.search(qequery, top).scoreDocs print_results(isearcher, qehits, display_fields) ireader.close() lindex.close()
def __init__(self, index_path, field, similarity="boolean", use_relevance_feedback=False, feedback_index_path=None): self.reader = DirectoryReader.open( FSDirectory.open(Paths.get(index_path))) self.searcher = IndexSearcher(self.reader) if use_relevance_feedback and feedback_index_path is not None: self.feedback_reader = DirectoryReader.open( FSDirectory.open(Paths.get(feedback_index_path))) self.feedback_searcher = IndexSearcher(self.feedback_reader) self.similarity = similarity self.stopwords = stop_words() if similarity == "boolean": self.searcher.setSimilarity(BooleanSimilarity()) elif similarity == "tf": self.searcher.setSimilarity(TFSimilarity()) elif similarity == "tfidf": self.searcher.setSimilarity(ClassicSimilarity()) elif similarity == "BM25": self.searcher.setSimilarity(BM25Similarity(1.2, 0.2)) else: print("Unknown similarity, so we use BM25(1.2, 0.2) as default") self.searcher.setSimilarity(BM25Similarity(1.2, 0.2)) analyzer = StandardAnalyzer() print(self.searcher.getSimilarity()) self.parser = QueryParser(field, analyzer)
def search_synonym(self, query): self.hits_dict = {} self.hits = [] similar_terms = self.w2v_model.most_similar(query) parser = QueryParser('text', self.analyzer) query = parser.parse(query) for s_term in similar_terms[:20]: s_term_query = parser.parse(s_term[0]) hits = self.searcher.search(s_term_query, 1000).scoreDocs hit_count = 0 for hit in hits: doc = self.searcher.doc(hit.doc) text = doc.get('text') terms = text.split() sentence = '' for term in terms: sentence += term simpleHTMLFormatter = SimpleHTMLFormatter( prefixHTML, suffixHTML) highlighter = Highlighter(simpleHTMLFormatter, QueryScorer(query)) highLightText = highlighter.getBestFragment( self.analyzer, 'text', sentence) if highLightText is not None: self.hits.append(highLightText) hit_count += 1 if hit_count >= 3: break if len(self.hits) > 0: self.hits_dict[s_term] = self.hits self.hits = [] return self.hits_dict
def search(self, index_dir): # Get handle to index directory directory = SimpleFSDirectory(File(index_dir)) # Creates a searcher searching the provided index. ireader = DirectoryReader.open(directory) # Implements search over a single IndexReader. # Use a single instance and use it across queries # to improve performance. searcher = IndexSearcher(ireader) # Get the analyzer analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) # Constructs a query parser. We specify what field to search into. queryParser = QueryParser(Version.LUCENE_CURRENT, FIELD_CONTENTS, analyzer) # Create the query query = queryParser.parse(self.query) # Run the query and get top 50 results topDocs = searcher.search(query, self.retrieve_count) # Get top hits scoreDocs = topDocs.scoreDocs doc_ids = [] for scoreDoc in scoreDocs: doc = searcher.doc(scoreDoc.doc) doc_ids.append(doc.get(FIELD_PATH)) return [int(item) for item in doc_ids]
def get_pair_scores(self, q, doc_int, save_cache=False, extra_terms=True): # if prm.n_threads > 1: # out = self.search_pair_score_multithread(qs_trailing_doc, self.searcher) # if (prm.docs_path != prm.docs_path_term) and extra_terms: # terms = self.search_pair_score_multithread(qs_trailing_doc, self.searcher_term) # else: # out = self.search_pair_score_singlethread(qs_trailing_doc, self.searcher) # if (prm.docs_path != prm.docs_path_term) and extra_terms: # terms = self.search_pair_score_singlethread(qs_trailing_doc, self.searcher_term) out = [] try: q = q.replace('AND', '\\AND').replace('OR', '\\OR').replace('NOT', '\\NOT') query = QueryParser("text", self.analyzer).parse(QueryParser.escape(q)) # query = QueryParser("text", StandardAnalyzer()).parse(QueryParser.escape(q)) except: print('Unexpected error when processing query:', str(q)) print('Using query "dummy".') q = 'dummy' query = QueryParser("text", self.analyzer).parse(QueryParser.escape(q)) # query = QueryParser("text", StandardAnalyzer()).parse(QueryParser.escape(q)) c = OrderedDict() exp = self.searcher.explain(query, doc_int) c[1] = exp out.append(c) return out
def Qsearch(self,query): words = seg.segment(query.strip()) #words = self.segmentor.segment(query.strip()) #print ' '.join(words) vm_env = lucene.getVMEnv() vm_env.attachCurrentThread() result = QueryParser(Version.LUCENE_CURRENT, "contents",self.analyzer) result.setPhraseSlop(0) # "\""+' '.join(words)+"\"~0" means words should be continuous query = result.parse("\""+' '.join(words)+"\"~0") totalHits = self.searcher.search(query, 50) #print "%s total matching documents." % totalHits.totalHits #return totalHits.totalHits for hit in totalHits.scoreDocs: #print"Hit Score: ",hit.score, "Hit Doc:",hit.doc, "HitString:",hit.toString() doc= self.searcher.doc(hit.doc) #print doc.get("name").encode("utf-8") #print "----------------------------------------" t = Term('contents',' '.join(words)) #termDocs = ireader.termDocs(t) #for tt in termDocs: # print ireader.document(termDocs.docs).getFeildable('neme'),termDocs.freq() #print self.reader.totalTermFreq(t) return self.reader.totalTermFreq(t)
def findPropertyURIs(self, propertyType, max): uris = list() # list() try: analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) parser = QueryParser(Version.LUCENE_CURRENT, FreyaConstants.CLASS_FEATURE_LKB, analyzer) query = parser.parse("\"" + QueryParser.escape(propertyType) + "\"") result = self._searcher.search(query, 1) freq = result.totalHits if max != None: freq = max.intValue() if freq > 0: result = self._searcher.search(query, freq) hits = pyJava.JArray2List(result.scoreDocs) logging.debug("For " + str(query) + " : " + str(result.totalHits) + " max:" + str(max)) print "For " + str(query) + " : " + str(result.totalHits) + " max:" + str(max) # for (ScoreDoc hit : hits) { indexus = 0 while indexus < len(hits): hit = hits[indexus] doc = self._searcher.doc(hit.doc) self._searcher.explain(query, hit.doc) uris.append(doc.get(FreyaConstants.INST_FEATURE_LKB)) indexus += 1 except Exception as e:#CorruptIndexException(e): print e.message logging.error("Error") return uris
def findSubClasses(self, classUri): #RESOLVE multifieldqueryparser DOCUMENTATION PROBLEM!!!! propertyURI = "http://www.w3.org/2000/01/rdf-schema#subClassOf" subClasses = list() try: analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) fields = [FreyaConstants.CLASS_FEATURE_LKB, FreyaConstants.PROPERTY_FEATURE_LKB] flags = [BooleanClause.Occur.MUST, BooleanClause.Occur.MUST] subClassUri = "\"" + QueryParser.escape(propertyURI) + "\"" queries = ["\"" + QueryParser.escape(classUri) + "\"", subClassUri] query = MultiFieldQueryParser.parse(Version.LUCENE_CURRENT,queries, fields,flags,analyzer) result = self._searcher.search(query, 1) logging.debug("For " + str(query) + " : " + str(result.totalHits)) freq = result.totalHits if freq > 0: result = self._searcher.search(query, freq) hits = pyJava.JArray2List(result.scoreDocs) # for (ScoreDoc hit : hits) { indexus = 0 while indexus < len(hits): hit = hits[indexus] doc = self._searcher.doc(hit.doc) subClasses.append(doc.get(FreyaConstants.INST_FEATURE_LKB)) indexus += 1 except Exception as e:#CorruptIndexException(e): print e.message logging.error("Error") return subClasses
def text_search(command): envir.vm_env.attachCurrentThread() command_dict = parseCommand(command, "contents") querys = BooleanQuery() for k, v in command_dict.iteritems(): query = QueryParser(Version.LUCENE_CURRENT, k, envir.analyzer).parse(v) querys.add(query, BooleanClause.Occur.MUST) scoreDocs = envir.text_searcher.search(querys, 30).scoreDocs res = [] query_highlight = QueryParser(Version.LUCENE_CURRENT, k, envir.analyzer).parse(command_dict["contents"]) myhighlighter = Highlighter( SimpleHTMLFormatter(), QueryScorer(query_highlight)) myhighlighter.setTextFragmenter(SimpleFragmenter(50)) for scoreDoc in scoreDocs: # find texts which are around the keyword doc = envir.text_searcher.doc(scoreDoc.doc) text = doc.get("contents") key_text = "".join((myhighlighter.getBestFragments( envir.analyzer, "contents", text, 3))) key_text = re.sub('\s', '', key_text) temp = [doc.get("title"), doc.get('url'), key_text] res.append(temp) return res
def retrieve_sents(self): indexDir = self.indexDir query = self.query sent_ind_list = [] # template = CustomTemplate(format) fsDir = SimpleFSDirectory(Paths.get(indexDir)) # print indexDir searcher = IndexSearcher(DirectoryReader.open(fsDir)) analyzer = StandardAnalyzer() parser = QueryParser("contents", analyzer) parser.setDefaultOperator(QueryParser.Operator.OR) query = parser.parse(query) # print query start = datetime.now() scoreDocs = searcher.search(query, 50).scoreDocs duration = datetime.now() - start # print query if self.stats: print >> sys.stderr, "Found %d sentences (in %s) that matched query '%s':" % ( len(scoreDocs), duration, query) for scoreDoc in scoreDocs: # print scoreDoc.doc # doc = searcher.doc(scoreDoc.doc) sent_ind_list.append(scoreDoc.doc) return sent_ind_list
def buscar(indexDir, args,options = None): #lucene.initVM(vmargs=['-Djava.awt.headless=true']) fsDir = SimpleFSDirectory(File(indexDir)) #print fsDir #Criando buscador baseado no diretorio dos indices passados pelo usuario searcher = IndexSearcher(DirectoryReader.open(fsDir)) #Analizador para filtro dos tokens analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) #print analyzer #Criando um QueryParser usando por padrao contents #Variavel com as restricoes da busca parser = QueryParser(Version.LUCENE_CURRENT, "keywords", analyzer) #print parser parser.setDefaultOperator(QueryParser.Operator.AND) #print args #Juntando parametros passados com o valor do mesmo command = ' +'.join(args) #print command query = parser.parse(command) print query #Criando um JArray com resultado da consulta return searcher.search(query, 200).scoreDocs
def search(**kwargs): vm_env.attachCurrentThread() query = BooleanQuery() print("Searched keywords:") for field_name, keywords in kwargs.items(): # assert field_name in SearchConfig.searchable_fields # keywords = list(filter(None, jieba.cut(keywords, cut_all=True))) keywords = list(filter(None, (k.strip() for k in jieba.cut_for_search(keywords)))) for kw in keywords: print(kw) # construct query for kw in keywords: q = QueryParser(Version.LUCENE_CURRENT, field_name, analyzer).parse(kw) query.add(q, BooleanClause.Occur.SHOULD) if field_name == 'keywords': for kw in keywords: q = QueryParser(Version.LUCENE_CURRENT, 'ent_name', analyzer).parse(kw) query.add(q, BooleanClause.Occur.SHOULD) # search scoreDocs = searcher.search(query, 50).scoreDocs return [retrieve(searcher.doc(scoreDoc.doc)) for scoreDoc in scoreDocs]
def find_all_text_occurrences(objects: list) -> (dict, DirectoryReader): docs_lookup = dict() # noinspection PyUnresolvedReferences lucene.initVM(initialheap='32m', maxheap='4G') file = Paths.get("D:\GitHubD\BREDS\wiki_text_index\WIKI_TEXT") dir = FSDirectory.open(file) reader = DirectoryReader.open(dir) searcher = IndexSearcher(reader) parser = QueryParser('contents', StandardAnalyzer()) logging.warning( 'FOR MULTI-WORD OBJECTS, ALL DOCUMENTS WITH BOTH TERMS SEPARATELY WILL BE RETRIEVED' ) for object in objects: tokens = object.split(' ') doc_sets = [] for token in tokens: q = parser.parse(f'"{token}"') # TODO maybe use minimum score topdocs = searcher.search(q, 99999999) results = set([topdoc.doc for topdoc in topdocs.scoreDocs]) doc_sets.append(results) docs_lookup[object] = set.intersection(*doc_sets) return docs_lookup, reader
def search(self, query_str, restriction=2): self.attachCurrentThread() # 对query进行解析 result_contexts = [] # 根据有没有‘/’判断有没有词性, if '/' in query_str: # 有词性就转到search_phrases result_contexts = self.search_phrases(query_str) else: # 有词性就转到search_terms result_contexts = self.search_terms( QueryParser("context", self.analyzer).parse(query_str)) # 将搜索结果复原为文章返回 self.recover_to_article(query_str, result_contexts, restriction) final_result = [] #进行搜索结果中跟query相关的文段高量处理 simpleHTMLFormatter = SimpleHTMLFormatter(u"<b><font color='red'>", u"</font></b>") for index, recovered_query in enumerate(self.recovered_queries): # 不是直接拿用户输入的query来进行高亮处理,而是通过我们自己处理好的包含了位置约束的query进行高亮处理 recovered_query = recovered_query.replace("/", ",") highlighter = Highlighter( simpleHTMLFormatter, QueryScorer( QueryParser("context", self.analyzer).parse(recovered_query))) highLightText = highlighter.getBestFragment( self.analyzer, 'context', self.recovered_contexts[index]) if highLightText is not None: final_result.append(highLightText) return final_result
def searchForClass(self, inst, pred): classUris = list() fields = [FreyaConstants.INST_FEATURE_LKB, FreyaConstants.PROPERTY_FEATURE_LKB] flags = [BooleanClause.Occur.MUST, BooleanClause.Occur.MUST] queries = ["\"" + QueryParser.escape(inst) + "\"", "\"" + QueryParser.escape(pred) + "\""] try: analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) query = MultiFieldQueryParser.parse(Version.LUCENE_CURRENT, queries, fields, flags, analyzer) result = self._searcher.search(query, 1) logging.info("For " + str(query) + " : " + str(result.totalHits)) freq = result.totalHits if freq > 0: result = self._searcher.search(query, freq) hits = pyJava.JArray2List(result.scoreDocs) # for (ScoreDoc hit : hits) { indexus = 0 while indexus < len(hits): hit = hits[indexus] doc = self._searcher.doc(hit.doc) classUris.append(doc.get(FreyaConstants.CLASS_FEATURE_LKB)) indexus += 1 except Exception as e:#ParseException(e): print e.message logging.error("Error") return classUris
def search_multithread_part(self, q): if not self.env.isCurrentThreadAttached(): self.env.attachCurrentThread() if q in self.cache: return self.cache[q] else: try: q = q.replace('AND','\\AND').replace('OR','\\OR').replace('NOT','\\NOT') query = QueryParser("text", self.analyzer).parse(QueryParser.escape(q)) except: print('Unexpected error when processing query:', str(q)) print('Using query "dummy".') q = 'dummy' query = QueryParser("text", self.analyzer).parse(QueryParser.escape(q)) c = OrderedDict() hits = self.curr_searcher.search(query, self.max_cand) for i, hit in enumerate(hits.scoreDocs): doc = self.curr_searcher.doc(hit.doc) if i < self.max_full_cand: word_idx = list(map(int, doc['word_idx'].split(' '))) word = doc['word'].split('<&>') else: word_idx = [] word = [] # c[int(doc['id'])] = [word_idx, word] c[int(doc['id'])] = [word_idx, word, hit.score] # print(c) return c
def run(searcher, analyzer,keyword,way): while True: try: command = keyword.encode('utf8') except UnicodeDecodeError: command = keyword if command == '': return [] if way=='contents': query = QueryParser(Version.LUCENE_CURRENT, "contents",analyzer).parse(command) elif way=='tags': query = QueryParser(Version.LUCENE_CURRENT, "tag",analyzer).parse(command) scoreDocs = searcher.search(query, 50).scoreDocs result=[] result.append(command) for scoreDoc in scoreDocs: doc = searcher.doc(scoreDoc.doc) item=[] item.append(doc.get('title')) item.append(doc.get('url')) item.append(doc.get('price')) item.append(doc.get('imgurl')) item.append(doc.get('wellrate')) item.append(doc.get('comment')) item.append(doc.get('tag')) #print doc.get('comment').encode('utf8') result.append(item) #print result return result
def get_candidates(qatp): if prm.create_index: create_index() lucene.initVM() analyzer = StandardAnalyzer(Version.LUCENE_4_10_1) reader = IndexReader.open(SimpleFSDirectory(File(prm.index_folder))) searcher = IndexSearcher(reader) candidates = [] n = 0 for q, a, t, p in qatp: if n % 100 == 0: print 'finding candidates sample', n n += 1 q = q.replace('AND', '\\AND').replace('OR', '\\OR').replace('NOT', '\\NOT') query = QueryParser(Version.LUCENE_4_10_1, "text", analyzer).parse(QueryParser.escape(q)) hits = searcher.search(query, prm.max_candidates) c = [] for hit in hits.scoreDocs: doc = searcher.doc(hit.doc) c.append(doc.get("id")) candidates.append(c) return candidates
def __init__(self, indexDir): self.directory = SimpleFSDirectory(Paths.get(indexDir)) self.searcher = IndexSearcher(DirectoryReader.open(self.directory)) self.nameQueryParser = QueryParser('name', StandardAnalyzer()) self.nameQueryParser.setDefaultOperator(QueryParser.Operator.AND) self.idQueryParser = QueryParser('id', StandardAnalyzer()) self.idQueryParser.setDefaultOperator(QueryParser.Operator.AND)
def func1(genre, year): vm_env.attachCurrentThread() lists = [] query = BooleanQuery() if genre != "111": item = QueryParser(Version.LUCENE_CURRENT, "genre", analyzer).parse(genre) query.add(item, BooleanClause.Occur.MUST) if year != "111": item = QueryParser(Version.LUCENE_CURRENT, "year", analyzer).parse(year) query.add(item, BooleanClause.Occur.MUST) sf = SortField("score", SortField.Type.STRING, True) s = Sort(sf) scoreDocs = searcher1.search(query, 20, s).scoreDocs for scoreDoc in scoreDocs: movie = [] doc = searcher1.doc(scoreDoc.doc) movie.append(doc.get("url")) movie.append(doc.get("picture")) movie.append(doc.get("title")) movie.append(doc.get("score")) movie.append(doc.get("genre")) movie.append(doc.get("stars")) movie.append(doc.get("comments")) lists.append(movie) return lists
def search_singlethread(self, qs, max_cand, max_full_cand, curr_searcher): out = [] for q in qs: if q in self.cache: out.append(self.cache[q]) else: try: q = q.replace('AND', '\\AND').replace('OR', '\\OR').replace( 'NOT', '\\NOT') query = QueryParser("text", self.analyzer).parse( QueryParser.escape(q)) except: print 'Unexpected error when processing query:', str(q) print 'Using query "dummy".' query = QueryParser("text", self.analyzer).parse( QueryParser.escape('dummy')) c = OrderedDict() hits = curr_searcher.search(query, max_cand) for i, hit in enumerate(hits.scoreDocs): doc = curr_searcher.doc(hit.doc) if i < max_full_cand: word_idx = map(int, doc['word_idx'].split(' ')) word = doc['word'].split('<&>') else: word_idx = [] word = [] c[int(doc['id'])] = [word_idx, word] out.append(c) return out
def more_like_this2(self, so_items): github_result = [] if not so_items: so_items.append(SOResultItem(None, 1.0, "No Title", 0, "")) query = "" for so_item in so_items: queryparser = QueryParser(Version.LUCENE_CURRENT, "typed_method_call", self.analyzer) if so_item.doc: query += self.document_to_query(so_item.doc) query += self.code_as_text() if query: print "-" * 30 print "UNified Query: %s" % query print "-" * 30 try: like_query = queryparser.parse(query) hits = self.searcher.search(like_query, 10).scoreDocs for i, hit in enumerate(hits): doc = self.searcher.doc(hit.doc) matched_terms = self.get_matched_keywords2( like_query, hit.doc) # apis = [d.stringValue() for d in doc.getFields("typed_method_call")] print("file__", doc.get("file"), "file_content", doc.get("file_content"), "line_numbers", doc.get("line_numbers")) file_path = "/extdsk/FaCoY/Git_data/G" + doc.get( "file")[24:] print(file_path) content = None try: with open(file_path) as f: content = f.read() except: pass if content: item = GithubResultItem(doc.get("file"), content, matched_terms, hit.score, so_item, doc.get("line_numbers"), hit.doc) # code github_result.append(item) #print("%d. File: %s, Matched: %s, Score: %s" % (i + 1, doc.get("file"), matched_terms, hit.score)) except Exception as e: print "GitSearcher: Error: %s" % e print(traceback.format_exc()) # print Counter(files).most_common(5) return github_result
def __init__(self, tfidf_path, strict=True): lucene.initVM() analyzer = StandardAnalyzer() reader = DirectoryReader.open(SimpleFSDirectory(Paths.get(tfidf_path))) self.searcher = IndexSearcher(reader) self.parser = QueryParser("text", analyzer) self.parser.setDefaultOperator(QueryParser.Operator.OR)
def getDoc(self, doc, sentenseid): query = QueryParser.escape(doc + ' ' + str(sentenseid)) query = QueryParser('docname', self.analyzer).parse(query) score = self.searcher.search(query, 1).scoreDocs doc = self.searcher.doc(score[0].doc) return doc.get('docname'), doc.get('contents')
def retrieve(self, term, sid): query = term + ' ' + str(sid) query = self.repalcer(query) query = QueryParser.escape(query) query = QueryParser('name-sid', self.analyzer).parse(query) score = self.searcher.search(query, 1).scoreDocs doc = self.searcher.doc(score[0].doc) return doc.get('name-sid'), doc.get('contents')
def lucene_retrieval_multifield(q_string, q_class, feature_type, use_BM25=False): """ multifield: different query string for different field not same word on different field :param q_string: :param feature_type: :param use_BM25: :return: retrieval_scores for each question-answer pair """ index = set_lucene_index['ind'] # nonlocal variable index def retrieval_scores(hists): """ return sorted document+score by score :param hists: """ def doc_score(hists): """ return doc_name & score :param hists: """ for h in hists: # docID = h.doc # doc = searcher.doc(docID) # file_name = doc.get("corpus_name") # doc_name = doc.get("doc_name") # text = doc.get("text") score = h.score # yield (file_name, doc_name, score, text) yield score doc_score_list = list(doc_score(hists)) return map(lambda f: f(doc_score_list), feature_type) # feature_type is a list of function text_query = QueryParser(version, 'text', analyzer).parse(QueryParser.escape(q_string)) subject_query = QueryParser(version, 'corpus_name', analyzer).parse(QueryParser.escape(q_class)) query = BooleanQuery() # BooleanClause.Occur # MUST implies that the keyword must occur # SHOULD implies that the keyword SHOULD occur query.add(text_query, BooleanClause.Occur.SHOULD) query.add(subject_query, BooleanClause.Occur.SHOULD) # search reader = IndexReader.open(index) searcher = IndexSearcher(reader) if use_BM25: searcher.setSimilarity(BM25Similarity(k1=1.5, b=0.75)) # todo: BM25 parameters collector = TopScoreDocCollector.create(hitsPerPage, True) searcher.search(query, collector) hs = collector.topDocs().scoreDocs # hists results = retrieval_scores(hs) # reader.close() return results # retrieval_scores for each question-answer pair
def document_to_query(self, doc): """ Given a document it transforms the source code related fields to a lucene query string""" query = "" for field in [ "description" ]: #여기의 필드가 description 으로 설정 했고... 맨 끝에서 field, term이런식으로 넣으니.. 중복이 많음.. for val in doc.getFields(field): if val.stringValue().strip(): term = QueryParser.escape(val.stringValue()) #tokenize term = self.tokenize_string(StandardAnalyzer(), term) #CamelCase temp = [] for t in term: temp += self.camel_case_split(t) #stopwords temp_2 = [] for t in temp: if t not in english_stop_words: temp_2.append(t) #stemming temp_3 = [] for t in temp_2: temp_3.append(stem(t)) #stopwords temp_4 = [] for t in temp_3: if t not in english_stop_words: temp_4.append(t) #query generation for term in temp_4: query += "%s:%s " % (field, term) for field in [ "typed_method_call", "methods", "used_classes", "class_instance_creation", "methods_called" ]: # "extends", "annotations", "literals" for val in doc.getFields(field): if val.stringValue().strip(): term = QueryParser.escape(val.stringValue( )) #이 자리에서 Unified Query 정제 되나 한번 보자...... stoplist = ["java.lang.Object"] if term not in stoplist: query += "%s:%s " % (field, term) if len(doc.getFields("code_hints")) > 0: hints = [ hint.stringValue() for hint in doc.getFields("code_hints") ] hints_str = " ".join(hints) for term in hints: if term: term = QueryParser.escape(term) if term not in english_stop_words: # print "Including 'code_hints' from Doc_To_Query TERMs... //", term query += "code_hints:%s " % term return query
def retrieve_wiki(text_query, searcher, analyzer): txt =text_query try: query = QueryParser(Version.LUCENE_CURRENT, "contents", analyzer).parse(txt) except: qp = QueryParser(Version.LUCENE_CURRENT, "contents", analyzer) txt = qp.escape(txt) query = qp.parse(txt) scoreDocs = searcher.search(query, 1000).scoreDocs for scoreDoc in scoreDocs: doc = searcher.doc(scoreDoc.doc) yield doc.get('title'), doc.get('contents')
def author_search(qry, limit): helper.initPyLucene() RNLP_ctxt = _get_rnlp_ctxt() entry_map = RNLP_ctxt.get_entry_map() rootdir = OUT_RAW_DIR from org.apache.lucene.index import DirectoryReader from org.apache.lucene.search import IndexSearcher from org.apache.lucene.queryparser.classic import QueryParser from org.apache.lucene.analysis.standard import StandardAnalyzer from org.apache.lucene.store import FSDirectory from org.apache.lucene.util import Version from java.io import File reader = DirectoryReader.open(FSDirectory.open(File(IDX_DIR))) searcher = IndexSearcher(reader) analyzer = StandardAnalyzer(Version.LUCENE_40) field = 'contents' parser = QueryParser(Version.LUCENE_40, field, analyzer); query = parser.parse(qry); print 'Searching for:', query.toString(field) raw_results = searcher.search(query, limit) hits = raw_results.scoreDocs numTotalHits = raw_results.totalHits print numTotalHits, 'total matching documents' results = {} for hit in hits: doc = searcher.doc(hit.doc) entry_id = doc.get('entry_id') entry = entry_map.get(entry_id) short_title = entry['short_title'] print(entry['prim_author']) if qry in entry['prim_author'].lower(): fname = short_title + CONTENT_EXT results[entry_id] = {'title': short_title, 'file': fname } f = open ('/Users/Nelle/Documents/coding/text_analysis/newsvn/RenaissanceNLP/data/dataResults/authorPaths/' + qry + '.json', 'w') f.write(json.dumps(results)) f.close() return json.dumps(results)
def perform_search(self, searchterm): # processing a query parser = QueryParser(Version.LUCENE_CURRENT, "content", self.analyzer) parser.setDefaultOperator(QueryParser.Operator.AND) query = parser.parse(searchterm) # conducting search searcher = IndexSearcher(DirectoryReader.open(self.store)) start = datetime.now() scoreDocs = searcher.search(query, 50).scoreDocs duration = datetime.now() - start print scoreDocs print duration
def get_candidates(qatp): if prm.create_index: create_index() lucene.initVM() analyzer = StandardAnalyzer(Version.LUCENE_4_10_1) reader = IndexReader.open(SimpleFSDirectory(File(prm.index_folder))) searcher = IndexSearcher(reader) candidates = [] n = 0 for q,a,t,p in qatp: if n % 100 == 0: print 'finding candidates sample', n n+=1 q = q.replace('AND','\\AND').replace('OR','\\OR').replace('NOT','\\NOT') query = QueryParser(Version.LUCENE_4_10_1, "text", analyzer).parse(QueryParser.escape(q)) hits = searcher.search(query, prm.max_candidates) c = [] for hit in hits.scoreDocs: doc = searcher.doc(hit.doc) c.append(doc.get("id")) candidates.append(c) return candidates
def do_mapping(line): regex = re.match(r"(?P<netflix_id>[0-9]+),(?P<year>([0-9]+)|NULL),(?P<title>.+)", line) if not regex: raise ValueError(line) netflix_id = int(regex.group("netflix_id")) title = QueryParser.escape(regex.group("title")) query1 = QueryParser(Version.LUCENE_CURRENT, "title", analyzer).parse(title) year = regex.group("year") if year == "NULL": scoreDocs = searcher.search(query1, 1).scoreDocs else: year = int(year) query2 = NumericRangeQuery.newIntRange("year", year, year, True, True) booleanQuery = BooleanQuery(); booleanQuery.add(query1, BooleanClause.Occur.MUST); booleanQuery.add(query2, BooleanClause.Occur.MUST); scoreDocs = searcher.search(booleanQuery, 1).scoreDocs if scoreDocs: if scoreDocs[0].score > 1.5: doc = searcher.doc(scoreDocs[0].doc) doc_id = doc.getField("id").stringValue() doc.add(StringField("netflix_id", str(netflix_id), Field.Store.YES)) writer.updateDocument(Term("id", doc_id), doc)
def run(self, searcher, analyzer, rawQuery): query = QueryParser(Version.LUCENE_CURRENT, "contents", analyzer).parse(QueryParser.escape(rawQuery)) # escape special characters scoreDocs = searcher.search(query, 50).scoreDocs print "%s total matching documents." % len(scoreDocs) for scoreDoc in scoreDocs: doc = searcher.doc(scoreDoc.doc) print 'path:', doc.get("path"), 'name:', doc.get("name")
def form_new_query_from_rf(self, relevant_doc_ids): firstSet = True new_query = set() for id in relevant_doc_ids: doc = self.searcher.doc(id) contents = re.sub('[/\*&^%$#@?\'`":()<>]', " ", doc.get("title")).strip() query = QueryParser(Version.LUCENE_CURRENT, "contents", self.analyzer).parse(contents) keywords = query.toString().split("contents:") keywords_set = set() for k in keywords: if k.strip() != "": keywords_set.add(k) if firstSet: new_query = set(keywords_set) else: new_query = new_query & set(keywords_set) firstSet = False return " ".join(new_query)
def similarityOfSynopsis(self): directory = SimpleFSDirectory(File(settings.SYNOPSIS_INDEX)) ireader = DirectoryReader.open(directory) searcher = IndexSearcher(ireader) analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) queryParser = QueryParser(Version.LUCENE_CURRENT, FIELD_CONTENTS, analyzer) for root, dirnames, filenames in os.walk(settings.SYNOPSIS): filenames = [int(item) for item in filenames] filenames.sort() filenames = [str(item) for item in filenames] for filename in filenames: path = os.path.join(root, filename) major_movie = models.Movie.objects.get(pk=filename) with open(path, 'r') as moviedoc: content = moviedoc.read().replace('\n', ' ') content = re.sub('[^A-Za-z0-9 ]+', '', content) while True: try: query = queryParser.parse(QueryParser.escape(content)) except Exception as e: self.boolean_query.setMaxClauseCount(self.boolean_query.maxClauseCount * 2) print self.boolean_query.maxClauseCount continue break topDocs = searcher.search(query, len(filenames)) scoreDocs = topDocs.scoreDocs for scoreDoc in scoreDocs: doc = searcher.doc(scoreDoc.doc) movie_id = int(doc.get(FIELD_PATH)) if movie_id <= major_movie.id: continue minor_movie = models.Movie.objects.get(pk=movie_id) try: similarity = models.Similarities.objects.filter(first_movie=major_movie, second_movie=minor_movie).first() if not similarity: similarity = models.Similarities.objects.filter(first_movie=minor_movie, second_movie=major_movie).first() similarity.synopsis = scoreDoc.score similarity.save() except Exception as e: print major_movie.id, minor_movie.id raise e print u"{0} completed.".format(major_movie.id)
def __init__(self): vm_env = lucene.getVMEnv() vm_env.attachCurrentThread() indexDir = SimpleFSDirectory(File(app.config['INDEX_PATH'])) self.searcher = IndexSearcher(DirectoryReader.open(indexDir)) self.analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) self.parser = QueryParser(Version.LUCENE_CURRENT, "contents", self.analyzer)
def custom_search(qry, limit): helper.initPyLucene() RNLP_ctxt = _get_rnlp_ctxt() entry_map = RNLP_ctxt.get_entry_map() rootdir = OUT_RAW_DIR from org.apache.lucene.index import DirectoryReader from org.apache.lucene.search import IndexSearcher from org.apache.lucene.queryparser.classic import QueryParser from org.apache.lucene.analysis.standard import StandardAnalyzer from org.apache.lucene.store import FSDirectory from org.apache.lucene.util import Version from java.io import File reader = DirectoryReader.open(FSDirectory.open(File(IDX_DIR))) searcher = IndexSearcher(reader) analyzer = StandardAnalyzer(Version.LUCENE_40) field = 'contents' parser = QueryParser(Version.LUCENE_40, field, analyzer); query = parser.parse(qry); print 'Searching for:', query.toString(field) raw_results = searcher.search(query, limit) hits = raw_results.scoreDocs numTotalHits = raw_results.totalHits print numTotalHits, 'total matching documents' print rootdir results = {} for hit in hits: doc = searcher.doc(hit.doc) entry_id = doc.get('entry_id') entry = entry_map.get(entry_id) short_title = entry['short_title'] year = entry['publ_year'] fname = short_title + CONTENT_EXT results[fname] = year;
class SearchIndex(object): def __init__(self): vm_env = lucene.getVMEnv() vm_env.attachCurrentThread() indexDir = SimpleFSDirectory(File(app.config['INDEX_PATH'])) self.searcher = IndexSearcher(DirectoryReader.open(indexDir)) self.analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) self.parser = QueryParser(Version.LUCENE_CURRENT, "contents", self.analyzer) def search(self, q, page = 1, duplicates = False): query = self.parser.parse(q) if not duplicates: query = self.addDuplicatesQuery(query) perPage = 10 start = (page - 1) * perPage results = TopScoreDocCollector.create(1000, True) self.searcher.search(query, results) highlighter = Highlighter(QueryScorer(query)) highlighter.setTextFragmenter(SimpleFragmenter(40)) docs = [] for scoreDoc in results.topDocs(start, perPage).scoreDocs: doc = self.searcher.doc(scoreDoc.doc) tokenStream = self.analyzer.tokenStream("contents", StringReader(doc['contents'])) highlight = highlighter.getBestFragments(tokenStream, doc['contents'], 3, "...") docs.append({ 'title': doc['title'], 'url': doc['url'], 'duplicate': doc['duplicate'], 'highlight': highlight} ) del self.searcher totalPages = int(math.ceil(results.getTotalHits()/float(perPage))) return totalPages, docs def addDuplicatesQuery(self, query): not_duplicate = TermQuery(Term('duplicate', 'false')) booleanQuery = BooleanQuery() booleanQuery.add(not_duplicate, BooleanClause.Occur.MUST) booleanQuery.add(query, BooleanClause.Occur.MUST) return booleanQuery
def findTopClasses(self): propertyURI = RDFS.SUBCLASSOF allClasses = list() topClasses = list() try: analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) parser = QueryParser(Version.LUCENE_CURRENT, FreyaConstants.PROPERTY_FEATURE_LKB, analyzer) query = parser.parse("\"" + QueryParser.escape(propertyURI) + "\"") result = self._searcher.search(query, 1) logging.debug("For " + str(query) + " : " + str(result.totalHits)) freq = result.totalHits if freq > 0: result = self._searcher.search(query, freq) hits = pyJava.JArray2List(result.scoreDocs) # for (ScoreDoc hit : hits) { indexus = 0 while indexus < len(hits): hit = hits[indexus] doc = self._searcher.doc(hit.doc) allClasses.append(doc.get(FreyaConstants.CLASS_FEATURE_LKB)) indexus += 1 # for (String classUri : allClasses) { indexus = 0 while indexus < len(allClasses): classUri = allClasses[indexus] logging.info("Checking whether " + classUri + " is a top class.") # search inst and pred retrieve class # if class exists that means it is not top class otherwise add to # topClasses classes = self.searchForClass(classUri, propertyURI) logging.info("top classes:" + str(len(classes))) if classes != None or len(classes) > 0: logging.info("This is not a top class...") else: topClasses.append(classUri) logging.info("Adding " + classUri + " to top classes.") indexus += 1 except Exception as e:#CorruptIndexException(e): print e.message logging.error("Error") return topClasses
def do_search(qry, limit): helper.initPyLucene() RNLP_ctxt = _get_rnlp_ctxt() entry_map = RNLP_ctxt.get_entry_map() from org.apache.lucene.index import DirectoryReader from org.apache.lucene.search import IndexSearcher from org.apache.lucene.queryparser.classic import QueryParser from org.apache.lucene.analysis.standard import StandardAnalyzer from org.apache.lucene.store import FSDirectory from org.apache.lucene.util import Version from java.io import File print os.path.abspath(os.path.pardir) reader = DirectoryReader.open(FSDirectory.open(File(IDX_DIR))) searcher = IndexSearcher(reader) analyzer = StandardAnalyzer(Version.LUCENE_40) field = 'contents' parser = QueryParser(Version.LUCENE_40, field, analyzer); query = parser.parse(qry); print 'Searching for:', query.toString(field) raw_results = searcher.search(query, limit) hits = raw_results.scoreDocs numTotalHits = raw_results.totalHits print numTotalHits, 'total matching documents' results = [] for hit in hits: doc = searcher.doc(hit.doc); entry_id = doc.get('entry_id') entry = entry_map.get(entry_id) #print 'entry:', entry score = hit.score #print 'Hit:', entry['short_title'], score results.append((score, doc, entry)) return results
def search(termo, **args): indexDir = os.environ.get('MANDEX') or '3iteracao' fsDir = SimpleFSDirectory(File(indexDir)) searcher = IndexSearcher(DirectoryReader.open(fsDir)) analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) parser = QueryParser(Version.LUCENE_CURRENT, field, analyzer) parser.setDefaultOperator(QueryParser.Operator.OR) query = parser.parse(termo + ' '.join(args.values())) start = datetime.now() scoreDocs = searcher.search(query, 50).scoreDocs duration = datetime.now() - start politicos = [] for scoreDoc in scoreDocs: doc = searcher.doc(scoreDoc.doc) table = dict((field.name(), field.stringValue()) for field in doc.getFields()) politicos.append(table) return politicos
def perform_search(self, searchterm, results_per_page, page): # if there is a field in the searchterm """if ":" in searchterm: # processing a query parser = QueryParser(Version.LUCENE_CURRENT, "content", self.analyzer) parser.setDefaultOperator(QueryParser.Operator.AND) query = parser.parse(searchterm) else: query = BooleanQuery() query_title = TermQuery(Term("title", searchterm)) query_description = TermQuery(Term("description", searchterm)) query_content = TermQuery(Term("content", searchterm)) # BooleanClause.Occur.MUST for AND queries query.add(query_title, BooleanClause.Occur.SHOULD) query.add(query_description, BooleanClause.Occur.SHOULD) query.add(query_content, BooleanClause.Occur.SHOULD)""" # create QueryParser for each field to be searched parser_title = QueryParser(Version.LUCENE_CURRENT, "title", self.analyzer) parser_description = QueryParser(Version.LUCENE_CURRENT, "description", self.analyzer) parser_content = QueryParser(Version.LUCENE_CURRENT, "content", self.analyzer) # put fields together query = BooleanQuery() query.add(parser_title.parse(searchterm), BooleanClause.Occur.SHOULD) query.add(parser_description.parse(searchterm), BooleanClause.Occur.SHOULD) query.add(parser_content.parse(searchterm), BooleanClause.Occur.SHOULD) # conducting search searcher = IndexSearcher(DirectoryReader.open(self.store)) start = datetime.now() hits = searcher.search(query, results_per_page + (results_per_page * page)) score_docs = hits.scoreDocs count_results = hits.totalHits duration = datetime.now() - start # results to return results = [] count = 0 for scoreDoc in score_docs: # skip offset if count < results_per_page * page: count += 1 continue count += 1 doc = searcher.doc(scoreDoc.doc) table = dict((field.name(), field.stringValue()) for field in doc.getFields()) results.append(table) return results, duration, count_results
def getQueryParser(self): analyzers = self.getSearchingAnalyzers() map = HashMap() map.put('name', analyzers['name']) map.put('parent', analyzers['parent']) map.put('content', analyzers['default']) map.put('id', analyzers['id']) analyzerWrapper = PerFieldAnalyzerWrapper(analyzers['default'], map) queryParser = QueryParser(Version.LUCENE_CURRENT, 'content', analyzerWrapper) queryParser.setAutoGeneratePhraseQueries(PHRASE_QUERY_BY_DEFAULT) queryParser.setPhraseSlop(PHRASE_SLOP) queryParser.setFuzzyMinSim(FUZZY_MIN_SIM) queryParser.setDefaultOperator(DEFAULT_OPERATOR) return queryParser
def __BM25(self,searcher,rawQuery): '''retrieve documents with a single query''' if 'Standard' in self.indexFile: analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) # build a standard analyzer with default stop words if 'Porter' in self.indexFile: analyzer = PorterStemmerAnalyzer() query = QueryParser(Version.LUCENE_CURRENT, "contents", analyzer).parse(QueryParser.escape(rawQuery)) # escape special characters scoreDocs = searcher.search(query, 100).scoreDocs docList = [] for scoreDoc in scoreDocs: doc = searcher.doc(scoreDoc.doc) docList.append(doc.get("name")) return docList
def lucene_retrieval(q_string, feature_type, use_BM25=False): """ :param q_string: :param feature_type: :param use_BM25: :return: retrieval_scores for each question-answer pair """ index = set_lucene_index['ind'] # nonlocal variable index def retrieval_scores(hists): """ return sorted document+score by score :param hists: """ def doc_score(hists): """ return doc_name & score :param hists: """ for h in hists: # docID = h.doc # doc = searcher.doc(docID) # file_name = doc.get("corpus_name") # doc_name = doc.get("doc_name") # text = doc.get("text") score = h.score # yield (file_name, doc_name, score, text) yield score doc_score_list = list(doc_score(hists)) return map(lambda f: f(doc_score_list), feature_type) if len(doc_score_list) != 0 else [0]*len(feature_type) # feature_type is a list of function # escape special characters via escape function query = QueryParser(version, 'text', analyzer).parse(QueryParser.escape(q_string)) # search reader = IndexReader.open(index) searcher = IndexSearcher(reader) if use_BM25: searcher.setSimilarity(BM25Similarity(k1=1.5, b=0.75)) # todo: BM25 parameters collector = TopScoreDocCollector.create(hitsPerPage, True) searcher.search(query, collector) hs = collector.topDocs().scoreDocs # hists results = retrieval_scores(hs) # reader.close() return results # retrieval_scores for each question-answer pair
def extract_phrase_query(self, q, field, slop=0, boost=5): phrases = re.findall(r'"([^"]*)"', q) if len(phrases) == 0: return None, q q = re.sub(r'"([^"]*)"', "", q).strip() # query without phrases if self.verbose: print "Detected phrases: ", phrases bq = BooleanQuery() for phrase in phrases: # pq = PhraseQuery() # for term in filter(None, phrase.split(' ')): # pq.add(Term(field, term)) qparser = QueryParser(Version.LUCENE_CURRENT, field, self.analyzer) # parse phrase - this may or may not be desired # pq = qparser.parse(field + ':"' + phrase + '"') pq = qparser.parse('%s "%s"~%d^%.1f' % (phrase, phrase, slop, boost)) # phrase queries have high priority bq.add(pq, BooleanClause.Occur.MUST) # bq.add(pq, BooleanClause.Occur.SHOULD) return bq, q
def lucene_retrieval(q_string, use_BM25=False): """ :param q_string: :param use_BM25: :return: retrieval_scores for each question-answer pair """ index = set_lucene_index['ind'] # nonlocal variable index def doc_text(hists): """ return doc_name & score :param hists: """ text = '_NONE_' for h in hists: docID = h.doc doc = searcher.doc(docID) # file_name = doc.get("corpus_name") # doc_name = doc.get("doc_name") text = doc.get("text") #score = h.score # yield (file_name, doc_name, score, text) return text result = '_NONE_' # escape special characters via escape function if q_string and q_string.strip(): # when pre-process answers, `none of the above` -> '' cause error here #print(q_string) query = QueryParser(version, 'text', analyzer).parse(QueryParser.escape(q_string)) # search reader = IndexReader.open(index) searcher = IndexSearcher(reader) if use_BM25: searcher.setSimilarity(BM25Similarity(k1=1.5, b=0.75)) # todo: BM25 parameters collector = TopScoreDocCollector.create(hitsPerPage, True) searcher.search(query, collector) hs = collector.topDocs().scoreDocs # hists result = doc_text(hs) # reader.close() return result # text: also nodes
def __init__(self, queries=[], criteria=[], conjunctions=[], orderby=["ta"], ascending=True, limit=10000): vm.attachCurrentThread() self.queries = [query for query in queries if len(query.strip()) > 0] self.criteria = criteria self.conjunctions = conjunctions self.orderby = orderby self.ascending = ascending self.queryString = "" self.limit = limit self.fields = fields self.analyzer = PorterStemmerAnalyzer() self.queryParser = QueryParser(Version.LUCENE_30, "freetext", self.analyzer) self.queryParser.setAllowLeadingWildcard(True) self.queryParser.setDefaultOperator(QueryParser.Operator.AND) indexDir = settings.LUCENE_INDEX_DIRECTORY self.index = MMapDirectory(File(indexDir))