def testScore(self): reactor = CallTrace('reactor') settings = LuceneSettings(commitCount=1, similarity=TermFrequencySimilarity(), verbose=False) lucene = Lucene(join(self.tempdir, 'lucene'), reactor=reactor, settings=settings) document = Document() document.add(TextField('field', 'x '*100, Field.Store.NO)) returnValueFromGenerator(lucene.addDocument(identifier="identifier", document=document)) q = TermQuery(Term("field", 'x')) result = returnValueFromGenerator(lucene.executeQuery(q)) self.assertAlmostEqual(0.1, result.hits[0].score) q.setBoost(10.0) result = returnValueFromGenerator(lucene.executeQuery(q)) self.assertAlmostEqual(1, result.hits[0].score)
def testUnqualifiedTermFields(self): composer = LuceneQueryComposer(unqualifiedTermFields=[("field0", 0.2), ("field1", 2.0)], luceneSettings=LuceneSettings()) ast = parseCql("value") result = composer.compose(ast) query = BooleanQuery() left = TermQuery(Term("field0", "value")) left.setBoost(0.2) query.add(left, BooleanClause.Occur.SHOULD) right = TermQuery(Term("field1", "value")) right.setBoost(2.0) query.add(right, BooleanClause.Occur.SHOULD) self.assertEquals(type(query), type(result)) self.assertEquals(repr(query), repr(result))
def runSearch(self, runCount, mainThread=False): """ search for runCount number of times """ # problem: if there are any assertion errors in the child # thread, the calling thread is not notified and may still # consider the test case pass. We are using self.totalQueries # to double check that work has actually been done. if not mainThread: getVMEnv().attachCurrentThread() time.sleep(0.5) searcher = self.getSearcher() try: for word, count in self.testData[0:runCount]: query = TermQuery(Term("field", word)) topDocs = searcher.search(query, 50) self.assertEqual(topDocs.totalHits, count) self.lock.acquire() self.totalQueries += 1 self.lock.release() finally: del searcher
def main(): global lucene_vm_init if not lucene_vm_init: lucene.initVM(vmargs=['-Djava.awt.headless=true']) lucene_vm_init = True is_index_Exist = os.path.exists(LUCENE_INDEX_DIR) # specify index path index_mm = MMapDirectory(Paths.get(LUCENE_INDEX_DIR)) # configure search engine analyzer = StandardAnalyzer() config = IndexWriterConfig(analyzer) # load index to search engine reader = DirectoryReader.open(index_mm) searcher1 = IndexSearcher(reader) searcher1.setSimilarity(BM25Similarity()) searcher2 = IndexSearcher(reader) w = IndexWriter(index_mm,config) # read query read_query() # initialize mongodb client mongoObj=Mongo_Object('localhost',27017) # search docDup=set() finalDup={} for i in xrange(len(queries)): print 'process query %d' %(i) query = queries[i] querystr = stemSentence(query[3]) # build searcher q_lucene = QueryParser("all_text", analyzer).parse(querystr) collector = TopScoreDocCollector.create(hitsPerPage); searcher1.search(q_lucene, collector); hits = collector.topDocs().scoreDocs; # find candidate results after 1st round filter docDup.clear() for j in xrange(len(hits)): docID=hits[j].doc d=searcher1.doc(docID) if d['title'] in docDup: finalDup[d['title']]=d continue docDup.add(d['title']) docDup.clear() for j in xrange(len(hits)): docID=hits[j].doc d=searcher1.doc(docID) title=d['title'] if d['title'] in docDup: continue docDup.add(title) item=(mongoObj.conn_me).find_one({'title':title}) if item is None: continue entitylist=item['entitylist'].split('|') for en_title in entitylist: if title==en_title: continue t=Term('title',en_title) q=TermQuery(t) docs=searcher2.search(q,2) if docs.totalHits<=1: continue docID2=(docs.scoreDocs)[0].doc doc=searcher2.doc(docID2) finalDup[doc['title']]=doc print 'begin to clean index, there are %d dup records' %(len(finalDup)) for title in finalDup: doc=finalDup[title] # title,name,value,category,skos_category,all_text,raw_name,raw_value,abstract name=doc['name'] value=doc['value'] category=doc['category'] skos_category=doc['skos_category'] all_text=doc['all_text'] raw_name=doc['raw_name'] raw_value=doc['raw_value'] abstract=doc['abstract'] print 'process '+title t=Term('title',title) q=TermQuery(t) w.deleteDocuments(q) addDoc(w,title,name,value,category,skos_category,all_text,raw_name,raw_value,abstract) # process remaining records #global batch,cnt_batch #if cnt_batch>0: #w.addDocuments(batch) #cnt_batch=0 #del batch[:] w.close()
def main(): #constants FIELD_CONTENTS = "vectext" DOC_NAME = "identifier" STORE_DIR = "../full_index1" lucene.initVM() store = SimpleFSDirectory(Paths.get(STORE_DIR)) ireader = DirectoryReader.open(store) #, True) #print(ireader.readerIndex(0)) searcher = IndexSearcher(ireader) #self.getSearcher() pickle_file = glob.glob('full_word_list.pkl') print(pickle_file) date_range = (1785, 1805) bigrams = False remake_word_list = True if remake_word_list: #not pickle_file: full_df = get_full_df() full_term_data = [] for year in range(date_range[0], date_range[1]): docs_in_year = get_docs_in_year(full_df, year) #print(docs_in_year) year_dict = Counter({}) terms = [] freqs = [] print(year) for cd, doc_id in enumerate(docs_in_year): #if not cd%100: # print(cd , '--', len(docs_in_year)) # get document (query by id) q = TermQuery(Term("identifier", doc_id + '_djvu.txt')) topDocs = searcher.search(q, 50000) #termvec = reader.getTermVector(topDocs.scoreDocs[0].doc, "all") one_doc = topDocs.scoreDocs[0].doc doc_name = searcher.doc(one_doc) #print(doc_name, doc_id) if bigrams == False: termvec = ireader.getTermVector(topDocs.scoreDocs[0].doc, FIELD_CONTENTS) if termvec != None: #termvec = reader.getTermVector(topDocs.scoreDocs[0].doc, "all") termsEnum = termvec.iterator() for term in BytesRefIterator.cast_(termsEnum): terms.append(term.utf8ToString()) freqs.append(termsEnum.totalTermFreq()) else: #print(doc_name, doc_id) text = doc_name.get("text") text = text.split() text = strip_stopwords_punc(text) for word1, word2 in zip(text[:-1], text[1:]): if len(word1) + len(word2) > 6: try: year_dict[word1 + ' ' + word2] += 1 except: year_dict[word1 + ' ' + word2] = 1 if bigrams == False: for term, freq in zip(terms, freqs): try: year_dict[term] += freq except: year_dict[term] = freq print(len(year_dict)) #print(year_dict) for term in list(year_dict): if year_dict[term] < 2: #5 and term not in stopwords: year_dict.pop(term) full_term_data.append(year_dict) print(len(year_dict)) #year_dict = year_dict + doc_dict #print(year_dict.most_common(1000)) print('\n\n') if bigrams: pickle.dump(full_term_data, open('full_bigram_list.pkl', 'wb')) else: pickle.dump(full_term_data, open('full_word_list.pkl', 'wb')) else: if bigrams: full_term_data = pickle.load(open('full_bigram_list.pkl', 'rb')) else: full_term_data = pickle.load(open('full_word_list.pkl', 'rb')) # get complete list of unique words # top_words_year = zscore_method(full_term_data, date_range) top_words_year = tfidf_method(full_term_data, date_range) print(top_words_year) pickle.dump(top_words_year, open('trending_ratio.pkl', 'wb'))
def testOneTerm(self): self.assertConversion(['term'], 'term') self.assertEquals(1, len(self.observer.calledMethods)) self.assertEquals("executeQuery", self.observer.calledMethods[0].name) self.assertEquals(repr(TermQuery(Term('field', 'term'))), repr(self.observer.calledMethods[0].kwargs['luceneQuery']))
import lucene from java.nio.file import Paths from org.apache.lucene.store import SimpleFSDirectory from org.apache.lucene.search import IndexSearcher, BooleanClause, BooleanQuery, TermQuery from org.apache.lucene.index import DirectoryReader, Term input_q = input().replace('ی', 'ي') lucene.initVM() index_path = Paths.get('./lucene.index') question_field = 'question' answer_field = 'answer' directory = SimpleFSDirectory(index_path) searcher = IndexSearcher(DirectoryReader.open(directory)) query_builder = BooleanQuery.Builder() for q_word in input_q.split(' '): qtq = TermQuery(Term(question_field, q_word)) query_builder\ .add(BooleanClause(qtq, BooleanClause.Occur.SHOULD)) query = query_builder.build() top_n = 5 scoreDocs = searcher.search(query, top_n).scoreDocs print('found nums: ', len(scoreDocs)) for scoreDoc in scoreDocs: doc = searcher.doc(scoreDoc.doc) print('Best Math: ', doc.get(question_field), '\n') print('Answer: ', doc.get(answer_field)) print('---------------------\n')
def testOneTermOutputWithANumber(self): self.assertConversion(TermQuery(Term("unqualified", "2005")), "2005")
def testOneTermOutput(self): self.assertConversion(TermQuery(Term("unqualified", "cat")), "cat")
t1.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) with codecs.open(json_file, encoding='utf8') as f: for line in tqdm(f): line = line.strip() try: json_doc = json.loads(line) except: glog.warning('Error json parsing: {}'.format(line)) continue # Delete existing abstracts. This is useful when adding # update files from Medline. try: assert 'pmid' in json_doc pmid_query = TermQuery(Term('pmid', json_doc['pmid'])) pmcid_query = TermQuery(Term('pmcid', json_doc['pmcid'])) id_query = IntPoint.newRangeQuery("id", json_doc['id'], json_doc['id']) bq = BooleanQuery.Builder() bq.add(pmid_query, BooleanClause.Occur.MUST) bq.add(pmcid_query, BooleanClause.Occur.MUST) bq.add(id_query, BooleanClause.Occur.MUST) q = bq.build() writer.deleteDocuments(q) # Add whole abstract. doc = Document() # Store field. doc.add(IntPoint('id', json_doc['id'])) # index doc.add(StoredField('id', json_doc['id'])) # store
def search(self, query, topn=None): general_query = query['general_query'] purpose_is = query['purpose_is'] purpose_is_not = query['purpose_is_not'] mechanics_is = query['mechanics_is'] mechanics_is_not = query['mechanics_is_not'] self.purpose_is_w2v = Word2Vec(purpose_is).words self.purpose_is_not_w2v = Word2Vec(purpose_is_not).words self.mechanics_is_w2v = Word2Vec(mechanics_is).words self.mechanics_is_not_w2v = Word2Vec(mechanics_is_not).words topn = self.topn if topn is None else topn vm_env = lucene.getVMEnv() vm_env.attachCurrentThread() bool_query = BooleanQuery.Builder() if general_query: query_description = QueryParser( 'description', self._analyzer).parse(general_query) query_abstract = QueryParser('abstract', self._analyzer).parse(general_query) query_title = QueryParser('title', self._analyzer).parse(general_query) query_id = TermQuery(Term('uid', general_query)) bool_query.add(query_description, BooleanClause.Occur.SHOULD) bool_query.add(query_abstract, BooleanClause.Occur.SHOULD) bool_query.add(query_title, BooleanClause.Occur.SHOULD) bool_query.add(query_id, BooleanClause.Occur.SHOULD) if purpose_is: query_purpose_is_w2v = QueryParser( 'purpose', self._analyzer).parse(purpose_is + self.purpose_is_w2v) bool_query.add(query_purpose_is_w2v, BooleanClause.Occur.SHOULD) if purpose_is_not: query_purpose_is_not_w2v = QueryParser( 'purpose', self._analyzer).parse(purpose_is_not + self.purpose_is_not_w2v) bool_query.add(query_purpose_is_not_w2v, BooleanClause.Occur.MUST_NOT ) # MAYBE NEED TO BE BooleanClause.Occur.SHOULD if mechanics_is: query_mechanics_is_w2v = QueryParser( 'mechanics', self._analyzer).parse(mechanics_is + self.mechanics_is_w2v) bool_query.add(query_mechanics_is_w2v, BooleanClause.Occur.SHOULD) if mechanics_is_not: query_mechanics_is_not_w2v = QueryParser( 'mechanics', self._analyzer).parse(mechanics_is_not + self.mechanics_is_not_w2v) bool_query.add(query_mechanics_is_not_w2v, BooleanClause.Occur.MUST_NOT) docs = self._searcher.search(bool_query.build(), topn).scoreDocs result = [] for doc in docs: doc = self._searcher.doc(doc.doc) result.append({ 'id': doc.get('id'), 'date': doc.get('date'), 'title': doc.get('title'), 'author': doc.get('author'), 'icn': doc.get('icn'), 'organization': doc.get('organization'), 'acn': doc.get('acn'), 'abstract': doc.get('abstract'), 'description': doc.get('description'), 'purpose': doc.get('patent purpose'), 'mechanics': doc.get('patent mechanics'), 'uid': doc.get('uid') }) return result
def testSimple(self): writer = self.getWriter(analyzer=SimpleAnalyzer()) doc = Document() field = Field("foo", "", TextField.TYPE_NOT_STORED) doc.add(field) dvField = FloatDocValuesField("foo_boost", 0.0) doc.add(dvField) field2 = Field("bar", "", TextField.TYPE_NOT_STORED) doc.add(field2) field.setStringValue("quick brown fox") field2.setStringValue("quick brown fox") dvField.setFloatValue(2.0) # boost x2 writer.addDocument(doc) field.setStringValue("jumps over lazy brown dog") field2.setStringValue("jumps over lazy brown dog") dvField.setFloatValue(4.0) # boost x4 writer.addDocument(doc) reader = writer.getReader() writer.close() # no boosting searcher1 = self.getSearcher(reader=reader) base = searcher1.getSimilarity(True) # boosting searcher2 = self.getSearcher(reader=reader) class _similarity(PythonPerFieldSimilarityWrapper): def __init__(_self, base): super(_similarity, _self).__init__() _self.base = base _self.fooSim = BoostingSimilarity(base, "foo_boost") def get(_self, field): return _self.fooSim if "foo" == field else _self.base searcher2.setSimilarity(_similarity(base)) # in this case, we searched on field "foo". first document should have # 2x the score. tq = TermQuery(Term("foo", "quick")) noboost = searcher1.search(tq, 10) boost = searcher2.search(tq, 10) self.assertEqual(1, noboost.totalHits) self.assertEqual(1, boost.totalHits) self.assertEqual(boost.scoreDocs[0].score, noboost.scoreDocs[0].score * 2.0, SCORE_EPSILON) # this query matches only the second document, which should have 4x # the score. tq = TermQuery(Term("foo", "jumps")) noboost = searcher1.search(tq, 10) boost = searcher2.search(tq, 10) self.assertEqual(1, noboost.totalHits) self.assertEqual(1, boost.totalHits) self.assertEqual(boost.scoreDocs[0].score, noboost.scoreDocs[0].score * 4.0, SCORE_EPSILON) # search on on field bar just for kicks, nothing should happen, since # we setup our sim provider to only use foo_boost for field foo. tq = TermQuery(Term("bar", "quick")) noboost = searcher1.search(tq, 10) boost = searcher2.search(tq, 10) self.assertEqual(1, noboost.totalHits) self.assertEqual(1, boost.totalHits) self.assertEqual(boost.scoreDocs[0].score, noboost.scoreDocs[0].score, SCORE_EPSILON) reader.close()
def testSimilarity(self): writer = self.getWriter(analyzer=SimpleAnalyzer( Version.LUCENE_CURRENT), similarity=SimpleSimilarity()) d1 = Document() d1.add(Field("field", "a c", TextField.TYPE_STORED)) d2 = Document() d2.add(Field("field", "a b c", TextField.TYPE_STORED)) writer.addDocument(d1) writer.addDocument(d2) writer.commit() writer.close() searcher = self.getSearcher() searcher.setSimilarity(SimpleSimilarity()) a = Term("field", "a") b = Term("field", "b") c = Term("field", "c") class collector1(PythonCollector): def collect(_self, doc, score): self.assertEqual(1.0, score) def setNextReader(_self, context): pass def acceptsDocsOutOfOrder(_self): return True searcher.search(TermQuery(b), collector1()) bq = BooleanQuery() bq.add(TermQuery(a), BooleanClause.Occur.SHOULD) bq.add(TermQuery(b), BooleanClause.Occur.SHOULD) class collector2(PythonCollector): def collect(_self, doc, score): self.assertEqual(doc + _self.base + 1, score) def setNextReader(_self, context): _self.base = context.docBase def acceptsDocsOutOfOrder(_self): return True searcher.search(bq, collector2()) pq = PhraseQuery() pq.add(a) pq.add(c) class collector3(PythonCollector): def collect(_self, doc, score): self.assertEqual(1.0, score) def setNextReader(_self, context): pass def acceptsDocsOutOfOrder(_self): return True searcher.search(pq, collector3()) pq.setSlop(2) class collector4(PythonCollector): def collect(_self, doc, score): self.assertEqual(2.0, score) def setNextReader(_self, context): pass def acceptsDocsOutOfOrder(_self): return True searcher.search(pq, collector4())
def strict_search(self, value): query = TermQuery(Term("wiki_name", value)) scoreDocs = self.searcher.search(query, self.num_docs_to_return).scoreDocs return scoreDocs
def GET(self, query): data_input = web.input() page = 0 if "page" in data_input: page = int(data_input["page"]) render = web.template.render('templates/') anses = [] num_pages = 0 if use_elasticsearch: # importing libraries for Elasticsearch from elasticsearch import Elasticsearch from elasticsearch_dsl import Search, document, field, connections, Q from elasticsearch_dsl.connections import connections from booktype import Book es = Elasticsearch() es.indices.create(index='book-index', ignore=[400, 404]) connections.create_connection(hosts=['localhost'], timeout=20) connections.add_connection('book', es) # print(connections.get_connection().cluster.health()) s = Search(es).index('book-index').doc_type('book').query( Q('match', title=query.strip()) | Q('match', description=query.strip()) | Q("match", userreviews_userReview=query.strip())) ## This damn statement took half an hour from me! Nowhere in the documentation indicated that this statement should be before s.execute() s = s[page * 10:page * 10 + 10] response = s.execute() # print 'total number of hits: ', response.hits.total num_pages = (response.hits.total / 10) + 1 for res in response: authors = zip(res.authors_name, res.authors_url) anses.append({ 'title': res.title, 'description': res.description.encode('utf-8'), 'url': res.url, 'cover': res.cover, 'authors': authors }) else: # importing libraries for Lucene import lucene from java.io import File from org.apache.lucene.index import DirectoryReader, Term from org.apache.lucene.queryparser.classic import QueryParser from org.apache.lucene.store import SimpleFSDirectory from org.apache.lucene.search import IndexSearcher, BooleanClause, BooleanQuery, TermQuery from org.apache.lucene.util import Version from org.apache.lucene.analysis.standard import StandardAnalyzer import os # fields title_field = 'title' description_field = 'description' cover_field = 'cover' authors_name_field = 'authors_name' authors_url_field = 'authors_url' url_field = 'url' index_folder = '.' index_name = 'lucene.index' index_path = os.path.join(index_folder, index_name) lucene.initVM() version = Version.LUCENE_CURRENT directory = SimpleFSDirectory(File(index_path)) searcher = IndexSearcher(DirectoryReader.open(directory)) analyzer = StandardAnalyzer(version) title_tq = TermQuery(Term(title_field, query)) desc_tq = TermQuery(Term(description_field, query)) query = BooleanQuery() query.add(BooleanClause(title_tq, BooleanClause.Occur.SHOULD)) query.add(BooleanClause(desc_tq, BooleanClause.Occur.SHOULD)) scoreDocs = searcher.search(query, 1000).scoreDocs num_pages = (len(scoreDocs) / 10) + 1 for scoreDoc in scoreDocs[page * 10:page * 10 + 10]: doc = searcher.doc(scoreDoc.doc) authors = zip([doc.get(authors_name_field)], [doc.get(authors_url_field)]) anses.append({ 'title': doc.get(title_field), 'description': doc.get(description_field).encode('utf-8'), 'url': doc.get(url_field), 'cover': doc.get(cover_field), 'authors': authors }) return render.index(anses, query, num_pages)
def exists(self, key): boolean_query = BooleanQuery.Builder() boolean_query.add(TermQuery(Term('key', key)), BooleanClause.Occur.MUST) results = self.searcher.search(boolean_query.build(), 1) return results.totalHits > 0
def testRangeFilterRand(self): index = self.signedIndex reader = self.getReader(directory=index.index) search = self.getSearcher(reader=reader) minRP = self.pad(index.minR) maxRP = self.pad(index.maxR) numDocs = reader.numDocs() self.assertEqual(numDocs, 1 + self.maxId - self.minId, "num of docs") q = TermQuery(Term("body", "body")) # test extremes, bounded on both ends result = search.search(q, _trf("rand", minRP, maxRP, True, True), 50) self.assertEqual(numDocs, result.totalHits, "find all") result = search.search(q, _trf("rand", minRP, maxRP, True, False), 50) self.assertEqual(numDocs - 1, result.totalHits, "all but biggest") result = search.search(q, _trf("rand", minRP, maxRP, False, True), 50) self.assertEqual(numDocs - 1, result.totalHits, "all but smallest") result = search.search(q, _trf("rand", minRP, maxRP, False, False), 50) self.assertEqual(numDocs - 2, result.totalHits, "all but extremes") # unbounded result = search.search(q, _trf("rand", minRP, None, True, False), 50) self.assertEqual(numDocs, result.totalHits, "smallest and up") result = search.search(q, _trf("rand", None, maxRP, False, True), 50) self.assertEqual(numDocs, result.totalHits, "biggest and down") result = search.search(q, _trf("rand", minRP, None, False, False), 50) self.assertEqual(numDocs - 1, result.totalHits, "not smallest, but up") result = search.search(q, _trf("rand", None, maxRP, False, False), 50) self.assertEqual(numDocs - 1, result.totalHits, "not biggest, but down") # very small sets result = search.search(q, _trf("rand", minRP, minRP, False, False), 50) self.assertEqual(0, result.totalHits, "min, min, False, False") result = search.search(q, _trf("rand", maxRP, maxRP, False, False), 50) self.assertEqual(0, result.totalHits, "max, max, False, False") result = search.search(q, _trf("rand", minRP, minRP, True, True), 50) self.assertEqual(1, result.totalHits, "min, min, True, True") result = search.search(q, _trf("rand", None, minRP, False, True), 50) self.assertEqual(1, result.totalHits, "nul, min, False, True") result = search.search(q, _trf("rand", maxRP, maxRP, True, True), 50) self.assertEqual(1, result.totalHits, "max, max, True, True") result = search.search(q, _trf("rand", maxRP, None, True, False), 50) self.assertEqual(1, result.totalHits, "max, nul, True, True")
def testDrilldownFieldQuery(self): fieldRegistry = FieldRegistry([DrilldownField('field')]) self.composer = LuceneQueryComposer(unqualifiedTermFields=[("unqualified", 1.0)], luceneSettings=LuceneSettings(fieldRegistry=fieldRegistry)) self.assertConversion(TermQuery(DrillDownQuery.term("$facets", "field", "value")), "field = value")
def testRangeFilterId(self): index = self.signedIndex reader = self.getReader(directory=index.index) search = self.getSearcher(reader=reader) medId = ((self.maxId - self.minId) / 2) minIP = self.pad(self.minId) maxIP = self.pad(self.maxId) medIP = self.pad(medId) numDocs = reader.numDocs() self.assertEqual(numDocs, 1 + self.maxId - self.minId, "num of docs") q = TermQuery(Term("body", "body")) # test id, bounded on both ends result = search.search(q, _trf("id", minIP, maxIP, True, True), 50) self.assertEqual(numDocs, result.totalHits, "find all") result = search.search(q, _trf("id", minIP, maxIP, True, False), 50) self.assertEqual(numDocs - 1, result.totalHits, "all but last") result = search.search(q, _trf("id", minIP, maxIP, False, True), 50) self.assertEqual(numDocs - 1, result.totalHits, "all but first") result = search.search(q, _trf("id", minIP, maxIP, False, False), 50) self.assertEqual(numDocs - 2, result.totalHits, "all but ends") result = search.search(q, _trf("id", medIP, maxIP, True, True), 50) self.assertEqual(1 + self.maxId - medId, result.totalHits, "med and up") result = search.search(q, _trf("id", minIP, medIP, True, True), 50) self.assertEqual(1 + medId - self.minId, result.totalHits, "up to med") # unbounded id result = search.search(q, _trf("id", minIP, None, True, False), 50) self.assertEqual(numDocs, result.totalHits, "min and up") result = search.search(q, _trf("id", None, maxIP, False, True), 50) self.assertEqual(numDocs, result.totalHits, "max and down") result = search.search(q, _trf("id", minIP, None, False, False), 50) self.assertEqual(numDocs - 1, result.totalHits, "not min, but up") result = search.search(q, _trf("id", None, maxIP, False, False), 50) self.assertEqual(numDocs - 1, result.totalHits, "not max, but down") result = search.search(q, _trf("id", medIP, maxIP, True, False), 50) self.assertEqual(self.maxId - medId, result.totalHits, "med and up, not max") result = search.search(q, _trf("id", minIP, medIP, False, True), 50) self.assertEqual(medId - self.minId, result.totalHits, "not min, up to med") # very small sets result = search.search(q, _trf("id", minIP, minIP, False, False), 50) self.assertEqual(0, result.totalHits, "min, min, False, False") result = search.search(q, _trf("id", medIP, medIP, False, False), 50) self.assertEqual(0, result.totalHits, "med, med, False, False") result = search.search(q, _trf("id", maxIP, maxIP, False, False), 50) self.assertEqual(0, result.totalHits, "max, max, False, False") result = search.search(q, _trf("id", minIP, minIP, True, True), 50) self.assertEqual(1, result.totalHits, "min, min, True, True") result = search.search(q, _trf("id", None, minIP, False, True), 50) self.assertEqual(1, result.totalHits, "nul, min, False, True") result = search.search(q, _trf("id", maxIP, maxIP, True, True), 50) self.assertEqual(1, result.totalHits, "max, max, True, True") result = search.search(q, _trf("id", maxIP, None, True, False), 50) self.assertEqual(1, result.totalHits, "max, nul, True, True") result = search.search(q, _trf("id", medIP, medIP, True, True), 50) self.assertEqual(1, result.totalHits, "med, med, True, True")
def testRightHandSideIsLowercase(self): self.assertConversion(TermQuery(Term("unqualified", "cat")), "CaT")
def build_anyterm_query(field, strv): res = BooleanQuery() for i in strv.split(): res.add(TermQuery(Term(field, i)), BooleanClause.Occur.SHOULD) return res
def testDiacritics(self): self.assertConversion(TermQuery(Term('title', 'moree')), 'title=Moree') self.assertConversion(TermQuery(Term('title', 'moree')), 'title=Morée') self.assertConversion(TermQuery(Term('title', 'moree')), 'title=Morèe')
def get_query_result(sarc, dct): PAGE_SIZE = 10 PAGE_JUMP = 10 query = BooleanQuery() query.add(TermQuery(Term(zh_pganlz.LTPF_TYPE, '1')), BooleanClause.Occur.MUST) page = 0 sort_lists = [] summ_set = set() exclus_set = None words = [] for k, v in dct.items(): if k in ('index', 'type', 'tag_indices', 'author_index'): query.add(build_anyterm_query(k, dct[k]), BooleanClause.Occur.MUST) elif k in ('text', 'contents', 'title', 'description', 'alias'): words += jieba.lcut(v) query.add( build_text_query(k + zh_pganlz.LTPF_FOR_QUERY, dct[k]), BooleanClause.Occur.MUST) elif k == 'raw': query.add( QueryParser('index', WhitespaceAnalyzer()).parse(dct[k]), BooleanClause.Occur.MUST) elif k == 'enhraw': x = 0 reslst = [] for entry in v: if x == 2: reslst += [ lastdoc + x.encode('utf8') for x in jieba.cut(entry) ] x = 0 else: if x == 0: reslst.append(entry.encode('utf8')) else: lastdoc = entry.encode('utf8') x += 1 query.add( QueryParser('index', WhitespaceAnalyzer()).parse( ' '.join(reslst)), BooleanClause.Occur.MUST) elif k == 'page': page = int(dct[k]) elif k == 'sort': for x in dct['sort']: sort_type = SortField.Type.STRING if 'type' in x.keys(): if x['type'] == 'int': sort_type = SortField.Type.INT elif x['type'] == 'float': sort_type = SortField.Type.FLOAT reverse = False if 'reverse' in x.keys(): reverse = x['reverse'] sort_lists.append( SortField(x['key'], sort_type, reverse)) elif k == 'summarize': summ_set = set(v) elif k == 'exclusive': exclus_set = set(v) ressrt = Sort(*sort_lists) resdocs = sarc.searcher.search(query, PAGE_SIZE, ressrt) if page > 0: if resdocs.totalHits > page * PAGE_SIZE: page -= 1 while page > PAGE_JUMP: resdocs = sarc.searcher.searchAfter( resdocs.scoreDocs[-1], query, PAGE_SIZE * PAGE_JUMP, ressrt) page -= PAGE_JUMP if page > 0: resdocs = sarc.searcher.searchAfter( resdocs.scoreDocs[-1], query, PAGE_SIZE * page, ressrt) resdocs = sarc.searcher.searchAfter( resdocs.scoreDocs[-1], query, PAGE_SIZE, ressrt) else: resdocs.scoreDocs = [] reslst = [] for x in resdocs.scoreDocs: dictobj = zh_pganlz.obj_to_json( zh_pganlz.document_to_obj(sarc.searcher.doc(x.doc))) if 'additional' in dct.keys(): adres = [] for x in dct['additional']: if isinstance(dictobj[x['sourcefield']], list): qlist = dictobj[x['sourcefield']] else: qlist = [dictobj[x['sourcefield']]] cres = [] for qword in qlist: if not isinstance(qword, (unicode, str)): qword = str(qword) searchres = sarc.searcher.search( zh_iatd.create_query({ 'type': x['type'], x['targetfield']: qword }), 1) if searchres.totalHits > 1: print x, 'FOUND', searchres elif searchres.totalHits == 0: cres.append(None) else: cres.append( zh_pganlz.obj_to_json( zh_pganlz.document_to_obj( sarc.searcher.doc( searchres.scoreDocs[0].doc)))) adres.append(cres) for k, v in dictobj.items(): if k in summ_set: dictobj[k + '_summary'] = summarize( hyper_text(v).text, list(set(words))) if not exclus_set is None: for k in dictobj.keys(): if not k in exclus_set: del dictobj[k] if 'additional' in dct.keys(): dictobj['additional'] = adres reslst.append(dictobj) return {'total': resdocs.totalHits, 'data': reslst}
def testQuery(self): base_query = TermQuery(self.term) new_query = base_query.rewrite(self.reader) self.assertEqual(base_query, new_query)
def testIndexRelationExactTermOutput(self): self.assertConversion(TermQuery(Term("animal", "hairy cats")), 'animal exact "hairy cats"') self.assertConversion(TermQuery(Term("animal", "Capital Cats")), 'animal exact "Capital Cats"')
def testSimilarity(self): writer = self.getWriter(analyzer=SimpleAnalyzer( Version.LUCENE_CURRENT), similarity=SimpleSimilarity()) d1 = Document() d1.add(Field("field", "a c", TextField.TYPE_STORED)) d2 = Document() d2.add(Field("field", "a c b", TextField.TYPE_STORED)) writer.addDocument(d1) writer.addDocument(d2) writer.commit() writer.close() searcher = self.getSearcher() searcher.setSimilarity(SimpleSimilarity()) a = Term("field", "a") b = Term("field", "b") c = Term("field", "c") class collector1(PythonSimpleCollector): def collect(_self, doc, score): self.assertEqual(1.0, score) def doSetNextReader(_self, context): pass def scoreMode(_self): return ScoreMode.COMPLETE searcher.search(TermQuery(b), collector1()) builder = BooleanQuery.Builder() builder.add(TermQuery(a), BooleanClause.Occur.SHOULD) builder.add(TermQuery(b), BooleanClause.Occur.SHOULD) bq = builder.build() class collector2(PythonSimpleCollector): def collect(_self, doc, score): self.assertEqual(doc + _self.base + 1, score) def doSetNextReader(_self, context): _self.base = context.docBase def scoreMode(_self): return ScoreMode.COMPLETE searcher.search(bq, collector2()) pq = PhraseQuery(a.field(), [a.bytes(), c.bytes()]) class collector3(PythonSimpleCollector): def collect(_self, doc, score): self.assertEqual(1.0, score) def doSetNextReader(_self, context): pass def scoreMode(_self): return ScoreMode.COMPLETE searcher.search(pq, collector3()) pq = PhraseQuery(2, a.field(), [a.bytes(), b.bytes()]) class collector4(PythonSimpleCollector): def collect(_self, doc, score): self.assertEqual(0.5, score) def doSetNextReader(_self, context): pass def scoreMode(_self): return ScoreMode.COMPLETE searcher.search(pq, collector4())
def testBooleanOrTermOutput(self): query = BooleanQuery() query.add(TermQuery(Term('unqualified', 'cats')), BooleanClause.Occur.SHOULD) query.add(TermQuery(Term('unqualified', 'dogs')), BooleanClause.Occur.SHOULD) self.assertConversion(query, 'cats OR dogs')
def testPhraseQueryInConjunctionScorer(self): writer = self.getWriter() doc = Document() doc.add(Field("source", "marketing info", TextField.TYPE_STORED)) writer.addDocument(doc) doc = Document() doc.add(Field("contents", "foobar", TextField.TYPE_STORED)) doc.add(Field("source", "marketing info", TextField.TYPE_STORED)) writer.addDocument(doc) writer.close() searcher = self.getSearcher() builder = PhraseQuery.Builder() builder.add(Term("source", "marketing")) builder.add(Term("source", "info")) phraseQuery = builder.build() topDocs = searcher.search(phraseQuery, 50) self.assertEqual(2, topDocs.totalHits) termQuery = TermQuery(Term("contents", "foobar")) builder = BooleanQuery.Builder() builder.add(termQuery, BooleanClause.Occur.MUST) builder.add(phraseQuery, BooleanClause.Occur.MUST) booleanQuery = builder.build() topDocs = searcher.search(booleanQuery, 50) self.assertEqual(1, topDocs.totalHits) writer = self.getWriter() doc = Document() doc.add(Field("contents", "map entry woo", TextField.TYPE_STORED)) writer.addDocument(doc) doc = Document() doc.add(Field("contents", "woo map entry", TextField.TYPE_STORED)) writer.addDocument(doc) doc = Document() doc.add( Field("contents", "map foobarword entry woo", TextField.TYPE_STORED)) writer.addDocument(doc) writer.close() searcher = self.getSearcher() termQuery = TermQuery(Term("contents", "woo")) builder = PhraseQuery.Builder() builder.add(Term("contents", "map")) builder.add(Term("contents", "entry")) topDocs = searcher.search(termQuery, 50) self.assertEqual(3, topDocs.totalHits) phraseQuery = builder.build() topDocs = searcher.search(phraseQuery, 50) self.assertEqual(2, topDocs.totalHits) builder = BooleanQuery.Builder() builder.add(termQuery, BooleanClause.Occur.MUST) builder.add(phraseQuery, BooleanClause.Occur.MUST) booleanQuery = builder.build() topDocs = searcher.search(booleanQuery, 50) self.assertEqual(2, topDocs.totalHits) builder = BooleanQuery.Builder() builder.add(phraseQuery, BooleanClause.Occur.MUST) builder.add(termQuery, BooleanClause.Occur.MUST) booleanQuery = builder.build() topDocs = searcher.search(booleanQuery, 50) self.assertEqual(2, topDocs.totalHits)
def testBooleanNotTermOutput(self): query = BooleanQuery() query.add(TermQuery(Term('unqualified', 'cats')), BooleanClause.Occur.MUST) query.add(TermQuery(Term('unqualified', 'dogs')), BooleanClause.Occur.MUST_NOT) self.assertConversion(query, 'cats NOT dogs')
def get_id_lookup_query(self, id, field=None): """Creates Lucene query for searching by (external) document id.""" if field is None: field = self.FIELDNAME_ID return TermQuery(Term(field, id))
def testBoost(self): query = TermQuery(Term("title", "cats")) query.setBoost(2.0) self.assertConversion(query, "title =/boost=2.0 cats")
def new_get_content(dir, id, show_length=300): index_dir = SimpleFSDirectory(Paths.get(dir)) searcher = IndexSearcher(DirectoryReader.open(index_dir)) cur_id_list = [int(x) for x in id.split(".")] query = TermQuery(Term("id", ".".join([str(x) for x in cur_id_list]))) hit = searcher.search(query, 1) doc = searcher.doc(hit.scoreDocs[0].doc) info_dict = {} for field_info in doc.getFields(): info_dict[field_info.name()] = field_info.stringValue() text_cur = [[info_dict]] text_prev = [] cur_id_list[2] -= 1 while list_text_len(text_prev) < show_length and cur_id_list[1] >= 1: para_info = [] while cur_id_list[2] >= 1: query = TermQuery( Term("id", ".".join([str(x) for x in cur_id_list]))) hit = searcher.search(query, 1) doc = searcher.doc(hit.scoreDocs[0].doc) info_dict = {} for field_info in doc.getFields(): info_dict[field_info.name()] = field_info.stringValue() para_info.insert(0, info_dict) cur_id_list[2] -= 1 text_prev.insert(0, para_info) cur_id_list[1] -= 1 query = RegexpQuery( Term("id", str(cur_id_list[0]) + "\\." + str(cur_id_list[1]) + "\\..+")) hits = searcher.search(query, 99999) s_id_max = 1 for hit in hits.scoreDocs: doc = searcher.doc(hit.doc) s_id = int(doc.get("id").split(".")[2]) if s_id > s_id_max: s_id_max = s_id cur_id_list[2] = s_id_max if len(text_prev) > 0 and list_text_len(text_prev) >= show_length: over_count = list_text_len(text_prev) - show_length len_count = 0 para_count = 0 new_para = [] while True: text = text_prev[0][para_count]["text"] prev_len = len_count len_count += len(text) if len_count > over_count: text = text[over_count - prev_len:] # 修注解的offset if "zhujie" in text_prev[0][para_count].keys(): zj = json.loads(text_prev[0][para_count]["zhujie"]) new_zj_offset = zj["offset"][:] new_zj_content = zj["content"][:] count = 0 for i in range(len(zj["offset"])): new_offset = zj["offset"][i] - (over_count - prev_len) if new_offset < 0: new_zj_offset.pop(count) new_zj_content.pop(count) count -= 1 else: new_zj_offset[count] = new_offset count += 1 text_prev[0][para_count]["zhujie"] = json.dumps({ "offset": new_zj_offset, "content": new_zj_content }) text_prev[0][para_count]["text"] = text break para_count += 1 while para_count < len(text_prev[0]): new_para.insert(0, text_prev[0][para_count]) para_count += 1 text_prev[0] = new_para cur_id_list = [int(x) for x in id.split(".")] text_next = [] cur_id_list[2] += 1 while list_text_len(text_next) < show_length: query = RegexpQuery( Term("id", str(cur_id_list[0]) + "\\." + str(cur_id_list[1]) + "\\..+")) hits = searcher.search(query, 100) s_id_max = 1 if hits.totalHits < 1: break for hit in hits.scoreDocs: doc = searcher.doc(hit.doc) s_id = int(doc.get("id").split(".")[2]) if s_id > s_id_max: s_id_max = s_id para_info = [] while cur_id_list[2] <= s_id_max: query = TermQuery( Term("id", ".".join([str(x) for x in cur_id_list]))) hit = searcher.search(query, 1) doc = searcher.doc(hit.scoreDocs[0].doc) info_dict = {} for field_info in doc.getFields(): info_dict[field_info.name()] = field_info.stringValue() para_info.append(info_dict) # para_text += doc.get("text") cur_id_list[2] += 1 text_next.append(para_info) cur_id_list[2] = 1 cur_id_list[1] += 1 if len(text_next) > 0 and list_text_len(text_next) >= show_length: over_count = list_text_len(text_next) - show_length len_count = 0 para_count = len(text_next[-1]) - 1 new_para = [] while True: text = text_next[-1][para_count]["text"] len_count += len(text) if len_count > over_count: text = text[:len_count - over_count] # 修注解的offset if "zhujie" in text_next[-1][para_count].keys(): zj = json.loads(text_next[-1][para_count]["zhujie"]) new_zj_offset = zj["offset"][:] new_zj_content = zj["content"][:] count = 0 for i in range(len(zj["offset"])): if zj["offset"][i] >= len(text): new_zj_offset.pop(count) new_zj_content.pop(count) count -= 1 count += 1 text_next[-1][para_count]["zhujie"] = json.dumps({ "offset": new_zj_offset, "content": new_zj_content }) text_next[-1][para_count]["text"] = text break para_count -= 1 para_count_new = 0 while para_count_new <= para_count: new_para.append(text_next[-1][para_count_new]) para_count_new += 1 text_next[-1] = new_para if len(text_prev) == 0: text_prev = [] if len(text_cur) == 0: text_cur = [] if len(text_next) == 0: text_next = [] return {"prev": text_prev, "cur": text_cur, "next": text_next}
def crawl_until_stop(session): global _vm, _stop, _stopped _vm.attachCurrentThread() db_writer = zh_iatd.create_index_writer() info_logger = external_console_logger('/tmp/zh_c_info') error_logger = external_console_logger('/tmp/zh_c_err') strategy = crawl_strategy() errcount = 0 while not _stop: info_logger.write(' acquiring new tasks... ') task_reader = zh_iatd.create_searcher(TASK_FOLDER) default_query = BooleanQuery() default_query.add(TermQuery(Term('finish_time', '0')), BooleanClause.Occur.MUST) strategy.process_query(default_query) idstart = task_reader.reader.numDocs() searchres = task_reader.searcher.search(default_query, 100) resdocs = [ task_reader.searcher.doc(x.doc) for x in searchres.scoreDocs ] info_logger.write('got:{0} total:{1}\n'.format(searchres.totalHits, idstart)) task_reader.close() task_writer = zh_iatd.create_index_writer(TASK_FOLDER) for doct in resdocs: curt = task() curt.from_document(doct) crlt = curt.to_crawler_task() try: crlt.func(session, crlt) except Exception as e: info_logger.write('FAIL') error_logger.write( '## ERROR ################################\n') zh_pganlz.print_object(crlt, out=error_logger) error_logger.write( '-- stacktrace ---------------------------\n') error_logger.write(traceback.format_exc()) errcount += 1 error_logger.write('[Error count: {0}]\n'.format(errcount)) task_writer.deleteDocuments(Term('docid', str(doct['docid']))) curt.fails += 1 task_writer.addDocument(curt.to_document()) else: if not crlt.result_rep_obj is None: db_writer.deleteDocuments(crlt.result_query) db_writer.addDocument( zh_pganlz.obj_to_document(crlt.result_rep_obj)) for x in crlt.result_new: db_writer.addDocument(zh_pganlz.obj_to_document(x)) db_writer.commit() task_writer.deleteDocuments(Term('docid', str(doct['docid']))) curt.finish_time = int(time.time()) task_writer.addDocument(curt.to_document()) for x in crlt.result_tasks: newt = task() newt.from_crawler_task(x) newt.docid = idstart idstart += 1 task_writer.addDocument(newt.to_document()) if isinstance(crlt.prm_id, unicode): prids = crlt.prm_id.encode('utf8') else: prids = str(crlt.prm_id) info_logger.write( ' ~{0}(+{1}) -{2} {3}({4}, {5}, {6}, {7})\n'.format( task_writer.numDocs(), len(crlt.result_tasks), curt.fails, crlt.func.func_name[14:], prids, crlt.prm_start, crlt.prm_pagesize, crlt.prm_extra)) if _stop: break time.sleep(1) task_writer.close() info_logger.write('stopped\n') _stopped = True