def _termRangeQuery(self, index, relation, termString): field = index if '<' in relation: lowerTerm, upperTerm = None, termString else: lowerTerm, upperTerm = termString, None includeLower, includeUpper = relation == '>=', relation == '<=' return TermRangeQuery.newStringRange(field, lowerTerm, upperTerm, includeLower, includeUpper)
def query_between_dates(self, dt1, dt2, original_query=None): '''Update the given query to only allow records between dt1 and dt2.''' return TermRangeQuery( 'date_published', # Field BytesRef(dt1.strftime(self.date_format)), # Lower bound BytesRef(dt2.strftime(self.date_format)), # Upper bound True, # Include lower bound True # Include upper bound )
def testRangeQuery(self): """ This tests FilteredQuery's rewrite correctness """ rq = TermRangeQuery.newStringRange("sorter", "b", "d", True, True) filteredquery = FilteredQuery(rq, self.filter) scoreDocs = self.searcher.search(filteredquery, None, 1000).scoreDocs self.assertEqual(2, len(scoreDocs))
def getQueryBuiler(): # builder = QueryBuilder(analyzer) boolean_query = BooleanQuery.Builder() # print(args.search) if len(args.search) == 0: boolean_query.add(MatchAllDocsQuery(), BooleanClause.Occur.MUST) return boolean_query for i in range(len(args.search)): curSearch = args.search[i].split(' ') if curSearch[1] == 'query': parser = QueryParser(curSearch[2], analyzer) query = parser.parse(curSearch[3]) elif curSearch[1] == 'intrange': query = IntPoint.newRangeQuery(curSearch[2], curSearch[3], curSearch[4]) elif curSearch[1] == 'termrange': lowerDate = handleDate(curSearch[3], '%d/%b/%Y:%H:%M:%S') upperDate = handleDate(curSearch[4], '%d/%b/%Y:%H:%M:%S') query = TermRangeQuery.newStringRange(curSearch[2], lowerDate, upperDate, True, True) if curSearch[0] == 'must': boolean_query.add(query, BooleanClause.Occur.MUST) elif curSearch[0] == 'should': boolean_query.add(query, BooleanClause.Occur.SHOULD) elif curSearch[0] == 'filter': boolean_query.add(query, BooleanClause.Occur.FILTER) elif curSearch[0] == 'must_not': boolean_query.add(query, BooleanClause.Occur.MUST_NOT) else: print('raise exception') # raise Exception # exit() # parser = QueryParser('method1', analyzer) # query = parser.parse('options') # boolean_query.add(query, BooleanClause.Occur.MUST) # parser = QueryParser('response_code', analyzer) # query = IntPoint.newRangeQuery('response_code', 200, 300) # boolean_query.add(query, BooleanClause.Occur.MUST) # lowerDate = handleDate("19/Jul/2020:05:40:00 +0000") # upperDate = handleDate("19/Jul/2020:06:45:04 +0000") # query = TermRangeQuery.newStringRange("date_time", lowerDate, upperDate, True, True) # boolean_query.add(query, BooleanClause.Occur.MUST) return boolean_query
def testInclusive(self): query = TermRangeQuery.newStringRange("content", "A", "C", True, True) self._initializeIndex(["A", "B", "C", "D"]) searcher = self.getSearcher() topDocs = searcher.search(query, 50) self.assertEqual(3, topDocs.totalHits, "A,B,C,D - A,B,C in range") del searcher self._initializeIndex(["A", "B", "D"]) searcher = self.getSearcher() topDocs = searcher.search(query, 50) self.assertEqual(2, topDocs.totalHits, "A,B,D - A and B in range") del searcher self._addDoc("C") searcher = self.getSearcher() topDocs = searcher.search(query, 50) self.assertEqual(3, topDocs.totalHits, "C added - A, B, C in range") del searcher
def testExclusive(self): query = TermRangeQuery.newStringRange("content", "A", "C", False, False) self._initializeIndex(["A", "B", "C", "D"]) searcher = self.getSearcher() topDocs = searcher.search(query, 50) self.assertEqual(1, topDocs.totalHits, "A,B,C,D, only B in range") del searcher self._initializeIndex(["A", "B", "D"]) searcher = self.getSearcher() topDocs = searcher.search(query, 50) self.assertEqual(1, topDocs.totalHits, "A,B,D, only B in range") del searcher self._addDoc("C") searcher = self.getSearcher() topDocs = searcher.search(query, 50) self.assertEqual(1, topDocs.totalHits, "C added, still only B in range") del searcher
def testTextRangeQuery(self): # (field, lowerTerm, upperTerm, includeLower, includeUpper) self.assertConversion(TermRangeQuery.newStringRange('field', 'value', None, False, False), 'field > value') self.assertConversion(TermRangeQuery.newStringRange('field', 'value', None, True, False), 'field >= value') self.assertConversion(TermRangeQuery.newStringRange('field', None, 'value', False, False), 'field < value') self.assertConversion(TermRangeQuery.newStringRange('field', None, 'value', False, True), 'field <= value')
def index_images_until_stop(session, handler, lbound): global _stop, _stopped, _vm _vm.attachCurrentThread() searcher = IndexSearcher( DirectoryReader.open(SimpleFSDirectory(File(TASK_FOLDER)))) query = BooleanQuery() query.add(TermQuery(Term('finish_time', '0')), BooleanClause.Occur.MUST_NOT) query.add(MatchAllDocsQuery(), BooleanClause.Occur.MUST) if not lbound is None: query.add( TermRangeQuery.newStringRange('finish_time', lbound, '9999999999', False, True), BooleanClause.Occur.MUST) sort = Sort(SortField('finish_time', SortField.Type.INT)) tmpbk = None res = searcher.search(query, 100, sort) answer_content_searcher = zh_iatd.create_searcher() logger = external_console_logger('/tmp/zh_imgc_info') while not _stop: print 'got', len(res.scoreDocs), 'docs' for x in res.scoreDocs: try: imgsgot = 0 realdoc = searcher.doc(x.doc) doctype = realdoc['func_name'] objid = realdoc['id'] logger.write(' ft:{0}'.format(realdoc['finish_time'])) if doctype == 'user_data': soup = bs4.BeautifulSoup( session.opener.open( urllib2.Request( url='https://www.zhihu.com/people/{0}'.format( objid))), HTML_PARSER) cover = soup.select( '#ProfileHeader .ProfileHeader-userCover img') if len(cover) > 0: cover_img = cover[0]['src'] imgsgot += 1 handler(cover_img, ZH_IMGTYPE_USERINFO_COVER, objid) avatar_img = soup.select( '#ProfileHeader .ProfileHeader-main .UserAvatar img' )[0]['src'] imgsgot += 1 handler(avatar_img, ZH_IMGTYPE_USER_AVATAR, objid) elif doctype == 'article_data': jsondata = session.get_article_content_raw(objid) if 'titleImage' in jsondata.keys(): cover_img = jsondata['titleImage'] if len(cover_img) > 0: imgsgot += 1 handler(cover_img, ZH_IMGTYPE_ARTICLE_COVER, objid) soup = bs4.BeautifulSoup(jsondata['content'], HTML_PARSER) for x in soup.select('img'): imgsgot += 1 handler(x['src'], ZH_IMGTYPE_IN_ARTICLE, objid) elif doctype == 'topic_data': soup = bs4.BeautifulSoup( session.opener.open( urllib2.Request( url='https://www.zhihu.com/topic/{0}/hot'. format(objid))), HTML_PARSER) topic_img = soup.select( '.zu-main-content .topic-avatar .zm-entry-head-avatar-link img' )[0]['src'] imgsgot += 1 handler(topic_img, ZH_IMGTYPE_TOPIC_ICON, objid) elif doctype == 'answer_comments' and realdoc['start'] == '0': obj, q = zh_iatd.query_object(answer_content_searcher, objid, zh_pganlz.answer) for x in obj.data.text.as_soup().select('img'): imgsgot += 1 handler(x['src'], ZH_IMGTYPE_IN_ANSWER, objid) elif doctype == 'question_data': soup = bs4.BeautifulSoup( session.opener.open( urllib2.Request( url='https://www.zhihu.com/question/{0}'. format(objid))), HTML_PARSER) for x in soup.select('#zh-question-detail img'): imgsgot += 1 handler(x['src'], ZH_IMGTYPE_IN_QUESTION, objid) else: logger.write('\n') continue logger.write(' ({0}, +{1})\n'.format(doctype, imgsgot)) if _stop: break time.sleep(3) except Exception as e: logger.write('\n## ERROR ################################\n') logger.write(traceback.format_exc()) if len(res.scoreDocs) > 0: tmpbk = res.scoreDocs[-1] res = searcher.searchAfter(tmpbk, query, 100, sort) print 'stopped' _stopped = True