def _termRangeQuery(self, index, relation, termString):
     field = index
     if '<' in relation:
         lowerTerm, upperTerm = None, termString
     else:
         lowerTerm, upperTerm = termString, None
     includeLower, includeUpper = relation == '>=', relation == '<='
     return TermRangeQuery.newStringRange(field, lowerTerm, upperTerm, includeLower, includeUpper)
Exemple #2
0
 def query_between_dates(self, dt1, dt2, original_query=None):
     '''Update the given query to only allow records between dt1 and dt2.'''
     return TermRangeQuery(
         'date_published',  # Field
         BytesRef(dt1.strftime(self.date_format)),  # Lower bound
         BytesRef(dt2.strftime(self.date_format)),  # Upper bound
         True,  # Include lower bound
         True  # Include upper bound
     )
Exemple #3
0
    def testRangeQuery(self):
        """
        This tests FilteredQuery's rewrite correctness
        """

        rq = TermRangeQuery.newStringRange("sorter", "b", "d", True, True)
        filteredquery = FilteredQuery(rq, self.filter)
        scoreDocs = self.searcher.search(filteredquery, None, 1000).scoreDocs
        self.assertEqual(2, len(scoreDocs))
Exemple #4
0
 def _termRangeQuery(self, index, relation, termString):
     field = index
     if '<' in relation:
         lowerTerm, upperTerm = None, termString
     else:
         lowerTerm, upperTerm = termString, None
     includeLower, includeUpper = relation == '>=', relation == '<='
     return TermRangeQuery.newStringRange(field, lowerTerm, upperTerm,
                                          includeLower, includeUpper)
Exemple #5
0
def getQueryBuiler():
    # builder = QueryBuilder(analyzer)
    boolean_query = BooleanQuery.Builder()

    # print(args.search)

    if len(args.search) == 0:
        boolean_query.add(MatchAllDocsQuery(), BooleanClause.Occur.MUST)
        return boolean_query
    
    for i in range(len(args.search)):
        curSearch = args.search[i].split(' ')

        if curSearch[1] == 'query':
            parser = QueryParser(curSearch[2], analyzer)
            query = parser.parse(curSearch[3])
        elif curSearch[1] == 'intrange':
            query = IntPoint.newRangeQuery(curSearch[2], curSearch[3], curSearch[4])
        elif curSearch[1] == 'termrange':
            lowerDate = handleDate(curSearch[3], '%d/%b/%Y:%H:%M:%S')
            upperDate = handleDate(curSearch[4], '%d/%b/%Y:%H:%M:%S')
            query = TermRangeQuery.newStringRange(curSearch[2], lowerDate, upperDate, True, True)

        if curSearch[0] == 'must':
            boolean_query.add(query, BooleanClause.Occur.MUST)
        elif curSearch[0] == 'should':
            boolean_query.add(query, BooleanClause.Occur.SHOULD)
        elif curSearch[0] == 'filter':
            boolean_query.add(query, BooleanClause.Occur.FILTER)
        elif curSearch[0] == 'must_not':
            boolean_query.add(query, BooleanClause.Occur.MUST_NOT)
        else:
            print('raise exception')
            # raise Exception
    # exit()
    # parser = QueryParser('method1', analyzer)
    # query = parser.parse('options')
    # boolean_query.add(query, BooleanClause.Occur.MUST)

    # parser = QueryParser('response_code', analyzer)
    # query = IntPoint.newRangeQuery('response_code', 200, 300)
    # boolean_query.add(query, BooleanClause.Occur.MUST)

    # lowerDate = handleDate("19/Jul/2020:05:40:00 +0000")
    # upperDate = handleDate("19/Jul/2020:06:45:04 +0000")
    # query = TermRangeQuery.newStringRange("date_time", lowerDate, upperDate, True, True)
    # boolean_query.add(query, BooleanClause.Occur.MUST)


    return boolean_query
Exemple #6
0
    def testInclusive(self):

        query = TermRangeQuery.newStringRange("content", "A", "C", True, True)

        self._initializeIndex(["A", "B", "C", "D"])
        searcher = self.getSearcher()
        topDocs = searcher.search(query, 50)
        self.assertEqual(3, topDocs.totalHits, "A,B,C,D - A,B,C in range")
        del searcher

        self._initializeIndex(["A", "B", "D"])
        searcher = self.getSearcher()
        topDocs = searcher.search(query, 50)
        self.assertEqual(2, topDocs.totalHits, "A,B,D - A and B in range")
        del searcher

        self._addDoc("C")
        searcher = self.getSearcher()
        topDocs = searcher.search(query, 50)
        self.assertEqual(3, topDocs.totalHits, "C added - A, B, C in range")
        del searcher
Exemple #7
0
    def testExclusive(self):

        query = TermRangeQuery.newStringRange("content", "A", "C", False,
                                              False)

        self._initializeIndex(["A", "B", "C", "D"])
        searcher = self.getSearcher()
        topDocs = searcher.search(query, 50)
        self.assertEqual(1, topDocs.totalHits, "A,B,C,D, only B in range")
        del searcher

        self._initializeIndex(["A", "B", "D"])
        searcher = self.getSearcher()
        topDocs = searcher.search(query, 50)
        self.assertEqual(1, topDocs.totalHits, "A,B,D, only B in range")
        del searcher

        self._addDoc("C")
        searcher = self.getSearcher()
        topDocs = searcher.search(query, 50)
        self.assertEqual(1, topDocs.totalHits,
                         "C added, still only B in range")
        del searcher
Exemple #8
0
 def testTextRangeQuery(self):
     # (field, lowerTerm, upperTerm, includeLower, includeUpper)
     self.assertConversion(TermRangeQuery.newStringRange('field', 'value', None, False, False), 'field > value')
     self.assertConversion(TermRangeQuery.newStringRange('field', 'value', None, True, False), 'field >= value')
     self.assertConversion(TermRangeQuery.newStringRange('field', None, 'value', False, False), 'field < value')
     self.assertConversion(TermRangeQuery.newStringRange('field', None, 'value', False, True), 'field <= value')
Exemple #9
0
def index_images_until_stop(session, handler, lbound):
    global _stop, _stopped, _vm

    _vm.attachCurrentThread()
    searcher = IndexSearcher(
        DirectoryReader.open(SimpleFSDirectory(File(TASK_FOLDER))))
    query = BooleanQuery()
    query.add(TermQuery(Term('finish_time', '0')),
              BooleanClause.Occur.MUST_NOT)
    query.add(MatchAllDocsQuery(), BooleanClause.Occur.MUST)
    if not lbound is None:
        query.add(
            TermRangeQuery.newStringRange('finish_time', lbound, '9999999999',
                                          False, True),
            BooleanClause.Occur.MUST)
    sort = Sort(SortField('finish_time', SortField.Type.INT))
    tmpbk = None
    res = searcher.search(query, 100, sort)
    answer_content_searcher = zh_iatd.create_searcher()
    logger = external_console_logger('/tmp/zh_imgc_info')
    while not _stop:
        print 'got', len(res.scoreDocs), 'docs'
        for x in res.scoreDocs:
            try:
                imgsgot = 0
                realdoc = searcher.doc(x.doc)
                doctype = realdoc['func_name']
                objid = realdoc['id']
                logger.write(' ft:{0}'.format(realdoc['finish_time']))
                if doctype == 'user_data':
                    soup = bs4.BeautifulSoup(
                        session.opener.open(
                            urllib2.Request(
                                url='https://www.zhihu.com/people/{0}'.format(
                                    objid))), HTML_PARSER)
                    cover = soup.select(
                        '#ProfileHeader .ProfileHeader-userCover img')
                    if len(cover) > 0:
                        cover_img = cover[0]['src']
                        imgsgot += 1
                        handler(cover_img, ZH_IMGTYPE_USERINFO_COVER, objid)
                    avatar_img = soup.select(
                        '#ProfileHeader .ProfileHeader-main .UserAvatar img'
                    )[0]['src']
                    imgsgot += 1
                    handler(avatar_img, ZH_IMGTYPE_USER_AVATAR, objid)
                elif doctype == 'article_data':
                    jsondata = session.get_article_content_raw(objid)
                    if 'titleImage' in jsondata.keys():
                        cover_img = jsondata['titleImage']
                        if len(cover_img) > 0:
                            imgsgot += 1
                            handler(cover_img, ZH_IMGTYPE_ARTICLE_COVER, objid)
                    soup = bs4.BeautifulSoup(jsondata['content'], HTML_PARSER)
                    for x in soup.select('img'):
                        imgsgot += 1
                        handler(x['src'], ZH_IMGTYPE_IN_ARTICLE, objid)
                elif doctype == 'topic_data':
                    soup = bs4.BeautifulSoup(
                        session.opener.open(
                            urllib2.Request(
                                url='https://www.zhihu.com/topic/{0}/hot'.
                                format(objid))), HTML_PARSER)
                    topic_img = soup.select(
                        '.zu-main-content .topic-avatar .zm-entry-head-avatar-link img'
                    )[0]['src']
                    imgsgot += 1
                    handler(topic_img, ZH_IMGTYPE_TOPIC_ICON, objid)
                elif doctype == 'answer_comments' and realdoc['start'] == '0':
                    obj, q = zh_iatd.query_object(answer_content_searcher,
                                                  objid, zh_pganlz.answer)
                    for x in obj.data.text.as_soup().select('img'):
                        imgsgot += 1
                        handler(x['src'], ZH_IMGTYPE_IN_ANSWER, objid)
                elif doctype == 'question_data':
                    soup = bs4.BeautifulSoup(
                        session.opener.open(
                            urllib2.Request(
                                url='https://www.zhihu.com/question/{0}'.
                                format(objid))), HTML_PARSER)
                    for x in soup.select('#zh-question-detail img'):
                        imgsgot += 1
                        handler(x['src'], ZH_IMGTYPE_IN_QUESTION, objid)
                else:
                    logger.write('\n')
                    continue
                logger.write(' ({0}, +{1})\n'.format(doctype, imgsgot))
                if _stop:
                    break
                time.sleep(3)
            except Exception as e:
                logger.write('\n## ERROR ################################\n')
                logger.write(traceback.format_exc())
        if len(res.scoreDocs) > 0:
            tmpbk = res.scoreDocs[-1]
        res = searcher.searchAfter(tmpbk, query, 100, sort)
    print 'stopped'
    _stopped = True