def POST(self):
        global _vm, _session

        _vm.attachCurrentThread()
        user_data = json.loads(web.input()['data'])
        print user_data
        tgtype = user_data['type']
        if tgtype == 'user':
            tfield = 'author_index'
        elif tgtype == 'question':
            tfield = 'question_index'
        else:
            return ''
        with zh_iatd.create_searcher() as searcher:
            res1 = searcher.searcher.search(
                zh_iatd.create_query({
                    'type': 'answer',
                    tfield: user_data['index']
                }), 200, Sort(SortField('likes', SortField.Type.INT, True)))
            res2 = searcher.searcher.search(
                zh_iatd.create_query({
                    'type': 'answer',
                    tfield: user_data['index']
                }), 200, Sort(SortField('date', SortField.Type.INT, True)))
            res1 = [
                zh_pganlz.document_to_obj(searcher.searcher.doc(
                    x.doc)).data.likes for x in res1.scoreDocs
            ]
            res2 = [
                zh_pganlz.document_to_obj(searcher.searcher.doc(x.doc))
                for x in res2.scoreDocs
            ]
            res2 = [{'x': x.data.date, 'y': x.data.likes} for x in res2]
        return json.dumps({'histogram': res1, 'graph': res2})
Exemple #2
0
def search_kw(kw, mode):
    vm_env.attachCurrentThread()
    lists = []
    l = jieba.cut(kw)
    query = BooleanQuery()
    for i in l:
        ii = QueryParser(Version.LUCENE_CURRENT, "introduction",
                         analyzer).parse(i)
        query.add(ii, BooleanClause.Occur.MUST)
    if mode:
        sf = SortField("score", SortField.Type.STRING, True)
        s = Sort(sf)
    else:
        sf = SortField("comments", SortField.Type.FLOAT, True)
        s = Sort(sf)
    scoreDocs = searcher1.search(query, 20, s).scoreDocs
    for scoreDoc in scoreDocs:
        movie = []
        doc = searcher1.doc(scoreDoc.doc)
        ####
        movie.append(doc.get("url"))
        movie.append(doc.get("picture"))
        movie.append(doc.get("title"))
        movie.append(doc.get("score"))
        movie.append(doc.get("genre"))
        movie.append(doc.get("stars"))
        movie.append(doc.get("comments"))
        #####
        lists.append(movie)

    return lists
Exemple #3
0
 def sales_sort(query):
     scoreDocs = searcher.search(
         query, 8,
         Sort([
             SortField.FIELD_SCORE,
             SortField("sales", SortField.Type.INT, True),
             SortField("price", SortField.Type.DOUBLE, False)
         ])).scoreDocs
     return scoreDocs
Exemple #4
0
 def name_price_search(name, low, high):
     query = QueryParser(Version.LUCENE_CURRENT, "name",
                         analyzer).parse(command)
     scoreDocs = searcher.search(
         query, 5000,
         Sort([
             SortField.FIELD_SCORE,
             SortField("price", SortField.Type.DOUBLE, False)
         ])).scoreDocs
     for scoreDoc in scoreDocs:
         doc = searcher.doc(scoreDoc.doc)
         if float(doc.get('price')) >= low and float(
                 doc.get('price')) < high:
             print '-' * 100
             print 'name:', doc.get('name')
             print doc.get('rate') + '分(' + doc.get('comments') + '人评价)'
             if doc.get('post') == 0:
                 print '¥' + doc.get('price') + '\t' + '包邮'
             else:
                 print '¥' + doc.get('price')
             print doc.get('sales'), '人付款'
             if doc.get('perfumer') != None:
                 print '调香师:', doc.get('perfumer')
             if doc.get('tune') != None:
                 print 'tune:', doc.get('tune')
             if doc.get('scents') == '': continue
             if doc.get('former') != None:
                 print 'former:', doc.get('former')
                 print 'mid:', doc.get('mid')
                 print 'last:', doc.get('last')
             else:
                 print 'scents:', doc.get('scents')
Exemple #5
0
    def testFilteredQuery(self):

        filteredquery = FilteredQuery(self.query, self.filter)
        topDocs = self.searcher.search(filteredquery, 50)
        self.assertEqual(1, topDocs.totalHits)
        self.assertEqual(1, topDocs.scoreDocs[0].doc)

        topDocs = self.searcher.search(filteredquery, None, 50,
                                       Sort(SortField("sorter",
                                                      SortField.Type.STRING)))
        self.assertEqual(1, topDocs.totalHits)
        self.assertEqual(1, topDocs.scoreDocs[0].doc)

        filteredquery = FilteredQuery(TermQuery(Term("field", "one")),
                                      self.filter)
        topDocs = self.searcher.search(filteredquery, 50)
        self.assertEqual(2, topDocs.totalHits)

        filteredquery = FilteredQuery(TermQuery(Term("field", "x")),
                                      self.filter)
        topDocs = self.searcher.search(filteredquery, 50)
        self.assertEqual(1, topDocs.totalHits)
        self.assertEqual(3, topDocs.scoreDocs[0].doc)

        filteredquery = FilteredQuery(TermQuery(Term("field", "y")),
                                      self.filter)
        topDocs = self.searcher.search(filteredquery, 50)
        self.assertEqual(0, topDocs.totalHits)
Exemple #6
0
def func1(genre, year):
    vm_env.attachCurrentThread()
    lists = []
    query = BooleanQuery()
    if genre != "111":
        item = QueryParser(Version.LUCENE_CURRENT, "genre",
                           analyzer).parse(genre)
        query.add(item, BooleanClause.Occur.MUST)
    if year != "111":
        item = QueryParser(Version.LUCENE_CURRENT, "year",
                           analyzer).parse(year)
        query.add(item, BooleanClause.Occur.MUST)
    sf = SortField("score", SortField.Type.STRING, True)
    s = Sort(sf)
    scoreDocs = searcher1.search(query, 20, s).scoreDocs
    for scoreDoc in scoreDocs:
        movie = []
        doc = searcher1.doc(scoreDoc.doc)
        movie.append(doc.get("url"))
        movie.append(doc.get("picture"))
        movie.append(doc.get("title"))
        movie.append(doc.get("score"))
        movie.append(doc.get("genre"))
        movie.append(doc.get("stars"))
        movie.append(doc.get("comments"))
        lists.append(movie)
    return lists
Exemple #7
0
 def rate_sort(query):
     scoreDocs = searcher.search(
         query, 8,
         Sort([
             SortField.FIELD_SCORE,
             SortField("rate", SortField.Type.DOUBLE, True)
         ])).scoreDocs
     return scoreDocs
Exemple #8
0
def func_ns(command):
    vm_env = lucene.getVMEnv()
    vm_env.attachCurrentThread()
    STORE_DIR = "index_tb_new"
    directory = SimpleFSDirectory(File(STORE_DIR))
    searcher = IndexSearcher(DirectoryReader.open(directory))
    analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)
    query = QueryParser(Version.LUCENE_CURRENT, "name",
                        analyzer).parse(command)
    scoreDocs = searcher.search(
        query, 50,
        Sort([
            SortField("sales", SortField.Type.INT, True),
            SortField.FIELD_SCORE,
            SortField("price", SortField.Type.DOUBLE, False)
        ])).scoreDocs
    results = process(scoreDocs, searcher)
    return results
Exemple #9
0
def getLucene(path):
    directory = FSDirectory.open(Paths.get(path))
    analyzer = WhitespaceAnalyzer()
    config = IndexWriterConfig(analyzer)
    config.setIndexSort(
        Sort(SortField(NUMERIC_STAMP_FIELD, SortField.Type.LONG)))
    writer = IndexWriter(directory, config)
    reader = writer.getReader()
    searcher = IndexSearcher(reader)
    return writer, reader, searcher
Exemple #10
0
def search():
    boolean_query = getQueryBuiler()

    final_query = boolean_query.build()
    if args.sort:
        sort = Sort(SortField(args.sort, SortField.Type.STRING, args.sortDirection == 'desc'))
        results = searcher.search(final_query, args.ndocs, sort)
    else:
        results = searcher.search(final_query, args.ndocs)

    return results
Exemple #11
0
 def getLastStampId(self, prefix='oai_dc', setSpec=None):
     searcher = self._getSearcher()
     sort = Sort(SortField(NUMERIC_STAMP_FIELD, SortField.Type.LONG, True))
     if prefix is None and setSpec is None:
         query = MatchAllDocsQuery()
     else:
         if prefix is None:
             query = TermQuery(Term(SETS_FIELD, setSpec))
         else:
             query = TermQuery(Term(PREFIX_FIELD, prefix))
     results = searcher.search(query, 1, sort)
     if results.totalHits.value < 1:
         return None
     return _stampFromDocument(searcher.doc(results.scoreDocs[0].doc))
Exemple #12
0
def printAggregationSearch(field_name):
    print('Values for field "{}".'.format(field_name))
    
    groupingSearch = GroupingSearch(field_name)
    groupingSearch.setAllGroups(True)

    if args.sort != 'agg':
        sort = Sort(SortField(field_name, SortField.Type.STRING))
        groupingSearch.setGroupSort(sort)

    query = getQueryBuiler().build()
    result = groupingSearch.search(searcher, query, args.offset, 2500) #args.ndocs

    totalGroupCount = result.totalGroupCount
    print('Total groups count: {} Total docs count {}'.format(totalGroupCount, result.totalHitCount))

    aggGroups = []
    groups = result.groups
    for i in range(len(groups)):
        charCodes = groups[i].groupValue.toString()[1:-1].split(' ')
        for j in range(len(charCodes)):
            if charCodes[j] == '':
                charCodes[j] = 45
            else:
                charCodes[j] = int(charCodes[j], 16) # % 128
        groupName = str(bytearray(charCodes), 'utf-8')
        aggGroups.append({'name': groupName, 'value': int(groups[i].totalHits.value)})
    
    if args.sort == 'agg':
        aggGroups = sorted(aggGroups, key=lambda k: k['value'], reverse=True) 

    table_line = ''.join(['-' for i in range(27)])
    print(table_line)
    print('| {:10} | {:10} |'.format('Value', 'Count'))
    print(table_line)
    groups = result.groups
    for i in range(min(args.ndocs, len(aggGroups))):
        print('| {:10} | {:10,d} |'.format(aggGroups[i]['name'], aggGroups[i]['value']))
    print(table_line)
    print()
Exemple #13
0
    def __init__(self,
                 index_dir,
                 search_fields=['canonical_url', 'title', 'meta', 'content'],
                 unique_field='uq_id_str',
                 boost=dict(canonical_url=4.0,
                            title=8.0,
                            meta=2.0,
                            content=1.0),
                 date_format='%Y-%m-%dT%H:%M:%S'):
        """Constructor of Searcher.

        Parameters
        ----------
        index_dir : string
            The location of lucene index.
        search_fields : list
            A list of field names indicating fields to search on.
        unique_field : string
            The field name, on which the duplication should avoid.
        boost : dict
            This dict control the weight when computing score.
        date_format : string
            Convert the string into datetime. Should consistent with the
            index part.
        """
        self.index_dir = index_dir
        self.search_fields = search_fields
        self.sort_by_recent = Sort(
            SortField('date_published', SortField.Type.STRING, True))
        self.store = FSDirectory.open(File(index_dir))
        self.reader = DirectoryReader.open(self.store)
        self.isearcher = IndexSearcher(self.reader)
        self.analyzer = StandardAnalyzer()
        self.dup_filter = DuplicateFilter(unique_field)
        self.boost_map = HashMap()
        for k, v in boost.iteritems():
            self.boost_map.put(k, Float(v))
        self.mul_parser = MultiFieldQueryParser(search_fields, self.analyzer,
                                                self.boost_map)
        self.date_format = date_format
Exemple #14
0
 def testSearchTopField(self):
     I = Index(path=self.tempdir, settings=LuceneSettings())
     I._indexWriter.addDocument(
         document(__id__='1', name="one", price="aap noot mies"))
     I.commit()
     I._indexWriter.addDocument(
         document(__id__='2', name="two", price="aap vuur boom"))
     I.commit()
     I._indexWriter.addDocument(
         document(__id__='3', name="three", price="noot boom mies"))
     I.commit()
     I.close()
     I = Index(path=self.tempdir, settings=LuceneSettings())
     sort = Sort(SortField("name", SortField.Type.STRING, True))
     C = TopFieldSuperCollector(sort, 2, True, False, True)
     Q = MatchAllDocsQuery()
     I.search(Q, None, C)
     td = C.topDocs(0)
     self.assertEquals(3, C.getTotalHits())
     self.assertEquals(3, td.totalHits)
     self.assertEquals(2, len(td.scoreDocs))
     self.assertEquals(
         ['2', '3'],
         [I.getDocument(s.doc).get("__id__") for s in td.scoreDocs])
Exemple #15
0
def run(searcher_good, searcher_bad, analyzer):
    while True:
        command_dict = parseCommand(command)
        total_num = 20

        #这些不同的s用来决定排序顺序:依次是按价格(从低到高)、热度(总评论数)、好评率、综合评分
        #s=SortField("price",SortField.Type.FLOAT,False)
        #s=SortField("total_comment",SortField.Type.FLOAT,True)
        s = SortField("good_rate", SortField.Type.FLOAT, True)
        #s=SortField("socre",SortField.Type.FLOAT,True)
        so = Sort(s)

        querys = BooleanQuery()
        for k, v in command_dict.iteritems():
            query = QueryParser(Version.LUCENE_CURRENT, k, analyzer).parse(v)
            querys.add(query, BooleanClause.Occur.MUST)

        #这两句用来限定价格的范围
#q=NumericRangeQuery.newFloatRange("price",100.0,200.0,True,True)
#querys.add(q,BooleanClause.Occur.MUST)

        scoreDocs_good = searcher_good.search(querys, total_num, so).scoreDocs
        total = len(scoreDocs_good)
        flag = True
        if len(scoreDocs_good) < total_num:
            scoreDocs_bad = searcher_bad.search(querys, total_num,
                                                so).scoreDocs
            total = total + len(scoreDocs_bad)
            flag = False
        if total > total_num:
            total = total_num
        print "%s total matching documents." % total

        #"url"是网址,“img_url”是图片网址,“brand”是品牌
        for scoreDoc_good in scoreDocs_good:
            doc = searcher_good.doc(scoreDoc_good.doc)
            ##            explanation = searcher.explain(query, scoreDoc.doc)
            print "------------------------"
            print 'title:', doc.get('title')
            print 'total_comment', doc.get("total_comment")
            print 'price', doc.get("price")
            print 'socre', doc.get("socre")
            print 'brand', doc.get("brand")
            print 'good_rate', doc.get("good_rate")
            print
        if not flag:
            t = 0
            for scoreDoc_bad in scoreDocs_bad:
                t = t + 1
                doc = searcher_bad.doc(scoreDoc_bad.doc)
                ##                explanation = searcher.explain(query, scoreDoc.doc)
                print "------------------------"
                print 'title:', doc.get('title')
                print 'total_comment', doc.get("total_comment")
                print 'price', doc.get("price")
                print 'score', doc.get("score")
                print 'brand', doc.get("brand")
                print 'good_rate', doc.get("good_rate")
                print
                if t > total_num - 1 - len(scoreDocs_good):
                    break
        def get_query_result(sarc, dct):
            PAGE_SIZE = 10
            PAGE_JUMP = 10

            query = BooleanQuery()
            query.add(TermQuery(Term(zh_pganlz.LTPF_TYPE, '1')),
                      BooleanClause.Occur.MUST)
            page = 0
            sort_lists = []
            summ_set = set()
            exclus_set = None
            words = []
            for k, v in dct.items():
                if k in ('index', 'type', 'tag_indices', 'author_index'):
                    query.add(build_anyterm_query(k, dct[k]),
                              BooleanClause.Occur.MUST)
                elif k in ('text', 'contents', 'title', 'description',
                           'alias'):
                    words += jieba.lcut(v)
                    query.add(
                        build_text_query(k + zh_pganlz.LTPF_FOR_QUERY, dct[k]),
                        BooleanClause.Occur.MUST)

                elif k == 'raw':
                    query.add(
                        QueryParser('index',
                                    WhitespaceAnalyzer()).parse(dct[k]),
                        BooleanClause.Occur.MUST)
                elif k == 'enhraw':
                    x = 0
                    reslst = []
                    for entry in v:
                        if x == 2:
                            reslst += [
                                lastdoc + x.encode('utf8')
                                for x in jieba.cut(entry)
                            ]
                            x = 0
                        else:
                            if x == 0:
                                reslst.append(entry.encode('utf8'))
                            else:
                                lastdoc = entry.encode('utf8')
                            x += 1
                    query.add(
                        QueryParser('index', WhitespaceAnalyzer()).parse(
                            ' '.join(reslst)), BooleanClause.Occur.MUST)

                elif k == 'page':
                    page = int(dct[k])
                elif k == 'sort':
                    for x in dct['sort']:
                        sort_type = SortField.Type.STRING
                        if 'type' in x.keys():
                            if x['type'] == 'int':
                                sort_type = SortField.Type.INT
                            elif x['type'] == 'float':
                                sort_type = SortField.Type.FLOAT
                        reverse = False
                        if 'reverse' in x.keys():
                            reverse = x['reverse']
                        sort_lists.append(
                            SortField(x['key'], sort_type, reverse))

                elif k == 'summarize':
                    summ_set = set(v)
                elif k == 'exclusive':
                    exclus_set = set(v)

            ressrt = Sort(*sort_lists)
            resdocs = sarc.searcher.search(query, PAGE_SIZE, ressrt)
            if page > 0:
                if resdocs.totalHits > page * PAGE_SIZE:
                    page -= 1
                    while page > PAGE_JUMP:
                        resdocs = sarc.searcher.searchAfter(
                            resdocs.scoreDocs[-1], query,
                            PAGE_SIZE * PAGE_JUMP, ressrt)
                        page -= PAGE_JUMP
                    if page > 0:
                        resdocs = sarc.searcher.searchAfter(
                            resdocs.scoreDocs[-1], query, PAGE_SIZE * page,
                            ressrt)
                    resdocs = sarc.searcher.searchAfter(
                        resdocs.scoreDocs[-1], query, PAGE_SIZE, ressrt)
                else:
                    resdocs.scoreDocs = []
            reslst = []
            for x in resdocs.scoreDocs:
                dictobj = zh_pganlz.obj_to_json(
                    zh_pganlz.document_to_obj(sarc.searcher.doc(x.doc)))
                if 'additional' in dct.keys():
                    adres = []
                    for x in dct['additional']:
                        if isinstance(dictobj[x['sourcefield']], list):
                            qlist = dictobj[x['sourcefield']]
                        else:
                            qlist = [dictobj[x['sourcefield']]]
                        cres = []
                        for qword in qlist:
                            if not isinstance(qword, (unicode, str)):
                                qword = str(qword)
                            searchres = sarc.searcher.search(
                                zh_iatd.create_query({
                                    'type': x['type'],
                                    x['targetfield']: qword
                                }), 1)
                            if searchres.totalHits > 1:
                                print x, 'FOUND', searchres
                            elif searchres.totalHits == 0:
                                cres.append(None)
                            else:
                                cres.append(
                                    zh_pganlz.obj_to_json(
                                        zh_pganlz.document_to_obj(
                                            sarc.searcher.doc(
                                                searchres.scoreDocs[0].doc))))
                        adres.append(cres)
                for k, v in dictobj.items():
                    if k in summ_set:
                        dictobj[k + '_summary'] = summarize(
                            hyper_text(v).text, list(set(words)))
                if not exclus_set is None:
                    for k in dictobj.keys():
                        if not k in exclus_set:
                            del dictobj[k]
                if 'additional' in dct.keys():
                    dictobj['additional'] = adres
                reslst.append(dictobj)
            return {'total': resdocs.totalHits, 'data': reslst}
def Run_Price(searcher_good, searcher_bad, analyzer, command, brand):
    while True:
        command_dict, low, high = parseCommand(command, brand)
        total_num = 20

        s = SortField("price", SortField.Type.FLOAT, False)
        #s=SortField("total_comment",SortField.Type.FLOAT,True)
        #s=SortField("good_rate",SortField.Type.FLOAT,True)
        #s=SortField("socre",SortField.Type.FLOAT,True)
        so = Sort(s)
        querys = BooleanQuery()
        for k, v in command_dict.iteritems():
            query = QueryParser(Version.LUCENE_CURRENT, k, analyzer).parse(v)
            querys.add(query, BooleanClause.Occur.MUST)

        #The price's range
        q = NumericRangeQuery.newFloatRange("price", low, high, True, True)
        querys.add(q, BooleanClause.Occur.MUST)

        scoreDocs_good = searcher_good.search(querys, total_num, so).scoreDocs
        total = len(scoreDocs_good)
        flag = True
        if len(scoreDocs_good) < total_num:
            scoreDocs_bad = searcher_bad.search(querys, total_num,
                                                so).scoreDocs
            total = total + len(scoreDocs_bad)
            flag = False
        if total > total_num:
            total = total_num
        #Total is the number of matched websites
        res = []
        for scoreDoc_good in scoreDocs_good:
            unit = []
            doc = searcher_good.doc(scoreDoc_good.doc)
            title = doc.get('title')
            title.replace(' ', '')
            title = title[:18]
            total_comment = doc.get("total_comment")
            price = doc.get("price")
            socre = doc.get("socre")
            brand = doc.get("brand")
            good_rate = doc.get("good_rate")
            url = doc.get("url")
            img_url = doc.get("img_url")
            comment = doc.get("comment").split()
            unit.append(title)  #0
            unit.append(total_comment)  #1
            unit.append(price)  #2
            unit.append(socre)  #3
            unit.append(brand)  #4
            unit.append(good_rate)  #5
            unit.append(url)  #6
            unit.append(img_url)  #7
            unit.append(comment)  #8
            res.append(unit)
        if not flag:
            t = 0
            for scoreDoc_bad in scoreDocs_bad:
                t = t + 1
                doc = searcher_bad.doc(scoreDoc_bad.doc)
                ##                explanation = searcher.explain(query, scoreDoc.doc)
                title = doc.get('title')
                title.replace(' ', '')
                title = title[:18]
                total_comment = doc.get("total_comment")
                price = doc.get("price")
                socre = doc.get("socre")
                brand = doc.get("brand")
                good_rate = doc.get("good_rate")
                url = doc.get("url")
                img_url = doc.get("img_url")
                comment = doc.get("comment").split()
                unit.append(title)
                unit.append(total_comment)
                unit.append(price)
                unit.append(socre)
                unit.append(brand)
                unit.append(good_rate)
                unit.append(url)
                unit.append(img_url)
                unit.append(comment)
                res.append(unit)
                if t > total_num - 1 - len(scoreDocs_good):
                    break
        res.append(brand)
        return res
Exemple #18
0
 def _sortField(self, fieldname, sortDescending):
     result = SortField(fieldname, SortField.Type.STRING, sortDescending)
     result.setMissingValue(SortField.STRING_FIRST if sortDescending else SortField.STRING_LAST)
     return result
Exemple #19
0
def func_pr(name, low, high):
    vm_env = lucene.getVMEnv()
    vm_env.attachCurrentThread()
    STORE_DIR = "index_tb_new"
    directory = SimpleFSDirectory(File(STORE_DIR))
    searcher = IndexSearcher(DirectoryReader.open(directory))
    analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)
    query = QueryParser(Version.LUCENE_CURRENT, "name", analyzer).parse(name)

    scoreDocs = searcher.search(
        query, 1000,
        Sort([
            SortField.FIELD_SCORE,
            SortField("price", SortField.Type.DOUBLE, False)
        ])).scoreDocs

    results = []
    for scoreDoc in scoreDocs:
        doc = searcher.doc(scoreDoc.doc)
        if float(doc.get('price')) >= float(low) and float(
                doc.get('price')) < float(high):
            shop = doc.get('shop')
            name = doc.get('name')
            img = doc.get('img')
            url = doc.get('url')
            post = doc.get('post')
            sales = doc.get('sales')
            comments = doc.get('comments')
            place = doc.get('place')
            price = doc.get('price')

            data = {}
            data.setdefault('name', name)
            data.setdefault('url', url)
            data.setdefault('price', price)
            data.setdefault('post', post)
            data.setdefault('sales', sales)
            data.setdefault('comments', comments)
            data.setdefault('place', place)
            data.setdefault('shop', shop)
            data.setdefault('img', img)

            data.setdefault('xssd_name', doc.get('xssd_name'))
            data.setdefault('perfumer', doc.get('perfumer'))
            data.setdefault('tune', doc.get('tune'))
            data.setdefault('xssd_url', doc.get('xssd_url'))
            data.setdefault('brand', doc.get('brand'))
            data.setdefault('rate', float(doc.get('rate')))
            data.setdefault('xssd_comments', doc.get('comment'))
            if doc.get('former') != None:
                former = doc.get('former')
                mid = doc.get('mid')
                last = doc.get('last')
                data.setdefault('former', former)
                data.setdefault('mid', mid)
                data.setdefault('last', last)
                scents = former + ' ' + mid + ' ' + last
                data.setdefault('scents', scents)
            else:
                data.setdefault('scents', doc.get('scents'))
            dump(data)
            # print(result['context'])
            results.append(data)
            print data
    return results
Exemple #20
0
def index_images_until_stop(session, handler, lbound):
    global _stop, _stopped, _vm

    _vm.attachCurrentThread()
    searcher = IndexSearcher(
        DirectoryReader.open(SimpleFSDirectory(File(TASK_FOLDER))))
    query = BooleanQuery()
    query.add(TermQuery(Term('finish_time', '0')),
              BooleanClause.Occur.MUST_NOT)
    query.add(MatchAllDocsQuery(), BooleanClause.Occur.MUST)
    if not lbound is None:
        query.add(
            TermRangeQuery.newStringRange('finish_time', lbound, '9999999999',
                                          False, True),
            BooleanClause.Occur.MUST)
    sort = Sort(SortField('finish_time', SortField.Type.INT))
    tmpbk = None
    res = searcher.search(query, 100, sort)
    answer_content_searcher = zh_iatd.create_searcher()
    logger = external_console_logger('/tmp/zh_imgc_info')
    while not _stop:
        print 'got', len(res.scoreDocs), 'docs'
        for x in res.scoreDocs:
            try:
                imgsgot = 0
                realdoc = searcher.doc(x.doc)
                doctype = realdoc['func_name']
                objid = realdoc['id']
                logger.write(' ft:{0}'.format(realdoc['finish_time']))
                if doctype == 'user_data':
                    soup = bs4.BeautifulSoup(
                        session.opener.open(
                            urllib2.Request(
                                url='https://www.zhihu.com/people/{0}'.format(
                                    objid))), HTML_PARSER)
                    cover = soup.select(
                        '#ProfileHeader .ProfileHeader-userCover img')
                    if len(cover) > 0:
                        cover_img = cover[0]['src']
                        imgsgot += 1
                        handler(cover_img, ZH_IMGTYPE_USERINFO_COVER, objid)
                    avatar_img = soup.select(
                        '#ProfileHeader .ProfileHeader-main .UserAvatar img'
                    )[0]['src']
                    imgsgot += 1
                    handler(avatar_img, ZH_IMGTYPE_USER_AVATAR, objid)
                elif doctype == 'article_data':
                    jsondata = session.get_article_content_raw(objid)
                    if 'titleImage' in jsondata.keys():
                        cover_img = jsondata['titleImage']
                        if len(cover_img) > 0:
                            imgsgot += 1
                            handler(cover_img, ZH_IMGTYPE_ARTICLE_COVER, objid)
                    soup = bs4.BeautifulSoup(jsondata['content'], HTML_PARSER)
                    for x in soup.select('img'):
                        imgsgot += 1
                        handler(x['src'], ZH_IMGTYPE_IN_ARTICLE, objid)
                elif doctype == 'topic_data':
                    soup = bs4.BeautifulSoup(
                        session.opener.open(
                            urllib2.Request(
                                url='https://www.zhihu.com/topic/{0}/hot'.
                                format(objid))), HTML_PARSER)
                    topic_img = soup.select(
                        '.zu-main-content .topic-avatar .zm-entry-head-avatar-link img'
                    )[0]['src']
                    imgsgot += 1
                    handler(topic_img, ZH_IMGTYPE_TOPIC_ICON, objid)
                elif doctype == 'answer_comments' and realdoc['start'] == '0':
                    obj, q = zh_iatd.query_object(answer_content_searcher,
                                                  objid, zh_pganlz.answer)
                    for x in obj.data.text.as_soup().select('img'):
                        imgsgot += 1
                        handler(x['src'], ZH_IMGTYPE_IN_ANSWER, objid)
                elif doctype == 'question_data':
                    soup = bs4.BeautifulSoup(
                        session.opener.open(
                            urllib2.Request(
                                url='https://www.zhihu.com/question/{0}'.
                                format(objid))), HTML_PARSER)
                    for x in soup.select('#zh-question-detail img'):
                        imgsgot += 1
                        handler(x['src'], ZH_IMGTYPE_IN_QUESTION, objid)
                else:
                    logger.write('\n')
                    continue
                logger.write(' ({0}, +{1})\n'.format(doctype, imgsgot))
                if _stop:
                    break
                time.sleep(3)
            except Exception as e:
                logger.write('\n## ERROR ################################\n')
                logger.write(traceback.format_exc())
        if len(res.scoreDocs) > 0:
            tmpbk = res.scoreDocs[-1]
        res = searcher.searchAfter(tmpbk, query, 100, sort)
    print 'stopped'
    _stopped = True