def POST(self): global _vm, _session _vm.attachCurrentThread() user_data = json.loads(web.input()['data']) print user_data tgtype = user_data['type'] if tgtype == 'user': tfield = 'author_index' elif tgtype == 'question': tfield = 'question_index' else: return '' with zh_iatd.create_searcher() as searcher: res1 = searcher.searcher.search( zh_iatd.create_query({ 'type': 'answer', tfield: user_data['index'] }), 200, Sort(SortField('likes', SortField.Type.INT, True))) res2 = searcher.searcher.search( zh_iatd.create_query({ 'type': 'answer', tfield: user_data['index'] }), 200, Sort(SortField('date', SortField.Type.INT, True))) res1 = [ zh_pganlz.document_to_obj(searcher.searcher.doc( x.doc)).data.likes for x in res1.scoreDocs ] res2 = [ zh_pganlz.document_to_obj(searcher.searcher.doc(x.doc)) for x in res2.scoreDocs ] res2 = [{'x': x.data.date, 'y': x.data.likes} for x in res2] return json.dumps({'histogram': res1, 'graph': res2})
def search_kw(kw, mode): vm_env.attachCurrentThread() lists = [] l = jieba.cut(kw) query = BooleanQuery() for i in l: ii = QueryParser(Version.LUCENE_CURRENT, "introduction", analyzer).parse(i) query.add(ii, BooleanClause.Occur.MUST) if mode: sf = SortField("score", SortField.Type.STRING, True) s = Sort(sf) else: sf = SortField("comments", SortField.Type.FLOAT, True) s = Sort(sf) scoreDocs = searcher1.search(query, 20, s).scoreDocs for scoreDoc in scoreDocs: movie = [] doc = searcher1.doc(scoreDoc.doc) #### movie.append(doc.get("url")) movie.append(doc.get("picture")) movie.append(doc.get("title")) movie.append(doc.get("score")) movie.append(doc.get("genre")) movie.append(doc.get("stars")) movie.append(doc.get("comments")) ##### lists.append(movie) return lists
def sales_sort(query): scoreDocs = searcher.search( query, 8, Sort([ SortField.FIELD_SCORE, SortField("sales", SortField.Type.INT, True), SortField("price", SortField.Type.DOUBLE, False) ])).scoreDocs return scoreDocs
def name_price_search(name, low, high): query = QueryParser(Version.LUCENE_CURRENT, "name", analyzer).parse(command) scoreDocs = searcher.search( query, 5000, Sort([ SortField.FIELD_SCORE, SortField("price", SortField.Type.DOUBLE, False) ])).scoreDocs for scoreDoc in scoreDocs: doc = searcher.doc(scoreDoc.doc) if float(doc.get('price')) >= low and float( doc.get('price')) < high: print '-' * 100 print 'name:', doc.get('name') print doc.get('rate') + '分(' + doc.get('comments') + '人评价)' if doc.get('post') == 0: print '¥' + doc.get('price') + '\t' + '包邮' else: print '¥' + doc.get('price') print doc.get('sales'), '人付款' if doc.get('perfumer') != None: print '调香师:', doc.get('perfumer') if doc.get('tune') != None: print 'tune:', doc.get('tune') if doc.get('scents') == '': continue if doc.get('former') != None: print 'former:', doc.get('former') print 'mid:', doc.get('mid') print 'last:', doc.get('last') else: print 'scents:', doc.get('scents')
def testFilteredQuery(self): filteredquery = FilteredQuery(self.query, self.filter) topDocs = self.searcher.search(filteredquery, 50) self.assertEqual(1, topDocs.totalHits) self.assertEqual(1, topDocs.scoreDocs[0].doc) topDocs = self.searcher.search(filteredquery, None, 50, Sort(SortField("sorter", SortField.Type.STRING))) self.assertEqual(1, topDocs.totalHits) self.assertEqual(1, topDocs.scoreDocs[0].doc) filteredquery = FilteredQuery(TermQuery(Term("field", "one")), self.filter) topDocs = self.searcher.search(filteredquery, 50) self.assertEqual(2, topDocs.totalHits) filteredquery = FilteredQuery(TermQuery(Term("field", "x")), self.filter) topDocs = self.searcher.search(filteredquery, 50) self.assertEqual(1, topDocs.totalHits) self.assertEqual(3, topDocs.scoreDocs[0].doc) filteredquery = FilteredQuery(TermQuery(Term("field", "y")), self.filter) topDocs = self.searcher.search(filteredquery, 50) self.assertEqual(0, topDocs.totalHits)
def func1(genre, year): vm_env.attachCurrentThread() lists = [] query = BooleanQuery() if genre != "111": item = QueryParser(Version.LUCENE_CURRENT, "genre", analyzer).parse(genre) query.add(item, BooleanClause.Occur.MUST) if year != "111": item = QueryParser(Version.LUCENE_CURRENT, "year", analyzer).parse(year) query.add(item, BooleanClause.Occur.MUST) sf = SortField("score", SortField.Type.STRING, True) s = Sort(sf) scoreDocs = searcher1.search(query, 20, s).scoreDocs for scoreDoc in scoreDocs: movie = [] doc = searcher1.doc(scoreDoc.doc) movie.append(doc.get("url")) movie.append(doc.get("picture")) movie.append(doc.get("title")) movie.append(doc.get("score")) movie.append(doc.get("genre")) movie.append(doc.get("stars")) movie.append(doc.get("comments")) lists.append(movie) return lists
def rate_sort(query): scoreDocs = searcher.search( query, 8, Sort([ SortField.FIELD_SCORE, SortField("rate", SortField.Type.DOUBLE, True) ])).scoreDocs return scoreDocs
def func_ns(command): vm_env = lucene.getVMEnv() vm_env.attachCurrentThread() STORE_DIR = "index_tb_new" directory = SimpleFSDirectory(File(STORE_DIR)) searcher = IndexSearcher(DirectoryReader.open(directory)) analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) query = QueryParser(Version.LUCENE_CURRENT, "name", analyzer).parse(command) scoreDocs = searcher.search( query, 50, Sort([ SortField("sales", SortField.Type.INT, True), SortField.FIELD_SCORE, SortField("price", SortField.Type.DOUBLE, False) ])).scoreDocs results = process(scoreDocs, searcher) return results
def getLucene(path): directory = FSDirectory.open(Paths.get(path)) analyzer = WhitespaceAnalyzer() config = IndexWriterConfig(analyzer) config.setIndexSort( Sort(SortField(NUMERIC_STAMP_FIELD, SortField.Type.LONG))) writer = IndexWriter(directory, config) reader = writer.getReader() searcher = IndexSearcher(reader) return writer, reader, searcher
def search(): boolean_query = getQueryBuiler() final_query = boolean_query.build() if args.sort: sort = Sort(SortField(args.sort, SortField.Type.STRING, args.sortDirection == 'desc')) results = searcher.search(final_query, args.ndocs, sort) else: results = searcher.search(final_query, args.ndocs) return results
def getLastStampId(self, prefix='oai_dc', setSpec=None): searcher = self._getSearcher() sort = Sort(SortField(NUMERIC_STAMP_FIELD, SortField.Type.LONG, True)) if prefix is None and setSpec is None: query = MatchAllDocsQuery() else: if prefix is None: query = TermQuery(Term(SETS_FIELD, setSpec)) else: query = TermQuery(Term(PREFIX_FIELD, prefix)) results = searcher.search(query, 1, sort) if results.totalHits.value < 1: return None return _stampFromDocument(searcher.doc(results.scoreDocs[0].doc))
def printAggregationSearch(field_name): print('Values for field "{}".'.format(field_name)) groupingSearch = GroupingSearch(field_name) groupingSearch.setAllGroups(True) if args.sort != 'agg': sort = Sort(SortField(field_name, SortField.Type.STRING)) groupingSearch.setGroupSort(sort) query = getQueryBuiler().build() result = groupingSearch.search(searcher, query, args.offset, 2500) #args.ndocs totalGroupCount = result.totalGroupCount print('Total groups count: {} Total docs count {}'.format(totalGroupCount, result.totalHitCount)) aggGroups = [] groups = result.groups for i in range(len(groups)): charCodes = groups[i].groupValue.toString()[1:-1].split(' ') for j in range(len(charCodes)): if charCodes[j] == '': charCodes[j] = 45 else: charCodes[j] = int(charCodes[j], 16) # % 128 groupName = str(bytearray(charCodes), 'utf-8') aggGroups.append({'name': groupName, 'value': int(groups[i].totalHits.value)}) if args.sort == 'agg': aggGroups = sorted(aggGroups, key=lambda k: k['value'], reverse=True) table_line = ''.join(['-' for i in range(27)]) print(table_line) print('| {:10} | {:10} |'.format('Value', 'Count')) print(table_line) groups = result.groups for i in range(min(args.ndocs, len(aggGroups))): print('| {:10} | {:10,d} |'.format(aggGroups[i]['name'], aggGroups[i]['value'])) print(table_line) print()
def __init__(self, index_dir, search_fields=['canonical_url', 'title', 'meta', 'content'], unique_field='uq_id_str', boost=dict(canonical_url=4.0, title=8.0, meta=2.0, content=1.0), date_format='%Y-%m-%dT%H:%M:%S'): """Constructor of Searcher. Parameters ---------- index_dir : string The location of lucene index. search_fields : list A list of field names indicating fields to search on. unique_field : string The field name, on which the duplication should avoid. boost : dict This dict control the weight when computing score. date_format : string Convert the string into datetime. Should consistent with the index part. """ self.index_dir = index_dir self.search_fields = search_fields self.sort_by_recent = Sort( SortField('date_published', SortField.Type.STRING, True)) self.store = FSDirectory.open(File(index_dir)) self.reader = DirectoryReader.open(self.store) self.isearcher = IndexSearcher(self.reader) self.analyzer = StandardAnalyzer() self.dup_filter = DuplicateFilter(unique_field) self.boost_map = HashMap() for k, v in boost.iteritems(): self.boost_map.put(k, Float(v)) self.mul_parser = MultiFieldQueryParser(search_fields, self.analyzer, self.boost_map) self.date_format = date_format
def testSearchTopField(self): I = Index(path=self.tempdir, settings=LuceneSettings()) I._indexWriter.addDocument( document(__id__='1', name="one", price="aap noot mies")) I.commit() I._indexWriter.addDocument( document(__id__='2', name="two", price="aap vuur boom")) I.commit() I._indexWriter.addDocument( document(__id__='3', name="three", price="noot boom mies")) I.commit() I.close() I = Index(path=self.tempdir, settings=LuceneSettings()) sort = Sort(SortField("name", SortField.Type.STRING, True)) C = TopFieldSuperCollector(sort, 2, True, False, True) Q = MatchAllDocsQuery() I.search(Q, None, C) td = C.topDocs(0) self.assertEquals(3, C.getTotalHits()) self.assertEquals(3, td.totalHits) self.assertEquals(2, len(td.scoreDocs)) self.assertEquals( ['2', '3'], [I.getDocument(s.doc).get("__id__") for s in td.scoreDocs])
def run(searcher_good, searcher_bad, analyzer): while True: command_dict = parseCommand(command) total_num = 20 #这些不同的s用来决定排序顺序:依次是按价格(从低到高)、热度(总评论数)、好评率、综合评分 #s=SortField("price",SortField.Type.FLOAT,False) #s=SortField("total_comment",SortField.Type.FLOAT,True) s = SortField("good_rate", SortField.Type.FLOAT, True) #s=SortField("socre",SortField.Type.FLOAT,True) so = Sort(s) querys = BooleanQuery() for k, v in command_dict.iteritems(): query = QueryParser(Version.LUCENE_CURRENT, k, analyzer).parse(v) querys.add(query, BooleanClause.Occur.MUST) #这两句用来限定价格的范围 #q=NumericRangeQuery.newFloatRange("price",100.0,200.0,True,True) #querys.add(q,BooleanClause.Occur.MUST) scoreDocs_good = searcher_good.search(querys, total_num, so).scoreDocs total = len(scoreDocs_good) flag = True if len(scoreDocs_good) < total_num: scoreDocs_bad = searcher_bad.search(querys, total_num, so).scoreDocs total = total + len(scoreDocs_bad) flag = False if total > total_num: total = total_num print "%s total matching documents." % total #"url"是网址,“img_url”是图片网址,“brand”是品牌 for scoreDoc_good in scoreDocs_good: doc = searcher_good.doc(scoreDoc_good.doc) ## explanation = searcher.explain(query, scoreDoc.doc) print "------------------------" print 'title:', doc.get('title') print 'total_comment', doc.get("total_comment") print 'price', doc.get("price") print 'socre', doc.get("socre") print 'brand', doc.get("brand") print 'good_rate', doc.get("good_rate") print if not flag: t = 0 for scoreDoc_bad in scoreDocs_bad: t = t + 1 doc = searcher_bad.doc(scoreDoc_bad.doc) ## explanation = searcher.explain(query, scoreDoc.doc) print "------------------------" print 'title:', doc.get('title') print 'total_comment', doc.get("total_comment") print 'price', doc.get("price") print 'score', doc.get("score") print 'brand', doc.get("brand") print 'good_rate', doc.get("good_rate") print if t > total_num - 1 - len(scoreDocs_good): break
def get_query_result(sarc, dct): PAGE_SIZE = 10 PAGE_JUMP = 10 query = BooleanQuery() query.add(TermQuery(Term(zh_pganlz.LTPF_TYPE, '1')), BooleanClause.Occur.MUST) page = 0 sort_lists = [] summ_set = set() exclus_set = None words = [] for k, v in dct.items(): if k in ('index', 'type', 'tag_indices', 'author_index'): query.add(build_anyterm_query(k, dct[k]), BooleanClause.Occur.MUST) elif k in ('text', 'contents', 'title', 'description', 'alias'): words += jieba.lcut(v) query.add( build_text_query(k + zh_pganlz.LTPF_FOR_QUERY, dct[k]), BooleanClause.Occur.MUST) elif k == 'raw': query.add( QueryParser('index', WhitespaceAnalyzer()).parse(dct[k]), BooleanClause.Occur.MUST) elif k == 'enhraw': x = 0 reslst = [] for entry in v: if x == 2: reslst += [ lastdoc + x.encode('utf8') for x in jieba.cut(entry) ] x = 0 else: if x == 0: reslst.append(entry.encode('utf8')) else: lastdoc = entry.encode('utf8') x += 1 query.add( QueryParser('index', WhitespaceAnalyzer()).parse( ' '.join(reslst)), BooleanClause.Occur.MUST) elif k == 'page': page = int(dct[k]) elif k == 'sort': for x in dct['sort']: sort_type = SortField.Type.STRING if 'type' in x.keys(): if x['type'] == 'int': sort_type = SortField.Type.INT elif x['type'] == 'float': sort_type = SortField.Type.FLOAT reverse = False if 'reverse' in x.keys(): reverse = x['reverse'] sort_lists.append( SortField(x['key'], sort_type, reverse)) elif k == 'summarize': summ_set = set(v) elif k == 'exclusive': exclus_set = set(v) ressrt = Sort(*sort_lists) resdocs = sarc.searcher.search(query, PAGE_SIZE, ressrt) if page > 0: if resdocs.totalHits > page * PAGE_SIZE: page -= 1 while page > PAGE_JUMP: resdocs = sarc.searcher.searchAfter( resdocs.scoreDocs[-1], query, PAGE_SIZE * PAGE_JUMP, ressrt) page -= PAGE_JUMP if page > 0: resdocs = sarc.searcher.searchAfter( resdocs.scoreDocs[-1], query, PAGE_SIZE * page, ressrt) resdocs = sarc.searcher.searchAfter( resdocs.scoreDocs[-1], query, PAGE_SIZE, ressrt) else: resdocs.scoreDocs = [] reslst = [] for x in resdocs.scoreDocs: dictobj = zh_pganlz.obj_to_json( zh_pganlz.document_to_obj(sarc.searcher.doc(x.doc))) if 'additional' in dct.keys(): adres = [] for x in dct['additional']: if isinstance(dictobj[x['sourcefield']], list): qlist = dictobj[x['sourcefield']] else: qlist = [dictobj[x['sourcefield']]] cres = [] for qword in qlist: if not isinstance(qword, (unicode, str)): qword = str(qword) searchres = sarc.searcher.search( zh_iatd.create_query({ 'type': x['type'], x['targetfield']: qword }), 1) if searchres.totalHits > 1: print x, 'FOUND', searchres elif searchres.totalHits == 0: cres.append(None) else: cres.append( zh_pganlz.obj_to_json( zh_pganlz.document_to_obj( sarc.searcher.doc( searchres.scoreDocs[0].doc)))) adres.append(cres) for k, v in dictobj.items(): if k in summ_set: dictobj[k + '_summary'] = summarize( hyper_text(v).text, list(set(words))) if not exclus_set is None: for k in dictobj.keys(): if not k in exclus_set: del dictobj[k] if 'additional' in dct.keys(): dictobj['additional'] = adres reslst.append(dictobj) return {'total': resdocs.totalHits, 'data': reslst}
def Run_Price(searcher_good, searcher_bad, analyzer, command, brand): while True: command_dict, low, high = parseCommand(command, brand) total_num = 20 s = SortField("price", SortField.Type.FLOAT, False) #s=SortField("total_comment",SortField.Type.FLOAT,True) #s=SortField("good_rate",SortField.Type.FLOAT,True) #s=SortField("socre",SortField.Type.FLOAT,True) so = Sort(s) querys = BooleanQuery() for k, v in command_dict.iteritems(): query = QueryParser(Version.LUCENE_CURRENT, k, analyzer).parse(v) querys.add(query, BooleanClause.Occur.MUST) #The price's range q = NumericRangeQuery.newFloatRange("price", low, high, True, True) querys.add(q, BooleanClause.Occur.MUST) scoreDocs_good = searcher_good.search(querys, total_num, so).scoreDocs total = len(scoreDocs_good) flag = True if len(scoreDocs_good) < total_num: scoreDocs_bad = searcher_bad.search(querys, total_num, so).scoreDocs total = total + len(scoreDocs_bad) flag = False if total > total_num: total = total_num #Total is the number of matched websites res = [] for scoreDoc_good in scoreDocs_good: unit = [] doc = searcher_good.doc(scoreDoc_good.doc) title = doc.get('title') title.replace(' ', '') title = title[:18] total_comment = doc.get("total_comment") price = doc.get("price") socre = doc.get("socre") brand = doc.get("brand") good_rate = doc.get("good_rate") url = doc.get("url") img_url = doc.get("img_url") comment = doc.get("comment").split() unit.append(title) #0 unit.append(total_comment) #1 unit.append(price) #2 unit.append(socre) #3 unit.append(brand) #4 unit.append(good_rate) #5 unit.append(url) #6 unit.append(img_url) #7 unit.append(comment) #8 res.append(unit) if not flag: t = 0 for scoreDoc_bad in scoreDocs_bad: t = t + 1 doc = searcher_bad.doc(scoreDoc_bad.doc) ## explanation = searcher.explain(query, scoreDoc.doc) title = doc.get('title') title.replace(' ', '') title = title[:18] total_comment = doc.get("total_comment") price = doc.get("price") socre = doc.get("socre") brand = doc.get("brand") good_rate = doc.get("good_rate") url = doc.get("url") img_url = doc.get("img_url") comment = doc.get("comment").split() unit.append(title) unit.append(total_comment) unit.append(price) unit.append(socre) unit.append(brand) unit.append(good_rate) unit.append(url) unit.append(img_url) unit.append(comment) res.append(unit) if t > total_num - 1 - len(scoreDocs_good): break res.append(brand) return res
def _sortField(self, fieldname, sortDescending): result = SortField(fieldname, SortField.Type.STRING, sortDescending) result.setMissingValue(SortField.STRING_FIRST if sortDescending else SortField.STRING_LAST) return result
def func_pr(name, low, high): vm_env = lucene.getVMEnv() vm_env.attachCurrentThread() STORE_DIR = "index_tb_new" directory = SimpleFSDirectory(File(STORE_DIR)) searcher = IndexSearcher(DirectoryReader.open(directory)) analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) query = QueryParser(Version.LUCENE_CURRENT, "name", analyzer).parse(name) scoreDocs = searcher.search( query, 1000, Sort([ SortField.FIELD_SCORE, SortField("price", SortField.Type.DOUBLE, False) ])).scoreDocs results = [] for scoreDoc in scoreDocs: doc = searcher.doc(scoreDoc.doc) if float(doc.get('price')) >= float(low) and float( doc.get('price')) < float(high): shop = doc.get('shop') name = doc.get('name') img = doc.get('img') url = doc.get('url') post = doc.get('post') sales = doc.get('sales') comments = doc.get('comments') place = doc.get('place') price = doc.get('price') data = {} data.setdefault('name', name) data.setdefault('url', url) data.setdefault('price', price) data.setdefault('post', post) data.setdefault('sales', sales) data.setdefault('comments', comments) data.setdefault('place', place) data.setdefault('shop', shop) data.setdefault('img', img) data.setdefault('xssd_name', doc.get('xssd_name')) data.setdefault('perfumer', doc.get('perfumer')) data.setdefault('tune', doc.get('tune')) data.setdefault('xssd_url', doc.get('xssd_url')) data.setdefault('brand', doc.get('brand')) data.setdefault('rate', float(doc.get('rate'))) data.setdefault('xssd_comments', doc.get('comment')) if doc.get('former') != None: former = doc.get('former') mid = doc.get('mid') last = doc.get('last') data.setdefault('former', former) data.setdefault('mid', mid) data.setdefault('last', last) scents = former + ' ' + mid + ' ' + last data.setdefault('scents', scents) else: data.setdefault('scents', doc.get('scents')) dump(data) # print(result['context']) results.append(data) print data return results
def index_images_until_stop(session, handler, lbound): global _stop, _stopped, _vm _vm.attachCurrentThread() searcher = IndexSearcher( DirectoryReader.open(SimpleFSDirectory(File(TASK_FOLDER)))) query = BooleanQuery() query.add(TermQuery(Term('finish_time', '0')), BooleanClause.Occur.MUST_NOT) query.add(MatchAllDocsQuery(), BooleanClause.Occur.MUST) if not lbound is None: query.add( TermRangeQuery.newStringRange('finish_time', lbound, '9999999999', False, True), BooleanClause.Occur.MUST) sort = Sort(SortField('finish_time', SortField.Type.INT)) tmpbk = None res = searcher.search(query, 100, sort) answer_content_searcher = zh_iatd.create_searcher() logger = external_console_logger('/tmp/zh_imgc_info') while not _stop: print 'got', len(res.scoreDocs), 'docs' for x in res.scoreDocs: try: imgsgot = 0 realdoc = searcher.doc(x.doc) doctype = realdoc['func_name'] objid = realdoc['id'] logger.write(' ft:{0}'.format(realdoc['finish_time'])) if doctype == 'user_data': soup = bs4.BeautifulSoup( session.opener.open( urllib2.Request( url='https://www.zhihu.com/people/{0}'.format( objid))), HTML_PARSER) cover = soup.select( '#ProfileHeader .ProfileHeader-userCover img') if len(cover) > 0: cover_img = cover[0]['src'] imgsgot += 1 handler(cover_img, ZH_IMGTYPE_USERINFO_COVER, objid) avatar_img = soup.select( '#ProfileHeader .ProfileHeader-main .UserAvatar img' )[0]['src'] imgsgot += 1 handler(avatar_img, ZH_IMGTYPE_USER_AVATAR, objid) elif doctype == 'article_data': jsondata = session.get_article_content_raw(objid) if 'titleImage' in jsondata.keys(): cover_img = jsondata['titleImage'] if len(cover_img) > 0: imgsgot += 1 handler(cover_img, ZH_IMGTYPE_ARTICLE_COVER, objid) soup = bs4.BeautifulSoup(jsondata['content'], HTML_PARSER) for x in soup.select('img'): imgsgot += 1 handler(x['src'], ZH_IMGTYPE_IN_ARTICLE, objid) elif doctype == 'topic_data': soup = bs4.BeautifulSoup( session.opener.open( urllib2.Request( url='https://www.zhihu.com/topic/{0}/hot'. format(objid))), HTML_PARSER) topic_img = soup.select( '.zu-main-content .topic-avatar .zm-entry-head-avatar-link img' )[0]['src'] imgsgot += 1 handler(topic_img, ZH_IMGTYPE_TOPIC_ICON, objid) elif doctype == 'answer_comments' and realdoc['start'] == '0': obj, q = zh_iatd.query_object(answer_content_searcher, objid, zh_pganlz.answer) for x in obj.data.text.as_soup().select('img'): imgsgot += 1 handler(x['src'], ZH_IMGTYPE_IN_ANSWER, objid) elif doctype == 'question_data': soup = bs4.BeautifulSoup( session.opener.open( urllib2.Request( url='https://www.zhihu.com/question/{0}'. format(objid))), HTML_PARSER) for x in soup.select('#zh-question-detail img'): imgsgot += 1 handler(x['src'], ZH_IMGTYPE_IN_QUESTION, objid) else: logger.write('\n') continue logger.write(' ({0}, +{1})\n'.format(doctype, imgsgot)) if _stop: break time.sleep(3) except Exception as e: logger.write('\n## ERROR ################################\n') logger.write(traceback.format_exc()) if len(res.scoreDocs) > 0: tmpbk = res.scoreDocs[-1] res = searcher.searchAfter(tmpbk, query, 100, sort) print 'stopped' _stopped = True