def process(url, parameter, manager, *args, **kwargs): method, gap, js, data = parameter.split(':') gap = int(gap) batch_id = BATCH_ID['question'] for _ in range(2): content = get_zhidao_content(url, method, gap, HEADER, batch_id) if content != u'': break time.sleep(gap) else: return False answer_ids = [] question_content = generate_question_json(content, answer_ids) if question_content is None: return False m = Cache(BATCH_ID['json']) flag = m.post(url, question_content) if not flag: flag = m.post(url, question_content) if not flag: return flag answer_urls = [] qid = re.search( 'http://zhidao.baidu.com/question/(\d+).html', url).group(1) for answer_id in answer_ids[:3]: answer_urls.append( get_answer_url(qid, answer_id) ) manager.put_urls_enqueue(BATCH_ID['answer'], answer_urls) return flag
def __init__(self, config): print config, "-----" self.config = config self.counter = collections.Counter() self.cache = Cache(self.config["batch_ids"]["json"], self.config["cache_server"]) self.downloader = DownloadWrapper(self.config["cache_server"], self.config["http_headers"])
def process(url, batch_id, parameter, *args, **kwargs): if not hasattr(process, '_downloader'): domain_name = Downloader.url2domain(url) #{'Host': 'zhidao.baidu.com'} headers = {"Host": domain_name} setattr(process, '_downloader', DownloadWrapper(CACHE_SERVER, headers)) if not hasattr(process, '_cache'): setattr(process, '_cache', Cache(batch_id.split('-', 1)[0] + '-json', CACHE_SERVER)) method, gap, js, timeout, data = parameter.split(':') gap = int(gap) timeout = int(timeout) content = process._downloader.downloader_wrapper(url, batch_id, gap, timeout=timeout) if kwargs and kwargs.get("debug"): print(len(content), "\n", content[:1000]) if content is False: return False return True
def process(url, parameter, *args, **kwargs): method, gap, js, data = parameter.split(':') gap = int(gap) content = get_zhidao_content(url, method, gap, HEADER, BATCH_ID['answer']) if content is u'': time.sleep(gap) content = get_zhidao_content( url, method, gap, HEADER, BATCH_ID['answer']) if content is u'': return False ans_content = generate_answer_json(content) if ans_content is None: return False m = Cache(BATCH_ID['json']) flag = m.post(url, ans_content) if not flag: flag = m.post(url, ans_content) return flag
class ZhidaoPrefetch(object): def __init__(self, config): print config, "-----" self.config = config self.counter = collections.Counter() self.cache = Cache(self.config["batch_ids"]["json"], self.config["cache_server"]) self.downloader = DownloadWrapper(self.config["cache_server"], self.config["http_headers"]) def is_debug(self): return self.config.get("debug", False) def zhidao_results(self, qids): q_jsons = [] for qid in qids: q_json = self.zhidao_question(qid) if q_json is False: continue q_json["list_answers"] = [] for rid in q_json["answer_ids"][:3]: a_json = self.zhidao_answer(qid, rid) if a_json is False: continue q_json["list_answers"].append(a_json) q_jsons.append(q_json) return q_jsons def zhidao_question(self, qid): question_url = "http://zhidao.baidu.com/question/{}.html".format(qid) if self.is_debug(): print question_url ret = self.downloader.downloader_wrapper( question_url, self.config["batch_ids"]["question"], self.config["crawler"]["gap"], timeout=self.config["crawler"]["timeout"], encoding=self.config["crawler"]["encoding"]) if ret is False: return False q_json = generate_question_json(qid, ret) if q_json is None or q_json == {}: return False success = self.cache.post(question_url, q_json) return q_json def zhidao_answer(self, qid, rid): answer_url = ("http://zhidao.baidu.com/question/api/mini?qid={}" "&rid={}&tag=timeliness".format(qid, rid)) #print self.config["crawler"] if self.is_debug(): print answer_url ret = self.downloader.downloader_wrapper( answer_url, self.config["batch_ids"]["answer"], self.config["crawler"]["gap"], timeout=self.config["crawler"]["timeout"], encoding=self.config["crawler"]["encoding"]) if ret is False: return False try: a_json = generate_answer_json(ret) except: return False success = self.cache.post(answer_url, a_json) return a_json def zhidao_search(self, query, page_number=None, start_result_index=0): if isinstance(query, unicode): query = query.encode("utf-8") if page_number is None or page_number == 0: query_url = "http://zhidao.baidu.com/search/?word={}".format( urllib.quote(query)) else: query_url = "http://zhidao.baidu.com/search/?pn={}&word={}".format( page_number * 10, urllib.quote(query)) if self.is_debug(): print query_url # query_url = "http://zhidao.baidu.com/search?word={}".format(quote_word) #print query #print query_url ret = self.downloader.downloader_wrapper( query_url, self.config["batch_ids"]["search"], self.config["crawler"]["gap"], timeout=self.config["crawler"]["timeout"], encoding=self.config["crawler"]["encoding"], refresh=False) # resp.headers: "content-type": "text/html;charset=UTF-8", # resp.content: <meta content="application/xhtml+xml; charset=utf-8" http-equiv="content-type"/> if ret is False: return False else: return parse_search_json_v0615( ret, start_result_index=start_result_index) def run_query(self, query, max_page): self.counter["query"] += 1 qids_select = set() result_all = [] for page_number in range(max_page): print "==== page ", page_number, query self.counter["page"] += 1 result_local = self.zhidao_search(query, page_number, len(result_all)) #print json.dumps( result_local, ensure_ascii=False, indent=4, sort_keys=True) result_all.extend(result_local) self.counter["q_total"] += len(result_local) for item in result_local: item["query"] = query if type(query) != unicode: item["query"] = query.decode("utf-8") #print item if item["source"] == "recommend" or (item["cnt_like"] >= 3): self.counter["q_good"] += 1 qids_select.add(item["question_id"]) print item["source"], item["cnt_like"], item[ "cnt_answer"], item['question'], "<----", item[ 'answers'] else: print item["source"], item["cnt_like"], item[ "cnt_answer"], item['question'] print datetime.datetime.now().isoformat(), self.counter return result_all #qajson = self.zhidao_results(qids_select) #print json.dumps(qajson, ensure_ascii=False, indent=4) def run_query_entity(self): filename = getTheFile("seed_entity.human.txt") with codecs.open(filename) as f: for line in f: if line.startswith("#"): continue line = line.strip() if not line: continue self.run_query(line, 10) def run_query_batch(self, filename, limit): with codecs.open(filename) as f: for line in f: if line.startswith("#"): continue line = line.strip() if not line: continue self.run_query(line, limit) def run_gen_url_search_realtime(self, filename): lines = libfile.file2list(filename) visited = set() for line in sorted(lines): for query_parser in [0]: query_url, qword = zhidao_fetch.get_search_url_qword( line, query_parser=query_parser) if query_url in visited: continue visited.add(query_url) print qword, query_url print len(visited) filename_output = getLocalFile( os.path.basename(filename.replace("human.txt", "_urls.txt"))) libfile.lines2file(sorted(list(visited)), filename_output) def run_test_search_realtime(self, filename, limit): results = [] counter = collections.Counter() with codecs.open(filename) as f: for line in f: if line.startswith("#"): continue line = line.strip() if not line: continue ret = self.run_query(line, limit) counter["query"] += 1 for item in ret: #print json.dumps(item, ensure_ascii=False, indent=4, sort_keys=True) results.append(item) for p in ["source", "result_index"]: counter["{}_{}".format(p, item[p])] += 1 for p in ["question", "answers"]: if p in item: if not isinstance(item[p], unicode): item[p] = item[p].decode("gb18030") filename_output = getLocalFile( os.path.basename(filename.replace("human.txt", "xls"))) libfile.writeExcel(results, [ "id", "source", "result_index", "cnt_like", "cnt_answer", "query", "question_id", "question", "answers" ], filename_output) #libfile.writeExcel(results, ["query", "source", "cnt_like", "cnt_answer", "question", "answers"], filename_output) print counter def run_get_best_search_realtime(self, filename): results = [] counter = collections.Counter() lines = libfile.file2list(filename) for query_parser in [0]: for line in sorted(lines): cnt_label = "query_{}".format(query_parser) if counter[cnt_label] % 10 == 0: print datetime.datetime.now().isoformat( ), counter[cnt_label], line counter[cnt_label] += 1 ret_one = search_zhidao_best(line, query_filter=0, query_parser=query_parser) if ret_one: item = ret_one["best_qapair"] print "=====>", line print "------", item["match_score"], item["question"] print item["answers"], "*******", item["answers_raw"][ len(item["answers"]):] for p in ["query"]: item[p] = ret_one[p] #print json.dumps(item, ensure_ascii=False, indent=4, sort_keys=True) results.append(item) for p in ["source", "result_index"]: counter["{}_{}".format(p, item[p])] += 1 for p in ["question", "answers"]: if p in item: if not isinstance(item[p], unicode): item[p] = item[p].decode("gb18030") filename_output = getLocalFile( os.path.basename(filename.replace("human.txt", "xls"))) libfile.writeExcel(results, [ "id", "source", "result_index", "cnt_like", "cnt_answer", "query", "question_id", "question", "answers" ], filename_output) #libfile.writeExcel(results, ["query", "source", "cnt_like", "cnt_answer", "question", "answers"], filename_output) print counter
def __init__(self, cacheserver): self.cache = Cache(BATCH_ID['json'], cacheserver) self.downloader = DownloadWrapper(cacheserver, {'Host': 'zhidao.baidu.com'})
class Scheduler(object): def __init__(self, cacheserver): self.cache = Cache(BATCH_ID['json'], cacheserver) self.downloader = DownloadWrapper(cacheserver, {'Host': 'zhidao.baidu.com'}) @classmethod def instance(cls, *args): if not hasattr(cls, '_instance'): setattr(cls, '_instance', cls(*args)) return cls._instance def zhidao_results(self, qids, gap, timeout=10): q_jsons = [] for qid in qids: q_json = self.zhidao_question(qid, gap, timeout) if q_json is False: continue q_json['list_answers'] = [] for rid in q_json['answer_ids'][:3]: a_json = self.zhidao_answer(qid, rid, gap, timeout) if a_json is False: continue q_json['list_answers'].append(a_json) q_jsons.append(q_json) return q_jsons def zhidao_question(self, qid, gap, timeout): question_url = 'http://zhidao.baidu.com/question/{}.html'.format(qid) ret = self.downloader.downloader_wrapper(question_url, BATCH_ID['question'], gap, timeout=timeout, encoding='gb18030', error_check=True) if ret is False: return False q_json = generate_question_json(qid, ret) if q_json is None: return False success = self.cache.post(question_url, q_json) return q_json def zhidao_answer(self, qid, rid, gap, timeout): answer_url = ('http://zhidao.baidu.com/question/api/mini?qid={}' '&rid={}&tag=timeliness'.format(qid, rid)) ret = self.downloader.downloader_wrapper(answer_url, BATCH_ID['answer'], gap, timeout=timeout, encoding='gb18030') if ret is False: return False try: a_json = generate_answer_json(ret) except: return False success = self.cache.post(answer_url, a_json) return a_json def zhidao_search(self, qword, batch_id, gap=3, timeout=10, refresh=True): quote_word = urllib.quote(qword.encode('utf-8')) if isinstance( qword, unicode) else urllib.quote(qword) # query_url = 'http://zhidao.baidu.com/index/?word={}'.format(quote_word) # utf-8 query_url = 'http://zhidao.baidu.com/search?word={}'.format(quote_word) ret = self.downloader.downloader_wrapper(query_url, batch_id, gap, timeout=timeout, encoding='gb18030', refresh=refresh) # resp.headers: 'content-type': 'text/html;charset=UTF-8', # resp.content: <meta content="application/xhtml+xml; charset=utf-8" http-equiv="content-type"/> if ret is False: return False return zhidao_search_questions(ret) def zhidao_search_list_json(self, qword, batch_id, gap=3, timeout=10, refresh=False): quote_word = urllib.quote(qword.encode('utf-8')) if isinstance( qword, unicode) else urllib.quote(qword) # query_url = 'http://zhidao.baidu.com/index/?word={}'.format(quote_word) # utf-8 query_url = 'http://zhidao.baidu.com/search?word={}'.format(quote_word) ret = self.downloader.downloader_wrapper(query_url, batch_id, gap, timeout=timeout, encoding='gb18030', refresh=refresh) # resp.headers: 'content-type': 'text/html;charset=UTF-8', # resp.content: <meta content="application/xhtml+xml; charset=utf-8" http-equiv="content-type"/> if ret is False: return False search_result_json = parse_search_json_v0615(ret) for item in search_result_json: item["query"] = qword if type(qword) != unicode: item["query"] = qword.decode("utf-8") return search_result_json def zhidao_search_select_best(self, qword, gap=3, timeout=2): search_result_json = self.zhidao_search_list_json( qword, BATCH_ID['search'], gap, timeout) # get the best answer for item in search_result_json: if item["is_recommend"] == 1: return item return False def zhidao_search_select_best_qids(self, qword, gap=3, timeout=2): ret = self.zhidao_search_select_best(qword, gap, timeout) if ret: return [ret["question_id"]] return [] def run(self, qword, gap=3, timeout=10): # qids = self.zhidao_search(qword, BATCH_ID['search'], gap, timeout) qids = self.zhidao_search_select_best_qids(qword, gap, timeout) return self.zhidao_results(qids, gap, timeout)
# -*- coding: utf-8 -*- from downloader.cache import Cache import sys import re import requests reload(sys) sys.setdefaultencoding('utf-8') BATCH_ID = 'dongfang-201606test' m = Cache(BATCH_ID, 'http://192.168.1.179:8000/') url = 'http://data.eastmoney.com/Notice/Noticelist.aspx?type=0&market=all&date=&page=33333' content = m.get(url) #print requests.get(url).text print '更多公告' in content OK = 0 NO = 0 for index in range(1, 2000): url = 'http://data.eastmoney.com/Notice/Noticelist.aspx?type=0&market=all&date=&page={}'.format( index) content = m.get(url) if content: if '更多公告' in content: OK += 1 else: NO += 1 print NO, OK #检测指定区间内正常网页和显示不完全网页的对比 #所谓的显示不完全表现为网页爬取正常,但是列表框内没有那50个公告列表以及连接