def download_url(url): cacheserver = 'http://192.168.1.179:8000/' batch_id = 'dongfangcaifu-201606' url = url #url = 'http://data.eastmoney.com/notice/20160617/2Wvl2aWuYwihKD.html' m = DownloadWrapper(cacheserver) content = m.downloader_wrapper(url, batch_id, 0.5, encoding='gb2312') return content
def process_request(self, request, spider): url = request.url m = DownloadWrapper(SERVER) content = m.downloader_wrapper(url, BATCH_ID, 2) if content: response = scrapy.http.response.html.HtmlResponse(url, encoding='utf-8', body=content) return response return
class ZhidaoPrefetch(object): def __init__(self, config): print config, "-----" self.config = config self.counter = collections.Counter() self.cache = Cache(self.config["batch_ids"]["json"], self.config["cache_server"]) self.downloader = DownloadWrapper(self.config["cache_server"], self.config["http_headers"]) def is_debug(self): return self.config.get("debug", False) def zhidao_results(self, qids): q_jsons = [] for qid in qids: q_json = self.zhidao_question(qid) if q_json is False: continue q_json["list_answers"] = [] for rid in q_json["answer_ids"][:3]: a_json = self.zhidao_answer(qid, rid) if a_json is False: continue q_json["list_answers"].append(a_json) q_jsons.append(q_json) return q_jsons def zhidao_question(self, qid): question_url = "http://zhidao.baidu.com/question/{}.html".format(qid) if self.is_debug(): print question_url ret = self.downloader.downloader_wrapper( question_url, self.config["batch_ids"]["question"], self.config["crawler"]["gap"], timeout=self.config["crawler"]["timeout"], encoding=self.config["crawler"]["encoding"]) if ret is False: return False q_json = generate_question_json(qid, ret) if q_json is None or q_json == {}: return False success = self.cache.post(question_url, q_json) return q_json def zhidao_answer(self, qid, rid): answer_url = ("http://zhidao.baidu.com/question/api/mini?qid={}" "&rid={}&tag=timeliness".format(qid, rid)) #print self.config["crawler"] if self.is_debug(): print answer_url ret = self.downloader.downloader_wrapper( answer_url, self.config["batch_ids"]["answer"], self.config["crawler"]["gap"], timeout=self.config["crawler"]["timeout"], encoding=self.config["crawler"]["encoding"]) if ret is False: return False try: a_json = generate_answer_json(ret) except: return False success = self.cache.post(answer_url, a_json) return a_json def zhidao_search(self, query, page_number=None, start_result_index=0): if isinstance(query, unicode): query = query.encode("utf-8") if page_number is None or page_number == 0: query_url = "http://zhidao.baidu.com/search/?word={}".format( urllib.quote(query)) else: query_url = "http://zhidao.baidu.com/search/?pn={}&word={}".format( page_number * 10, urllib.quote(query)) if self.is_debug(): print query_url # query_url = "http://zhidao.baidu.com/search?word={}".format(quote_word) #print query #print query_url ret = self.downloader.downloader_wrapper( query_url, self.config["batch_ids"]["search"], self.config["crawler"]["gap"], timeout=self.config["crawler"]["timeout"], encoding=self.config["crawler"]["encoding"], refresh=False) # resp.headers: "content-type": "text/html;charset=UTF-8", # resp.content: <meta content="application/xhtml+xml; charset=utf-8" http-equiv="content-type"/> if ret is False: return False else: return parse_search_json_v0615( ret, start_result_index=start_result_index) def run_query(self, query, max_page): self.counter["query"] += 1 qids_select = set() result_all = [] for page_number in range(max_page): print "==== page ", page_number, query self.counter["page"] += 1 result_local = self.zhidao_search(query, page_number, len(result_all)) #print json.dumps( result_local, ensure_ascii=False, indent=4, sort_keys=True) result_all.extend(result_local) self.counter["q_total"] += len(result_local) for item in result_local: item["query"] = query if type(query) != unicode: item["query"] = query.decode("utf-8") #print item if item["source"] == "recommend" or (item["cnt_like"] >= 3): self.counter["q_good"] += 1 qids_select.add(item["question_id"]) print item["source"], item["cnt_like"], item[ "cnt_answer"], item['question'], "<----", item[ 'answers'] else: print item["source"], item["cnt_like"], item[ "cnt_answer"], item['question'] print datetime.datetime.now().isoformat(), self.counter return result_all #qajson = self.zhidao_results(qids_select) #print json.dumps(qajson, ensure_ascii=False, indent=4) def run_query_entity(self): filename = getTheFile("seed_entity.human.txt") with codecs.open(filename) as f: for line in f: if line.startswith("#"): continue line = line.strip() if not line: continue self.run_query(line, 10) def run_query_batch(self, filename, limit): with codecs.open(filename) as f: for line in f: if line.startswith("#"): continue line = line.strip() if not line: continue self.run_query(line, limit) def run_gen_url_search_realtime(self, filename): lines = libfile.file2list(filename) visited = set() for line in sorted(lines): for query_parser in [0]: query_url, qword = zhidao_fetch.get_search_url_qword( line, query_parser=query_parser) if query_url in visited: continue visited.add(query_url) print qword, query_url print len(visited) filename_output = getLocalFile( os.path.basename(filename.replace("human.txt", "_urls.txt"))) libfile.lines2file(sorted(list(visited)), filename_output) def run_test_search_realtime(self, filename, limit): results = [] counter = collections.Counter() with codecs.open(filename) as f: for line in f: if line.startswith("#"): continue line = line.strip() if not line: continue ret = self.run_query(line, limit) counter["query"] += 1 for item in ret: #print json.dumps(item, ensure_ascii=False, indent=4, sort_keys=True) results.append(item) for p in ["source", "result_index"]: counter["{}_{}".format(p, item[p])] += 1 for p in ["question", "answers"]: if p in item: if not isinstance(item[p], unicode): item[p] = item[p].decode("gb18030") filename_output = getLocalFile( os.path.basename(filename.replace("human.txt", "xls"))) libfile.writeExcel(results, [ "id", "source", "result_index", "cnt_like", "cnt_answer", "query", "question_id", "question", "answers" ], filename_output) #libfile.writeExcel(results, ["query", "source", "cnt_like", "cnt_answer", "question", "answers"], filename_output) print counter def run_get_best_search_realtime(self, filename): results = [] counter = collections.Counter() lines = libfile.file2list(filename) for query_parser in [0]: for line in sorted(lines): cnt_label = "query_{}".format(query_parser) if counter[cnt_label] % 10 == 0: print datetime.datetime.now().isoformat( ), counter[cnt_label], line counter[cnt_label] += 1 ret_one = search_zhidao_best(line, query_filter=0, query_parser=query_parser) if ret_one: item = ret_one["best_qapair"] print "=====>", line print "------", item["match_score"], item["question"] print item["answers"], "*******", item["answers_raw"][ len(item["answers"]):] for p in ["query"]: item[p] = ret_one[p] #print json.dumps(item, ensure_ascii=False, indent=4, sort_keys=True) results.append(item) for p in ["source", "result_index"]: counter["{}_{}".format(p, item[p])] += 1 for p in ["question", "answers"]: if p in item: if not isinstance(item[p], unicode): item[p] = item[p].decode("gb18030") filename_output = getLocalFile( os.path.basename(filename.replace("human.txt", "xls"))) libfile.writeExcel(results, [ "id", "source", "result_index", "cnt_like", "cnt_answer", "query", "question_id", "question", "answers" ], filename_output) #libfile.writeExcel(results, ["query", "source", "cnt_like", "cnt_answer", "question", "answers"], filename_output) print counter
class Scheduler(object): def __init__(self, cacheserver): self.cache = Cache(BATCH_ID['json'], cacheserver) self.downloader = DownloadWrapper(cacheserver, {'Host': 'zhidao.baidu.com'}) @classmethod def instance(cls, *args): if not hasattr(cls, '_instance'): setattr(cls, '_instance', cls(*args)) return cls._instance def zhidao_results(self, qids, gap, timeout=10): q_jsons = [] for qid in qids: q_json = self.zhidao_question(qid, gap, timeout) if q_json is False: continue q_json['list_answers'] = [] for rid in q_json['answer_ids'][:3]: a_json = self.zhidao_answer(qid, rid, gap, timeout) if a_json is False: continue q_json['list_answers'].append(a_json) q_jsons.append(q_json) return q_jsons def zhidao_question(self, qid, gap, timeout): question_url = 'http://zhidao.baidu.com/question/{}.html'.format(qid) ret = self.downloader.downloader_wrapper(question_url, BATCH_ID['question'], gap, timeout=timeout, encoding='gb18030', error_check=True) if ret is False: return False q_json = generate_question_json(qid, ret) if q_json is None: return False success = self.cache.post(question_url, q_json) return q_json def zhidao_answer(self, qid, rid, gap, timeout): answer_url = ('http://zhidao.baidu.com/question/api/mini?qid={}' '&rid={}&tag=timeliness'.format(qid, rid)) ret = self.downloader.downloader_wrapper(answer_url, BATCH_ID['answer'], gap, timeout=timeout, encoding='gb18030') if ret is False: return False try: a_json = generate_answer_json(ret) except: return False success = self.cache.post(answer_url, a_json) return a_json def zhidao_search(self, qword, batch_id, gap=3, timeout=10, refresh=True): quote_word = urllib.quote(qword.encode('utf-8')) if isinstance( qword, unicode) else urllib.quote(qword) # query_url = 'http://zhidao.baidu.com/index/?word={}'.format(quote_word) # utf-8 query_url = 'http://zhidao.baidu.com/search?word={}'.format(quote_word) ret = self.downloader.downloader_wrapper(query_url, batch_id, gap, timeout=timeout, encoding='gb18030', refresh=refresh) # resp.headers: 'content-type': 'text/html;charset=UTF-8', # resp.content: <meta content="application/xhtml+xml; charset=utf-8" http-equiv="content-type"/> if ret is False: return False return zhidao_search_questions(ret) def zhidao_search_list_json(self, qword, batch_id, gap=3, timeout=10, refresh=False): quote_word = urllib.quote(qword.encode('utf-8')) if isinstance( qword, unicode) else urllib.quote(qword) # query_url = 'http://zhidao.baidu.com/index/?word={}'.format(quote_word) # utf-8 query_url = 'http://zhidao.baidu.com/search?word={}'.format(quote_word) ret = self.downloader.downloader_wrapper(query_url, batch_id, gap, timeout=timeout, encoding='gb18030', refresh=refresh) # resp.headers: 'content-type': 'text/html;charset=UTF-8', # resp.content: <meta content="application/xhtml+xml; charset=utf-8" http-equiv="content-type"/> if ret is False: return False search_result_json = parse_search_json_v0615(ret) for item in search_result_json: item["query"] = qword if type(qword) != unicode: item["query"] = qword.decode("utf-8") return search_result_json def zhidao_search_select_best(self, qword, gap=3, timeout=2): search_result_json = self.zhidao_search_list_json( qword, BATCH_ID['search'], gap, timeout) # get the best answer for item in search_result_json: if item["is_recommend"] == 1: return item return False def zhidao_search_select_best_qids(self, qword, gap=3, timeout=2): ret = self.zhidao_search_select_best(qword, gap, timeout) if ret: return [ret["question_id"]] return [] def run(self, qword, gap=3, timeout=10): # qids = self.zhidao_search(qword, BATCH_ID['search'], gap, timeout) qids = self.zhidao_search_select_best_qids(qword, gap, timeout) return self.zhidao_results(qids, gap, timeout)
class FudanAttr(object): batch_id = { 'entity': 'fudankg-entity-20160623', 'avp': 'fudankg-avp-20160623', } def __init__(self): self.downloader = DownloadWrapper('http://192.168.1.179:8000') self.attr_counter = Counter() def fudan_entities(self, word): if isinstance(word, unicode): word = word.encode('utf-8') entities_api = 'http://kw.fudan.edu.cn/cndbpedia/api/entity?mention={}' content = self.downloader.downloader_wrapper( entities_api.format(urllib.quote(word)), self.batch_id['entity'], gap=0, encoding='utf-8', ) return json.loads(content)[u'entity'] def fudan_attrvalue(self, entity): if isinstance(entity, unicode): entity = entity.encode('utf-8') avpair_api = 'http://kw.fudan.edu.cn/cndbpedia/api/entityAVP?entity={}' content = self.downloader.downloader_wrapper( avpair_api.format(urllib.quote(entity)), self.batch_id['avp'], gap=0, encoding='utf-8', ) return json.loads(content).values()[0] def fudan_attr_count(self, result): """[ (word, entity, [(attr, value), (attr, value), ...]) ] """ for word, entity, avps in result: for a,v in avps: self.attr_counter[a] += 1 count = sorted(self.attr_counter.items(), key=itemgetter(1), reverse=True) print( json.dumps(count, ensure_ascii=False, indent=4) ) def fudan_gen_excel(self, result): items = [] keys = ['word', 'entity', 'attribute', 'value'] filename = 'fudan_eav.xlsx' for word, entity, avps in result: for a,v in avps: items.append({'word': word.decode('utf-8'), 'entity': entity, 'attribute': a, 'value': v}) writeExcel(items, keys, filename) def prepare_entities(self, entities_fname='entities_0623.txt'): words = [] with open(entities_fname) as fd: for line in fd: line = line.strip() if line == '': continue words.append(line) picked_words = self.pick_some_words(words, 1000) self.save_picked_words(picked_words) def pick_some_words(self, words, num=1000): picked_words = [] for i in range(num): picked_words.append( words[random.randint(1, len(words))] ) return picked_words def save_picked_words(self, words): words.extend(['长江', '熊二', '杨绛先生']) with open('picked_thousand_words.txt', 'w') as fd: fd.write('\n'.join( list(set(words)) ))