def do_task(self, body): download_rsp = DownLoadRsp() try: tMemory_o = TMemoryBuffer(body) tBinaryProtocol_o = TBinaryProtocol(tMemory_o) download_rsp.read(tBinaryProtocol_o) parserpage = self.extractor.extract(download_rsp) self.task_collector.task_stats(parserpage) self.log.debug(parserpage) return self.to_string(parserpage), parserpage.extract_info.topic_id except EOFError, e: self.log.warning("cann't read DownLoadRsp from string") return None
def process_response(self, request, response): resp = {} resp['url'] = str_obj(request.url) if hasattr(request, 'src_type') and getattr(request, 'src_type') != None: resp['src_type'] = str_obj(request.src_type) resp['download_time'] = int(time.time()) resp['pages'] = [] resp['content'] = '' resp['info'] = request.info resp['scheduler'] = request.scheduler resp['parse_extends'] = request.parse_extends resp['data_extends'] = request.data_extends resp['http_code'] = 0 resp['elapsed'] = -1 if response is not None: resp['redirect_url'] = str_obj(response.url) resp['http_code']=response.status_code resp['elapsed'] = int(response.elapsed.microseconds/1000.0) resp['content_type']=response.headers.get('content-type') resp['content']=str_obj(response.content) resp['page_size']=len(resp['content']) if hasattr(request, 'contenta'): resp['content'] = str_obj(request.contenta) if hasattr(request, 'identify_status'): resp['status'] = request.identify_status else: if resp.get('http_code')==200: resp['status']=CrawlStatus.CRAWL_SUCCESS else: resp['status'] = CrawlStatus.CRAWL_FAILT return DownLoadRsp(**resp)
def prepare_download_rsp(self, url, content_type="html/text"): obj = DownLoadRsp() obj.url = url obj.elapsed = 50 obj.content_type = content_type obj.status = 0 obj.http_code = 200 obj.download_time = int(time.time()) return obj
def get_download_rsp(self, result): url = result['url'] content = result['content'].encode('utf-8') content_type = result.get('content_type', 'text/html') page_size = len(content) return DownLoadRsp(url=url, download_time=int(time.time()), status=0, content_type=content_type, page_size=page_size, elapsed=100, content=content, redirect_url=url, src_type='webpage', http_code=200)
def select_webpage_by_url(self, url): self.log.info("select_webpage_by_url start\turl:{}".format(url)) url = url_encode(url) download_result = DownLoadRsp(url=url, download_time=int(time.time()), status=1, content_type='text/html', page_size=0, elapsed=100, content=None, redirect_url=url, src_type='webpage', http_code=0) try: query_item = {'url': url} domain = get_url_info(url).get('domain') result = self.mongo_client_web.find_first(domain, query_item) if result and (result.get('content')): download_result = self.get_download_rsp(result) except: self.log.error("select_webpage_by_url\turl\t{0}\terror:{1}".format( url, traceback.format_exc())) self.log.info("select_webpage_by_url finish\turl:{}".format(url)) return download_result
def download(self, request): self.log.info("start_crawl\turl::%s\tmethod:%s\tdownload_type:%s" % (request.url, request.method, request.download_type)) start = time.time() response = DownLoadRsp(status=CrawlStatus.CRAWL_FAILT, ) try: if request.retry_times is None: retry_times = self.conf.get( 'default_request_kwargs')['retry_times'] else: retry_times = request.retry_times for t in xrange(retry_times): request = copy.deepcopy(request) self.middleware_manager.process_request(request) start_time = time.time() res = self.downloader.download(request) response = self.middleware_manager.process_response( request, res) request.proxytime = (time.time() - start_time) * 1000.0 - response.elapsed self.write_log(request, response) if response.status == CrawlStatus.CRAWL_SUCCESS: break time.sleep(3) except Exception as e: self.log.error('url:' + request.url + '\terror_msg:' + str(traceback.format_exc())) finally: content_len = -1 if response.content: content_len = len(response.content) self.log.info('finish_crawl\tuse_time:' + str(time.time() - start) + '\tlens:' + str(content_len) + '\tstatus:' + str(response.status) + '\turl:' + str(request.url)) return response
#_ElementUnicodeResult import sys ss = "L2UnbnJUQ3QjHxZcbDQBTlZFIwlqMXYeTDwoZxplOCMIelU9BxASGF4dBgoyZS5tLSYyPxUDNFB%2BNnBhTwUjEh47KFUQKFJcKwNnOi9BE1MSRkADNi5JckwGIjosagcsYBISIiBrKFZyLSJ6ZAcVZmhnIGRxCydraw4EZxdmFyt7FVIVZFUMADdVBQlzBTgJVgBjWyp4C0swaxcQPzUnaSA7FWhRKxBAfh1aCmUbFgkoJ3EGLX84DTovJmMmEBAAJ2EmQTZUAxkjH01NWAUiUAURElFifjkcKz8ZdVZlBT0WB2Q%3D" ss = "%2B" exit(1) html = "" with open('douban_detail.html') as fp: html = fp.read() print re.findall("语言:</span> (.*?)<br/>", html) exit(1) #html = '' CHARSET_PATTERN = re.compile( '<meta.*?(?:charset|CHARSET)=["\']?([a-zA-Z0-9\\-]+)["\']?.*?>') #html = CHARSET_PATTERN.sub("", html) drsp = DownLoadRsp() drsp.content = html #print txt drsp.status = 0 #drsp.parse_extends = json.dumps({'debug':True}) drsp.content_type = "text/html" drsp.url = "https://movie.douban.com/tag/" #drsp.url = "http://www.baidu.com" drsp.redirect_url = "https://movie.douban.com/tag/" drsp.parse_extends = json.dumps({"parser_id": -1, "debug": True}) #print html #from i_extractor.extractor import Extractor from i_extractor import conf from i_util.i_crawler_services import ThriftExtractor #extractor = Extractor(conf) extractor = ThriftExtractor(host="127.0.0.1", port=12300)
log.warning("cann't write DownLoadRsp to string") return str_page_info def put_beanstalked(beanstalk_conf, log, rsp): beanstalk = PyBeanstalk(beanstalk_conf['host'], beanstalk_conf['port']) tube = beanstalk_conf['input_tube'] str_page_info = to_string(log, rsp) try: beanstalk.put(tube, str_page_info) log.info('beanstalk\turl:%s\ttube:%s' % (rsp.url, tube)) except Exception as e: log.info('beanstalk put error url:' + rsp.url + '\ttube:' + tube) if __name__ == "__main__": url = 'dsfsdf' redirect_url = 'sdfdfsfsfs' status = 1 http_code = 200 rsp = DownLoadRsp(url=url, redirect_url=redirect_url, status=status, http_code=http_code) beanstalk_conf = {} beanstalk_conf['host'] = '101.201.102.37' beanstalk_conf['port'] = 11300 beanstalk_conf['input_tube'] = 'download_rsp_test' log = LogHandler('download_test') put_beanstalked(beanstalk_conf, log, rsp=rsp)
def test_parser_config(): if not request.json: return jsonify({'status': 'failed', 'data': 'request error'}) req_datas = request.json download_req = DownLoadReq() download_req.method = req_datas.get('request_method', 'get') download_req.url = req_datas.get('request_url') download_req.download_type = req_datas.get('download_type') download_req.post_data = {} download_req.http_header = {} try: download_req.http_header = json.loads(req_datas.get('headers')) except Exception as e: download_req.http_header = None post_data = None try: post_data = json.loads(req_datas.get('request_params')) except Exception as e: pass parser_id = req_datas.get('parser_id', "-1") page_source = req_datas.get('page_source').strip() if page_source not in ['cache', 'downloader', 'pagedb', 'input']: page_source = 'cache' hz_url = download_req.url if post_data and download_req.method == "post": hz_url = build_hzpost_url(download_req.url, post_data) download_req.url = hz_url spend_time = {} try: page_id = get_md5_i64(hz_url) download_rsp = None stime = time.time() if page_source == 'pagedb': download_rsp = current_app.config['crawler_merge'].select_one( hz_url) if download_rsp.status == 1: download_rsp = None elif page_source == 'cache': download_rsp = get_page_cache(page_id) elif page_source == 'input': download_rsp = DownLoadRsp() download_rsp.url = hz_url download_rsp.status = 0 download_rsp.content_type = "text" download_rsp.http_code = 200 download_rsp.download_time = 0 download_rsp.content = req_datas.get('input_page', "").encode('utf8') download_rsp.src_type = "input" download_rsp.elapsed = 50 if not download_rsp: downloader = current_app.config['downloader'] download_rsp = downloader.download(hz_url, download_req) download_rsp.url = hz_url spend_time['download_spend'] = (time.time() - stime) * 1000 set_page_cache(page_id, download_rsp) is_save = req_datas.get('is_save', 'false') if is_save == "true": download_rsp.parse_extends = json.dumps({'parser_id': parser_id}) download_rsp_tube = current_app.config[ 'put_beanstald_server'].get_tube_by_name('download_rsp_tube') if download_rsp_tube: current_app.config['put_beanstald_server'].save_record({ 'tube_name': download_rsp_tube, 'obj': download_rsp }) #复制download_rsp, 防止多线程修改 download_rsp = deepcopy(download_rsp) download_rsp.parse_extends = json.dumps({ "parser_id": parser_id, "debug": True }) extractor = current_app.config['extractor'] stime = time.time() extract_rsp = extractor.extract(download_rsp) spend_time['extract_spend'] = (time.time() - stime) * 1000 #实体解析数据列表 entity_datas = None #schema检查结果 schema_check_result = None entity_rsps = None cur_datetime = str(datetime.datetime.now()) try: stime = time.time() extract_data = extract_rsp.extract_info.extract_data if extract_data: extract_data_dict = json.loads(extract_data) _src = { "url": extract_rsp.base_info.url, "site_id": extract_rsp.base_info.site_id, "site": extract_rsp.base_info.site } if "datas" in extract_data_dict: datas = extract_data_dict['datas'] tmp_datas = [] for d in datas: d['_src'] = [_src] tmp_datas.append(d) extract_data_dict['datas'] = tmp_datas else: extract_data_dict['_src'] = [_src] extract_rsp.extract_info.extract_data = json.dumps( extract_data_dict) entity_rsps = current_app.config[ 'entity_extractor'].entity_extract(extract_rsp) spend_time['entity_spend'] = (time.time() - stime) * 1000 entity_datas = [] for data in entity_rsps.entity_data_list: if data: entity_datas.append(json.loads(data.entity_data)) else: entity_datas.append(None) except Exception as e: if entity_rsps: entity_datas = { 'sys_error': e.message, 'error_message': entity_rsps.msg } else: entity_datas = {'sys_error': e.message} final_data = {} try: if entity_rsps.entity_data_list: entity_json = { "topic_id": entity_rsps.entity_data_list[0].topic_id, "data": json.loads(entity_rsps.entity_data_list[0].entity_data) } datasaver_resp = current_app.config['data_saver'].check_data( json.dumps(entity_json)) final_data = json.loads(datasaver_resp.data) except Exception as e: final_data = {'sys_error': e.message} return jsonify({ 'status': True, 'data': build_test_parser_config_rsp(extract_rsp, entity_datas, final_data, spend_time) }) except Exception as e: current_app.config['logger'].error(hz_url) current_app.config['logger'].info(traceback.format_exc()) return jsonify({'status': False, 'data': e.message})
def do_task(self, body): try: download_req = json.loads(body) self.log.info("request_msg\t%s" % download_req) target_extractor_id = self.type_extractor_map[download_req['_type']] name = download_req['name'].encode('utf-8') target_url = 'http://%(site)s/gongshang_search?%(query)s' % { 'site': prov_site_map[download_req['province']], 'query': urllib.urlencode({ 'name': name, 'original_query': json.dumps(download_req) }) } self.log.info('请求代理企业名称: name = {name}'.format(name=name)) response = requests.get(target_url, proxies={'http': self.smart_proxy_url}) if response.status_code != 200: download_rsp = DownLoadRsp(status=CrawlStatus.CRAWL_FAILT, ) return self.to_string(download_rsp) self.log.debug(response.text) resp_json = response.json() url = resp_json['url'] # 组装DownloadRsp resp = dict() resp['url'] = str_obj(url) resp['download_time'] = resp_json.get('entitySrcDownloadTime', 0) resp['pages'] = [] resp['content'] = str_obj(resp_json['html']) if resp['content'] is None: resp['content'] = '<html></html>' resp['data_extends'] = str_obj(json.dumps(resp_json['entity'])) resp['parse_extends'] = str_obj(json.dumps({"parser_id": target_extractor_id})) resp['page_size'] = len(resp['content']) resp['content_type'] = 'text/html' resp['src_type'] = 'webpage' # resp['info'] = request.info # resp['scheduler'] = request.scheduler # resp['parse_extends'] = request.parse_extends resp['http_code'] = response.status_code resp['elapsed'] = int(response.elapsed.microseconds / 1000.0) resp['status'] = CrawlStatus.CRAWL_SUCCESS download_rsp = DownLoadRsp(**resp) self.log.info('发送到解析器的 name = {name} url = {url}'.format(name=name, url=resp['url'])) # self.log.info(download_rsp) # 写给工商调度 company_name = resp_json['entity'].get('company') self.out_beanstalk.put(self.output_tube_scheduler, json.dumps({ 'company': company_name, 'crawl_online': resp_json['crawlStatus'].get('crawl_online'), 'crawl_online_time': resp_json['crawlStatus'].get('crawl_online_time'), 'query': resp_json['crawlSeed'], })) self.log.info('发送企业名称到工商调度消息队列: comapny = {company}'.format(company=company_name.encode('utf-8'))) return self.to_string(download_rsp) except Exception as err: self.log.error("process failed, err[%s]" % (repr(err))) self.log.exception(err) download_rsp = DownLoadRsp(status=CrawlStatus.CRAWL_FAILT, ) return self.to_string(download_rsp)