def construct_downloader_req(urls): i = 0 for url in urls: i = i + 1 print i # print url download_req = DownLoadReq() download_req.method = 'get' download_req.url = url['url'] download_req.http_header = {} download_req.session_commit = SessionCommit() download_req.session_commit.refer_url = "" download_req.session_commit.identifying_code_url = "" download_req.session_commit.identifying_code_check_url = "" download_req.session_commit.check_body = "" download_req.session_commit.check_body_not = "" download_req.session_commit.session_msg = {} download_req.session_commit.need_identifying = False download_req.session_commit.need_identifying = False scheduler_info = {} download_req.scheduler = json.dumps(scheduler_info) download_req.use_proxy = False download_req.src_type = "seed" if download_req.url is not None: download_req.download_type = 'simple' priority_key = str(time.time()) index_queue.put((priority_key, download_req))
def download(self, url, req = None): rsp = None; self.transport.open() try: if req is None: req = DownLoadReq(); if url != None: req.url = url; rsp = self.client.download(req) finally: self.transport.close() return rsp
def create_download_req(url, method='simple', parser_id="-1", http_method="get"): download_req = DownLoadReq() download_req.url = url download_req.post_data = {} download_req.src_type = 'linkbase' download_req.download_type = method download_req.parse_extends = json.dumps({"parser_id": parser_id}) download_req.method = http_method scheduler_info = {} scheduler_info["schedule_time"] = time.time() download_req.scheduler = json.dumps(scheduler_info) return download_req
port = 8088 user = '******' password = '******' proxy = Proxy(host=host, port=port, user=user, password=password) kw = { 'refer_url': 'http://wsgs.fjaic.gov.cn/creditpub/home', 'session_msg': { 'session.token': 'session.token' }, } url = 'http://wsgs.fjaic.gov.cn/creditpub/search/ent_info_list' post_data = {'searchType': '1', 'captcha': ''} # session_commit=SessionCommit(**kw) req = DownLoadReq(url=url, method='get', download_type='simple') #req.proxy=proxy req.priority = 0 req.time_out = 30 req.http_header = http_header req.retry_times = 1 req.post_data = post_data # req.session_commit=session_commit for i in range(1, 199): req.url = 'http://data.eastmoney.com/Notice_n/Noticelist.aspx?type=&market=hk&code=01224&date=&page=%s' % ( str(i)) res = client.download(req) time.sleep(2) print res transport.close() #iutl改了log i——config改了mysql密码,,phantomjs改了log except Thrift.TException, tx: print '%s' % (tx.message)
def test_parser_config(): if not request.json: return jsonify({'status': 'failed', 'data': 'request error'}) req_datas = request.json download_req = DownLoadReq() download_req.method = req_datas.get('request_method', 'get') download_req.url = req_datas.get('request_url') download_req.download_type = req_datas.get('download_type') download_req.post_data = {} download_req.http_header = {} try: download_req.http_header = json.loads(req_datas.get('headers')) except Exception as e: download_req.http_header = None post_data = None try: post_data = json.loads(req_datas.get('request_params')) except Exception as e: pass parser_id = req_datas.get('parser_id', "-1") page_source = req_datas.get('page_source').strip() if page_source not in ['cache', 'downloader', 'pagedb', 'input']: page_source = 'cache' hz_url = download_req.url if post_data and download_req.method == "post": hz_url = build_hzpost_url(download_req.url, post_data) download_req.url = hz_url spend_time = {} try: page_id = get_md5_i64(hz_url) download_rsp = None stime = time.time() if page_source == 'pagedb': download_rsp = current_app.config['crawler_merge'].select_one( hz_url) if download_rsp.status == 1: download_rsp = None elif page_source == 'cache': download_rsp = get_page_cache(page_id) elif page_source == 'input': download_rsp = DownLoadRsp() download_rsp.url = hz_url download_rsp.status = 0 download_rsp.content_type = "text" download_rsp.http_code = 200 download_rsp.download_time = 0 download_rsp.content = req_datas.get('input_page', "").encode('utf8') download_rsp.src_type = "input" download_rsp.elapsed = 50 if not download_rsp: downloader = current_app.config['downloader'] download_rsp = downloader.download(hz_url, download_req) download_rsp.url = hz_url spend_time['download_spend'] = (time.time() - stime) * 1000 set_page_cache(page_id, download_rsp) is_save = req_datas.get('is_save', 'false') if is_save == "true": download_rsp.parse_extends = json.dumps({'parser_id': parser_id}) download_rsp_tube = current_app.config[ 'put_beanstald_server'].get_tube_by_name('download_rsp_tube') if download_rsp_tube: current_app.config['put_beanstald_server'].save_record({ 'tube_name': download_rsp_tube, 'obj': download_rsp }) #复制download_rsp, 防止多线程修改 download_rsp = deepcopy(download_rsp) download_rsp.parse_extends = json.dumps({ "parser_id": parser_id, "debug": True }) extractor = current_app.config['extractor'] stime = time.time() extract_rsp = extractor.extract(download_rsp) spend_time['extract_spend'] = (time.time() - stime) * 1000 #实体解析数据列表 entity_datas = None #schema检查结果 schema_check_result = None entity_rsps = None cur_datetime = str(datetime.datetime.now()) try: stime = time.time() extract_data = extract_rsp.extract_info.extract_data if extract_data: extract_data_dict = json.loads(extract_data) _src = { "url": extract_rsp.base_info.url, "site_id": extract_rsp.base_info.site_id, "site": extract_rsp.base_info.site } if "datas" in extract_data_dict: datas = extract_data_dict['datas'] tmp_datas = [] for d in datas: d['_src'] = [_src] tmp_datas.append(d) extract_data_dict['datas'] = tmp_datas else: extract_data_dict['_src'] = [_src] extract_rsp.extract_info.extract_data = json.dumps( extract_data_dict) entity_rsps = current_app.config[ 'entity_extractor'].entity_extract(extract_rsp) spend_time['entity_spend'] = (time.time() - stime) * 1000 entity_datas = [] for data in entity_rsps.entity_data_list: if data: entity_datas.append(json.loads(data.entity_data)) else: entity_datas.append(None) except Exception as e: if entity_rsps: entity_datas = { 'sys_error': e.message, 'error_message': entity_rsps.msg } else: entity_datas = {'sys_error': e.message} final_data = {} try: if entity_rsps.entity_data_list: entity_json = { "topic_id": entity_rsps.entity_data_list[0].topic_id, "data": json.loads(entity_rsps.entity_data_list[0].entity_data) } datasaver_resp = current_app.config['data_saver'].check_data( json.dumps(entity_json)) final_data = json.loads(datasaver_resp.data) except Exception as e: final_data = {'sys_error': e.message} return jsonify({ 'status': True, 'data': build_test_parser_config_rsp(extract_rsp, entity_datas, final_data, spend_time) }) except Exception as e: current_app.config['logger'].error(hz_url) current_app.config['logger'].info(traceback.format_exc()) return jsonify({'status': False, 'data': e.message})