Ejemplo n.º 1
0
 def prepare_download_rsp(self, url, content_type="html/text"):
     obj = DownLoadRsp()
     obj.url = url
     obj.elapsed = 50
     obj.content_type = content_type
     obj.status = 0
     obj.http_code = 200
     obj.download_time = int(time.time())
     return obj
Ejemplo n.º 2
0
def test_parser_config():
    if not request.json:
        return jsonify({'status': 'failed', 'data': 'request error'})
    req_datas = request.json
    download_req = DownLoadReq()
    download_req.method = req_datas.get('request_method', 'get')
    download_req.url = req_datas.get('request_url')
    download_req.download_type = req_datas.get('download_type')
    download_req.post_data = {}
    download_req.http_header = {}
    try:
        download_req.http_header = json.loads(req_datas.get('headers'))
    except Exception as e:
        download_req.http_header = None
    post_data = None
    try:
        post_data = json.loads(req_datas.get('request_params'))
    except Exception as e:
        pass
    parser_id = req_datas.get('parser_id', "-1")
    page_source = req_datas.get('page_source').strip()

    if page_source not in ['cache', 'downloader', 'pagedb', 'input']:
        page_source = 'cache'
    hz_url = download_req.url
    if post_data and download_req.method == "post":
        hz_url = build_hzpost_url(download_req.url, post_data)
        download_req.url = hz_url
    spend_time = {}
    try:
        page_id = get_md5_i64(hz_url)
        download_rsp = None
        stime = time.time()

        if page_source == 'pagedb':
            download_rsp = current_app.config['crawler_merge'].select_one(
                hz_url)
            if download_rsp.status == 1:
                download_rsp = None
        elif page_source == 'cache':
            download_rsp = get_page_cache(page_id)
        elif page_source == 'input':
            download_rsp = DownLoadRsp()
            download_rsp.url = hz_url
            download_rsp.status = 0
            download_rsp.content_type = "text"
            download_rsp.http_code = 200
            download_rsp.download_time = 0
            download_rsp.content = req_datas.get('input_page',
                                                 "").encode('utf8')
            download_rsp.src_type = "input"
            download_rsp.elapsed = 50
        if not download_rsp:
            downloader = current_app.config['downloader']
            download_rsp = downloader.download(hz_url, download_req)
            download_rsp.url = hz_url
        spend_time['download_spend'] = (time.time() - stime) * 1000
        set_page_cache(page_id, download_rsp)
        is_save = req_datas.get('is_save', 'false')
        if is_save == "true":
            download_rsp.parse_extends = json.dumps({'parser_id': parser_id})
            download_rsp_tube = current_app.config[
                'put_beanstald_server'].get_tube_by_name('download_rsp_tube')
            if download_rsp_tube:
                current_app.config['put_beanstald_server'].save_record({
                    'tube_name':
                    download_rsp_tube,
                    'obj':
                    download_rsp
                })
        #复制download_rsp, 防止多线程修改
        download_rsp = deepcopy(download_rsp)
        download_rsp.parse_extends = json.dumps({
            "parser_id": parser_id,
            "debug": True
        })
        extractor = current_app.config['extractor']
        stime = time.time()
        extract_rsp = extractor.extract(download_rsp)
        spend_time['extract_spend'] = (time.time() - stime) * 1000
        #实体解析数据列表
        entity_datas = None
        #schema检查结果
        schema_check_result = None
        entity_rsps = None
        cur_datetime = str(datetime.datetime.now())
        try:
            stime = time.time()
            extract_data = extract_rsp.extract_info.extract_data
            if extract_data:
                extract_data_dict = json.loads(extract_data)
                _src = {
                    "url": extract_rsp.base_info.url,
                    "site_id": extract_rsp.base_info.site_id,
                    "site": extract_rsp.base_info.site
                }
                if "datas" in extract_data_dict:
                    datas = extract_data_dict['datas']
                    tmp_datas = []
                    for d in datas:
                        d['_src'] = [_src]
                        tmp_datas.append(d)
                    extract_data_dict['datas'] = tmp_datas
                else:
                    extract_data_dict['_src'] = [_src]
                extract_rsp.extract_info.extract_data = json.dumps(
                    extract_data_dict)
                entity_rsps = current_app.config[
                    'entity_extractor'].entity_extract(extract_rsp)
                spend_time['entity_spend'] = (time.time() - stime) * 1000
                entity_datas = []
                for data in entity_rsps.entity_data_list:
                    if data:
                        entity_datas.append(json.loads(data.entity_data))
                    else:
                        entity_datas.append(None)
        except Exception as e:
            if entity_rsps:
                entity_datas = {
                    'sys_error': e.message,
                    'error_message': entity_rsps.msg
                }
            else:
                entity_datas = {'sys_error': e.message}
        final_data = {}
        try:
            if entity_rsps.entity_data_list:
                entity_json = {
                    "topic_id": entity_rsps.entity_data_list[0].topic_id,
                    "data":
                    json.loads(entity_rsps.entity_data_list[0].entity_data)
                }
                datasaver_resp = current_app.config['data_saver'].check_data(
                    json.dumps(entity_json))
                final_data = json.loads(datasaver_resp.data)
        except Exception as e:
            final_data = {'sys_error': e.message}
        return jsonify({
            'status':
            True,
            'data':
            build_test_parser_config_rsp(extract_rsp, entity_datas, final_data,
                                         spend_time)
        })
    except Exception as e:
        current_app.config['logger'].error(hz_url)
        current_app.config['logger'].info(traceback.format_exc())
        return jsonify({'status': False, 'data': e.message})