Ejemplo n.º 1
0
 def do_task(self, body):
     download_rsp = DownLoadRsp()
     try:
         tMemory_o = TMemoryBuffer(body)
         tBinaryProtocol_o = TBinaryProtocol(tMemory_o)
         download_rsp.read(tBinaryProtocol_o)
         parserpage = self.extractor.extract(download_rsp)
         self.task_collector.task_stats(parserpage)
         self.log.debug(parserpage)
         return self.to_string(parserpage), parserpage.extract_info.topic_id
     except EOFError, e:
         self.log.warning("cann't read DownLoadRsp from string")
         return None
Ejemplo n.º 2
0
 def process_response(self, request, response):
     resp = {}
     resp['url'] = str_obj(request.url)
     if hasattr(request, 'src_type') and getattr(request, 'src_type') != None:
         resp['src_type'] = str_obj(request.src_type)
     resp['download_time'] = int(time.time())
     resp['pages'] = []
     resp['content'] = ''
     resp['info'] = request.info
     resp['scheduler'] = request.scheduler
     resp['parse_extends'] = request.parse_extends
     resp['data_extends'] = request.data_extends
     resp['http_code'] = 0
     resp['elapsed'] = -1
     if response is not None:
         resp['redirect_url'] = str_obj(response.url)
         resp['http_code']=response.status_code
         resp['elapsed'] = int(response.elapsed.microseconds/1000.0)
         resp['content_type']=response.headers.get('content-type')
         resp['content']=str_obj(response.content)
         resp['page_size']=len(resp['content'])
     if hasattr(request, 'contenta'):
         resp['content'] = str_obj(request.contenta)
     if hasattr(request, 'identify_status'):
         resp['status'] = request.identify_status
     else:
         if resp.get('http_code')==200:
             resp['status']=CrawlStatus.CRAWL_SUCCESS
         else:
             resp['status'] = CrawlStatus.CRAWL_FAILT
     return DownLoadRsp(**resp)
Ejemplo n.º 3
0
 def prepare_download_rsp(self, url, content_type="html/text"):
     obj = DownLoadRsp()
     obj.url = url
     obj.elapsed = 50
     obj.content_type = content_type
     obj.status = 0
     obj.http_code = 200
     obj.download_time = int(time.time())
     return obj
Ejemplo n.º 4
0
 def get_download_rsp(self, result):
     url = result['url']
     content = result['content'].encode('utf-8')
     content_type = result.get('content_type', 'text/html')
     page_size = len(content)
     return DownLoadRsp(url=url,
                        download_time=int(time.time()),
                        status=0,
                        content_type=content_type,
                        page_size=page_size,
                        elapsed=100,
                        content=content,
                        redirect_url=url,
                        src_type='webpage',
                        http_code=200)
Ejemplo n.º 5
0
 def select_webpage_by_url(self, url):
     self.log.info("select_webpage_by_url start\turl:{}".format(url))
     url = url_encode(url)
     download_result = DownLoadRsp(url=url,
                                   download_time=int(time.time()),
                                   status=1,
                                   content_type='text/html',
                                   page_size=0,
                                   elapsed=100,
                                   content=None,
                                   redirect_url=url,
                                   src_type='webpage',
                                   http_code=0)
     try:
         query_item = {'url': url}
         domain = get_url_info(url).get('domain')
         result = self.mongo_client_web.find_first(domain, query_item)
         if result and (result.get('content')):
             download_result = self.get_download_rsp(result)
     except:
         self.log.error("select_webpage_by_url\turl\t{0}\terror:{1}".format(
             url, traceback.format_exc()))
     self.log.info("select_webpage_by_url finish\turl:{}".format(url))
     return download_result
Ejemplo n.º 6
0
 def download(self, request):
     self.log.info("start_crawl\turl::%s\tmethod:%s\tdownload_type:%s" %
                   (request.url, request.method, request.download_type))
     start = time.time()
     response = DownLoadRsp(status=CrawlStatus.CRAWL_FAILT, )
     try:
         if request.retry_times is None:
             retry_times = self.conf.get(
                 'default_request_kwargs')['retry_times']
         else:
             retry_times = request.retry_times
         for t in xrange(retry_times):
             request = copy.deepcopy(request)
             self.middleware_manager.process_request(request)
             start_time = time.time()
             res = self.downloader.download(request)
             response = self.middleware_manager.process_response(
                 request, res)
             request.proxytime = (time.time() -
                                  start_time) * 1000.0 - response.elapsed
             self.write_log(request, response)
             if response.status == CrawlStatus.CRAWL_SUCCESS:
                 break
             time.sleep(3)
     except Exception as e:
         self.log.error('url:' + request.url + '\terror_msg:' +
                        str(traceback.format_exc()))
     finally:
         content_len = -1
         if response.content:
             content_len = len(response.content)
         self.log.info('finish_crawl\tuse_time:' +
                       str(time.time() - start) + '\tlens:' +
                       str(content_len) + '\tstatus:' +
                       str(response.status) + '\turl:' + str(request.url))
     return response
Ejemplo n.º 7
0
#_ElementUnicodeResult
import sys
ss = "L2UnbnJUQ3QjHxZcbDQBTlZFIwlqMXYeTDwoZxplOCMIelU9BxASGF4dBgoyZS5tLSYyPxUDNFB%2BNnBhTwUjEh47KFUQKFJcKwNnOi9BE1MSRkADNi5JckwGIjosagcsYBISIiBrKFZyLSJ6ZAcVZmhnIGRxCydraw4EZxdmFyt7FVIVZFUMADdVBQlzBTgJVgBjWyp4C0swaxcQPzUnaSA7FWhRKxBAfh1aCmUbFgkoJ3EGLX84DTovJmMmEBAAJ2EmQTZUAxkjH01NWAUiUAURElFifjkcKz8ZdVZlBT0WB2Q%3D"
ss = "%2B"

exit(1)
html = ""
with open('douban_detail.html') as fp:
    html = fp.read()
print re.findall("语言:</span> (.*?)<br/>", html)
exit(1)
#html = ''
CHARSET_PATTERN = re.compile(
    '<meta.*?(?:charset|CHARSET)=["\']?([a-zA-Z0-9\\-]+)["\']?.*?>')
#html = CHARSET_PATTERN.sub("", html)
drsp = DownLoadRsp()
drsp.content = html
#print txt
drsp.status = 0
#drsp.parse_extends = json.dumps({'debug':True})
drsp.content_type = "text/html"
drsp.url = "https://movie.douban.com/tag/"
#drsp.url = "http://www.baidu.com"
drsp.redirect_url = "https://movie.douban.com/tag/"
drsp.parse_extends = json.dumps({"parser_id": -1, "debug": True})
#print html
#from i_extractor.extractor import Extractor
from i_extractor import conf
from i_util.i_crawler_services import ThriftExtractor
#extractor = Extractor(conf)
extractor = ThriftExtractor(host="127.0.0.1", port=12300)
Ejemplo n.º 8
0
        log.warning("cann't write DownLoadRsp to string")
    return str_page_info


def put_beanstalked(beanstalk_conf, log, rsp):
    beanstalk = PyBeanstalk(beanstalk_conf['host'], beanstalk_conf['port'])
    tube = beanstalk_conf['input_tube']
    str_page_info = to_string(log, rsp)
    try:
        beanstalk.put(tube, str_page_info)
        log.info('beanstalk\turl:%s\ttube:%s' % (rsp.url, tube))
    except Exception as e:
        log.info('beanstalk put error url:' + rsp.url + '\ttube:' + tube)


if __name__ == "__main__":
    url = 'dsfsdf'
    redirect_url = 'sdfdfsfsfs'
    status = 1
    http_code = 200
    rsp = DownLoadRsp(url=url,
                      redirect_url=redirect_url,
                      status=status,
                      http_code=http_code)
    beanstalk_conf = {}
    beanstalk_conf['host'] = '101.201.102.37'
    beanstalk_conf['port'] = 11300
    beanstalk_conf['input_tube'] = 'download_rsp_test'
    log = LogHandler('download_test')
    put_beanstalked(beanstalk_conf, log, rsp=rsp)
Ejemplo n.º 9
0
def test_parser_config():
    if not request.json:
        return jsonify({'status': 'failed', 'data': 'request error'})
    req_datas = request.json
    download_req = DownLoadReq()
    download_req.method = req_datas.get('request_method', 'get')
    download_req.url = req_datas.get('request_url')
    download_req.download_type = req_datas.get('download_type')
    download_req.post_data = {}
    download_req.http_header = {}
    try:
        download_req.http_header = json.loads(req_datas.get('headers'))
    except Exception as e:
        download_req.http_header = None
    post_data = None
    try:
        post_data = json.loads(req_datas.get('request_params'))
    except Exception as e:
        pass
    parser_id = req_datas.get('parser_id', "-1")
    page_source = req_datas.get('page_source').strip()

    if page_source not in ['cache', 'downloader', 'pagedb', 'input']:
        page_source = 'cache'
    hz_url = download_req.url
    if post_data and download_req.method == "post":
        hz_url = build_hzpost_url(download_req.url, post_data)
        download_req.url = hz_url
    spend_time = {}
    try:
        page_id = get_md5_i64(hz_url)
        download_rsp = None
        stime = time.time()

        if page_source == 'pagedb':
            download_rsp = current_app.config['crawler_merge'].select_one(
                hz_url)
            if download_rsp.status == 1:
                download_rsp = None
        elif page_source == 'cache':
            download_rsp = get_page_cache(page_id)
        elif page_source == 'input':
            download_rsp = DownLoadRsp()
            download_rsp.url = hz_url
            download_rsp.status = 0
            download_rsp.content_type = "text"
            download_rsp.http_code = 200
            download_rsp.download_time = 0
            download_rsp.content = req_datas.get('input_page',
                                                 "").encode('utf8')
            download_rsp.src_type = "input"
            download_rsp.elapsed = 50
        if not download_rsp:
            downloader = current_app.config['downloader']
            download_rsp = downloader.download(hz_url, download_req)
            download_rsp.url = hz_url
        spend_time['download_spend'] = (time.time() - stime) * 1000
        set_page_cache(page_id, download_rsp)
        is_save = req_datas.get('is_save', 'false')
        if is_save == "true":
            download_rsp.parse_extends = json.dumps({'parser_id': parser_id})
            download_rsp_tube = current_app.config[
                'put_beanstald_server'].get_tube_by_name('download_rsp_tube')
            if download_rsp_tube:
                current_app.config['put_beanstald_server'].save_record({
                    'tube_name':
                    download_rsp_tube,
                    'obj':
                    download_rsp
                })
        #复制download_rsp, 防止多线程修改
        download_rsp = deepcopy(download_rsp)
        download_rsp.parse_extends = json.dumps({
            "parser_id": parser_id,
            "debug": True
        })
        extractor = current_app.config['extractor']
        stime = time.time()
        extract_rsp = extractor.extract(download_rsp)
        spend_time['extract_spend'] = (time.time() - stime) * 1000
        #实体解析数据列表
        entity_datas = None
        #schema检查结果
        schema_check_result = None
        entity_rsps = None
        cur_datetime = str(datetime.datetime.now())
        try:
            stime = time.time()
            extract_data = extract_rsp.extract_info.extract_data
            if extract_data:
                extract_data_dict = json.loads(extract_data)
                _src = {
                    "url": extract_rsp.base_info.url,
                    "site_id": extract_rsp.base_info.site_id,
                    "site": extract_rsp.base_info.site
                }
                if "datas" in extract_data_dict:
                    datas = extract_data_dict['datas']
                    tmp_datas = []
                    for d in datas:
                        d['_src'] = [_src]
                        tmp_datas.append(d)
                    extract_data_dict['datas'] = tmp_datas
                else:
                    extract_data_dict['_src'] = [_src]
                extract_rsp.extract_info.extract_data = json.dumps(
                    extract_data_dict)
                entity_rsps = current_app.config[
                    'entity_extractor'].entity_extract(extract_rsp)
                spend_time['entity_spend'] = (time.time() - stime) * 1000
                entity_datas = []
                for data in entity_rsps.entity_data_list:
                    if data:
                        entity_datas.append(json.loads(data.entity_data))
                    else:
                        entity_datas.append(None)
        except Exception as e:
            if entity_rsps:
                entity_datas = {
                    'sys_error': e.message,
                    'error_message': entity_rsps.msg
                }
            else:
                entity_datas = {'sys_error': e.message}
        final_data = {}
        try:
            if entity_rsps.entity_data_list:
                entity_json = {
                    "topic_id": entity_rsps.entity_data_list[0].topic_id,
                    "data":
                    json.loads(entity_rsps.entity_data_list[0].entity_data)
                }
                datasaver_resp = current_app.config['data_saver'].check_data(
                    json.dumps(entity_json))
                final_data = json.loads(datasaver_resp.data)
        except Exception as e:
            final_data = {'sys_error': e.message}
        return jsonify({
            'status':
            True,
            'data':
            build_test_parser_config_rsp(extract_rsp, entity_datas, final_data,
                                         spend_time)
        })
    except Exception as e:
        current_app.config['logger'].error(hz_url)
        current_app.config['logger'].info(traceback.format_exc())
        return jsonify({'status': False, 'data': e.message})
Ejemplo n.º 10
0
    def do_task(self, body):
        try:
            download_req = json.loads(body)
            self.log.info("request_msg\t%s" % download_req)

            target_extractor_id = self.type_extractor_map[download_req['_type']]

            name = download_req['name'].encode('utf-8')
            target_url = 'http://%(site)s/gongshang_search?%(query)s' % {
                'site': prov_site_map[download_req['province']],
                'query': urllib.urlencode({
                    'name': name,
                    'original_query': json.dumps(download_req)
                })
            }
            self.log.info('请求代理企业名称: name = {name}'.format(name=name))

            response = requests.get(target_url, proxies={'http': self.smart_proxy_url})
            if response.status_code != 200:
                download_rsp = DownLoadRsp(status=CrawlStatus.CRAWL_FAILT, )
                return self.to_string(download_rsp)

            self.log.debug(response.text)

            resp_json = response.json()

            url = resp_json['url']

            # 组装DownloadRsp
            resp = dict()
            resp['url'] = str_obj(url)
            resp['download_time'] = resp_json.get('entitySrcDownloadTime', 0)
            resp['pages'] = []
            resp['content'] = str_obj(resp_json['html'])
            if resp['content'] is None:
                resp['content'] = '<html></html>'
            resp['data_extends'] = str_obj(json.dumps(resp_json['entity']))
            resp['parse_extends'] = str_obj(json.dumps({"parser_id": target_extractor_id}))
            resp['page_size'] = len(resp['content'])
            resp['content_type'] = 'text/html'
            resp['src_type'] = 'webpage'
            # resp['info'] = request.info
            # resp['scheduler'] = request.scheduler
            # resp['parse_extends'] = request.parse_extends
            resp['http_code'] = response.status_code
            resp['elapsed'] = int(response.elapsed.microseconds / 1000.0)
            resp['status'] = CrawlStatus.CRAWL_SUCCESS
            download_rsp = DownLoadRsp(**resp)

            self.log.info('发送到解析器的 name = {name} url = {url}'.format(name=name, url=resp['url']))

            # self.log.info(download_rsp)

            # 写给工商调度
            company_name = resp_json['entity'].get('company')
            self.out_beanstalk.put(self.output_tube_scheduler, json.dumps({
                'company': company_name,
                'crawl_online': resp_json['crawlStatus'].get('crawl_online'),
                'crawl_online_time': resp_json['crawlStatus'].get('crawl_online_time'),
                'query': resp_json['crawlSeed'],
            }))
            self.log.info('发送企业名称到工商调度消息队列: comapny = {company}'.format(company=company_name.encode('utf-8')))
            return self.to_string(download_rsp)
        except Exception as err:
            self.log.error("process failed, err[%s]" % (repr(err)))
            self.log.exception(err)

            download_rsp = DownLoadRsp(status=CrawlStatus.CRAWL_FAILT, )
            return self.to_string(download_rsp)