Beispiel #1
0
def put_beanstalked(beanstalk_conf, log, rsp):
    beanstalk = PyBeanstalk(beanstalk_conf['host'], beanstalk_conf['port'])
    tube = beanstalk_conf['input_tube']
    str_page_info = to_string(log, rsp)
    try:
        beanstalk.put(tube, str_page_info)
        log.info('beanstalk\turl:%s\ttube:%s' % (rsp.url, tube))
    except Exception as e:
        log.info('beanstalk put error url:' + rsp.url + '\ttube:' + tube)
Beispiel #2
0
def deliver_req():
    out_beanstalk = PyBeanstalk('172.18.180.223', 11300)
    while True:
        try:
            priority, reqs = index_queue.get_nowait()
            req_str = req_to_string(reqs)
            out_beanstalk.put('online_download_req', req_str)
        except Empty:
            continue
            time.sleep(6)
Beispiel #3
0
class PutBeanstaldServer(threading.Thread):
    def __init__(self, beanstalk_conf, log):
        self._queue = Queue()
        self._log = log
        self.beanstalk = PyBeanstalk(beanstalk_conf['host'],
                                     beanstalk_conf['port'])
        self.beanstalk_conf = beanstalk_conf
        threading.Thread.__init__(self)
        self.daemon = True
        self.running = True

    def to_string(self, page_info):
        str_page_info = None
        try:
            tMemory_b = TMemoryBuffer()
            tBinaryProtocol_b = TBinaryServerProtocol(tMemory_b)
            page_info.write(tBinaryProtocol_b)
            str_page_info = tMemory_b.getvalue()
        except EOFError as e:
            self._log.warning("cann't write data to string")
        return str_page_info

    def put_beanstalkd(self, tube_name, obj):
        str_page_info = self.to_string(obj)
        try:
            self.beanstalk.put(tube_name, str_page_info)
            self._log.info('put beanstalk \ttube:%s success' % (tube_name, ))
        except SocketError as e:
            self._log.error('beanstalk connect failed, {}'.format(e.message))
            self.beanstalk = PyBeanstalk(self.beanstalk_conf['host'],
                                         self.beanstalk_conf['port'])
        except Exception as e:
            self._log.info('beanstalk put tube{} error {}'.format(
                tube_name, str(traceback.format_exc())))

    def run(self):
        while True:
            record = self._queue.get()
            self._build_record_and_put(record)

    def get_tube_by_name(self, tube_name):
        return self.beanstalk_conf.get(tube_name, None)

    def _build_record_and_put(self, data):
        tube_name = data.get('tube_name', None)
        if not tube_name: return
        obj = data.get('obj', None)
        if not obj: return
        self.put_beanstalkd(tube_name, obj)

    def save_record(self, data):
        self._queue.put(data)
Beispiel #4
0
def thrput_task():
    input_tube='download_req'
    beanstalk = PyBeanstalk('101.201.102.37', 11300)
    client,transport=getclient()
    cnt=0
    start=time.time()
    suma=100
    while suma:
        suma-=1
        for i in pro.keys():
            try:
                req=getreq(proa=i)
                str_page_info = to_string(req)
                beanstalk.put(input_tube, str_page_info)
                cnt+=1
            except Exception as e:
                print e.message
        print ('usetime:{}'.format(time.time()-start))



    closetransport(transport)
Beispiel #5
0
class CrawlSelector(threading.Thread):
    def __init__(self, log, selector_conf, beanstalk_conf, scheduler=None):
        threading.Thread.__init__(self)
        self.daemon = True
        self.running = False
        self.log = log
        # 下载统计信息

        self.site_static = {}
        self.scheduler = scheduler
        self.download_req_num = 0
        # 下载器配置信息
        # self.downloaders = []
        self.downloader_num = 0
        # self.downloader_conf = downloader_conf

        # for downloader in self.downloader_conf:
        #     try:
        #         self.downloaders.append(ThriftDownloader(downloader['host'], downloader['port']))
        #         self.downloader_num += 1
        #     except Exception, e:
        #         self.log.error('Add_downloader\t' + traceback.format_exc())
        # 选择器配置
        self.selector_conf = selector_conf
        # beanstalk 队列设置
        self.beanstalk_conf = beanstalk_conf
        self.out_beanstalk = PyBeanstalk(beanstalk_conf['host'],
                                         beanstalk_conf['port'])
        self.output_tube = beanstalk_conf['output_tube']
        self.wlock = threading.Lock()

    def req_to_string(self, req):
        str_req = ""
        try:
            tMemory_b = TMemoryBuffer()
            tBinaryProtocol_b = TBinaryProtocol(tMemory_b)
            req.write(tBinaryProtocol_b)
            str_req = tMemory_b.getvalue()
        except:
            self.log.error('crawled_failt\terror:%s' %
                           (traceback.format_exc()))
        return str_req

    def run(self):
        self.running = True
        while self.running:
            reqs = None
            url = None
            try:
                if self.scheduler:
                    reqs = self.scheduler.dispatch()
                if reqs:
                    for req in reqs:
                        req_str = self.req_to_string(req)
                        self.out_beanstalk.put(self.output_tube, req_str)
                        self.log.info(
                            'start_crawl\turl:%s\tdownload_type:%s\tsession:%s'
                            % (req.url, req.download_type, req.session_commit))
                time.sleep(self.selector_conf['select_seed_sleep_time'])
            except SocketError as e:
                time.sleep(30)
                self.log.error('beanstalk\tconnect\tfail\tstart\treconnect')
                try:
                    self.out_beanstalk.reconnect()
                    self.log.error('beanstalk\treconnect\tsuccess')
                except Exception as e:
                    self.log.error('beanstalk\treconnect\tfail')
            except Exception, e:
                self.log.error('crawled_failt\turl:%s\terror:%s' %
                               (url, traceback.format_exc()))
Beispiel #6
0
class SelectProcessor(object):
    def __init__(self, conf):
        self.log = conf['log']
        self.conf = conf
        self.beanstalk_conf = conf['beanstalk_conf']
        try:
            self.mongo_client_web = PyMongo(
                self.conf['webpage_db']['host'],
                self.conf['webpage_db']['port'], self.conf['webpage_db']['db'],
                self.conf['webpage_db']['username'],
                self.conf['webpage_db']['password'])
            self.beanstalk_client = PyBeanstalk(self.beanstalk_conf['host'],
                                                self.beanstalk_conf['port'])
        except:
            self.log.error(traceback.format_exc())

    def get_download_rsp(self, result):
        url = result['url']
        content = result['content'].encode('utf-8')
        content_type = result.get('content_type', 'text/html')
        page_size = len(content)
        return DownLoadRsp(url=url,
                           download_time=int(time.time()),
                           status=0,
                           content_type=content_type,
                           page_size=page_size,
                           elapsed=100,
                           content=content,
                           redirect_url=url,
                           src_type='webpage',
                           http_code=200)

    # 通过url_format批量查询,并发送到队列
    def select_webpage(self, site, url_format, limit, start, extra_filter):
        try:
            collection_names = self.mongo_client_web.get_collection_names()
            #i_util中需提供一个函数计算主域
            domain = ""
            for collection_name in collection_names:
                prefix_domain = "." + collection_name
                if site.endswith(collection_name) or site.endswith(
                        prefix_domain):
                    domain = collection_name
                    break
            if domain:
                item_cursor = self.mongo_client_web.select_by_url_format(
                    domain, site, url_format, limit, start, extra_filter)
                return item_cursor
        except:
            self.log.error(
                "select_webpage\tsite:{0}\turl_format\t{1}\terror:{2}".format(
                    site, url_format, traceback.format_exc()))
        self.log.info(
            "select_webpage\tfinish\tsite:{0}\turl_format:{1}".format(
                site, url_format))
        return None

    def select_webpage_to_mq(self, condition):
        url_format = condition.get('url_format', "")
        site = condition.get('site', "")
        limit = int(condition.get('limit', -1))
        start = int(condition.get('start', 0))
        extra_filter = condition.get('extra_filter', '{}')
        self.log.info(
            "select_webpage_mq\tstart\tsite:{0}\turl_format:{1}".format(
                site, url_format))
        req_num = 0
        all_num = start
        if site:
            item_cursor = self.select_webpage(site, url_format, limit, start,
                                              extra_filter)
            if item_cursor:
                download_time = ""
                for item in item_cursor:
                    download_time = item.get("download_time", "")
                    all_num += 1
                    if item.get('content'):
                        download_rsp = self.get_download_rsp(item)
                        download_str = self.to_string(download_rsp)
                        req_num += 1
                        self.beanstalk_client.put(
                            self.beanstalk_conf['output_tube'], download_str)
                    if all_num % 100 == 1:
                        #print url_format, all_num, req_num, (all_num % 100 == 1)
                        self.log.info(
                            "select_webpage_mq\trunning\tsite:{0}\turl_format:{1}\tall_num:{2}\treq_num:{3}\tdownload_time:{4}"
                            .format(site, url_format, all_num, req_num,
                                    download_time))
        self.log.info(
            "select_webpage_mq\tfinish\tsite:{0}\turl_format:{1}\treq_num:{2}".
            format(site, url_format, req_num))

    def select_webpage_to_list(self, condition):
        return None

    # 通过url查询单条数据,并发送到队列
    def select_webpage_by_url(self, url):
        self.log.info("select_webpage_by_url start\turl:{}".format(url))
        url = url_encode(url)
        download_result = DownLoadRsp(url=url,
                                      download_time=int(time.time()),
                                      status=1,
                                      content_type='text/html',
                                      page_size=0,
                                      elapsed=100,
                                      content=None,
                                      redirect_url=url,
                                      src_type='webpage',
                                      http_code=0)
        try:
            query_item = {'url': url}
            domain = get_url_info(url).get('domain')
            result = self.mongo_client_web.find_first(domain, query_item)
            if result and (result.get('content')):
                download_result = self.get_download_rsp(result)
        except:
            self.log.error("select_webpage_by_url\turl\t{0}\terror:{1}".format(
                url, traceback.format_exc()))
        self.log.info("select_webpage_by_url finish\turl:{}".format(url))
        return download_result

    def to_string(self, link_info):
        str_entity = None
        try:
            tMemory_b = TMemoryBuffer()
            tBinaryProtocol_b = TBinaryProtocol.TBinaryProtocol(tMemory_b)
            link_info.write(tBinaryProtocol_b)
            str_entity = tMemory_b.getvalue()
        except EOFError, e:
            self.log.warning("can't write LinkAttr to string")
        return str_entity
class DownloaderProccessor(NormalProccessor):
    def __init__(self, log, conf):
        self.log = log
        self.conf = conf

        assert log is not None
        assert isinstance(conf, dict)

        self.type_extractor_map = self.conf['type_extractor_map']
        self.smart_proxy_url = self.conf['smart_proxy_url']

        self.out_beanstalk = PyBeanstalk(self.conf['beanstalk_conf']['host'], self.conf['beanstalk_conf']['port'])
        self.output_tube_scheduler = self.conf['beanstalk_conf']['output_tube_scheduler']

    def to_string(self, download_rsp):
        str_rsq = None
        try:
            t_memory_b = TMemoryBuffer()
            t_binary_protocol_b = TBinaryProtocol(t_memory_b)
            download_rsp.write(t_binary_protocol_b)
            str_rsq = t_memory_b.getvalue()
            # self.log.info('data-length is {}'.format(str(len(str_rsq))))
        except EOFError:
            self.log.warning("cann't write PageParseInfo to string")
        return str_rsq

    def do_task(self, body):
        try:
            download_req = json.loads(body)
            self.log.info("request_msg\t%s" % download_req)

            target_extractor_id = self.type_extractor_map[download_req['_type']]

            name = download_req['name'].encode('utf-8')
            target_url = 'http://%(site)s/gongshang_search?%(query)s' % {
                'site': prov_site_map[download_req['province']],
                'query': urllib.urlencode({
                    'name': name,
                    'original_query': json.dumps(download_req)
                })
            }
            self.log.info('请求代理企业名称: name = {name}'.format(name=name))

            response = requests.get(target_url, proxies={'http': self.smart_proxy_url})
            if response.status_code != 200:
                download_rsp = DownLoadRsp(status=CrawlStatus.CRAWL_FAILT, )
                return self.to_string(download_rsp)

            self.log.debug(response.text)

            resp_json = response.json()

            url = resp_json['url']

            # 组装DownloadRsp
            resp = dict()
            resp['url'] = str_obj(url)
            resp['download_time'] = resp_json.get('entitySrcDownloadTime', 0)
            resp['pages'] = []
            resp['content'] = str_obj(resp_json['html'])
            if resp['content'] is None:
                resp['content'] = '<html></html>'
            resp['data_extends'] = str_obj(json.dumps(resp_json['entity']))
            resp['parse_extends'] = str_obj(json.dumps({"parser_id": target_extractor_id}))
            resp['page_size'] = len(resp['content'])
            resp['content_type'] = 'text/html'
            resp['src_type'] = 'webpage'
            # resp['info'] = request.info
            # resp['scheduler'] = request.scheduler
            # resp['parse_extends'] = request.parse_extends
            resp['http_code'] = response.status_code
            resp['elapsed'] = int(response.elapsed.microseconds / 1000.0)
            resp['status'] = CrawlStatus.CRAWL_SUCCESS
            download_rsp = DownLoadRsp(**resp)

            self.log.info('发送到解析器的 name = {name} url = {url}'.format(name=name, url=resp['url']))

            # self.log.info(download_rsp)

            # 写给工商调度
            company_name = resp_json['entity'].get('company')
            self.out_beanstalk.put(self.output_tube_scheduler, json.dumps({
                'company': company_name,
                'crawl_online': resp_json['crawlStatus'].get('crawl_online'),
                'crawl_online_time': resp_json['crawlStatus'].get('crawl_online_time'),
                'query': resp_json['crawlSeed'],
            }))
            self.log.info('发送企业名称到工商调度消息队列: comapny = {company}'.format(company=company_name.encode('utf-8')))
            return self.to_string(download_rsp)
        except Exception as err:
            self.log.error("process failed, err[%s]" % (repr(err)))
            self.log.exception(err)

            download_rsp = DownLoadRsp(status=CrawlStatus.CRAWL_FAILT, )
            return self.to_string(download_rsp)
            # return download_rsp

    def do_output(self, body):
        return True