def _task_status_update(self): while True: task = ParserQueues.TASK_STATUS.get() TDDCLogging.debug('[{}:{}:{}]'.format(task.platform, task.url, task.status)) self._successed_num += 1 self._successed_pre_min += 1
def start(self): while True: for infos in self._src_apis: try: platform = infos.get('platform') api = infos.get('api') parse_mould = infos.get('parse_mould') rsp = requests.get(api) if not rsp: TDDCLogging.error('[TDDC_PROXY_SOURCE_UPDATER] Exception(%s): ' % platform + api) continue if not parse_mould: TDDCLogging.error('[TDDC_PROXY_SOURCE_UPDATER] Exception: parse_mould is None.') continue all_ips = parse_mould(rsp.text) http_ips = self._proxy_active_check(all_ips.get('HTTP', [])) self._ip_pool.smadd('tddc:test:proxy:ip_src:http', http_ips) TDDCLogging.info('[TDDC_PROXY_SOURCE_UPDATER] Source IPS(HTTP) Growth:%d' % len(http_ips)) https_ips = self._proxy_active_check(all_ips.get('HTTPS', [])) self._ip_pool.smadd('tddc:test:proxy:ip_src:https', https_ips) self._ip_pool.smadd('tddc:test:proxy:ip_src:http', https_ips) TDDCLogging.info('[TDDC_PROXY_SOURCE_UPDATER] Source IPS(HTTPS) Growth:%d' % len(https_ips)) except Exception, e: TDDCLogging.error('[TDDC_PROXY_SOURCE_UPDATER] Exception[IP_SOURCE]:' + e) gevent.sleep(10)
def _spider_opened(self, spider): if not self._spider: self._spider = spider self._spider_mqs = spider.crawler.engine.slot.scheduler.mqs gevent.spawn(self._task_dispatch) gevent.sleep() TDDCLogging.info('-->Spider Was Ready.')
def __init__(self): TDDCLogging.info('->Parser Is Starting') super(ParserManager, self).__init__() self._storager = ParseStorager() self._parser = Parser() self._task_manager = ParseTaskManager() TDDCLogging.info('->Parser Was Ready.')
def parse(self, response): TDDCLogging.debug('Download Success. ' + response.url) task,_ = response.request.meta.get('item') rsp_info = {'rsp': [response.url, response.status], 'content': response.body} if self.signals_callback: self.signals_callback(self, SingleSpider.SIGNAL_STORAGE, [task, rsp_info])
def _get_status(self): while True: cur_time = 1495087998 # time.time() keys = self.keys(MonitorSite.STATUS_HSET_PREFIX + '.*') for key in keys: h_len = self.hlen(key) platform, status = key.split('.')[-2:] if not self._status.get(platform): self._status[platform] = {} self._status[platform][status] = h_len item = self.hscan_iter(key) for index, (url, task) in enumerate(item): task = json.loads(task) task = Task(**task) time = task.timestamp if int(time) < cur_time - 20: MonitorQueues.EXCEPTION_TASK.put(task) TDDCLogging.debug( str(index) + ' : ' + task.platform + ' : ' + url + ' : ' + str(task.status) + ' : ' + str(time) + ' : ' + 'Crawl Again.') self.hdel(MonitorSite.STATUS_HSET_PREFIX, url) gevent.sleep(60) TDDCLogging.debug( json.dumps(self._status, sort_keys=True, indent=4))
def _push(self): cnt = 0 platform_rows = {} while True: try: task, storage_info = PublicQueues.STORAGE.get() items = { self.FAMILY: storage_info, 'task': { 'task': task.to_json() } } if not platform_rows.get(task.platform + BaseSite.PLATFORM_SUFFIX): platform_rows[task.platform + BaseSite.PLATFORM_SUFFIX] = {} platform_rows[task.platform + BaseSite.PLATFORM_SUFFIX][task.row_key] = items cnt += 1 if PublicQueues.STORAGE.qsize() and not cnt % 5: gevent.sleep(0.01) continue if self._db.puts_to_hbase(platform_rows): self._pushed(platform_rows, True) else: self._pushed(platform_rows, False) gevent.sleep(1) platform_rows = {} except Exception, e: TDDCLogging.error(e)
def create_table_to_hbase(self, table, families): try: with self._hb_pool.connection() as connection: connection.create_table(table, families) except Exception, e: TDDCLogging.error(e) return False
def __init__(self): ''' Constructor ''' TDDCLogging.info('-->Task Manager Is Starting.') super(CrawlTaskManager, self).__init__() self._start_mq_server() TDDCLogging.info('-->Task Manager Was Ready.')
def __init__(self): ''' Constructor ''' setproctitle.setproctitle("TDDC_CRAWLER") TDDCLogging.info('->Crawler Starting.') TDDCLogging.info('->Crawler Was Ready.')
def __init__(self): ''' Constructor ''' TDDCLogging.info('->Monitor Is Starting.') self._exception_manager = ExceptionManager() self._status_manager = StatusManager() TDDCLogging.info('->Monitor Was Started.')
def add_task(self, task, is_retry=False, times=1): if not is_retry: TDDCLogging.debug('Add New Task: ' + task.url) headers = self._init_request_headers(task) req = (self._make_get_request(task, headers, times) if not task.method or upper(task.method) == 'GET' else self._make_post_request(task, headers, times)) self.crawler.engine.schedule(req, self)
def _dispatch(self): while True: event = self._event_queue.get() callback = self._event_call.get(event.event_type, None) if callback: callback(event) else: TDDCLogging.warning('Event Exception: %d Not Register.' % event.event_type)
def __init__(self): ''' Constructor ''' TDDCLogging.info('-->Parser Is Starting.') self._rules_updater = ParsePackagesManager() gevent.spawn(self._parse) gevent.sleep() TDDCLogging.info('-->Parser Was Ready.')
def _auto_create_table(self, connection, table): for cnt in range(2): if table not in self._tables: if cnt == 1: connection.create_table(table, {k:{} for k in ['source', 'valuable', 'task']}) TDDCLogging.warning('Create New Table(%s) to HBase.' % table) self._tables = connection.tables() else: break
def __init__(self): ''' Constructor ''' TDDCLogging.info('-->Status Manager Is Starting.') self._status = {} super(StatusManager, self).__init__(MonitorSite.REDIS_NODES) gevent.spawn(self._get_status) gevent.sleep() TDDCLogging.info('-->Status Manager Was Started.')
def _push_new_crawl_task(self): TDDCLogging.info('--->Parser Task Producer Was Ready.') while True: task = ParserQueues.CRAWL.get() # if not self._filter.setget(task.url): # TDDCLogging.debug('New Task [%s:%s] Was Filter.' % (task.platform, task.url)) # continue msg = json.dumps(task.__dict__) if msg: self._push_task(ParserSite.CRAWL_TOPIC, task, msg)
def __init__(self): ''' Constructor ''' TDDCLogging.info('-->Exception Manager Is Starting.') self._exception_producer = KafkaHelper.make_producer( BaseSite.KAFKA_NODES) gevent.spawn(self._send) gevent.sleep() TDDCLogging.info('-->Exception Manager Was Ready.')
def _process(self): while True: exception = MonitorQueues.EXCEPTION.get() cls = self._exception_process.get(exception.code) if not cls: TDDCLogging.warning( 'No Match Process To Exception: {exp_id}'.format( exp_id=exception.id)) continue cls(exception)
def _subscribe(self): items = self._ip_pool.psubscribe(CrawlerSite.PROXY_PUBSUB_PATTERN) for item in items: if item.get('type') == 'psubscribe': TDDCLogging.info('---->Subscribe: %s' % item.get('channel')) continue platform = item.get('channel', '').split(':')[-1] data = item.get('data') if not CrawlerQueues.PLATFORM_PROXY.get(platform): CrawlerQueues.PLATFORM_PROXY[platform] = set() CrawlerQueues.PLATFORM_PROXY[platform].add(data)
def __init__(self): ''' Constructor ''' TDDCLogging.info('--->Messages Send And Recv Plugin Is Starting.') super(ExceptionMessageSR, self).__init__(status_logger=False) self._models_table = {} self._load_exception_models() gevent.spawn(self._recv) gevent.sleep() TDDCLogging.info('--->Messages Send And Recv Plugin Was Ready.')
def psubscribe(self, pattern): ''' 匹配订阅 ''' ps = self.pubsub() ps.psubscribe(pattern) TDDCLogging.info('--->Pubsub Was Ready.') for item in ps.listen(): yield item ps.unsubscribe('spub') TDDCLogging.info('-->Pubsub Is Exit.')
def __init__(self): ''' Constructor ''' TDDCLogging.info('-->Exception Manager Is Starting.') self._exception_process = {} self._load_exception_models() self._task_manager = ExceptionMessageSR() gevent.spawn(self._process) gevent.sleep() TDDCLogging.info('-->Exception Manager Was Started.')
def __init__(self): ''' Constructor ''' setproctitle.setproctitle("TDDC_PROXY_CHECKER") TDDCLogging.info('->Proxy Checker Is Starting') self._checker = Checker() # self._rules_updater = ProxyCheckerRulesUpdater() self._proxy_mq_manager = ProxyMQManager() self._proxy_manager = ProxyManager() TDDCLogging.info('->Proxy Checker Was Ready.')
def _consume_msg_exp(self, exp_type, info, exception=None): if 'JSON_ERR' in exp_type: TDDCLogging.error('*' * 5 + exp_type + '*' * 5 + '\nException: ' + info + '\n' + exception.message + '\n' + '*' * (10 + len(exp_type)) + '\n') elif 'TASK_ERR' in exp_type or 'EVENT_ERR' in exp_type: TDDCLogging.error('*' * 5 + exp_type + '*' * 5 + '\nException: ' + 'item={item}\n'.format(item=info) + 'item_type={item_type}\n'.format( item_type=type(info)) + '*' * (10 + len(exp_type)) + '\n')
def __init__(self): ''' Constructor ''' TDDCLogging.info('-->Proxy Manager Is Starting.') self._ip_pool = IPPool(RedisSite.REDIS_NODES) gevent.spawn(self._src_ip_fetch) gevent.sleep() gevent.spawn(self._useful_push) gevent.sleep() TDDCLogging.info('-->Proxy Manager Was Started.')
def _rules_update(self): while True: rule = ProxyCheckerQueues.RULES_MOULDS_UPDATE.get() print(rule.platform, rule.package, rule.moulds) for cls_name in rule.moulds: molule = importlib.import_module(rule.package) cls = getattr(molule, cls_name) if not cls: TDDCLogging.error('Exception: import rule failed: ' + cls_name) continue self._rules_moulds[cls.proxy_type][cls.proxy_type] = cls
def __init__(self): ''' Constructor ''' TDDCLogging.info('-->Crawl Proxy Pool Is Starting.') self._ip_pool = IPPool(CrawlerSite.REDIS_NODES) self._init_proxy() gevent.spawn(self._subscribe) gevent.sleep() gevent.spawn(self._proxy_unuseful_feedback) gevent.sleep() TDDCLogging.info('-->Crawl Proxy Pool Was Ready.')
def __init__(self): ''' Constructor ''' TDDCLogging.info('->Crawler Starting.') super(CrawlerManager, self).__init__() self._crawler = Crawler() self._storager = CrawlStorager() self._proxy_pool = CrawlProxyPool() self._cookies = CookiesManager() self._task_manager = CrawlTaskManager() TDDCLogging.info('->Crawler Was Ready.')
def __init__(self, push=True, pull=False): ''' Constructor ''' TDDCLogging.info('-->Storager Manager Is Starting.') self._db = DBManager(BaseSite.random_hbase_node()) if push: gevent.spawn(self._push) gevent.sleep() if pull: gevent.spawn(self._pull) gevent.sleep() TDDCLogging.info('-->Storager Manager Was Ready.')