Ejemplo n.º 1
0
 def _task_status_update(self):
     while True:
         task = ParserQueues.TASK_STATUS.get()
         TDDCLogging.debug('[{}:{}:{}]'.format(task.platform, task.url,
                                               task.status))
         self._successed_num += 1
         self._successed_pre_min += 1
Ejemplo n.º 2
0
 def start(self):
     while True:
         for infos in self._src_apis:
             try:
                 platform = infos.get('platform')
                 api = infos.get('api')
                 parse_mould = infos.get('parse_mould')
                 rsp = requests.get(api)
                 if not rsp:
                     TDDCLogging.error('[TDDC_PROXY_SOURCE_UPDATER] Exception(%s): ' % platform + api)
                     continue
                 if not parse_mould:
                     TDDCLogging.error('[TDDC_PROXY_SOURCE_UPDATER] Exception: parse_mould is None.')
                     continue
                 all_ips = parse_mould(rsp.text)
                 http_ips = self._proxy_active_check(all_ips.get('HTTP', []))
                 self._ip_pool.smadd('tddc:test:proxy:ip_src:http', http_ips)
                 TDDCLogging.info('[TDDC_PROXY_SOURCE_UPDATER] Source IPS(HTTP) Growth:%d' % len(http_ips))
                 https_ips = self._proxy_active_check(all_ips.get('HTTPS', []))
                 self._ip_pool.smadd('tddc:test:proxy:ip_src:https', https_ips)
                 self._ip_pool.smadd('tddc:test:proxy:ip_src:http', https_ips)
                 TDDCLogging.info('[TDDC_PROXY_SOURCE_UPDATER] Source IPS(HTTPS) Growth:%d' % len(https_ips))
             except Exception, e:
                 TDDCLogging.error('[TDDC_PROXY_SOURCE_UPDATER] Exception[IP_SOURCE]:' + e)
         gevent.sleep(10)
Ejemplo n.º 3
0
 def _spider_opened(self, spider):
     if not self._spider:
         self._spider = spider
         self._spider_mqs = spider.crawler.engine.slot.scheduler.mqs
         gevent.spawn(self._task_dispatch)
         gevent.sleep()
         TDDCLogging.info('-->Spider Was Ready.')
Ejemplo n.º 4
0
 def __init__(self):
     TDDCLogging.info('->Parser Is Starting')
     super(ParserManager, self).__init__()
     self._storager = ParseStorager()
     self._parser = Parser()
     self._task_manager = ParseTaskManager()
     TDDCLogging.info('->Parser Was Ready.')
Ejemplo n.º 5
0
 def parse(self, response):
     TDDCLogging.debug('Download Success. ' + response.url)
     task,_ = response.request.meta.get('item')
     rsp_info = {'rsp': [response.url, response.status],
                 'content': response.body}
     if self.signals_callback:
         self.signals_callback(self, SingleSpider.SIGNAL_STORAGE, [task, rsp_info])
Ejemplo n.º 6
0
 def _get_status(self):
     while True:
         cur_time = 1495087998  # time.time()
         keys = self.keys(MonitorSite.STATUS_HSET_PREFIX + '.*')
         for key in keys:
             h_len = self.hlen(key)
             platform, status = key.split('.')[-2:]
             if not self._status.get(platform):
                 self._status[platform] = {}
             self._status[platform][status] = h_len
             item = self.hscan_iter(key)
             for index, (url, task) in enumerate(item):
                 task = json.loads(task)
                 task = Task(**task)
                 time = task.timestamp
                 if int(time) < cur_time - 20:
                     MonitorQueues.EXCEPTION_TASK.put(task)
                     TDDCLogging.debug(
                         str(index) + ' : ' + task.platform + ' : ' + url +
                         ' : ' + str(task.status) + ' : ' + str(time) +
                         ' : ' + 'Crawl Again.')
                     self.hdel(MonitorSite.STATUS_HSET_PREFIX, url)
         gevent.sleep(60)
         TDDCLogging.debug(
             json.dumps(self._status, sort_keys=True, indent=4))
Ejemplo n.º 7
0
 def _push(self):
     cnt = 0
     platform_rows = {}
     while True:
         try:
             task, storage_info = PublicQueues.STORAGE.get()
             items = {
                 self.FAMILY: storage_info,
                 'task': {
                     'task': task.to_json()
                 }
             }
             if not platform_rows.get(task.platform +
                                      BaseSite.PLATFORM_SUFFIX):
                 platform_rows[task.platform +
                               BaseSite.PLATFORM_SUFFIX] = {}
             platform_rows[task.platform +
                           BaseSite.PLATFORM_SUFFIX][task.row_key] = items
             cnt += 1
             if PublicQueues.STORAGE.qsize() and not cnt % 5:
                 gevent.sleep(0.01)
                 continue
             if self._db.puts_to_hbase(platform_rows):
                 self._pushed(platform_rows, True)
             else:
                 self._pushed(platform_rows, False)
                 gevent.sleep(1)
             platform_rows = {}
         except Exception, e:
             TDDCLogging.error(e)
Ejemplo n.º 8
0
 def create_table_to_hbase(self, table, families):
     try:
         with self._hb_pool.connection() as connection:
             connection.create_table(table, families)
     except Exception, e:
         TDDCLogging.error(e)
         return False
Ejemplo n.º 9
0
 def __init__(self):
     '''
     Constructor
     '''
     TDDCLogging.info('-->Task Manager Is Starting.')
     super(CrawlTaskManager, self).__init__()
     self._start_mq_server()
     TDDCLogging.info('-->Task Manager Was Ready.')
Ejemplo n.º 10
0
    def __init__(self):
        '''
        Constructor
        '''
        setproctitle.setproctitle("TDDC_CRAWLER")
        TDDCLogging.info('->Crawler Starting.')

        TDDCLogging.info('->Crawler Was Ready.')
Ejemplo n.º 11
0
 def __init__(self):
     '''
     Constructor
     '''
     TDDCLogging.info('->Monitor Is Starting.')
     self._exception_manager = ExceptionManager()
     self._status_manager = StatusManager()
     TDDCLogging.info('->Monitor Was Started.')
Ejemplo n.º 12
0
 def add_task(self, task, is_retry=False, times=1):
     if not is_retry:
         TDDCLogging.debug('Add New Task: ' + task.url)
     headers = self._init_request_headers(task)
     req = (self._make_get_request(task, headers, times) 
            if not task.method or upper(task.method) == 'GET' 
            else self._make_post_request(task, headers, times))
     self.crawler.engine.schedule(req, self)
Ejemplo n.º 13
0
 def _dispatch(self):
     while True:
         event = self._event_queue.get()
         callback = self._event_call.get(event.event_type, None)
         if callback:
             callback(event)
         else:
             TDDCLogging.warning('Event Exception: %d Not Register.' %
                                 event.event_type)
Ejemplo n.º 14
0
 def __init__(self):
     '''
     Constructor
     '''
     TDDCLogging.info('-->Parser Is Starting.')
     self._rules_updater = ParsePackagesManager()
     gevent.spawn(self._parse)
     gevent.sleep()
     TDDCLogging.info('-->Parser Was Ready.')
Ejemplo n.º 15
0
 def _auto_create_table(self, connection, table):
     for cnt in range(2):
         if table not in self._tables:
             if cnt == 1:
                 connection.create_table(table, {k:{} for k in ['source', 'valuable', 'task']})
                 TDDCLogging.warning('Create New Table(%s) to HBase.' % table)
             self._tables = connection.tables()
         else:
             break
Ejemplo n.º 16
0
 def __init__(self):
     '''
     Constructor
     '''
     TDDCLogging.info('-->Status Manager Is Starting.')
     self._status = {}
     super(StatusManager, self).__init__(MonitorSite.REDIS_NODES)
     gevent.spawn(self._get_status)
     gevent.sleep()
     TDDCLogging.info('-->Status Manager Was Started.')
Ejemplo n.º 17
0
 def _push_new_crawl_task(self):
     TDDCLogging.info('--->Parser Task Producer Was Ready.')
     while True:
         task = ParserQueues.CRAWL.get()
         #             if not self._filter.setget(task.url):
         #                 TDDCLogging.debug('New Task [%s:%s] Was Filter.' % (task.platform, task.url))
         #                 continue
         msg = json.dumps(task.__dict__)
         if msg:
             self._push_task(ParserSite.CRAWL_TOPIC, task, msg)
Ejemplo n.º 18
0
 def __init__(self):
     '''
     Constructor
     '''
     TDDCLogging.info('-->Exception Manager Is Starting.')
     self._exception_producer = KafkaHelper.make_producer(
         BaseSite.KAFKA_NODES)
     gevent.spawn(self._send)
     gevent.sleep()
     TDDCLogging.info('-->Exception Manager Was Ready.')
Ejemplo n.º 19
0
 def _process(self):
     while True:
         exception = MonitorQueues.EXCEPTION.get()
         cls = self._exception_process.get(exception.code)
         if not cls:
             TDDCLogging.warning(
                 'No Match Process To Exception: {exp_id}'.format(
                     exp_id=exception.id))
             continue
         cls(exception)
Ejemplo n.º 20
0
 def _subscribe(self):
     items = self._ip_pool.psubscribe(CrawlerSite.PROXY_PUBSUB_PATTERN)
     for item in items:
         if item.get('type') == 'psubscribe':
             TDDCLogging.info('---->Subscribe: %s' % item.get('channel'))
             continue
         platform = item.get('channel', '').split(':')[-1]
         data = item.get('data')
         if not CrawlerQueues.PLATFORM_PROXY.get(platform):
             CrawlerQueues.PLATFORM_PROXY[platform] = set()
         CrawlerQueues.PLATFORM_PROXY[platform].add(data)
Ejemplo n.º 21
0
 def __init__(self):
     '''
     Constructor
     '''
     TDDCLogging.info('--->Messages Send And Recv Plugin Is Starting.')
     super(ExceptionMessageSR, self).__init__(status_logger=False)
     self._models_table = {}
     self._load_exception_models()
     gevent.spawn(self._recv)
     gevent.sleep()
     TDDCLogging.info('--->Messages Send And Recv Plugin Was Ready.')
Ejemplo n.º 22
0
 def psubscribe(self, pattern):
     '''
     匹配订阅
     '''
     ps = self.pubsub()
     ps.psubscribe(pattern)
     TDDCLogging.info('--->Pubsub Was Ready.')
     for item in ps.listen():
         yield item
     ps.unsubscribe('spub')
     TDDCLogging.info('-->Pubsub Is Exit.')
Ejemplo n.º 23
0
 def __init__(self):
     '''
     Constructor
     '''
     TDDCLogging.info('-->Exception Manager Is Starting.')
     self._exception_process = {}
     self._load_exception_models()
     self._task_manager = ExceptionMessageSR()
     gevent.spawn(self._process)
     gevent.sleep()
     TDDCLogging.info('-->Exception Manager Was Started.')
Ejemplo n.º 24
0
 def __init__(self):
     '''
     Constructor
     '''
     setproctitle.setproctitle("TDDC_PROXY_CHECKER")
     TDDCLogging.info('->Proxy Checker Is Starting')
     self._checker = Checker()
     #         self._rules_updater = ProxyCheckerRulesUpdater()
     self._proxy_mq_manager = ProxyMQManager()
     self._proxy_manager = ProxyManager()
     TDDCLogging.info('->Proxy Checker Was Ready.')
Ejemplo n.º 25
0
 def _consume_msg_exp(self, exp_type, info, exception=None):
     if 'JSON_ERR' in exp_type:
         TDDCLogging.error('*' * 5 + exp_type + '*' * 5 + '\nException: ' +
                           info + '\n' + exception.message + '\n' + '*' *
                           (10 + len(exp_type)) + '\n')
     elif 'TASK_ERR' in exp_type or 'EVENT_ERR' in exp_type:
         TDDCLogging.error('*' * 5 + exp_type + '*' * 5 + '\nException: ' +
                           'item={item}\n'.format(item=info) +
                           'item_type={item_type}\n'.format(
                               item_type=type(info)) + '*' *
                           (10 + len(exp_type)) + '\n')
Ejemplo n.º 26
0
Archivo: proxy.py Proyecto: slmzhi/tddc
 def __init__(self):
     '''
     Constructor
     '''
     TDDCLogging.info('-->Proxy Manager Is Starting.')
     self._ip_pool = IPPool(RedisSite.REDIS_NODES)
     gevent.spawn(self._src_ip_fetch)
     gevent.sleep()
     gevent.spawn(self._useful_push)
     gevent.sleep()
     TDDCLogging.info('-->Proxy Manager Was Started.')
Ejemplo n.º 27
0
 def _rules_update(self):
     while True:
         rule = ProxyCheckerQueues.RULES_MOULDS_UPDATE.get()
         print(rule.platform, rule.package, rule.moulds)
         for cls_name in rule.moulds:
             molule = importlib.import_module(rule.package)
             cls = getattr(molule, cls_name)
             if not cls:
                 TDDCLogging.error('Exception: import rule failed: ' +
                                   cls_name)
                 continue
             self._rules_moulds[cls.proxy_type][cls.proxy_type] = cls
Ejemplo n.º 28
0
 def __init__(self):
     '''
     Constructor
     '''
     TDDCLogging.info('-->Crawl Proxy Pool Is Starting.')
     self._ip_pool = IPPool(CrawlerSite.REDIS_NODES)
     self._init_proxy()
     gevent.spawn(self._subscribe)
     gevent.sleep()
     gevent.spawn(self._proxy_unuseful_feedback)
     gevent.sleep()
     TDDCLogging.info('-->Crawl Proxy Pool Was Ready.')
Ejemplo n.º 29
0
 def __init__(self):
     '''
     Constructor
     '''
     TDDCLogging.info('->Crawler Starting.')
     super(CrawlerManager, self).__init__()
     self._crawler = Crawler()
     self._storager = CrawlStorager()
     self._proxy_pool = CrawlProxyPool()
     self._cookies = CookiesManager()
     self._task_manager = CrawlTaskManager()
     TDDCLogging.info('->Crawler Was Ready.')
Ejemplo n.º 30
0
 def __init__(self, push=True, pull=False):
     '''
     Constructor
     '''
     TDDCLogging.info('-->Storager Manager Is Starting.')
     self._db = DBManager(BaseSite.random_hbase_node())
     if push:
         gevent.spawn(self._push)
         gevent.sleep()
     if pull:
         gevent.spawn(self._pull)
         gevent.sleep()
     TDDCLogging.info('-->Storager Manager Was Ready.')