Exemple #1
0
 def __init__(self):
     TDDCLogging.info('->Parser Is Starting')
     super(ParserManager, self).__init__()
     self._storager = ParseStorager()
     self._parser = Parser()
     self._task_manager = ParseTaskManager()
     TDDCLogging.info('->Parser Was Ready.')
Exemple #2
0
 def _parse(self):
     while True:
         task, body = ParserQueues.WAITING_PARSE.get()
         cls = self._rules_updater.get_parse_model(task.platform,
                                                   task.feature)
         if not cls:
             fmt = 'Parse No Match: [P:{platform}][F:{feature}][K:{row_key}]'
             TDDCLogging.warning(
                 fmt.format(platform=task.platform,
                            feature=task.feature,
                            row_key=task.row_key))
             continue
         try:
             ret = cls(task, body)
         except Exception, e:
             TDDCLogging.error(e)
             continue
         self._storage(task, ret.items)
         self._new_task_push(ret.tasks)
         fmt = 'Parsed: [{platform}:{row_key}:{feature}][S:{items}][N:{tasks}]'
         TDDCLogging.info(
             fmt.format(platform=task.platform,
                        feature=task.feature,
                        row_key=task.row_key,
                        items=len(ret.items),
                        tasks=len(ret.tasks)))
         ParserQueues.TASK_STATUS.put(task)
Exemple #3
0
 def _spider_opened(self, spider):
     if not self._spider:
         self._spider = spider
         self._spider_mqs = spider.crawler.engine.slot.scheduler.mqs
         gevent.spawn(self._task_dispatch)
         gevent.sleep()
         TDDCLogging.info('-->Spider Was Ready.')
Exemple #4
0
 def start(self):
     while True:
         for infos in self._src_apis:
             try:
                 platform = infos.get('platform')
                 api = infos.get('api')
                 parse_mould = infos.get('parse_mould')
                 rsp = requests.get(api)
                 if not rsp:
                     TDDCLogging.error('[TDDC_PROXY_SOURCE_UPDATER] Exception(%s): ' % platform + api)
                     continue
                 if not parse_mould:
                     TDDCLogging.error('[TDDC_PROXY_SOURCE_UPDATER] Exception: parse_mould is None.')
                     continue
                 all_ips = parse_mould(rsp.text)
                 http_ips = self._proxy_active_check(all_ips.get('HTTP', []))
                 self._ip_pool.smadd('tddc:test:proxy:ip_src:http', http_ips)
                 TDDCLogging.info('[TDDC_PROXY_SOURCE_UPDATER] Source IPS(HTTP) Growth:%d' % len(http_ips))
                 https_ips = self._proxy_active_check(all_ips.get('HTTPS', []))
                 self._ip_pool.smadd('tddc:test:proxy:ip_src:https', https_ips)
                 self._ip_pool.smadd('tddc:test:proxy:ip_src:http', https_ips)
                 TDDCLogging.info('[TDDC_PROXY_SOURCE_UPDATER] Source IPS(HTTPS) Growth:%d' % len(https_ips))
             except Exception, e:
                 TDDCLogging.error('[TDDC_PROXY_SOURCE_UPDATER] Exception[IP_SOURCE]:' + e)
         gevent.sleep(10)
Exemple #5
0
 def __init__(self):
     '''
     Constructor
     '''
     TDDCLogging.info('-->Task Manager Is Starting.')
     super(CrawlTaskManager, self).__init__()
     self._start_mq_server()
     TDDCLogging.info('-->Task Manager Was Ready.')
Exemple #6
0
    def __init__(self):
        '''
        Constructor
        '''
        setproctitle.setproctitle("TDDC_CRAWLER")
        TDDCLogging.info('->Crawler Starting.')

        TDDCLogging.info('->Crawler Was Ready.')
Exemple #7
0
 def __init__(self):
     '''
     Constructor
     '''
     TDDCLogging.info('->Monitor Is Starting.')
     self._exception_manager = ExceptionManager()
     self._status_manager = StatusManager()
     TDDCLogging.info('->Monitor Was Started.')
Exemple #8
0
 def __init__(self):
     '''
     Constructor
     '''
     TDDCLogging.info('-->Parser Is Starting.')
     self._rules_updater = ParsePackagesManager()
     gevent.spawn(self._parse)
     gevent.sleep()
     TDDCLogging.info('-->Parser Was Ready.')
Exemple #9
0
 def __init__(self):
     '''
     Constructor
     '''
     TDDCLogging.info('-->Status Manager Is Starting.')
     self._status = {}
     super(StatusManager, self).__init__(MonitorSite.REDIS_NODES)
     gevent.spawn(self._get_status)
     gevent.sleep()
     TDDCLogging.info('-->Status Manager Was Started.')
Exemple #10
0
 def _push_new_crawl_task(self):
     TDDCLogging.info('--->Parser Task Producer Was Ready.')
     while True:
         task = ParserQueues.CRAWL.get()
         #             if not self._filter.setget(task.url):
         #                 TDDCLogging.debug('New Task [%s:%s] Was Filter.' % (task.platform, task.url))
         #                 continue
         msg = json.dumps(task.__dict__)
         if msg:
             self._push_task(ParserSite.CRAWL_TOPIC, task, msg)
Exemple #11
0
 def __init__(self):
     '''
     Constructor
     '''
     TDDCLogging.info('-->Exception Manager Is Starting.')
     self._exception_producer = KafkaHelper.make_producer(
         BaseSite.KAFKA_NODES)
     gevent.spawn(self._send)
     gevent.sleep()
     TDDCLogging.info('-->Exception Manager Was Ready.')
Exemple #12
0
 def __init__(self):
     '''
     Constructor
     '''
     setproctitle.setproctitle("TDDC_PROXY_CHECKER")
     TDDCLogging.info('->Proxy Checker Is Starting')
     self._checker = Checker()
     #         self._rules_updater = ProxyCheckerRulesUpdater()
     self._proxy_mq_manager = ProxyMQManager()
     self._proxy_manager = ProxyManager()
     TDDCLogging.info('->Proxy Checker Was Ready.')
Exemple #13
0
 def __init__(self):
     '''
     Constructor
     '''
     TDDCLogging.info('-->Proxy Manager Is Starting.')
     self._ip_pool = IPPool(RedisSite.REDIS_NODES)
     gevent.spawn(self._src_ip_fetch)
     gevent.sleep()
     gevent.spawn(self._useful_push)
     gevent.sleep()
     TDDCLogging.info('-->Proxy Manager Was Started.')
Exemple #14
0
 def __init__(self):
     '''
     Constructor
     '''
     TDDCLogging.info('-->Exception Manager Is Starting.')
     self._exception_process = {}
     self._load_exception_models()
     self._task_manager = ExceptionMessageSR()
     gevent.spawn(self._process)
     gevent.sleep()
     TDDCLogging.info('-->Exception Manager Was Started.')
Exemple #15
0
 def _subscribe(self):
     items = self._ip_pool.psubscribe(CrawlerSite.PROXY_PUBSUB_PATTERN)
     for item in items:
         if item.get('type') == 'psubscribe':
             TDDCLogging.info('---->Subscribe: %s' % item.get('channel'))
             continue
         platform = item.get('channel', '').split(':')[-1]
         data = item.get('data')
         if not CrawlerQueues.PLATFORM_PROXY.get(platform):
             CrawlerQueues.PLATFORM_PROXY[platform] = set()
         CrawlerQueues.PLATFORM_PROXY[platform].add(data)
Exemple #16
0
 def psubscribe(self, pattern):
     '''
     匹配订阅
     '''
     ps = self.pubsub()
     ps.psubscribe(pattern)
     TDDCLogging.info('--->Pubsub Was Ready.')
     for item in ps.listen():
         yield item
     ps.unsubscribe('spub')
     TDDCLogging.info('-->Pubsub Is Exit.')
Exemple #17
0
 def __init__(self):
     '''
     Constructor
     '''
     TDDCLogging.info('--->Messages Send And Recv Plugin Is Starting.')
     super(ExceptionMessageSR, self).__init__(status_logger=False)
     self._models_table = {}
     self._load_exception_models()
     gevent.spawn(self._recv)
     gevent.sleep()
     TDDCLogging.info('--->Messages Send And Recv Plugin Was Ready.')
Exemple #18
0
 def __init__(self):
     '''
     Constructor
     '''
     TDDCLogging.info('->Crawler Starting.')
     super(CrawlerManager, self).__init__()
     self._crawler = Crawler()
     self._storager = CrawlStorager()
     self._proxy_pool = CrawlProxyPool()
     self._cookies = CookiesManager()
     self._task_manager = CrawlTaskManager()
     TDDCLogging.info('->Crawler Was Ready.')
Exemple #19
0
 def __init__(self):
     '''
     Constructor
     '''
     TDDCLogging.info('-->Crawl Proxy Pool Is Starting.')
     self._ip_pool = IPPool(CrawlerSite.REDIS_NODES)
     self._init_proxy()
     gevent.spawn(self._subscribe)
     gevent.sleep()
     gevent.spawn(self._proxy_unuseful_feedback)
     gevent.sleep()
     TDDCLogging.info('-->Crawl Proxy Pool Was Ready.')
Exemple #20
0
 def __init__(self, push=True, pull=False):
     '''
     Constructor
     '''
     TDDCLogging.info('-->Storager Manager Is Starting.')
     self._db = DBManager(BaseSite.random_hbase_node())
     if push:
         gevent.spawn(self._push)
         gevent.sleep()
     if pull:
         gevent.spawn(self._pull)
         gevent.sleep()
     TDDCLogging.info('-->Storager Manager Was Ready.')
Exemple #21
0
 def __init__(self):
     '''
     Constructor
     '''
     TDDCLogging.info('-->Spider Is Starting.')
     self._spider = None
     self._spider_mqs = None
     self._signals_list = {
         signals.spider_opened: self._spider_opened,
         SingleSpider.SIGNAL_STORAGE: self._storage
     }
     self._process = crawler_process
     self._process.crawl(SingleSpider, callback=self._spider_signals)
     EventCenter().register(EventType.Crawler.MODULE, self._rule_update)
Exemple #22
0
 def __init__(self):
     '''
     Constructor
     '''
     TDDCLogging.info('-->Event Manager Is Starting.')
     self._init_event()
     self._event_consumer = KafkaHelper.make_consumer(
         self.NODES, self.TOPIC, self.GROUP)
     self._event_queue = gevent.queue.Queue()
     self._event_call = {}
     gevent.spawn(self._recv)
     gevent.sleep()
     gevent.spawn(self._dispatch)
     gevent.sleep()
     TDDCLogging.info('-->Event Manager Was Ready.')
Exemple #23
0
 def __init__(self):
     '''
     Constructor
     '''
     setproctitle.setproctitle("TDDC_PROXY_SOURCE_UPDATER")
     TDDCLogging.info('->[TDDC_PROXY_SOURCE_UPDATER] Proxy Source Updater Is Starting.')
     self._ip_pool = IPPool(RedisSite.REDIS_NODES)
     self._src_apis = [{'platform': 'kuaidaili',
                        'api':('http://dev.kuaidaili.com/api/getproxy/'
                               '?orderid=999310215091675&num=100&'
                               'b_pcchrome=1&b_pcie=1&b_pcff=1&'
                               'protocol=1&method=1&an_an=1&'
                               'an_ha=1&sp1=1&sp2=1&sp3=1&f_pr=1'
                               '&format=json&sep=1'),
                        'parse_mould': self._parse_kuaidaili}]
     TDDCLogging.info('->[TDDC_PROXY_SOURCE_UPDATER] Proxy Source Updater Was Started.')
Exemple #24
0
 def _push_parse_task(self):
     TDDCLogging.info('--->Parse Task Producer Was Ready.')
     while True:
         task, status = CrawlerQueues.PARSE.get()
         tmp = Task(**task.__dict__)
         task.status = Task.Status.CRAWL_SUCCESS
         if not isinstance(task, Task):
             TDDCLogging.error('')
             continue
         if not self._push_task(CrawlerSite.PARSE_TOPIC, tmp):
             TDDCLogging.error('')
         else:
             CrawlerQueues.TASK_STATUS_REMOVE.put(tmp)
             TDDCLogging.debug('[%s:%s] Crawled Successed(%d).' %
                               (task.platform, task.row_key, status))
             self._successed_num += 1
             self._successed_pre_min += 1
Exemple #25
0
 def __init__(self):
     '''
     Constructor
     '''
     TDDCLogging.info('-->Checker Is Starting.')
     self._init_rules()
     gevent.spawn(self._rules_update)
     gevent.sleep()
     for i in range(ProxyCheckerSite.CONCURRENT):
         gevent.spawn(self._check, i, 'http',
                      ProxyCheckerQueues.HTTP_SOURCE_PROXY)
         gevent.sleep()
     for i in range(ProxyCheckerSite.CONCURRENT):
         gevent.spawn(self._check, i, 'https',
                      ProxyCheckerQueues.HTTPS_SOURCE_PROXY)
         gevent.sleep()
     TDDCLogging.info('-->Checker Was Started.')
Exemple #26
0
 def _src_ip_fetch(self):
     while True:
         if ProxyCheckerQueues.HTTP_SOURCE_PROXY.qsize(
         ) < ProxyCheckerSite.CONCURRENT / 2:
             ret = self._ip_pool.smpop(
                 ProxyCheckerSite.HTTP_SOURCE_PROXY_SET_KEY,
                 ProxyCheckerSite.CONCURRENT * 2)
             ret = [item for item in ret if item]
             TDDCLogging.info('HTTP Add New: %d' % len(ret))
             for ip in ret:
                 ProxyCheckerQueues.HTTP_SOURCE_PROXY.put(
                     IPInfo(ip_port=ip))
         if ProxyCheckerQueues.HTTPS_SOURCE_PROXY.qsize(
         ) < ProxyCheckerSite.CONCURRENT / 2:
             ret = self._ip_pool.smpop(
                 ProxyCheckerSite.HTTPS_SOURCE_PROXY_SET_KEY,
                 ProxyCheckerSite.CONCURRENT * 2)
             ret = [item for item in ret if item]
             TDDCLogging.info('HTTPS Add New: %d' % len(ret))
             for ip in ret:
                 ProxyCheckerQueues.HTTPS_SOURCE_PROXY.put(
                     IPInfo(ip_port=ip, http_or_https='https'))
         gevent.sleep(5)
Exemple #27
0
 def __init__(self, host_port=None):
     '''
     Constructor
     params:
         host_port:
             EXP: 'localhost:8888'
             DES: HBase的IP、PORT
     '''
     TDDCLogging.info('---->DB Manager Is Starting.')
     self._tables = []
     host, port = host_port.split(':')
     self._hb_pool = happybase.ConnectionPool(size=8,
                                              host=host,
                                              port=int(port),
                                              transport='framed',
                                              protocol='compact')
     TDDCLogging.info('----->HBase(%s:%s) Was Ready.' % (host, port))
     TDDCLogging.info('---->DB Manager Was Ready.')
Exemple #28
0
 def _fetch(self):
     TDDCLogging.info('--->Parsing Task Consumer Was Ready.')
     pause = False
     while True:
         if ParserQueues.PARSE.qsize(
         ) > ParserSite.FETCH_SOURCE_CONCURRENT * 4:
             if not pause:
                 self._consumer.commit()
                 self._consumer.unsubscribe()
                 pause = True
                 TDDCLogging.info('Parsing Task Consumer Was Paused.')
             gevent.sleep(1)
             continue
         if pause:
             self._consumer.subscribe(ParserSite.PARSE_TOPIC)
             pause = False
             TDDCLogging.info('Parsing Task Consumer Was Resumed.')
         partition_records = self._consumer.poll(2000, 16)
         if not len(partition_records):
             gevent.sleep(1)
             continue
         for _, records in partition_records.items():
             for record in records:
                 self._record_proc(record)
Exemple #29
0
 def _fetch_crawl_task(self):
     TDDCLogging.info('--->Crawl Task Consumer Was Ready.')
     pause = False
     while True:
         if CrawlerQueues.CRAWL.qsize() > CrawlerSite.CONCURRENT * 4:
             if not pause:
                 self._crawl_task_consumer.commit()
                 self._crawl_task_consumer.unsubscribe()
                 pause = True
                 TDDCLogging.info('Crawl Task Consumer Was Paused.')
             gevent.sleep(1)
             continue
         if pause and CrawlerQueues.CRAWL.qsize(
         ) < CrawlerSite.CONCURRENT / 2:
             self._crawl_task_consumer.subscribe(CrawlerSite.CRAWL_TOPIC)
             pause = False
             TDDCLogging.info('Crawl Task Consumer Was Resumed.')
         partition_records = self._crawl_task_consumer.poll(2000, 16)
         if not len(partition_records):
             gevent.sleep(1)
             continue
         for _, records in partition_records.items():
             for record in records:
                 self._record_proc(record)
Exemple #30
0
            current_host_port = ':'.join(self._current_host_port)
            self._host_ports_pool.remove(current_host_port)
            if len(self._host_ports_pool) > 0:
                TDDCLogging.warning(
                    'HBase Server Exception. Now Is Reconnecting.')
            else:
                TDDCLogging.warning(
                    'HBase Server Fatal Error. Please Check It.')
                gevent.sleep(30)
                self._host_ports_pool = list(self._host_ports)
                TDDCLogging.warning('Retry Connecting HHase.')
            self._reconnect()
        else:
            self._host_ports_pool = list(self._host_ports)
            self._status = True
            TDDCLogging.info('----->HBase Is Connected.(%s)' %
                             ':'.join(self._current_host_port))
            self._hbase_was_ready()

    def _hbase_was_ready(self):
        if self._callback:
            self._callback()

    def _keep_alive(self):
        while True:
            gevent.sleep(15)
            try:
                if self._status:
                    if not self.get('keep_alive', 'ping')[0]:
                        raise TTransportException
            except TTransportException, e:
                if not self._status: