コード例 #1
0
 def _parse(self):
     while True:
         task, body = ParserQueues.WAITING_PARSE.get()
         cls = self._rules_updater.get_parse_model(task.platform,
                                                   task.feature)
         if not cls:
             fmt = 'Parse No Match: [P:{platform}][F:{feature}][K:{row_key}]'
             TDDCLogging.warning(
                 fmt.format(platform=task.platform,
                            feature=task.feature,
                            row_key=task.row_key))
             continue
         try:
             ret = cls(task, body)
         except Exception, e:
             TDDCLogging.error(e)
             continue
         self._storage(task, ret.items)
         self._new_task_push(ret.tasks)
         fmt = 'Parsed: [{platform}:{row_key}:{feature}][S:{items}][N:{tasks}]'
         TDDCLogging.info(
             fmt.format(platform=task.platform,
                        feature=task.feature,
                        row_key=task.row_key,
                        items=len(ret.items),
                        tasks=len(ret.tasks)))
         ParserQueues.TASK_STATUS.put(task)
コード例 #2
0
 def _auto_create_table(self, connection, table):
     for cnt in range(2):
         if table not in self._tables:
             if cnt == 1:
                 connection.create_table(table, {k:{} for k in ['source', 'valuable', 'task']})
                 TDDCLogging.warning('Create New Table(%s) to HBase.' % table)
             self._tables = connection.tables()
         else:
             break
コード例 #3
0
 def _dispatch(self):
     while True:
         event = self._event_queue.get()
         callback = self._event_call.get(event.event_type, None)
         if callback:
             callback(event)
         else:
             TDDCLogging.warning('Event Exception: %d Not Register.' %
                                 event.event_type)
コード例 #4
0
 def _process(self):
     while True:
         exception = MonitorQueues.EXCEPTION.get()
         cls = self._exception_process.get(exception.code)
         if not cls:
             TDDCLogging.warning(
                 'No Match Process To Exception: {exp_id}'.format(
                     exp_id=exception.id))
             continue
         cls(exception)
コード例 #5
0
ファイル: hbase_manager.py プロジェクト: slmzhi/tddc
 def get(self, table_name, row_key, family=None, qualifier=None):
     if not self._status:
         TDDCLogging.warning(
             '[Get Operation Was Failed] HBase Server Is Exception.')
         return False, None
     get = TGet()
     get.row = row_key
     if family:
         tc = TColumn()
         tc.family = family
         if qualifier:
             tc.qualifier = qualifier
         get.columns = [tc]
     try:
         ret = None
         ret = self._client.get(table_name, get)
     except Exception, e:
         TDDCLogging.error(e)
         return False, None
コード例 #6
0
ファイル: hbase_manager.py プロジェクト: slmzhi/tddc
 def put(self, table, row_key, items=None):
     if not self._status:
         TDDCLogging.warning(
             '[Put Operation Was Failed] HBase Server Is Exception.')
         return False
     cvs = []
     for family, info in items.items():
         if not isinstance(info, dict):
             raise TypeError
         for k, v in info.items():
             if isinstance(v, list) or isinstance(v, dict):
                 v = json.dumps(v)
             cv = TColumnValue(family, k, v)
             cvs.append(cv)
     tp = TPut(row_key, cvs)
     try:
         self._client.put(table, tp)
     except Exception, e:
         TDDCLogging.error(e)
         return False
コード例 #7
0
 def error_back(self, response):
     task, times = response.request.meta['item']
     proxy = response.request.meta.get('proxy', None)
     if response.type == httperror.HttpError:
         status = response.value.response.status
         if status >= 500 or status == 408:
             fmt = '[%s][%s] Crawled Failed(\033[0m %d \033[1;37;43m| %s ). Will Retry After While.'
             TDDCLogging.warning(fmt % (task.platform,
                                        task.url,
                                        status,
                                        proxy))
             self.add_task(task, True)
             return
         elif status == 404:
             retry_times = task.retry if task.retry else 3
             if times >= retry_times:
                 exception = CrawlerTaskFailedException(task)
                 CrawlerQueues.EXCEPTION.put(exception)
                 CrawlerQueues.TASK_STATUS_REMOVE.put(task)
                 fmt = '[%s:%s] Crawled Failed(\033[0m 404 \033[1;37;43m| %s ). Not Retry.'
                 TDDCLogging.warning(fmt % (task.platform,
                                            task.url,
                                            proxy))
                 return
             times += 1
             fmt = '[%s:%s] Crawled Failed(\033[0m %d \033[1;37;43m| %s ). Will Retry After While.'
             TDDCLogging.warning(fmt % (task.platform,
                                        task.url,
                                        status,
                                        proxy))
             self.add_task(task, True, times)
             return
     elif response.type == internet_err.TimeoutError:
         err_msg = 'TimeoutError'
     elif response.type in [internet_err.ConnectionRefusedError,
                            internet_err.TCPTimedOutError]:
         err_msg = '%d:%s' % (response.value.osError, response.value.message)
     elif response.type == newclient_err.ResponseNeverReceived:
         err_msg = 'ResponseNeverReceived'
     else:
         err_msg = '%s' % (response.value)
     if proxy:
         proxy = proxy.split('//')[1]
         CrawlerQueues.UNUSEFUL_PROXY_FEEDBACK.put([task.platform, proxy])
     fmt = '[%s][%s] Crawled Failed(\033[0m %s \033[1;37;43m| %s ). Will Retry After While.'
     TDDCLogging.warning(fmt % (task.platform,
                                task.url,
                                err_msg,
                                proxy))
     self.add_task(task, True, times)
コード例 #8
0
ファイル: hbase_manager.py プロジェクト: slmzhi/tddc
 def _connect(self):
     try:
         self._current_host_port = random.choice(
             self._host_ports_pool).split(':')
         self._sock = TSocket.TSocket(host=self._current_host_port[0],
                                      port=self._current_host_port[1])
         self._transport = TTransport.TFramedTransport(self._sock)
         self._protocol = TCompactProtocol(self._transport)
         self._client = THBaseService.Client(self._protocol)
         self._transport.open()
     except Exception, e:
         TDDCLogging.error(e)
         current_host_port = ':'.join(self._current_host_port)
         self._host_ports_pool.remove(current_host_port)
         if len(self._host_ports_pool) > 0:
             TDDCLogging.warning(
                 'HBase Server Exception. Now Is Reconnecting.')
         else:
             TDDCLogging.warning(
                 'HBase Server Fatal Error. Please Check It.')
             gevent.sleep(30)
             self._host_ports_pool = list(self._host_ports)
             TDDCLogging.warning('Retry Connecting HHase.')
         self._reconnect()
コード例 #9
0
                self._models_table[cls.EXCEPTION_TYPE] = cls

    def _recv(self):
        self._consumer = KafkaHelper.make_consumer(MonitorSite.KAFKA_NODES,
                                                   MonitorSite.EXCEPTION_TOPIC,
                                                   MonitorSite.EXCEPTION_GROUP)
        while True:
            partition_records = self._consumer.poll(2000, 16)
            if not len(partition_records):
                gevent.sleep(1)
                continue
            for _, records in partition_records.items():
                for record in records:
                    self._record_proc(record)

    def _record_proc(self, record):
        try:
            exception = json.loads(record.value)
        except Exception, e:
            self._consume_msg_exp('PARSE_TASK_JSON_ERR', record.value, e)
        else:
            code = exception.get('code')
            if not code:
                TDDCLogging.warning('This Exception Is Not Type Of `ExceptionModel`.')
                return
            cls = self._models_table.get(code)
            if not cls:
                TDDCLogging.warning('This Exception Is No Match Model.')
                return
            MonitorQueues.EXCEPTION.put(cls(**exception))