def _parse(self): while True: task, body = ParserQueues.WAITING_PARSE.get() cls = self._rules_updater.get_parse_model(task.platform, task.feature) if not cls: fmt = 'Parse No Match: [P:{platform}][F:{feature}][K:{row_key}]' TDDCLogging.warning( fmt.format(platform=task.platform, feature=task.feature, row_key=task.row_key)) continue try: ret = cls(task, body) except Exception, e: TDDCLogging.error(e) continue self._storage(task, ret.items) self._new_task_push(ret.tasks) fmt = 'Parsed: [{platform}:{row_key}:{feature}][S:{items}][N:{tasks}]' TDDCLogging.info( fmt.format(platform=task.platform, feature=task.feature, row_key=task.row_key, items=len(ret.items), tasks=len(ret.tasks))) ParserQueues.TASK_STATUS.put(task)
def _auto_create_table(self, connection, table): for cnt in range(2): if table not in self._tables: if cnt == 1: connection.create_table(table, {k:{} for k in ['source', 'valuable', 'task']}) TDDCLogging.warning('Create New Table(%s) to HBase.' % table) self._tables = connection.tables() else: break
def _dispatch(self): while True: event = self._event_queue.get() callback = self._event_call.get(event.event_type, None) if callback: callback(event) else: TDDCLogging.warning('Event Exception: %d Not Register.' % event.event_type)
def _process(self): while True: exception = MonitorQueues.EXCEPTION.get() cls = self._exception_process.get(exception.code) if not cls: TDDCLogging.warning( 'No Match Process To Exception: {exp_id}'.format( exp_id=exception.id)) continue cls(exception)
def get(self, table_name, row_key, family=None, qualifier=None): if not self._status: TDDCLogging.warning( '[Get Operation Was Failed] HBase Server Is Exception.') return False, None get = TGet() get.row = row_key if family: tc = TColumn() tc.family = family if qualifier: tc.qualifier = qualifier get.columns = [tc] try: ret = None ret = self._client.get(table_name, get) except Exception, e: TDDCLogging.error(e) return False, None
def put(self, table, row_key, items=None): if not self._status: TDDCLogging.warning( '[Put Operation Was Failed] HBase Server Is Exception.') return False cvs = [] for family, info in items.items(): if not isinstance(info, dict): raise TypeError for k, v in info.items(): if isinstance(v, list) or isinstance(v, dict): v = json.dumps(v) cv = TColumnValue(family, k, v) cvs.append(cv) tp = TPut(row_key, cvs) try: self._client.put(table, tp) except Exception, e: TDDCLogging.error(e) return False
def error_back(self, response): task, times = response.request.meta['item'] proxy = response.request.meta.get('proxy', None) if response.type == httperror.HttpError: status = response.value.response.status if status >= 500 or status == 408: fmt = '[%s][%s] Crawled Failed(\033[0m %d \033[1;37;43m| %s ). Will Retry After While.' TDDCLogging.warning(fmt % (task.platform, task.url, status, proxy)) self.add_task(task, True) return elif status == 404: retry_times = task.retry if task.retry else 3 if times >= retry_times: exception = CrawlerTaskFailedException(task) CrawlerQueues.EXCEPTION.put(exception) CrawlerQueues.TASK_STATUS_REMOVE.put(task) fmt = '[%s:%s] Crawled Failed(\033[0m 404 \033[1;37;43m| %s ). Not Retry.' TDDCLogging.warning(fmt % (task.platform, task.url, proxy)) return times += 1 fmt = '[%s:%s] Crawled Failed(\033[0m %d \033[1;37;43m| %s ). Will Retry After While.' TDDCLogging.warning(fmt % (task.platform, task.url, status, proxy)) self.add_task(task, True, times) return elif response.type == internet_err.TimeoutError: err_msg = 'TimeoutError' elif response.type in [internet_err.ConnectionRefusedError, internet_err.TCPTimedOutError]: err_msg = '%d:%s' % (response.value.osError, response.value.message) elif response.type == newclient_err.ResponseNeverReceived: err_msg = 'ResponseNeverReceived' else: err_msg = '%s' % (response.value) if proxy: proxy = proxy.split('//')[1] CrawlerQueues.UNUSEFUL_PROXY_FEEDBACK.put([task.platform, proxy]) fmt = '[%s][%s] Crawled Failed(\033[0m %s \033[1;37;43m| %s ). Will Retry After While.' TDDCLogging.warning(fmt % (task.platform, task.url, err_msg, proxy)) self.add_task(task, True, times)
def _connect(self): try: self._current_host_port = random.choice( self._host_ports_pool).split(':') self._sock = TSocket.TSocket(host=self._current_host_port[0], port=self._current_host_port[1]) self._transport = TTransport.TFramedTransport(self._sock) self._protocol = TCompactProtocol(self._transport) self._client = THBaseService.Client(self._protocol) self._transport.open() except Exception, e: TDDCLogging.error(e) current_host_port = ':'.join(self._current_host_port) self._host_ports_pool.remove(current_host_port) if len(self._host_ports_pool) > 0: TDDCLogging.warning( 'HBase Server Exception. Now Is Reconnecting.') else: TDDCLogging.warning( 'HBase Server Fatal Error. Please Check It.') gevent.sleep(30) self._host_ports_pool = list(self._host_ports) TDDCLogging.warning('Retry Connecting HHase.') self._reconnect()
self._models_table[cls.EXCEPTION_TYPE] = cls def _recv(self): self._consumer = KafkaHelper.make_consumer(MonitorSite.KAFKA_NODES, MonitorSite.EXCEPTION_TOPIC, MonitorSite.EXCEPTION_GROUP) while True: partition_records = self._consumer.poll(2000, 16) if not len(partition_records): gevent.sleep(1) continue for _, records in partition_records.items(): for record in records: self._record_proc(record) def _record_proc(self, record): try: exception = json.loads(record.value) except Exception, e: self._consume_msg_exp('PARSE_TASK_JSON_ERR', record.value, e) else: code = exception.get('code') if not code: TDDCLogging.warning('This Exception Is Not Type Of `ExceptionModel`.') return cls = self._models_table.get(code) if not cls: TDDCLogging.warning('This Exception Is No Match Model.') return MonitorQueues.EXCEPTION.put(cls(**exception))