def create_table_to_hbase(self, table, families): try: with self._hb_pool.connection() as connection: connection.create_table(table, families) except Exception, e: TDDCLogging.error(e) return False
def _parse(self): while True: task, body = ParserQueues.WAITING_PARSE.get() cls = self._rules_updater.get_parse_model(task.platform, task.feature) if not cls: fmt = 'Parse No Match: [P:{platform}][F:{feature}][K:{row_key}]' TDDCLogging.warning( fmt.format(platform=task.platform, feature=task.feature, row_key=task.row_key)) continue try: ret = cls(task, body) except Exception, e: TDDCLogging.error(e) continue self._storage(task, ret.items) self._new_task_push(ret.tasks) fmt = 'Parsed: [{platform}:{row_key}:{feature}][S:{items}][N:{tasks}]' TDDCLogging.info( fmt.format(platform=task.platform, feature=task.feature, row_key=task.row_key, items=len(ret.items), tasks=len(ret.tasks))) ParserQueues.TASK_STATUS.put(task)
def _push(self): cnt = 0 platform_rows = {} while True: try: task, storage_info = PublicQueues.STORAGE.get() items = { self.FAMILY: storage_info, 'task': { 'task': task.to_json() } } if not platform_rows.get(task.platform + BaseSite.PLATFORM_SUFFIX): platform_rows[task.platform + BaseSite.PLATFORM_SUFFIX] = {} platform_rows[task.platform + BaseSite.PLATFORM_SUFFIX][task.row_key] = items cnt += 1 if PublicQueues.STORAGE.qsize() and not cnt % 5: gevent.sleep(0.01) continue if self._db.puts_to_hbase(platform_rows): self._pushed(platform_rows, True) else: self._pushed(platform_rows, False) gevent.sleep(1) platform_rows = {} except Exception, e: TDDCLogging.error(e)
def _consume_msg_exp(self, exp_type, info, exception=None): if 'JSON_ERR' in exp_type: TDDCLogging.error('*' * 5 + exp_type + '*' * 5 + '\nException: ' + info + '\n' + exception.message + '\n' + '*' * (10 + len(exp_type)) + '\n') elif 'TASK_ERR' in exp_type or 'EVENT_ERR' in exp_type: TDDCLogging.error('*' * 5 + exp_type + '*' * 5 + '\nException: ' + 'item={item}\n'.format(item=info) + 'item_type={item_type}\n'.format( item_type=type(info)) + '*' * (10 + len(exp_type)) + '\n')
def _rules_update(self): while True: rule = ProxyCheckerQueues.RULES_MOULDS_UPDATE.get() print(rule.platform, rule.package, rule.moulds) for cls_name in rule.moulds: molule = importlib.import_module(rule.package) cls = getattr(molule, cls_name) if not cls: TDDCLogging.error('Exception: import rule failed: ' + cls_name) continue self._rules_moulds[cls.proxy_type][cls.proxy_type] = cls
def get_from_hbase(self, table, row_key, family=None, qualifier=None): try: with self._hb_pool.connection() as connection: table = connection.table(table) if family and qualifier: cf = family + ':' + qualifier elif family and not qualifier: cf = family else: return False, None return True, table.row(row_key, columns=[cf]) except Exception, e: TDDCLogging.error(e) return False, None
def send_mail(subject, content): cur_time = time.time() if EMailManager.last_send_time > cur_time - 60: return False msg = MIMEText(content, 'plain', 'utf-8') msg['Subject'] = subject msg['From'] = MonitorSite.MAIL_USER msg['To'] = ';'.join(MonitorSite.MAIL_TO) server = smtplib.SMTP_SSL(MonitorSite.MAIL_HOST, MonitorSite.MAIL_PORT) try: server.login(MonitorSite.MAIL_USER, MonitorSite.MAIL_PWD) server.sendmail(MonitorSite.MAIL_USER, MonitorSite.MAIL_TO, msg.as_string()) except Exception, e: TDDCLogging.error(e)
def _keep_alive(self): while True: gevent.sleep(15) try: if self._status: if not self.get('keep_alive', 'ping')[0]: raise TTransportException except TTransportException, e: if not self._status: return TDDCLogging.error(e) self._status = False if len(self._host_ports_pool): self._reconnect() except Exception, e: TDDCLogging.error(e)
def _push_parse_task(self): TDDCLogging.info('--->Parse Task Producer Was Ready.') while True: task, status = CrawlerQueues.PARSE.get() tmp = Task(**task.__dict__) task.status = Task.Status.CRAWL_SUCCESS if not isinstance(task, Task): TDDCLogging.error('') continue if not self._push_task(CrawlerSite.PARSE_TOPIC, tmp): TDDCLogging.error('') else: CrawlerQueues.TASK_STATUS_REMOVE.put(tmp) TDDCLogging.debug('[%s:%s] Crawled Successed(%d).' % (task.platform, task.row_key, status)) self._successed_num += 1 self._successed_pre_min += 1
def get(self, table_name, row_key, family=None, qualifier=None): if not self._status: TDDCLogging.warning( '[Get Operation Was Failed] HBase Server Is Exception.') return False, None get = TGet() get.row = row_key if family: tc = TColumn() tc.family = family if qualifier: tc.qualifier = qualifier get.columns = [tc] try: ret = None ret = self._client.get(table_name, get) except Exception, e: TDDCLogging.error(e) return False, None
def put(self, table, row_key, items=None): if not self._status: TDDCLogging.warning( '[Put Operation Was Failed] HBase Server Is Exception.') return False cvs = [] for family, info in items.items(): if not isinstance(info, dict): raise TypeError for k, v in info.items(): if isinstance(v, list) or isinstance(v, dict): v = json.dumps(v) cv = TColumnValue(family, k, v) cvs.append(cv) tp = TPut(row_key, cvs) try: self._client.put(table, tp) except Exception, e: TDDCLogging.error(e) return False
def _pull(self): while True: task = ParserQueues.PARSE.get() if not task: continue if not task.platform or not task.row_key: TDDCLogging.error('Task Exception(Parse DB Manager): [%s:%s]' % (task.platform, task.row_key)) continue success, ret = self._db.get_from_hbase( task.platform + ParserSite.PLATFORM_SUFFIX, task.row_key, 'source', 'content') if not success: ParserQueues.PARSE.put(task) gevent.sleep(1) continue if not ret: continue for _, value in ret.items(): ParserQueues.WAITING_PARSE.put((task, value)) break
def start(self): while True: for infos in self._src_apis: try: platform = infos.get('platform') api = infos.get('api') parse_mould = infos.get('parse_mould') rsp = requests.get(api) if not rsp: TDDCLogging.error('[TDDC_PROXY_SOURCE_UPDATER] Exception(%s): ' % platform + api) continue if not parse_mould: TDDCLogging.error('[TDDC_PROXY_SOURCE_UPDATER] Exception: parse_mould is None.') continue all_ips = parse_mould(rsp.text) http_ips = self._proxy_active_check(all_ips.get('HTTP', [])) self._ip_pool.smadd('tddc:test:proxy:ip_src:http', http_ips) TDDCLogging.info('[TDDC_PROXY_SOURCE_UPDATER] Source IPS(HTTP) Growth:%d' % len(http_ips)) https_ips = self._proxy_active_check(all_ips.get('HTTPS', [])) self._ip_pool.smadd('tddc:test:proxy:ip_src:https', https_ips) self._ip_pool.smadd('tddc:test:proxy:ip_src:http', https_ips) TDDCLogging.info('[TDDC_PROXY_SOURCE_UPDATER] Source IPS(HTTPS) Growth:%d' % len(https_ips)) except Exception, e: TDDCLogging.error('[TDDC_PROXY_SOURCE_UPDATER] Exception[IP_SOURCE]:' + e) gevent.sleep(10)
def puts_to_hbase(self, table_rows): ''' 批量存储 params: table_rows: EXP: {'platformxxx': {'row_key1': {'familyxxx': {'column': data}, {'familyooo': {'column': data}}, {'row_key2': {'familyxxx': {'column': data}, {'familyooo': {'column': data}}}} ''' try: with self._hb_pool.connection() as connection: for table, rows in table_rows.items(): self._auto_create_table(connection, table) table = connection.table(table) b = table.batch() self._puts(b, rows) b.send() return True except Exception, e: TDDCLogging.error(e) return False
def put_to_hbase(self, table, row_key, items): ''' 单个存储 params: items: EXP: {'familyxxx': {'column': data}, 'familyooo': {'column': data}} ''' try: with self._hb_pool.connection() as connection: self._auto_create_table(connection, table) table = connection.table(table) for family, data in items.items(): cf_fmt = family + ':' values = {} for column, value in data.items(): if isinstance(value, dict) or isinstance(value, list): value = json.dumps(value) values[cf_fmt + column] = value table.put(row_key, values) return True except Exception, e: TDDCLogging.error(e) return False
def _connect(self): try: self._current_host_port = random.choice( self._host_ports_pool).split(':') self._sock = TSocket.TSocket(host=self._current_host_port[0], port=self._current_host_port[1]) self._transport = TTransport.TFramedTransport(self._sock) self._protocol = TCompactProtocol(self._transport) self._client = THBaseService.Client(self._protocol) self._transport.open() except Exception, e: TDDCLogging.error(e) current_host_port = ':'.join(self._current_host_port) self._host_ports_pool.remove(current_host_port) if len(self._host_ports_pool) > 0: TDDCLogging.warning( 'HBase Server Exception. Now Is Reconnecting.') else: TDDCLogging.warning( 'HBase Server Fatal Error. Please Check It.') gevent.sleep(30) self._host_ports_pool = list(self._host_ports) TDDCLogging.warning('Retry Connecting HHase.') self._reconnect()
gevent.sleep(5) def _event_parse(self, record): try: item = json.loads(record.value) except Exception, e: self._consume_msg_exp('EVENT_JSON_ERR', record.value, e) else: if item and isinstance(item, dict): event_type = item.get('event_type') if not event_type: self._consume_msg_exp('EVENT_ERR', item) return cls = self._events_cls.get(event_type) if not cls: TDDCLogging.error('Undefine Event Type: %d <%s>' % (event_type, json.dumps(item))) return event = cls(**item) self._event_queue.put(event) else: self._consume_msg_exp('EVENT_ERR', item) def _dispatch(self): while True: event = self._event_queue.get() callback = self._event_call.get(event.event_type, None) if callback: callback(event) else: TDDCLogging.warning('Event Exception: %d Not Register.' % event.event_type)