コード例 #1
0
 def create_table_to_hbase(self, table, families):
     try:
         with self._hb_pool.connection() as connection:
             connection.create_table(table, families)
     except Exception, e:
         TDDCLogging.error(e)
         return False
コード例 #2
0
 def _parse(self):
     while True:
         task, body = ParserQueues.WAITING_PARSE.get()
         cls = self._rules_updater.get_parse_model(task.platform,
                                                   task.feature)
         if not cls:
             fmt = 'Parse No Match: [P:{platform}][F:{feature}][K:{row_key}]'
             TDDCLogging.warning(
                 fmt.format(platform=task.platform,
                            feature=task.feature,
                            row_key=task.row_key))
             continue
         try:
             ret = cls(task, body)
         except Exception, e:
             TDDCLogging.error(e)
             continue
         self._storage(task, ret.items)
         self._new_task_push(ret.tasks)
         fmt = 'Parsed: [{platform}:{row_key}:{feature}][S:{items}][N:{tasks}]'
         TDDCLogging.info(
             fmt.format(platform=task.platform,
                        feature=task.feature,
                        row_key=task.row_key,
                        items=len(ret.items),
                        tasks=len(ret.tasks)))
         ParserQueues.TASK_STATUS.put(task)
コード例 #3
0
ファイル: storager_base.py プロジェクト: slmzhi/tddc
 def _push(self):
     cnt = 0
     platform_rows = {}
     while True:
         try:
             task, storage_info = PublicQueues.STORAGE.get()
             items = {
                 self.FAMILY: storage_info,
                 'task': {
                     'task': task.to_json()
                 }
             }
             if not platform_rows.get(task.platform +
                                      BaseSite.PLATFORM_SUFFIX):
                 platform_rows[task.platform +
                               BaseSite.PLATFORM_SUFFIX] = {}
             platform_rows[task.platform +
                           BaseSite.PLATFORM_SUFFIX][task.row_key] = items
             cnt += 1
             if PublicQueues.STORAGE.qsize() and not cnt % 5:
                 gevent.sleep(0.01)
                 continue
             if self._db.puts_to_hbase(platform_rows):
                 self._pushed(platform_rows, True)
             else:
                 self._pushed(platform_rows, False)
                 gevent.sleep(1)
             platform_rows = {}
         except Exception, e:
             TDDCLogging.error(e)
コード例 #4
0
 def _consume_msg_exp(self, exp_type, info, exception=None):
     if 'JSON_ERR' in exp_type:
         TDDCLogging.error('*' * 5 + exp_type + '*' * 5 + '\nException: ' +
                           info + '\n' + exception.message + '\n' + '*' *
                           (10 + len(exp_type)) + '\n')
     elif 'TASK_ERR' in exp_type or 'EVENT_ERR' in exp_type:
         TDDCLogging.error('*' * 5 + exp_type + '*' * 5 + '\nException: ' +
                           'item={item}\n'.format(item=info) +
                           'item_type={item_type}\n'.format(
                               item_type=type(info)) + '*' *
                           (10 + len(exp_type)) + '\n')
コード例 #5
0
 def _rules_update(self):
     while True:
         rule = ProxyCheckerQueues.RULES_MOULDS_UPDATE.get()
         print(rule.platform, rule.package, rule.moulds)
         for cls_name in rule.moulds:
             molule = importlib.import_module(rule.package)
             cls = getattr(molule, cls_name)
             if not cls:
                 TDDCLogging.error('Exception: import rule failed: ' +
                                   cls_name)
                 continue
             self._rules_moulds[cls.proxy_type][cls.proxy_type] = cls
コード例 #6
0
 def get_from_hbase(self, table, row_key, family=None, qualifier=None):
     try:
         with self._hb_pool.connection() as connection:
             table = connection.table(table)
             if family and qualifier:
                 cf = family + ':' + qualifier
             elif family and not qualifier:
                 cf = family
             else:
                 return False, None
             return True, table.row(row_key, columns=[cf])
     except Exception, e:
         TDDCLogging.error(e)
         return False, None
コード例 #7
0
 def send_mail(subject, content):
     cur_time = time.time()
     if EMailManager.last_send_time > cur_time - 60:
         return False
     msg = MIMEText(content, 'plain', 'utf-8')
     msg['Subject'] = subject
     msg['From'] = MonitorSite.MAIL_USER
     msg['To'] = ';'.join(MonitorSite.MAIL_TO)
     server = smtplib.SMTP_SSL(MonitorSite.MAIL_HOST, MonitorSite.MAIL_PORT)
     try:
         server.login(MonitorSite.MAIL_USER, MonitorSite.MAIL_PWD)
         server.sendmail(MonitorSite.MAIL_USER, MonitorSite.MAIL_TO,
                         msg.as_string())
     except Exception, e:
         TDDCLogging.error(e)
コード例 #8
0
ファイル: hbase_manager.py プロジェクト: slmzhi/tddc
 def _keep_alive(self):
     while True:
         gevent.sleep(15)
         try:
             if self._status:
                 if not self.get('keep_alive', 'ping')[0]:
                     raise TTransportException
         except TTransportException, e:
             if not self._status:
                 return
             TDDCLogging.error(e)
             self._status = False
             if len(self._host_ports_pool):
                 self._reconnect()
         except Exception, e:
             TDDCLogging.error(e)
コード例 #9
0
 def _push_parse_task(self):
     TDDCLogging.info('--->Parse Task Producer Was Ready.')
     while True:
         task, status = CrawlerQueues.PARSE.get()
         tmp = Task(**task.__dict__)
         task.status = Task.Status.CRAWL_SUCCESS
         if not isinstance(task, Task):
             TDDCLogging.error('')
             continue
         if not self._push_task(CrawlerSite.PARSE_TOPIC, tmp):
             TDDCLogging.error('')
         else:
             CrawlerQueues.TASK_STATUS_REMOVE.put(tmp)
             TDDCLogging.debug('[%s:%s] Crawled Successed(%d).' %
                               (task.platform, task.row_key, status))
             self._successed_num += 1
             self._successed_pre_min += 1
コード例 #10
0
ファイル: hbase_manager.py プロジェクト: slmzhi/tddc
 def get(self, table_name, row_key, family=None, qualifier=None):
     if not self._status:
         TDDCLogging.warning(
             '[Get Operation Was Failed] HBase Server Is Exception.')
         return False, None
     get = TGet()
     get.row = row_key
     if family:
         tc = TColumn()
         tc.family = family
         if qualifier:
             tc.qualifier = qualifier
         get.columns = [tc]
     try:
         ret = None
         ret = self._client.get(table_name, get)
     except Exception, e:
         TDDCLogging.error(e)
         return False, None
コード例 #11
0
ファイル: hbase_manager.py プロジェクト: slmzhi/tddc
 def put(self, table, row_key, items=None):
     if not self._status:
         TDDCLogging.warning(
             '[Put Operation Was Failed] HBase Server Is Exception.')
         return False
     cvs = []
     for family, info in items.items():
         if not isinstance(info, dict):
             raise TypeError
         for k, v in info.items():
             if isinstance(v, list) or isinstance(v, dict):
                 v = json.dumps(v)
             cv = TColumnValue(family, k, v)
             cvs.append(cv)
     tp = TPut(row_key, cvs)
     try:
         self._client.put(table, tp)
     except Exception, e:
         TDDCLogging.error(e)
         return False
コード例 #12
0
 def _pull(self):
     while True:
         task = ParserQueues.PARSE.get()
         if not task:
             continue
         if not task.platform or not task.row_key:
             TDDCLogging.error('Task Exception(Parse DB Manager): [%s:%s]' %
                               (task.platform, task.row_key))
             continue
         success, ret = self._db.get_from_hbase(
             task.platform + ParserSite.PLATFORM_SUFFIX, task.row_key,
             'source', 'content')
         if not success:
             ParserQueues.PARSE.put(task)
             gevent.sleep(1)
             continue
         if not ret:
             continue
         for _, value in ret.items():
             ParserQueues.WAITING_PARSE.put((task, value))
             break
コード例 #13
0
ファイル: src_proxies_updater.py プロジェクト: slmzhi/tddc
 def start(self):
     while True:
         for infos in self._src_apis:
             try:
                 platform = infos.get('platform')
                 api = infos.get('api')
                 parse_mould = infos.get('parse_mould')
                 rsp = requests.get(api)
                 if not rsp:
                     TDDCLogging.error('[TDDC_PROXY_SOURCE_UPDATER] Exception(%s): ' % platform + api)
                     continue
                 if not parse_mould:
                     TDDCLogging.error('[TDDC_PROXY_SOURCE_UPDATER] Exception: parse_mould is None.')
                     continue
                 all_ips = parse_mould(rsp.text)
                 http_ips = self._proxy_active_check(all_ips.get('HTTP', []))
                 self._ip_pool.smadd('tddc:test:proxy:ip_src:http', http_ips)
                 TDDCLogging.info('[TDDC_PROXY_SOURCE_UPDATER] Source IPS(HTTP) Growth:%d' % len(http_ips))
                 https_ips = self._proxy_active_check(all_ips.get('HTTPS', []))
                 self._ip_pool.smadd('tddc:test:proxy:ip_src:https', https_ips)
                 self._ip_pool.smadd('tddc:test:proxy:ip_src:http', https_ips)
                 TDDCLogging.info('[TDDC_PROXY_SOURCE_UPDATER] Source IPS(HTTPS) Growth:%d' % len(https_ips))
             except Exception, e:
                 TDDCLogging.error('[TDDC_PROXY_SOURCE_UPDATER] Exception[IP_SOURCE]:' + e)
         gevent.sleep(10)
コード例 #14
0
 def puts_to_hbase(self, table_rows):
     '''
     批量存储
     params:
         table_rows:
             EXP: {'platformxxx': {'row_key1': {'familyxxx': {'column': data},
                                               {'familyooo': {'column': data}},
                                  {'row_key2': {'familyxxx': {'column': data},
                                               {'familyooo': {'column': data}}}}
     '''
     try:
         with self._hb_pool.connection() as connection:
             for table, rows in table_rows.items():
                 self._auto_create_table(connection, table)
                 table = connection.table(table)
                 b = table.batch()
                 self._puts(b, rows)
                 b.send()
             return True
     except Exception, e:
         TDDCLogging.error(e)
         return False
コード例 #15
0
 def put_to_hbase(self, table, row_key, items):
     '''
     单个存储
     params:
         items:
             EXP: {'familyxxx': {'column': data},
                   'familyooo': {'column': data}}
     '''
     try:
         with self._hb_pool.connection() as connection:
             self._auto_create_table(connection, table)
             table = connection.table(table)
             for family, data in items.items():
                 cf_fmt = family + ':'
                 values = {}
                 for column, value in data.items():
                     if isinstance(value, dict) or isinstance(value, list):
                         value = json.dumps(value)
                     values[cf_fmt + column] = value
                 table.put(row_key, values) 
             return True
     except Exception, e:
         TDDCLogging.error(e)
         return False
コード例 #16
0
ファイル: hbase_manager.py プロジェクト: slmzhi/tddc
 def _connect(self):
     try:
         self._current_host_port = random.choice(
             self._host_ports_pool).split(':')
         self._sock = TSocket.TSocket(host=self._current_host_port[0],
                                      port=self._current_host_port[1])
         self._transport = TTransport.TFramedTransport(self._sock)
         self._protocol = TCompactProtocol(self._transport)
         self._client = THBaseService.Client(self._protocol)
         self._transport.open()
     except Exception, e:
         TDDCLogging.error(e)
         current_host_port = ':'.join(self._current_host_port)
         self._host_ports_pool.remove(current_host_port)
         if len(self._host_ports_pool) > 0:
             TDDCLogging.warning(
                 'HBase Server Exception. Now Is Reconnecting.')
         else:
             TDDCLogging.warning(
                 'HBase Server Fatal Error. Please Check It.')
             gevent.sleep(30)
             self._host_ports_pool = list(self._host_ports)
             TDDCLogging.warning('Retry Connecting HHase.')
         self._reconnect()
コード例 #17
0
            gevent.sleep(5)

    def _event_parse(self, record):
        try:
            item = json.loads(record.value)
        except Exception, e:
            self._consume_msg_exp('EVENT_JSON_ERR', record.value, e)
        else:
            if item and isinstance(item, dict):
                event_type = item.get('event_type')
                if not event_type:
                    self._consume_msg_exp('EVENT_ERR', item)
                    return
                cls = self._events_cls.get(event_type)
                if not cls:
                    TDDCLogging.error('Undefine Event Type: %d <%s>' %
                                      (event_type, json.dumps(item)))
                    return
                event = cls(**item)
                self._event_queue.put(event)
            else:
                self._consume_msg_exp('EVENT_ERR', item)

    def _dispatch(self):
        while True:
            event = self._event_queue.get()
            callback = self._event_call.get(event.event_type, None)
            if callback:
                callback(event)
            else:
                TDDCLogging.warning('Event Exception: %d Not Register.' %
                                    event.event_type)