def __UA_IP(self, signal): ''' get fake ua ip ''' ua_ip = SockCUA_IP() ua_ip.state = 1 ua_ip.num = 5 package = ua_ip.make_package() self.__send(package) ua_ip.state = 2 package = ua_ip.make_package() self.__send(package) if self.heart_cnt % 3 == 2: print_plus('send UA IP') def task(): package = SockHead() package.type = 1 package.opcode = 20000 self.__send(package.make_package()) print_plus('send get task') if self.task_timer: return self.task_timer = threading.Timer(10, task) self.task_timer.setName(True) self.setName('get_task_timer') self.task_timer.start()
def __cookies(self, package): cookies = SockSCookies(package) signal = KidSignal(sub_model_opcode=SubModelOpcode.crawler_model, opcode=CrawlerOpcode.cookies, data=cookies) self.__add_signal(signal) print_plus('recv SockSCookies')
def __storage(self, dic): task = dic['task'] item = dic['item'] if item is None: self.__task_state_feedback(task, 10) return key = item['basic']['key'] self.__task_state_feedback(task, 5) table_name = str(task.attr_id) pid = str(task.pid) if kid_setting.CRAWLER_TYPE == 2: table_name += '_detail' ret = storage_manager.write_data(item, table_name, pid, task.storage) if not ret: print_plus('Storage result failed', level=2) self.__task_state_feedback(task, 10) return self.__task_state_feedback(task, 6) if task.storage == 2: #FTP table_name = table_name + '/' + pid self.storage_list.append({ 'job_id': task.job_id, 'attr_id': task.attr_id, 'table': table_name, 'key': key, 'depth': task.depth, 'cur_depth': task.cur_depth, 'storage': task.storage }) if len(self.storage_list) >= 10: self.__send_msg()
def __get_cookies(self, signal): cookies = SockCCookies() cookies.amount = 5 cookies.attr_id = signal.data package_info = cookies.make_package() print_plus('send SockCCookies:%d' % cookies.attr_id) self.__send(package_info)
def __manager_registting(self, signal): ''' register manager ''' package = SockCManagerREG() package_info = package.make_package() self.__send(package_info) print_plus('send SockCManagerREG')
def __device_info(self, signal): ''' feedback device info ''' package = SockCDeviceInfo() package_info = package.make_package() self.__send(package_info) print_plus('send SockCDeviceInfo')
def __task_num(self, signal): ''' feedback task num ''' package = SockCTaskNum() package.task_num = signal.data self.__send(package.make_package()) print_plus('send SockCTaskNum: %d' % package.task_num)
def __task_amount(self, signal): ''' task process ''' package = SockCCrawlingAmount() package_info = package.make_package() self.__send(package_info) print_plus('send SockCCrawlingAmount')
def __heart(self, signal): ''' heart ''' if self.heart_cnt % 3 == 2: print_plus('send heart') self.heart_cnt += 1 self.__send(signal.data)
def __manager_reg_success(self, package): ''' reg success ''' package_info = SockSManagerREG(package) current_device_manager.set_manager_id(package_info.manager_id) current_device_manager.set_token(package_info.token) print_plus('recv SockSManagerREG') self.__getting_UA_IP()
def __task_process(self, package): ''' get task process ''' package_info = SockSCrawlingAmount(package) print package_info.opcode print package_info.manager_id print package_info.job_id print_plus('recv SockSCrawlingAmount')
def __sock_work(self, signal): ''' work ''' if signal.opcode not in self.send_selector.keys(): print_plus('sock_err', file_line=True, level=2) return send_method = self.send_selector[signal.opcode] if send_method: send_method(signal)
def __send(self, package): ''' send to server ''' if self.caller.caller and package: try: self.caller.caller.transport.getHandle().sendall(package) except Exception, e: err = 'send except:%s' % e print_plus(err, file_line=True, level=2)
def __distribute_task(self, package): ''' distribute task ''' package_info = SockSDistributeTask(package) signal = KidSignal(sub_model_opcode=SubModelOpcode.crawler_model, opcode=CrawlerOpcode.task_distribute, data=package_info) self.__add_signal(signal) print_plus('recv SockSDistributeTask')
def __hbase_daemon(self): cnt = 0 while True: if cnt >= 5: try: self.client.getTableNames() except Exception, e: print_plus(content='Hbase Check Alive Failed', level=1) print_plus(content=e, level=1) self.__reconnect_hbase() finally:
def get_row(self, table=None, rowkey=None): ''' get row ''' if not self.state: return if table and rowkey: try: return self.client.getRow(table, rowkey, None) except Exception,e: print_plus('GetRowExcept: table:%s %s' % (table, e), level=2)
def __getting_device_info(self, package): ''' get device info ''' package_info = SockSDeviceInfo(package) print package_info print_plus('recv SockSDeviceInfo') # TODO 发送设备当前使用情况 signal = KidSignal(sub_model_opcode=SubModelOpcode.sock_send_model, opcode=KidSockOpcode.c_device_info) self.__add_signal(signal)
def get(self, table=None, rowkey=None): ''' get row ''' if not self.state: return HbaseManager.ConnectError if table and rowkey: try: return self.client.getRow(table, rowkey, None) except Exception, e: print_plus(content='GetRowExcept: table:%s %s' % (table, e), level=1) return HbaseManager.GetError
def __write(self, table, key, value): ''' write ''' mutations = [] for f in value.keys(): values = value[f] for c in values.keys(): mutation = Mutation(column='%s:%s' % (f, c), value=values[c]) mutations.append(mutation) try: self.client.mutateRow(table, key, mutations, None) print_plus('write to hbase success: %s<=>%s' % (table, key)) except Exception,e: print_plus(e, level=2)
def __init__(self, host=None, port=None, timeout=15000): ''' Constructor ''' self.state = 0 self.transport = TSocket.TSocket(host, port) self.transport.setTimeout(timeout) self.transport = TTransport.TBufferedTransport(self.transport) self.protocol = TBinaryProtocol.TBinaryProtocol(self.transport) self.client = Hbase.Client(self.protocol) try: self.transport.open() self.state = 1 print_plus('HBase Init Succssed') except: print_plus('HBase Init Failed', level=1)
def main(): ''' 启动爬虫客户端 ''' setproctitle.setproctitle(kid_setting.CLIENT_PROC_NAME) client_factory = KidClientFactory() reactor.__init__() # @UndefinedVariable reactor.suggestThreadPoolSize(25) # @UndefinedVariable connector = reactor.connectTCP( kid_setting.SERVER_IP, # @UndefinedVariable kid_setting.SERVER_PORT, client_factory) connector.transport.getHandle().setsockopt(socket.SOL_SOCKET, socket.SO_SNDBUF, 4096 * 100) print_plus('IP:%s\tPort:%s\tHBase IP:%s\tCrawler Type:%d' % (kid_setting.SERVER_IP, kid_setting.SERVER_PORT, kid_setting.HBASE_HOST, kid_setting.CRAWLER_TYPE)) reactor.run() # @UndefinedVariable
def __manage_task(self, signal): ''' task manager ''' package = signal.data for task in package.tasks: if task.job_id in self.task_id_list: continue else: self.task_pool.append(task) self.task_id_list.add(task.job_id) for task in package.tasks: self.__task_state_feedback(task, 2) self.task_cnt += len(self.task_pool) try: self.__exec_task() except Exception, e: print_plus('%s' % e, True, True, 2)
def __timer_manager(self): cnt = 1 while kid_setting.CONNECT and self.is_alive(): if cnt % 59 == 0: print_plus('__timer_manager is alive') if cnt % 29 == 0: self.cookies_request_list = [] if cnt % 4 == 0: if len(self.storage_list) > 0: self.__send_msg() if cnt % 14 == 0: task_cnt = 0 mqs = self.spider.crawler.engine.slot.scheduler.mqs.queues.get( 0, None) """ 修改成返回当前任务数 """ if mqs: task_cnt = len(mqs.q) else: task_cnt = 0 """ if mqs: if len(mqs.q) < kid_setting.DEVICE_MAX_TASK: task_cnt = kid_setting.DEVICE_MAX_TASK - len(mqs.q) else: task_cnt = kid_setting.DEVICE_MAX_TASK """ signal = KidSignal( sub_model_opcode=SubModelOpcode.sock_send_model, opcode=KidSockOpcode.c_task_num, data=task_cnt) self.caller.add_signal(signal) signal = KidSignal( sub_model_opcode=SubModelOpcode.sock_send_model, opcode=KidSockOpcode.c_achieve_UA_IP) self.caller.add_signal(signal) time.sleep(1) cnt += 1
def __create_table(self, table, columns_name): ''' create table ''' try: if table not in self.tables: self.tables = self.client.getTableNames() if table not in self.tables: cols = [] for column_name in columns_name: col = ColumnDescriptor(name='%s:' % column_name, maxVersions=1) cols.append(col) try: self.client.createTable(table, cols) except Exception, e: print_plus(content=e, level=1) return HbaseManager.CreateError except Exception, e: print_plus(content=e, level=1) return HbaseManager.ConnectError
def __data_saved_info(self, signal): ''' storage info ''' tasks_info = signal.data package = SockCDataSaveInfo() for info in tasks_info: storage_info = StorageInfo() storage_info.key = info['key'] storage_info.name = info['table'] storage_info.job_id = info['job_id'] storage_info.attr_id = info['attr_id'] storage_info.depth = info['depth'] storage_info.cur_depth = info['cur_depth'] package.storages.append(storage_info) if info['storage'] == 3: package.opcode = KidSockOpcode.c_ftp_save_info package_info = package.make_package() self.__send(package_info) print_plus('send SockCDataSaveInfo [%d][%d]' % (len(package.storages), len(package_info)))
def __init__(self, host=None, port=None, timeout=15000): ''' Constructor ''' self.sock = TSocket.TSocket(host, port) self.sock.setTimeout(timeout) self.transport = TTransport.TBufferedTransport(self.sock) self.protocol = TBinaryProtocol.TBinaryProtocol(self.transport) self.client = Hbase.Client(self.protocol) try: self.transport.open() self.state = 1 self.tables = self.client.getTableNames() print_plus(content='HBase Init Succssed') except: self.state = 0 print_plus(content='HBase Init Failed', level=1) finally: self.connection_check_thread = threading.Thread( target=self.__hbase_daemon, name='hbase_daemon') self.connection_check_thread.setDaemon(True) self.connection_check_thread.start()
def __reconnect_hbase(self): self.transport.close() try: self.transport.open() self.state = 1 print_plus(content='HBase Init Succssed') except Exception, e: print_plus(content='HBase Init Failed', level=1) print_plus(content=e, level=1) self.state = 0
def task(): package = SockHead() package.type = 1 package.opcode = 20000 self.__send(package.make_package()) print_plus('send get task')
self.client.scannerClose(scanner) return results def get_row(self, table=None, rowkey=None): ''' get row ''' if not self.state: return if table and rowkey: try: return self.client.getRow(table, rowkey, None) except Exception,e: print_plus('GetRowExcept: table:%s %s' % (table, e), level=2) else: print_plus('get row error', level=2) def __create_table(self, table, columns_name): ''' create table ''' tables = self.client.getTableNames() if table not in tables: cols = [] for column_name in columns_name: col = ColumnDescriptor(name='%s:'%column_name, maxVersions=1) cols.append(col) try: self.client.createTable(table, cols) except Exception,e: print e
def __error(self, package): error = SockSError(package) print_plus('sock error:%d' % error.error_code)