def run(self): ''' 每隔1秒,循环读取tasks 交给Downloader :return: ''' #创建进程池 pool = Pool() while True: #获取一条待执行的Task,并置为doing状态 task = self.taskUtil.get_ready() if task is not None and len(task) > 0 or True: Log.i('-----------------------------') #用进程池启动Downloader pool.apply_async(self.run_downloader, args=(task, )) #休眠n秒(从配置文件中读取) items = ConfigUtil.getItems('scheduler') interval_min = items['interval_min'] interval_max = items['interval_max'] seconds = random.randint(int(interval_min), int(interval_max)) Log.i('Start sleep ' + str(seconds) + ' seconds') time.sleep(seconds) pool.close() pool.join() log.i('All subprocesses done.')
def run(self): ''' 生产进程执行,每隔60*60*60*24秒,循环读取tasks :return: ''' Log.i('ProducerUrl.run() in {0}'.format(time.ctime())) while True: #监听数据 DictData = self.pipeDictData.recv() if DictData is None: continue #源数据处理(实体类) self.URL_inf.dict2class(DictData) #检查Mongo if self.mogodbControl is None: self.mogodbControl = Mongodb_Operator( DbdataCCGPDFZB["host"], DbdataCCGPDFZB["port"], DbdataCCGPDFZB["db_name"], DbdataCCGPDFZB["default_collection"]) #检查Kafka if self.KafkaOperator is None: self.KafkaOperator = localKafkaUrlinformation() #查重 uuid = self.get_md5(self.URL_inf.Urlname, self.URL_inf.title) item = {"uuid": uuid} value = self.mogodbControl.findone( item, self.__Sendcollection) # 查询到返回文档 # #TODO 插入数据库有问题 if value is not None: continue #获取首页内容 self.URL_inf = self.downLoadHtml() if self.URL_inf is None: continue #异步保存数据 self.savedata(self.URL_inf) # #休眠n秒(从配置文件中读取) items = ConfigUtil.getItems('consumerScheduler') interval_min = items['interval_min'] interval_max = items['interval_max'] seconds = random.randint(int(interval_min), int(interval_max)) Log.i('StartProducerUrl sleep ' + str(seconds) + ' seconds') time.sleep(seconds)
def run(self): ''' 获取免费IP代理进程执行,循环读取tasks :return: ''' Log.i('proxyIpPool.run() in {0}'.format(time.ctime())) while True: #调用本地和远程的免费ip代理api并推进ip消息队列 proxyIpPool = getIpProxyPool() #统一改成本地 proxyIpPoolFromeRemote = getIpProxyPool() # proxyIpPoolFromeRemote = getIpProxyPoolFromeRemote() if proxyIpPool is not None: self.queueDictData.put(proxyIpPool) if proxyIpPoolFromeRemote is not None: self.queueDictData.put(proxyIpPoolFromeRemote) # 休眠n秒(从配置文件中读取) items = ConfigUtil.getItems('proxyIpScheduler') interval_min = items['interval_min'] interval_max = items['interval_max'] seconds = random.randint(int(interval_min), int(interval_max)) Log.i('proxyIpPool sleep ' + str(seconds) + ' seconds') time.sleep(seconds)
def run(self): ''' 生产进程执行,每隔60*60*60*24秒,循环读取tasks :return: ''' Log.i ('ProducerUrl.run() in {0}'.format(time.ctime())) while True: #生产URL if USE_SOURCEURL_TYPE is True: if USE_ASYNCTASK_TYPE is True: urlInformationList = ConfigUtil.readSourceListByParams(self.begin, self.end) else: urlInformationList = ConfigUtil.readSourceList() else: urlInformationList = ConfigUtil.readTaskList() if urlInformationList is None: continue for urlInfor in urlInformationList: data = urlInfor.class2dict() diststrjson = json.dumps(data) Log.i(diststrjson) KafkaOperator = kafkaUrlinformation() KafkaOperator.producerUrl(diststrjson) #日执行一次不用休眠了.使用crontab定时任务驱动 if self.crontab==1: os._exit(0) else: # #休眠n秒(从配置文件中读取) items=ConfigUtil.getItems('producerScheduler') interval_min = items['interval_min'] interval_max = items['interval_max'] seconds=random.randint(int(interval_min),int(interval_max)) Log.i('StartProducerUrl sleep ' + str(seconds) + ' seconds') time.sleep(seconds)
def simpleRun(self): ''' 生产进程执行,每隔60*60*60*24秒,循环读取tasks :return: ''' Log.i('ProducerUrl.run() in {0}'.format(time.ctime())) while True: #资源检查 # KafkaOperator = kafkaUrlinformation() KafkaOperator = localKafkaUrlinformation() # if self.mogodbControl is None: # self.mogodbControl = Mongodb_Operator(Dbdata["host"], Dbdata["port"], Dbdata["db_name"], # Dbdata["default_collection"]) #解析数据源 # if USE_SOURCEURL_TYPE is True: # if USE_ASYNCTASK_TYPE is True: # urlInformationList = ConfigUtil.readSourceListRealTime() # else: # urlInformationList = ConfigUtil.readSourceList() # else: # urlInformationList = ConfigUtil.readTaskList() urlInformationList = ConfigUtil.readSourceListRealTime() #爬取,解析子URL if urlInformationList is None: continue for urlInfor in urlInformationList: data = urlInfor.class2dict() #获取首页内容 dowloadData = self.downLoadHtml(data) if dowloadData is None: continue # 解析提取分页url pageData = self.getPageNumFromHome(dowloadData) if pageData is None: continue for pageIndex in pageData: # 获取首页内容 dowloadPageData = self.downLoadHtml(pageIndex.class2dict()) if dowloadPageData is None: continue #提取子链接 # self.URL_inf.dict2class(pageIndex) ccgpChildrenLink = self.getChildrenLink(dowloadPageData) if ccgpChildrenLink is None: continue #KAFKA下发子链接 for link in ccgpChildrenLink: # 检查Mongo if self.mogodbControl is None: self.mogodbControl = Mongodb_Operator( DbdataCCGPDFZB["host"], DbdataCCGPDFZB["port"], DbdataCCGPDFZB["db_name"], DbdataCCGPDFZB["default_collection"]) # 查重,不重复发送到kafka节省资源 if link.title is None: #标题为空的不发送 continue uuid = self.get_md5(link.Urlname, link.title) item = {"uuid": uuid} value = self.mogodbControl.findone( item, self.__Sendcollection) # 查询到返回文档 # #TODO 插入数据库有问题 if value is not None: #数据库查重 continue # 于浩说不要发父链接给他 if link.DeepNum >= 0: producerData = json.dumps(link.class2dict()) Log.i("produce<<" + producerData) KafkaOperator.producerUrl(producerData) #日执行一次不用休眠了.使用crontab定时任务驱动 if self.crontab == 1: os._exit(0) else: # #休眠n秒(从配置文件中读取) items = ConfigUtil.getItems('producerScheduler') interval_min = items['interval_min'] interval_max = items['interval_max'] seconds = random.randint(int(interval_min), int(interval_max)) Log.i('StartProducerUrl sleep ' + str(seconds) + ' seconds') time.sleep(seconds)