コード例 #1
0
ファイル: SchedulerX.py プロジェクト: yajie100/ScrapyX
 def run(self):
     '''
     每隔1秒,循环读取tasks
     交给Downloader
     :return:
     '''
     #创建进程池
     pool = Pool()
     while True:
         #获取一条待执行的Task,并置为doing状态
         task = self.taskUtil.get_ready()
         if task is not None and len(task) > 0 or True:
             Log.i('-----------------------------')
             #用进程池启动Downloader
             pool.apply_async(self.run_downloader, args=(task, ))
         #休眠n秒(从配置文件中读取)
         items = ConfigUtil.getItems('scheduler')
         interval_min = items['interval_min']
         interval_max = items['interval_max']
         seconds = random.randint(int(interval_min), int(interval_max))
         Log.i('Start sleep ' + str(seconds) + ' seconds')
         time.sleep(seconds)
     pool.close()
     pool.join()
     log.i('All subprocesses done.')
コード例 #2
0
    def run(self):
        '''
        生产进程执行,每隔60*60*60*24秒,循环读取tasks
        :return:
        '''
        Log.i('ProducerUrl.run() in {0}'.format(time.ctime()))
        while True:
            #监听数据
            DictData = self.pipeDictData.recv()
            if DictData is None:
                continue
            #源数据处理(实体类)
            self.URL_inf.dict2class(DictData)
            #检查Mongo
            if self.mogodbControl is None:
                self.mogodbControl = Mongodb_Operator(
                    DbdataCCGPDFZB["host"], DbdataCCGPDFZB["port"],
                    DbdataCCGPDFZB["db_name"],
                    DbdataCCGPDFZB["default_collection"])
            #检查Kafka
            if self.KafkaOperator is None:
                self.KafkaOperator = localKafkaUrlinformation()
            #查重
            uuid = self.get_md5(self.URL_inf.Urlname, self.URL_inf.title)
            item = {"uuid": uuid}
            value = self.mogodbControl.findone(
                item, self.__Sendcollection)  # 查询到返回文档
            # #TODO 插入数据库有问题
            if value is not None:
                continue
            #获取首页内容
            self.URL_inf = self.downLoadHtml()
            if self.URL_inf is None:
                continue
            #异步保存数据
            self.savedata(self.URL_inf)

            # #休眠n秒(从配置文件中读取)
            items = ConfigUtil.getItems('consumerScheduler')
            interval_min = items['interval_min']
            interval_max = items['interval_max']
            seconds = random.randint(int(interval_min), int(interval_max))
            Log.i('StartProducerUrl sleep ' + str(seconds) + ' seconds')
            time.sleep(seconds)
コード例 #3
0
 def run(self):
     '''
     获取免费IP代理进程执行,循环读取tasks
     :return:
     '''
     Log.i('proxyIpPool.run() in {0}'.format(time.ctime()))
     while True:
         #调用本地和远程的免费ip代理api并推进ip消息队列
         proxyIpPool = getIpProxyPool()
         #统一改成本地
         proxyIpPoolFromeRemote = getIpProxyPool()
         # proxyIpPoolFromeRemote = getIpProxyPoolFromeRemote()
         if proxyIpPool is not None:
             self.queueDictData.put(proxyIpPool)
         if proxyIpPoolFromeRemote is not None:
             self.queueDictData.put(proxyIpPoolFromeRemote)
         # 休眠n秒(从配置文件中读取)
         items = ConfigUtil.getItems('proxyIpScheduler')
         interval_min = items['interval_min']
         interval_max = items['interval_max']
         seconds = random.randint(int(interval_min), int(interval_max))
         Log.i('proxyIpPool sleep ' + str(seconds) + ' seconds')
         time.sleep(seconds)
コード例 #4
0
    def run(self):
        '''
        生产进程执行,每隔60*60*60*24秒,循环读取tasks
        :return:
        '''
        Log.i ('ProducerUrl.run() in {0}'.format(time.ctime()))
        while True:
            #生产URL
            if USE_SOURCEURL_TYPE is True:
                if USE_ASYNCTASK_TYPE is True:
                    urlInformationList = ConfigUtil.readSourceListByParams(self.begin, self.end)
                else:
                    urlInformationList = ConfigUtil.readSourceList()
            else:
                urlInformationList = ConfigUtil.readTaskList()

            if urlInformationList is None:
                continue

            for urlInfor in urlInformationList:
                data = urlInfor.class2dict()
                diststrjson = json.dumps(data)
                Log.i(diststrjson)
                KafkaOperator = kafkaUrlinformation()
                KafkaOperator.producerUrl(diststrjson)

            #日执行一次不用休眠了.使用crontab定时任务驱动
            if self.crontab==1:
                os._exit(0)
            else:
                # #休眠n秒(从配置文件中读取)
                items=ConfigUtil.getItems('producerScheduler')
                interval_min = items['interval_min']
                interval_max = items['interval_max']
                seconds=random.randint(int(interval_min),int(interval_max))
                Log.i('StartProducerUrl sleep ' + str(seconds) + ' seconds')
                time.sleep(seconds)
コード例 #5
0
    def simpleRun(self):
        '''
        生产进程执行,每隔60*60*60*24秒,循环读取tasks
        :return:
        '''
        Log.i('ProducerUrl.run() in {0}'.format(time.ctime()))
        while True:
            #资源检查
            # KafkaOperator = kafkaUrlinformation()
            KafkaOperator = localKafkaUrlinformation()
            # if self.mogodbControl is None:
            #     self.mogodbControl = Mongodb_Operator(Dbdata["host"], Dbdata["port"], Dbdata["db_name"],
            #                                       Dbdata["default_collection"])
            #解析数据源
            # if USE_SOURCEURL_TYPE is True:
            #     if USE_ASYNCTASK_TYPE is True:
            #         urlInformationList = ConfigUtil.readSourceListRealTime()
            #     else:
            #         urlInformationList = ConfigUtil.readSourceList()
            # else:
            #     urlInformationList = ConfigUtil.readTaskList()

            urlInformationList = ConfigUtil.readSourceListRealTime()

            #爬取,解析子URL
            if urlInformationList is None:
                continue

            for urlInfor in urlInformationList:
                data = urlInfor.class2dict()

                #获取首页内容
                dowloadData = self.downLoadHtml(data)
                if dowloadData is None:
                    continue
                # 解析提取分页url
                pageData = self.getPageNumFromHome(dowloadData)
                if pageData is None:
                    continue
                for pageIndex in pageData:
                    # 获取首页内容
                    dowloadPageData = self.downLoadHtml(pageIndex.class2dict())
                    if dowloadPageData is None:
                        continue
                    #提取子链接
                    # self.URL_inf.dict2class(pageIndex)
                    ccgpChildrenLink = self.getChildrenLink(dowloadPageData)
                    if ccgpChildrenLink is None:
                        continue
                    #KAFKA下发子链接
                    for link in ccgpChildrenLink:
                        # 检查Mongo
                        if self.mogodbControl is None:
                            self.mogodbControl = Mongodb_Operator(
                                DbdataCCGPDFZB["host"], DbdataCCGPDFZB["port"],
                                DbdataCCGPDFZB["db_name"],
                                DbdataCCGPDFZB["default_collection"])
                        # 查重,不重复发送到kafka节省资源
                        if link.title is None:  #标题为空的不发送
                            continue
                        uuid = self.get_md5(link.Urlname, link.title)
                        item = {"uuid": uuid}
                        value = self.mogodbControl.findone(
                            item, self.__Sendcollection)  # 查询到返回文档
                        # #TODO 插入数据库有问题
                        if value is not None:  #数据库查重
                            continue
                        # 于浩说不要发父链接给他
                        if link.DeepNum >= 0:
                            producerData = json.dumps(link.class2dict())
                            Log.i("produce<<" + producerData)
                            KafkaOperator.producerUrl(producerData)

            #日执行一次不用休眠了.使用crontab定时任务驱动
            if self.crontab == 1:
                os._exit(0)
            else:
                # #休眠n秒(从配置文件中读取)
                items = ConfigUtil.getItems('producerScheduler')
                interval_min = items['interval_min']
                interval_max = items['interval_max']
                seconds = random.randint(int(interval_min), int(interval_max))
                Log.i('StartProducerUrl sleep ' + str(seconds) + ' seconds')
                time.sleep(seconds)