Example #1
0
class Scheduler(object):
    def __init__(self):
        self.crawler = Crawler()
        self.db = Mongo()

    def run(self, url, date):
        """
        开始爬取并保存
        :param url: 爬取url
        :param date: 爬取时间
        :return:
        """
        data = self.crawler.main(url)
        data['date'] = date
        self.db.add(data)

    def main(self, url, h=1, m=0):
        """
        主程序入口
        :param url: 爬取url
        :param h: 执行程序的时间
        :param m:执行程序的时间
        :return:
        """
        while True:
            now = datetime.datetime.now()
            date = str(now.year) + '-' + str(now.month) + '-' + str(now.day)
            if now.hour == h and now.minute == m:
                self.run(url, date)
            # 每个60秒检测一次
            time.sleep(60)
Example #2
0
class Scheduler(object):
    def __init__(self):
        self.crawler = Crawler()
        self.db = Mongo(MONGO_DB)

    def main(self):
        """
        程序主逻辑函数
        :return:
        """
        self.db.add(MONGO_COLLECTION_URL, {'url': START_URL})
        while self.db.count(MONGO_COLLECTION_URL) > 0:
            url = self.db.remove_one(MONGO_COLLECTION_URL)['url']
            userinfo, new_urls = self.crawler.main(url)
            if userinfo or new_urls:
                self.db.add(MONGO_COLLECTION_USERINFO, userinfo)
                for new_url in new_urls:
                    self.db.add(MONGO_COLLECTION_URL, {'url': new_url})
            else:
                self.db.add(MONGO_COLLECTION_USERINFO, {
                    'user_url': url,
                    'declare': '该账户可能已经注销'
                })