class Scheduler(object): def __init__(self): self.crawler = Crawler() self.db = Mongo() def run(self, url, date): """ 开始爬取并保存 :param url: 爬取url :param date: 爬取时间 :return: """ data = self.crawler.main(url) data['date'] = date self.db.add(data) def main(self, url, h=1, m=0): """ 主程序入口 :param url: 爬取url :param h: 执行程序的时间 :param m:执行程序的时间 :return: """ while True: now = datetime.datetime.now() date = str(now.year) + '-' + str(now.month) + '-' + str(now.day) if now.hour == h and now.minute == m: self.run(url, date) # 每个60秒检测一次 time.sleep(60)
class Scheduler(object): def __init__(self): self.crawler = Crawler() self.db = Mongo(MONGO_DB) def main(self): """ 程序主逻辑函数 :return: """ self.db.add(MONGO_COLLECTION_URL, {'url': START_URL}) while self.db.count(MONGO_COLLECTION_URL) > 0: url = self.db.remove_one(MONGO_COLLECTION_URL)['url'] userinfo, new_urls = self.crawler.main(url) if userinfo or new_urls: self.db.add(MONGO_COLLECTION_USERINFO, userinfo) for new_url in new_urls: self.db.add(MONGO_COLLECTION_URL, {'url': new_url}) else: self.db.add(MONGO_COLLECTION_USERINFO, { 'user_url': url, 'declare': '该账户可能已经注销' })