def main(): config_logging() if not os.path.exists(settings.json_restore_path): CrawlerUtils.make_dir(settings.json_restore_path) cur_date = CrawlerUtils.get_cur_y_m_d() set_codecracker() if len(sys.argv) >= 2 and sys.argv[1] == "check": dt = None if len(sys.argv) == 3: dt = datetime.datetime.strptime(sys.argv[2], "%Y-%m-%d") checker = Checker(dt) checker.run() return if len(sys.argv) < 3: print 'usage: run.py [check] [max_crawl_time(minutes) province...] \n\tmax_crawl_time 最大爬取秒数,以秒计;\n\tprovince 是所要爬取的省份列表 用空格分开, all表示爬取全部)' return try: max_crawl_time = int(sys.argv[1]) settings.max_crawl_time = datetime.timedelta(minutes=max_crawl_time) except ValueError as e: settings.logger.error('invalid max_crawl_time, should be a integer') os._exit(1) timer = threading.Timer(max_crawl_time, force_exit) timer.start() settings.logger.info(u'即将开始爬取,最长爬取时间为 %s 秒' % settings.max_crawl_time) settings.start_crawl_time = datetime.datetime.now() if sys.argv[2] == 'all': args = [p for p in sorted(province_crawler.keys())] process_pool = MyPool() process_pool.map(crawl_province, args) process_pool.close() settings.logger.info("wait processes....") process_pool.join() else: provinces = sys.argv[2:] for p in provinces: if not p in province_crawler.keys(): settings.logger.warn('province %s is not supported currently' % p) continue crawl_province(p)
'chongqing': ChongqingClawer, 'zhejiang': ZhejiangCrawler, 'liaoning': LiaoningCrawler, 'gansu': GansuClawer, 'guangxi': GuangxiCrawler, 'shanxi': ShanxiCrawler, 'qinghai': QinghaiCrawler, 'hubei': HubeiCrawler, 'guizhou': GuizhouCrawler, 'jilin': JilinCrawler, 'hainan': HainanCrawler, 'xizang': XizangCrawler, } process_pool = None cur_date = CrawlerUtils.get_cur_y_m_d() def set_codecracker(): for province in sorted(province_crawler.keys()): try: province_crawler.get(province).code_cracker = CaptchaRecognition( province) except Exception, e: settings.logger.warn("init captcha recognition of %s", province) def config_logging(): settings.logger = logging.getLogger('enterprise-crawler') settings.logger.setLevel(settings.log_level) fh = logging.FileHandler(settings.log_file)