def run(self): values = configdata.get(DetailSpiderConst.DetailStatusSettings, {}) values[const.DETAIL_LIST] = self.cis values.update(**{ const.CONFIG_DATA:self.configdata, }) if ScrapyConst.Console in values: if values[ScrapyConst.Console] == u'1':# out to console values[ScrapyConst.LOG_FILE] = None else: log_dir = values.get(ScrapyConst.LOG_DIR, os.getcwd()) if not os.path.exists(log_dir): os.makedirs(log_dir,) if ScrapyConst.LOG_FILE in values: log_file = values[ScrapyConst.LOG_FILE] values[ScrapyConst.LOG_FILE] = os.sep.join([log_dir , log_file]) settings_path = u'crawler.shc.fe.settings' module_import = __import__(settings_path, {}, {}, ['']) settings = CrawlerSettings(module_import, values=values) execute(argv=["scrapy", "crawl", 'CarStatusSpider' ], settings=settings)
def fetch51anonymousfreeproxy(): values = configdata.get(FetchProxySpiderConst.FetchFOAnonymousProxySettings, {}) values[ScrapyConst.DOWNLOAD_TIMEOUT] = int(values.get(ScrapyConst.DOWNLOAD_TIMEOUT, 0)) if ScrapyConst.Console in values: if values[ScrapyConst.Console] == u'1':# out to console values[ScrapyConst.LOG_FILE] = None else: log_dir = values.get(ScrapyConst.LOG_DIR, os.getcwd()) if ScrapyConst.LOG_FILE in values: log_file = values[ScrapyConst.LOG_FILE] values[ScrapyConst.LOG_FILE] = os.sep.join([log_dir , log_file]) settings = CrawlerSettings(None, values=values) execute(argv=["scrapy", "crawl", "FOAnonymousSpider" ], settings=settings)
def run(self): feconfig = self.configdata[const.FE_CONFIG] try: #======================================================================= # if the city use the default config #======================================================================= city_config = eval(feconfig[self.city_name]) except Exception: city_config = {} start_page = city_config.get(const.START_PAGE, feconfig[const.DEFAULT_START_PAGE]) end_page = city_config.get(const.END_PAGE, feconfig[const.DEFAULT_END_PAGE]) # values = { # const.CONFIG_DATA:self.configdata, # const.START_PAGE:int(start_page), # const.END_PAGE:int(end_page), # } # settings = u'crawler.shc.fe.settings' # module_import = __import__(settings, {}, {}, ['']) # settings = CrawlerSettings(module_import, values=values) # execute(argv=["scrapy", "crawl", 'SHCSpider' ], settings=settings) values = configdata.get(ListSpiderConst.ListSettings, {}) values.update(**{ const.CONFIG_DATA:self.configdata, const.START_PAGE:int(start_page), const.END_PAGE:int(end_page), }) if ScrapyConst.Console in values: if values[ScrapyConst.Console] == u'1':# out to console values[ScrapyConst.LOG_FILE] = None else: log_dir = values.get(ScrapyConst.LOG_DIR, os.getcwd()) if ScrapyConst.LOG_FILE in values: log_file = values[ScrapyConst.LOG_FILE] values[ScrapyConst.LOG_FILE] = os.sep.join([log_dir , log_file]) settings_path = u'crawler.shc.fe.settings' module_import = __import__(settings_path, {}, {}, ['']) settings = CrawlerSettings(module_import, values=values) execute(argv=["scrapy", "crawl", 'SHCSpider' ], settings=settings)
def run(self): values = configdata.get(const.vpsettings, {}) values[AppConst.proxies] = self.proxies values[const.DOWNLOAD_TIMEOUT] = int(values.get(const.DOWNLOAD_TIMEOUT, 5)) if const.Console in values: if values[const.Console] == u'1':# out to console values[const.LOG_FILE] = None else: log_dir = values.get(const.LOG_DIR, os.getcwd()) if const.LOG_FILE in values: logfile_prefix = datetime.datetime.now().strftime("%Y%m%d_%H%M%S_%f") log_file = '%s_%s' % (logfile_prefix, values[const.LOG_FILE]) values[const.LOG_FILE] = os.sep.join([log_dir , log_file]) values[const.RETRY_TIMES] = len(valid_urls) settings = u'vp.settings' module_import = __import__(settings, {}, {}, ['']) settings = CrawlerSettings(module_import, values=values) execute(argv=["scrapy", "crawl", 'SOSOSpider' ], settings=settings)
def run(self): if self.proxies: values = configdata.get(const.vpsettings, {}) values[AppConst.proxies] = self.proxies values[const.DOWNLOAD_TIMEOUT] = int( values.get(const.DOWNLOAD_TIMEOUT, 5)) if const.Console in values: if values[const.Console] == u'1': # out to console values[const.LOG_FILE] = None else: log_dir = values.get(const.LOG_DIR, os.getcwd()) if const.LOG_FILE in values: logfile_prefix = datetime.datetime.now().strftime( "%Y%m%d_%H%M%S_%f") log_file = '%s_%s' % (logfile_prefix, values[const.LOG_FILE]) values[const.LOG_FILE] = os.sep.join( [log_dir, log_file]) settings = CrawlerSettings(None, values=values) execute(argv=["scrapy", "crawl", 'SOSOSpider'], settings=settings)
def run(): appconfig = configdata.get(AppConst.app_config, {}) frequence = appconfig.get(AppConst.app_config_frequence, 1800) frequence = int(frequence) volume_per_time = appconfig.get(AppConst.volumepertime, 1000) volume_per_time = int(volume_per_time) ps = [] while 1: proxy_ids = [] proxies = get_proxies(d=datetime.date.today()) print u'get %s proxies' % len(proxies) for idx, proxy in enumerate(proxies): proxy_ids.append(proxy) if len(proxy_ids) == volume_per_time: p = ValidProcess(proxy_ids) ps.append(p) print u'%s %s start %s' % (datetime.datetime.now(), p.name, len(proxy_ids)) p.start() proxy_ids = [] else: if proxy_ids: p = ValidProcess(proxy_ids) ps.append(p) print u'%s %s start %s' % (datetime.datetime.now(), p.name, len(proxy_ids)) p.start() proxy_ids = [] print u'%s valid proxy .. sleep %s seconds' % (datetime.datetime.now(), frequence) time.sleep(frequence) while ps: p = ps.pop() try: p.terminate() print(u'%s terminate one process %s' % (datetime.datetime.now(), p.name)) except: pass
def run(): appconfig = configdata.get(AppConst.app_config, {}) frequence = appconfig.get(AppConst.app_config_frequence, 1800) frequence = int(frequence) volume_per_time = appconfig.get(AppConst.volumepertime, 1000) volume_per_time = int(volume_per_time) ps = [] while 1: proxy_ids = [] proxies = get_proxies(d=datetime.date.today()) print u'get %s proxies'%len(proxies) for idx, proxy in enumerate(proxies): proxy_ids.append(proxy) if len(proxy_ids) == volume_per_time: p = ValidProcess(proxy_ids) ps.append(p) print u'%s %s start %s' % (datetime.datetime.now(), p.name,len(proxy_ids)) p.start() proxy_ids = [] else: if proxy_ids: p = ValidProcess(proxy_ids) ps.append(p) print u'%s %s start %s' % (datetime.datetime.now(), p.name,len(proxy_ids)) p.start() proxy_ids = [] print u'%s valid proxy .. sleep %s seconds' % (datetime.datetime.now(), frequence) time.sleep(frequence) while ps: p = ps.pop() try: p.terminate() print (u'%s terminate one process %s' % (datetime.datetime.now(), p.name)) except : pass
def fetch51freeproxy(): values = configdata.get(const.vpsettings, {}) settings = CrawlerSettings(values=values) execute(argv=["scrapy", "crawl", "FOSpider"], settings=settings)