def test_calc_next_action_time(self): conf_dict_str = '\ "MIN_TIME": 15,\n\ "MAX_TIME": 10080,\n\ "INITIAL_NEXT_ACTION_FACTOR": 10,\n\ "ZERO_ACTIONS_FACTOR_CHANGE": 20,\n\ "FACTOR_CHANGE_FACTOR": 1.3,\n' sched = Scheduler(conf_dict_str) # Successful action, not-initialized next action factor result = sched.calc_next_action_time(True, None, 0) self.assertEqual(result, (datetime.timedelta(minutes=115), 7.692, 0)) # Successful action result = sched.calc_next_action_time(True, 13, 9) self.assertEqual(result, (datetime.timedelta(minutes=150), 10, 0)) # Successful action, new time delta under min time result = sched.calc_next_action_time(True, 1, 9) self.assertEqual(result, (datetime.timedelta(minutes=15), 0.769, 0)) # Successful action, not-initialized next action factor result = sched.calc_next_action_time(False, None, 0) self.assertEqual(result, (datetime.timedelta(minutes=150), 10, 1)) # Unsuccessful action, no new action factor result = sched.calc_next_action_time(False, 10, 18) self.assertEqual(result, (datetime.timedelta(minutes=150), 10, 19)) # Unsuccessful action, new action factor result = sched.calc_next_action_time(False, 10, 19) self.assertEqual(result, (datetime.timedelta(minutes=195), 13, 0))
def from_crawler(cls, crawler, *args, **kwargs): spider = cls(*args, **kwargs) spider._set_crawler(crawler) spider._set_config(**kwargs) spider._set_request_kwargs() for cp_path in spider.conf['CUSTOM_PROCESSORS']: try: custom_processors = importlib.import_module(cp_path) except ImportError: msg = "Custom processors from {path} could not be imported, processors won't be applied".format( path=cp_path, ) spider.log(msg, logging.WARNING) post_save.connect(spider._post_save_tasks, sender=spider.scraped_obj_class) spider._set_start_urls(spider.scrape_url) spider.scheduler = Scheduler( spider.scraper.scraped_obj_class.scraper_scheduler_conf) spider.from_page = 'MP' spider.loader = None spider.dummy_loader = None spider.items_read_count = 0 spider.items_save_count = 0 msg = 'Spider for {roc} "{ro}" ({pk}) initialized.'.format( roc=spider.ref_object.__class__.__name__, ro=str(spider.ref_object), pk=str(spider.ref_object.pk), ) spider.log(msg, logging.INFO) return spider
def __init__(self, *args, **kwargs): self.mandatory_vars.append('scraped_obj_class') self.mandatory_vars.append('scraped_obj_item_class') super(DjangoSpider, self).__init__(self, *args, **kwargs) self._set_config(**kwargs) self._set_request_kwargs() for cp_path in self.conf['CUSTOM_PROCESSORS']: try: custom_processors = importlib.import_module(cp_path) except ImportError: msg = "Custom processors from {path} could not be imported, processors won't be applied".format( path=cp_path, ) self.log(msg, logging.WARNING) post_save.connect(self._post_save_tasks, sender=self.scraped_obj_class) self._set_start_urls(self.scrape_url) self.scheduler = Scheduler(self.scraper.scraped_obj_class.scraper_scheduler_conf) self.from_page = 'MP' self.loader = None self.dummy_loader = None self.items_read_count = 0 self.items_save_count = 0 msg = 'Spider for {roc} "{ro}" ({pk}) initialized.'.format( roc=self.ref_object.__class__.__name__, ro=str(self.ref_object), pk=str(self.ref_object.pk), ) self.log(msg, logging.INFO)
def __init__(self, *args, **kwargs): self.mandatory_vars.append('scraped_obj_class') self.mandatory_vars.append('scraped_obj_item_class') super(DjangoSpider, self).__init__(self, *args, **kwargs) self._set_config(**kwargs) self._set_request_kwargs() post_save.connect(self._post_save_tasks, sender=self.scraped_obj_class) self._set_start_urls(self.scrape_url) self.scheduler = Scheduler( self.scraper.scraped_obj_class.scraper_scheduler_conf) self.from_page = 'MP' self.loader = None self.dummy_loader = None self.items_read_count = 0 self.items_save_count = 0 msg = 'Spider for {roc} "{ro}" ({pk}) initialized.'.format( roc=self.ref_object.__class__.__name__, ro=str(self.ref_object), pk=str(self.ref_object.pk), ) self.log(msg, logging.INFO)
def __init__(self, *args, **kwargs): self.mandatory_vars.append('scraped_obj_class') self.mandatory_vars.append('scraped_obj_item_class') super(DjangoSpider, self).__init__(self, *args, **kwargs) self._set_config(**kwargs) self._set_request_kwargs() if self.scraper.form_data != u'': try: form_data = json.loads(self.scraper.form_data) except ValueError: raise CloseSpider( "Incorrect form_data attribute: not a valid JSON dict!") if not isinstance(form_data, dict): raise CloseSpider( "Incorrect form_data attribute: not a valid JSON dict!") self.form_data = form_data self._set_start_urls(self.scrape_url) self.scheduler = Scheduler( self.scraper.scraped_obj_class.scraper_scheduler_conf) self.from_detail_page = False self.loader = None self.items_read_count = 0 self.items_save_count = 0 msg = "Spider for " + self.ref_object.__class__.__name__ + " \"" + str( self.ref_object) + "\" (" + str( self.ref_object.pk) + ") initialized." self.log(msg, log.INFO)
def __init__(self, *args, **kwargs): super(DjangoChecker, self).__init__(self, *args, **kwargs) self._set_config(**kwargs) self._check_checker_config() self.start_urls.append(self.scrape_url) self.scheduler = Scheduler(self.scraper.scraped_obj_class.checker_scheduler_conf) dispatcher.connect(self.response_received, signal=signals.response_received) msg = "Checker for " + self.ref_object.__class__.__name__ + " \"" + str(self.ref_object) + "\" (" + str(self.ref_object.pk) + ") initialized." self.log(msg, log.INFO)
def __init__(self, *args, **kwargs): self.mandatory_vars.append('scraped_obj_class') self.mandatory_vars.append('scraped_obj_item_class') super(DjangoSpider, self).__init__(self, *args, **kwargs) self._set_config(**kwargs) self._check_scraper_config() self._set_start_urls(self.scrape_url) self.scheduler = Scheduler(self.scraper.scraped_obj_class.scraper_scheduler_conf) self.from_detail_page = False self.loader = None self.items_read_count = 0 self.items_save_count = 0 msg = "Spider for " + self.ref_object.__class__.__name__ + " \"" + str(self.ref_object) + "\" (" + str( self.ref_object.pk) + ") initialized." self.log(msg, log.INFO)
def from_crawler(cls, crawler, *args, **kwargs): spider = cls(*args, **kwargs) spider._set_crawler(crawler) spider._set_config(**kwargs) spider._check_checker_config() spider._set_request_kwargs() spider._set_meta_splash_args() spider.scheduler = Scheduler( spider.scraper.scraped_obj_class.checker_scheduler_conf) dispatcher.connect(spider.response_received, signal=signals.response_received) msg = "Checker for " + spider.ref_object.__class__.__name__ + " \"" + str( spider.ref_object) + "\" (" + str( spider.ref_object.pk) + ") initialized." spider.log(msg, logging.INFO) return spider