Beispiel #1
0
    def test_calc_next_action_time(self):
        conf_dict_str = '\
"MIN_TIME": 15,\n\
"MAX_TIME": 10080,\n\
"INITIAL_NEXT_ACTION_FACTOR": 10,\n\
"ZERO_ACTIONS_FACTOR_CHANGE": 20,\n\
"FACTOR_CHANGE_FACTOR": 1.3,\n'
        sched = Scheduler(conf_dict_str)
        
        # Successful action, not-initialized next action factor
        result = sched.calc_next_action_time(True, None, 0)
        self.assertEqual(result, (datetime.timedelta(minutes=115), 7.692, 0))
        
        # Successful action
        result = sched.calc_next_action_time(True, 13, 9)
        self.assertEqual(result, (datetime.timedelta(minutes=150), 10, 0))
        
        # Successful action, new time delta under min time
        result = sched.calc_next_action_time(True, 1, 9)
        self.assertEqual(result, (datetime.timedelta(minutes=15), 0.769, 0))
        
        # Successful action, not-initialized next action factor
        result = sched.calc_next_action_time(False, None, 0)
        self.assertEqual(result, (datetime.timedelta(minutes=150), 10, 1))
        
        # Unsuccessful action, no new action factor
        result = sched.calc_next_action_time(False, 10, 18)
        self.assertEqual(result, (datetime.timedelta(minutes=150), 10, 19))
        
        # Unsuccessful action, new action factor
        result = sched.calc_next_action_time(False, 10, 19)
        self.assertEqual(result, (datetime.timedelta(minutes=195), 13, 0))
    def from_crawler(cls, crawler, *args, **kwargs):
        spider = cls(*args, **kwargs)
        spider._set_crawler(crawler)

        spider._set_config(**kwargs)
        spider._set_request_kwargs()

        for cp_path in spider.conf['CUSTOM_PROCESSORS']:
            try:
                custom_processors = importlib.import_module(cp_path)
            except ImportError:
                msg = "Custom processors from {path} could not be imported, processors won't be applied".format(
                    path=cp_path, )
                spider.log(msg, logging.WARNING)

        post_save.connect(spider._post_save_tasks,
                          sender=spider.scraped_obj_class)

        spider._set_start_urls(spider.scrape_url)
        spider.scheduler = Scheduler(
            spider.scraper.scraped_obj_class.scraper_scheduler_conf)
        spider.from_page = 'MP'
        spider.loader = None
        spider.dummy_loader = None
        spider.items_read_count = 0
        spider.items_save_count = 0

        msg = 'Spider for {roc} "{ro}" ({pk}) initialized.'.format(
            roc=spider.ref_object.__class__.__name__,
            ro=str(spider.ref_object),
            pk=str(spider.ref_object.pk),
        )
        spider.log(msg, logging.INFO)

        return spider
Beispiel #3
0
 def __init__(self, *args, **kwargs):
     self.mandatory_vars.append('scraped_obj_class')
     self.mandatory_vars.append('scraped_obj_item_class')
     
     super(DjangoSpider, self).__init__(self, *args, **kwargs)
     self._set_config(**kwargs)
     self._set_request_kwargs()
     
     for cp_path in self.conf['CUSTOM_PROCESSORS']:
         try:
             custom_processors = importlib.import_module(cp_path)
         except ImportError:
             msg = "Custom processors from {path} could not be imported, processors won't be applied".format(
                 path=cp_path,
             )
             self.log(msg, logging.WARNING)
     
     post_save.connect(self._post_save_tasks, sender=self.scraped_obj_class)
     
     self._set_start_urls(self.scrape_url)
     self.scheduler = Scheduler(self.scraper.scraped_obj_class.scraper_scheduler_conf)
     self.from_page = 'MP'
     self.loader = None
     self.dummy_loader = None
     self.items_read_count = 0
     self.items_save_count = 0
     
     msg = 'Spider for {roc} "{ro}" ({pk}) initialized.'.format(
         roc=self.ref_object.__class__.__name__,
         ro=str(self.ref_object),
         pk=str(self.ref_object.pk),
     )
     self.log(msg, logging.INFO)
    def __init__(self, *args, **kwargs):
        self.mandatory_vars.append('scraped_obj_class')
        self.mandatory_vars.append('scraped_obj_item_class')

        super(DjangoSpider, self).__init__(self, *args, **kwargs)
        self._set_config(**kwargs)
        self._set_request_kwargs()

        post_save.connect(self._post_save_tasks, sender=self.scraped_obj_class)

        self._set_start_urls(self.scrape_url)
        self.scheduler = Scheduler(
            self.scraper.scraped_obj_class.scraper_scheduler_conf)
        self.from_page = 'MP'
        self.loader = None
        self.dummy_loader = None
        self.items_read_count = 0
        self.items_save_count = 0

        msg = 'Spider for {roc} "{ro}" ({pk}) initialized.'.format(
            roc=self.ref_object.__class__.__name__,
            ro=str(self.ref_object),
            pk=str(self.ref_object.pk),
        )
        self.log(msg, logging.INFO)
Beispiel #5
0
    def __init__(self, *args, **kwargs):
        self.mandatory_vars.append('scraped_obj_class')
        self.mandatory_vars.append('scraped_obj_item_class')

        super(DjangoSpider, self).__init__(self, *args, **kwargs)
        self._set_config(**kwargs)
        self._set_request_kwargs()
        if self.scraper.form_data != u'':
            try:
                form_data = json.loads(self.scraper.form_data)
            except ValueError:
                raise CloseSpider(
                    "Incorrect form_data attribute: not a valid JSON dict!")
            if not isinstance(form_data, dict):
                raise CloseSpider(
                    "Incorrect form_data attribute: not a valid JSON dict!")
            self.form_data = form_data

        self._set_start_urls(self.scrape_url)
        self.scheduler = Scheduler(
            self.scraper.scraped_obj_class.scraper_scheduler_conf)
        self.from_detail_page = False
        self.loader = None
        self.items_read_count = 0
        self.items_save_count = 0

        msg = "Spider for " + self.ref_object.__class__.__name__ + " \"" + str(
            self.ref_object) + "\" (" + str(
                self.ref_object.pk) + ") initialized."
        self.log(msg, log.INFO)
    def test_calc_next_action_time(self):
        conf_dict_str = '\
"MIN_TIME": 15,\n\
"MAX_TIME": 10080,\n\
"INITIAL_NEXT_ACTION_FACTOR": 10,\n\
"ZERO_ACTIONS_FACTOR_CHANGE": 20,\n\
"FACTOR_CHANGE_FACTOR": 1.3,\n'
        sched = Scheduler(conf_dict_str)

        # Successful action, not-initialized next action factor
        result = sched.calc_next_action_time(True, None, 0)
        self.assertEqual(result, (datetime.timedelta(minutes=115), 7.692, 0))

        # Successful action
        result = sched.calc_next_action_time(True, 13, 9)
        self.assertEqual(result, (datetime.timedelta(minutes=150), 10, 0))

        # Successful action, new time delta under min time
        result = sched.calc_next_action_time(True, 1, 9)
        self.assertEqual(result, (datetime.timedelta(minutes=15), 0.769, 0))

        # Successful action, not-initialized next action factor
        result = sched.calc_next_action_time(False, None, 0)
        self.assertEqual(result, (datetime.timedelta(minutes=150), 10, 1))

        # Unsuccessful action, no new action factor
        result = sched.calc_next_action_time(False, 10, 18)
        self.assertEqual(result, (datetime.timedelta(minutes=150), 10, 19))

        # Unsuccessful action, new action factor
        result = sched.calc_next_action_time(False, 10, 19)
        self.assertEqual(result, (datetime.timedelta(minutes=195), 13, 0))
Beispiel #7
0
 def __init__(self, *args, **kwargs):
     super(DjangoChecker, self).__init__(self, *args, **kwargs)
     self._set_config(**kwargs)
     self._check_checker_config()
     
     self.start_urls.append(self.scrape_url)
     self.scheduler = Scheduler(self.scraper.scraped_obj_class.checker_scheduler_conf)
     dispatcher.connect(self.response_received, signal=signals.response_received)
     
     msg = "Checker for " + self.ref_object.__class__.__name__ + " \"" + str(self.ref_object) + "\" (" + str(self.ref_object.pk) + ") initialized."
     self.log(msg, log.INFO)
  def __init__(self, *args, **kwargs):
    self.mandatory_vars.append('scraped_obj_class')
    self.mandatory_vars.append('scraped_obj_item_class')

    super(DjangoSpider, self).__init__(self, *args, **kwargs)
    self._set_config(**kwargs)
    self._check_scraper_config()

    self._set_start_urls(self.scrape_url)
    self.scheduler = Scheduler(self.scraper.scraped_obj_class.scraper_scheduler_conf)
    self.from_detail_page = False
    self.loader = None
    self.items_read_count = 0
    self.items_save_count = 0

    msg = "Spider for " + self.ref_object.__class__.__name__ + " \"" + str(self.ref_object) + "\" (" + str(
      self.ref_object.pk) + ") initialized."
    self.log(msg, log.INFO)
    def from_crawler(cls, crawler, *args, **kwargs):
        spider = cls(*args, **kwargs)
        spider._set_crawler(crawler)

        spider._set_config(**kwargs)
        spider._check_checker_config()
        spider._set_request_kwargs()
        spider._set_meta_splash_args()

        spider.scheduler = Scheduler(
            spider.scraper.scraped_obj_class.checker_scheduler_conf)
        dispatcher.connect(spider.response_received,
                           signal=signals.response_received)

        msg = "Checker for " + spider.ref_object.__class__.__name__ + " \"" + str(
            spider.ref_object) + "\" (" + str(
                spider.ref_object.pk) + ") initialized."
        spider.log(msg, logging.INFO)

        return spider