def crawl_params(self, **options): now = options.get("current_execution_date", datetime.utcnow()) _logger.debug("{0}: Current execution time: {1}". format(CRAWLER_NAME, now)) # Check if there is an internal checkpoint, use to save the status of # the crawler between executions checkpoint_data = get_checkpoint_data( CRAWLER_NAME, CRAWLER_FILE_CTL, default={})\ from_date = get_from_date(options, checkpoint_data) ###### The Crawler logic to obtain the objects to be crawl comes here # BEGIN DEMO last_index = checkpoint_data.get(LAST_GENERATED_INDEX, 0) crawling_params = range(last_index + 20) checkpoint_data[LAST_GENERATED_INDEX] = last_index + 20 ### END DEMO checkpoint_data[LAST_EXECUTION_DATE_CTL_FIELD] = now # Let's signal that we have processed the latest files (til 'now') # To process the new ones the last time we run the process put_checkpoint_data(CRAWLER_NAME, CRAWLER_FILE_CTL, checkpoint_data) return crawling_params
def crawl_params(self, producer, **options): now = options.get("current_execution_date", datetime.utcnow()) # Check if there is an internal checkpoint, use it instead of the # date send by the crawler system checkpoint_data = get_checkpoint_data(BOVESPA_CRAWLER, BOVESPA_FILE_CTL, default={}) from_date = get_from_date(options, checkpoint_data) to_date = get_to_date(options) process_listed_companies(options, checkpoint_data, now) process_companies_files(options, checkpoint_data, now, from_date, to_date, producer) # Let's signal that we have processed the latest files (til 'now') # To process the new ones the last time we run the process put_checkpoint_data(BOVESPA_CRAWLER, BOVESPA_FILE_CTL, checkpoint_data)