def scraper_worker_handler(event, context): console = Console(file=sys.stdout, record=True) run_log = settings.RUN_LOGGER(start=datetime.datetime.utcnow()) message = json.loads(event["Records"][0]["body"]) council = message["council"] command_name = message["scraper_type"] console.log(f"Fetching Scraper for: {council}") scraper_cls = load_scraper(council, command_name) if not scraper_cls: return console.log(f"Begin attempting to scrape: {council}") options = {"council": council, "verbose": True, "aws_lambda": True} scraper = scraper_cls(options, console) try: if not scraper.disabled: scraper.run(run_log) else: console.log(f"Scraper for {council} is disabled") except Exception as e: scraper.console.log(e) run_log.error = traceback.format_exc() # This probably means aws_tidy_up hasn't been called. # Let's do that ourselves then scraper.aws_tidy_up(run_log) console.log(f"Finished running scraper for: {council}")
def handle(self, options): self.options = options for council in self.councils_to_run(): self.options["council"] = council scraper_cls = load_scraper(council, self.command_name) with scraper_cls((self.options)) as scraper: should_run = True if scraper.disabled: should_run = False if should_run and options["refresh"]: if scraper.run_since(): should_run = False if should_run and options["tags"]: required_tags = set(options["tags"].split(",")) scraper_tags = set(scraper.get_tags) if not required_tags.issubset(scraper_tags): should_run = False if should_run: if options.get("verbose"): print(council) self._run_single(scraper)
def disabled(self): disabled_councils = [] for council in self.all_councils: scraper = load_scraper(council.council_id, self.command_name) if scraper and scraper.disabled: council_info = { "code": council.council_id, "name": council.metadata["official_name"], } disabled_councils.append(council_info) return sorted(disabled_councils, key=lambda d: d["code"])
def missing(self): missing_councils = [] for council in self.all_councils: # non-current councils are never classes as missing if not council.current: continue scraper = load_scraper(council.council_id, self.command_name) if not scraper: council_info = { "code": council.council_id, "name": council.metadata["official_name"], } missing_councils.append(council_info) return sorted(missing_councils, key=lambda d: d["code"])
def run_council(self, council): self.options["council"] = council self.options["council_info"] = load_council_info(council) scraper_cls = load_scraper(council, self.command_name) if not scraper_cls: return with scraper_cls(self.options, self.console) as scraper: should_run = True if scraper.disabled: should_run = False if should_run and self.options["refresh"]: if scraper.run_since(): should_run = False if should_run and self.options["tags"]: required_tags = set(self.options["tags"].split(",")) scraper_tags = set(scraper.get_tags) if not required_tags.issubset(scraper_tags): should_run = False if should_run: self._run_single(scraper)