def start_requests(self): for date in date_range(self.start_date, today()): yield self.make_state_confirmed_request( date, callback=self.parse_state_confirmed, meta={"row": { "date": date }}, )
class TotalDeathsSpider(BaseRegistroCivilSpider): name = "obitos_totais" base_url = "https://transparencia.registrocivil.org.br/api/record/death" start_date = datetime.date(2015, 1, 1) end_date = today() def make_state_request(self, start_date, end_date, state, callback, dont_cache=False): data = [ ("start_date", str(start_date)), ("end_date", str(end_date)), ("state", state), ] return self.make_request( url=urljoin(self.base_url, "?" + urlencode(data)), callback=callback, meta={ "row": qs_to_dict(data), "dont_cache": dont_cache }, ) def start_requests_after_login(self): one_day = datetime.timedelta(days=1) today = today() non_cache_period = datetime.timedelta(days=30) # `date_range` excludes the last, so we need to add one day to # `end_date`. for date in date_range(self.start_date, self.end_date + one_day, interval="monthly"): # Won't cache dates from 30 days ago until today (only historical # ones which are unlikely to change). should_cache = today - date > non_cache_period for state in STATES: yield self.make_state_request( start_date=date, end_date=next_month(date) - one_day, state=state, callback=self.parse, dont_cache=not should_cache, ) def parse(self, response): meta = response.meta["row"] data = json.loads(response.body)["data"] for row in data: row.update(meta) row["city"] = row.pop("name") row["deaths_total"] = row.pop("total") yield row
def start_requests_after_login(self): one_day = datetime.timedelta(days=1) non_cache_period = datetime.timedelta(days=30) # `date_range` excludes the last, so we need to add one day to # `end_date`. for date in date_range(self.start_date, self.end_date + one_day, interval="monthly"): # Won't cache dates from 30 days ago until today (only historical # ones which are unlikely to change). should_cache = today() - date > non_cache_period for state in STATES: yield self.make_state_request( start_date=date, end_date=next_month(date) - one_day, state=state, callback=self.parse, dont_cache=not should_cache, )
def main(): parser = argparse.ArgumentParser() parser.add_argument("input_filenames", nargs="+") parser.add_argument("output_filename") args = parser.parse_args() writer = rows.utils.CsvLazyDictWriter(args.output_filename) progress = tqdm() write_row = writer.writerow progress_update = progress.update start_date, end_date = None, today() for filename in args.input_filenames: for row in get_data_greedy(filename, start_date, end_date): write_row(row) progress_update() writer.close() progress.close()
def parse_application_date(value): value = parse_date(value) if value <= "2020-01-01" or value >= str(today()): # Invalid value value = None return value
async def tasks(self): start_date = None end_date = today() for filename in self.input_filenames: yield Task(function=get_data_greedy, args=(filename, start_date, end_date))
def parse_csv(self, response): reader = csv.DictReader(io.StringIO(response.body.decode("iso-8859-1")), delimiter=";") city_name_key = "Município" city_code_key = "Cód IBGE" confirmed_key = "Mun_Total de casos" deaths_key = "Mun_Total de óbitos" capture_date = today() total_confirmed = total_deaths = 0 for row in reader: city = row[city_name_key] city_ibge_code = int(row[city_code_key]) if row[city_code_key] else None confirmed = int(row[confirmed_key]) deaths = int(row[deaths_key]) if city == "Outros países": confirmed_imported = confirmed deaths_imported = deaths continue elif city == "Ignorado": confirmed_undefined = confirmed deaths_undefined = deaths continue elif city == "Outros estados": confirmed_other_states = confirmed deaths_other_states = deaths continue else: city = self.cities[city_ibge_code] total_confirmed += confirmed total_deaths += deaths yield { "city": city.city, "city_ibge_code": city_ibge_code, "confirmed": confirmed, "date": capture_date, "deaths": deaths, "place_type": "city", "state": self.name, } confirmed = confirmed_imported + confirmed_undefined + confirmed_other_states deaths = deaths_imported + deaths_undefined + deaths_other_states total_confirmed += confirmed total_deaths += deaths yield { "city": "Importados/Indefinidos", "city_ibge_code": None, "confirmed": confirmed, "date": capture_date, "deaths": deaths, "place_type": "city", "state": self.name, } yield { "city": None, "city_ibge_code": self.state_ibge_code, "confirmed": total_confirmed, "date": capture_date, "deaths": total_deaths, "place_type": "state", "state": self.name, }
def read_files(input_filenames): start_date = None end_date = today() for filename in input_filenames: yield get_data_greedy(filename, start_date, end_date)
def convert_file(filename): # There are some missing data on the registral, so default all to None # Multiple passes to keep the same column ordering all_keys = [] for prefix in PREFIX_CHOICES: all_keys.extend(year_causes_keys(prefix, YEAR_CHOICES)) all_keys.extend([f"{prefix}_total_{year}" for year in YEAR_CHOICES]) base_row = {} for key in all_keys: base_row[key] = 0 if key.startswith("deaths_") else None table_types = { "date": rows.fields.DateField, "state": rows.fields.TextField, "cause": rows.fields.TextField, "total": rows.fields.IntegerField, } table = rows.import_from_csv(filename, force_types=table_types) row_key = lambda row: (row.state, datetime.date(2020, row.date.month, row.date.day)) table = sorted(table, key=row_key) accumulated = Counter() last_day = today() for key, state_data in groupby(table, key=row_key): state, date = key row = { "date": date, "state": state, } try: this_day_in_2019 = datetime.date(2019, date.month, date.day) except ValueError: # This day does not exist in 2019 (29 February) yesterday = date - one_day this_day_in_2019 = datetime.date(2019, yesterday.month, yesterday.day) row["epidemiological_week_2019"] = brazilian_epidemiological_week( this_day_in_2019)[1] row["epidemiological_week_2020"] = brazilian_epidemiological_week( date)[1] row.update(base_row) # Zero sum of new deaths for this state in all years (will accumulate) for year in YEAR_CHOICES: accumulated[(year, state, "new-total")] = 0 # For each death cause in this date/state, fill `row` and accumulate filled_causes = set() for item in state_data: cause = item.cause year = item.date.year key_new = get_death_cause_key("new_deaths", cause, year) new_deaths = item.total if key_new is None: if new_deaths > 0: # raise RuntimeError(f"Cannot have new_deaths > 0 when key for (new_deaths, {cause}, {year}) is None") print( f"ERROR converting {item}: new_deaths > 0 but key is None" ) continue else: continue accumulated_key = (year, state, cause) accumulated_key_total = (year, state, "total") accumulated_key_new_total = (year, state, "new-total") accumulated[accumulated_key] += new_deaths accumulated[accumulated_key_total] += new_deaths accumulated[accumulated_key_new_total] += new_deaths row[key_new] = new_deaths row[get_death_cause_key("deaths", cause, year)] = accumulated[accumulated_key] filled_causes.add((year, cause)) # Fill other deaths_* (accumulated) values with the last available data # if not filled by the state_data for this date. for cause in RESPIRATORY_DEATH_CAUSES: for year in YEAR_CHOICES: if (year, cause) in filled_causes: continue accumulated_key = (year, state, cause) key_name = get_death_cause_key("deaths", cause, year) if key_name is None: continue row[key_name] = accumulated[accumulated_key] # Fill year totals (new and accumulated) for state for year in YEAR_CHOICES: if year == last_day.year and date > last_day: new_total = None else: new_total = accumulated[(year, state, "new-total")] total = accumulated[(year, state, "total")] row[get_death_cause_key("new_deaths", "total", year)] = new_total row[get_death_cause_key("deaths", "total", year)] = total yield row