class JohnHopkinsDataCrawler(): def __init__(self): self.session = Session() # The date where it has fix structure until now self.start_date = date(2020, 3, 22) self.end_date = date.today() def crawl_data(self): self.session.query(JohnHopkinsData).delete(synchronize_session=False) self.session.commit() for single_date in daterange(self.start_date, self.end_date): self.crawl_individual_csv(single_date) print("Success crawl raw data from JohnHopkins Repo") def crawl_individual_csv(self, date_to_crawl: date): csv_base_url = "https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_daily_reports" date_str = date_to_crawl.strftime("%m-%d-%Y") csv_file = f"{csv_base_url}/{date_str}.csv" print(f"[START]Crawl data for {date_str}") try: data_to_store = [] with requests.get(csv_file, stream=True) as f: if f.status_code != HTTPStatus.NOT_FOUND: lines = (line.decode('utf-8') for line in f.iter_lines()) idx = 0 for row in csv.reader(lines): if idx > 0: try: data_to_store.append( JohnHopkinsData( fips=(row[0] if row[0] != '' else None), date=date_to_crawl, admin2=row[1], province_state=row[2], country_region=row[3], last_update=row[4], lat=(row[5] if row[5] != '' else 0.0), long=(row[6] if row[6] != '' else 0.0), confirmed=(row[7] if row[7] != '' else 0), death=(row[8] if row[8] != '' else 0), recovered=(row[9] if row[9] != '' else 0), combined_key=row[10])) except Exception as e: print(e) idx += 1 self.session.add_all(data_to_store) self.session.commit() print(f"[END]Success crawl {idx} data for {date_to_crawl}") else: print(f"Can't find data for {date_str}") except Exception as e: print(e)
class OWDCrawler(): def __init__(self): self.session = Session() def crawl_data(self): print(f"[START]Crawl data from OWD Dataset") file_url = "https://covid.ourworldindata.org/data/owid-covid-data.csv" # clean up existing table first self.session.query(OWDData).delete(synchronize_session=False) with requests.get(file_url, stream=True) as f: if f.status_code != HTTPStatus.NOT_FOUND: lines = (line.decode('utf-8') for line in f.iter_lines()) idx = 0 data_to_store = [] for row in csv.reader(lines): if idx > 0: data_to_store.append( OWDData(iso_code=row[0], continent=row[1], location=row[2], date=row[3], total_cases=row[4], new_cases=row[5], total_deaths=row[6], new_deaths=row[7], total_cases_per_million=parseToFloat( row[8], 0.0), new_cases_per_million=parseToFloat( row[9], 0.0), total_deaths_per_million=parseToFloat( row[10], 0.0), new_deaths_per_million=parseToFloat( row[11], 0.0), total_tests=parseToFloat(row[12], 0.0), new_tests=parseToFloat(row[13], 0.0), total_tests_per_thousand=parseToFloat( row[14], 0.0), new_tests_per_thousand=parseToFloat( row[15], 0.0), tests_unit=row[16], last_updated=datetime.now())) idx += 1 self.session.add_all(data_to_store) self.session.commit() print(f"[END]Success crawl {idx} data from OWD Dataset")
class DatahubCrawler(): def __init__(self): data_url = 'https://datahub.io/core/covid-19/datapackage.json' # to load Data Package into storage package = datapackage.Package(data_url) # to load only tabular data resources = package.resources self.time_series_csv = "" self.country_aggregate_csv = "" self.world_aggregate_csv = "" self.session = Session() print("Fetching dataset from datahub") for resource in resources: if resource.tabular: if resource.descriptor.get( "name") == "time-series-19-covid-combined": self.time_series_csv = resource.descriptor['path'] if resource.descriptor.get("name") == "countries-aggregated": self.country_aggregate_csv = resource.descriptor['path'] if resource.descriptor.get("name") == "worldwide-aggregated": self.world_aggregate_csv = resource.descriptor['path'] def crawl_data(self): self.crawl_time_series_data(self.time_series_csv) self.crawl_country_aggregated_data(self.country_aggregate_csv) self.crawl_world_aggregated_data(self.world_aggregate_csv) def crawl_time_series_data(self, file_url: str): idx = 0 tsc_data = [] print("[START]Insert time series data") print(f"Crawl data using {self.time_series_csv}") with requests.get(file_url, stream=True) as tsc: lines = (line.decode('utf-8') for line in tsc.iter_lines()) self.session.query(TimeSeriesData).delete() for row in csv.reader(lines): if idx > 0 and len(row) > 0: confirmed = (row[5] if row[5] != '' else '0') recovered = (row[6] if row[6] != '' else '0') death = (row[7] if row[7] != '' else '0') tsc_data.append( TimeSeriesData(date=row[0], country=row[1], state=row[2], lat=row[3], long=row[4], confirmed=confirmed, recovered=recovered, death=death)) idx += 1 self.session.add_all(tsc_data) self.session.commit() print(f"[END]Insert time series data. Success inserting {idx} records") def crawl_country_aggregated_data(self, file_url: str): ca_data = [] idx = 0 print("[START]Insert country aggregated data") print(f"Crawl data using {self.country_aggregate_csv}") with requests.get(file_url, stream=True) as ca: lines = (line.decode('utf-8') for line in ca.iter_lines()) self.session.query(CountryAggregated).delete() for row in csv.reader(lines): if idx > 0 and len(row) > 0: confirmed = (row[2] if row[2] != '' else '0') recovered = (row[3] if row[3] != '' else '0') death = (row[4] if row[4] != '' else '0') ca_data.append( CountryAggregated(date=row[0], country=row[1], confirmed=confirmed, recovered=recovered, death=death)) idx += 1 self.session.add_all(ca_data) self.session.commit() print( f"[END]Insert country aggregated data. Success inserting {idx} records" ) def crawl_world_aggregated_data(self, file_url: str): wwa_data = [] idx = 0 print("[START]Insert world aggregated data") print(f"Crawl data using {self.world_aggregate_csv}") with requests.get(file_url, stream=True) as wwa: lines = (line.decode('utf-8') for line in wwa.iter_lines()) self.session.query(WorldwideAggregated).delete() for row in csv.reader(lines): if idx > 0 and len(row) > 0: confirmed = (row[1] if row[1] != '' else '0') recovered = (row[2] if row[2] != '' else '0') death = (row[3] if row[3] != '' else '0') wwa_data.append( WorldwideAggregated(date=row[0], confirmed=confirmed, recovered=recovered, death=death)) idx += 1 self.session.add_all(wwa_data) self.session.commit() print( f"[END]Insert world aggregated data. Success inserting {idx} records" ) print("Finish run DataHub crawler")