Beispiel #1
0
class JohnHopkinsDataCrawler():
    def __init__(self):
        self.session = Session()
        # The date where it has fix structure until now
        self.start_date = date(2020, 3, 22)
        self.end_date = date.today()

    def crawl_data(self):
        self.session.query(JohnHopkinsData).delete(synchronize_session=False)
        self.session.commit()
        for single_date in daterange(self.start_date, self.end_date):
            self.crawl_individual_csv(single_date)
        print("Success crawl raw data from JohnHopkins Repo")

    def crawl_individual_csv(self, date_to_crawl: date):
        csv_base_url = "https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_daily_reports"

        date_str = date_to_crawl.strftime("%m-%d-%Y")
        csv_file = f"{csv_base_url}/{date_str}.csv"
        print(f"[START]Crawl data for {date_str}")

        try:
            data_to_store = []
            with requests.get(csv_file, stream=True) as f:
                if f.status_code != HTTPStatus.NOT_FOUND:
                    lines = (line.decode('utf-8') for line in f.iter_lines())
                    idx = 0

                    for row in csv.reader(lines):
                        if idx > 0:
                            try:
                                data_to_store.append(
                                    JohnHopkinsData(
                                        fips=(row[0]
                                              if row[0] != '' else None),
                                        date=date_to_crawl,
                                        admin2=row[1],
                                        province_state=row[2],
                                        country_region=row[3],
                                        last_update=row[4],
                                        lat=(row[5] if row[5] != '' else 0.0),
                                        long=(row[6] if row[6] != '' else 0.0),
                                        confirmed=(row[7]
                                                   if row[7] != '' else 0),
                                        death=(row[8] if row[8] != '' else 0),
                                        recovered=(row[9]
                                                   if row[9] != '' else 0),
                                        combined_key=row[10]))
                            except Exception as e:
                                print(e)
                        idx += 1

                    self.session.add_all(data_to_store)
                    self.session.commit()
                    print(f"[END]Success crawl {idx} data for {date_to_crawl}")
                else:
                    print(f"Can't find data for {date_str}")
        except Exception as e:
            print(e)
class OWDCrawler():
    def __init__(self):
        self.session = Session()

    def crawl_data(self):
        print(f"[START]Crawl data from OWD Dataset")
        file_url = "https://covid.ourworldindata.org/data/owid-covid-data.csv"
        # clean up existing table first
        self.session.query(OWDData).delete(synchronize_session=False)
        with requests.get(file_url, stream=True) as f:
            if f.status_code != HTTPStatus.NOT_FOUND:
                lines = (line.decode('utf-8') for line in f.iter_lines())
                idx = 0
                data_to_store = []
                for row in csv.reader(lines):
                    if idx > 0:
                        data_to_store.append(
                            OWDData(iso_code=row[0],
                                    continent=row[1],
                                    location=row[2],
                                    date=row[3],
                                    total_cases=row[4],
                                    new_cases=row[5],
                                    total_deaths=row[6],
                                    new_deaths=row[7],
                                    total_cases_per_million=parseToFloat(
                                        row[8], 0.0),
                                    new_cases_per_million=parseToFloat(
                                        row[9], 0.0),
                                    total_deaths_per_million=parseToFloat(
                                        row[10], 0.0),
                                    new_deaths_per_million=parseToFloat(
                                        row[11], 0.0),
                                    total_tests=parseToFloat(row[12], 0.0),
                                    new_tests=parseToFloat(row[13], 0.0),
                                    total_tests_per_thousand=parseToFloat(
                                        row[14], 0.0),
                                    new_tests_per_thousand=parseToFloat(
                                        row[15], 0.0),
                                    tests_unit=row[16],
                                    last_updated=datetime.now()))
                    idx += 1
                self.session.add_all(data_to_store)
                self.session.commit()
                print(f"[END]Success crawl {idx} data from OWD Dataset")
class DatahubCrawler():
    def __init__(self):
        data_url = 'https://datahub.io/core/covid-19/datapackage.json'

        # to load Data Package into storage
        package = datapackage.Package(data_url)

        # to load only tabular data
        resources = package.resources
        self.time_series_csv = ""
        self.country_aggregate_csv = ""
        self.world_aggregate_csv = ""
        self.session = Session()
        print("Fetching dataset from datahub")
        for resource in resources:
            if resource.tabular:
                if resource.descriptor.get(
                        "name") == "time-series-19-covid-combined":
                    self.time_series_csv = resource.descriptor['path']
                if resource.descriptor.get("name") == "countries-aggregated":
                    self.country_aggregate_csv = resource.descriptor['path']
                if resource.descriptor.get("name") == "worldwide-aggregated":
                    self.world_aggregate_csv = resource.descriptor['path']

    def crawl_data(self):
        self.crawl_time_series_data(self.time_series_csv)
        self.crawl_country_aggregated_data(self.country_aggregate_csv)
        self.crawl_world_aggregated_data(self.world_aggregate_csv)

    def crawl_time_series_data(self, file_url: str):
        idx = 0
        tsc_data = []
        print("[START]Insert time series data")
        print(f"Crawl data using {self.time_series_csv}")
        with requests.get(file_url, stream=True) as tsc:
            lines = (line.decode('utf-8') for line in tsc.iter_lines())
            self.session.query(TimeSeriesData).delete()
            for row in csv.reader(lines):
                if idx > 0 and len(row) > 0:
                    confirmed = (row[5] if row[5] != '' else '0')
                    recovered = (row[6] if row[6] != '' else '0')
                    death = (row[7] if row[7] != '' else '0')
                    tsc_data.append(
                        TimeSeriesData(date=row[0],
                                       country=row[1],
                                       state=row[2],
                                       lat=row[3],
                                       long=row[4],
                                       confirmed=confirmed,
                                       recovered=recovered,
                                       death=death))
                idx += 1
            self.session.add_all(tsc_data)
            self.session.commit()
        print(f"[END]Insert time series data. Success inserting {idx} records")

    def crawl_country_aggregated_data(self, file_url: str):
        ca_data = []
        idx = 0
        print("[START]Insert country aggregated data")
        print(f"Crawl data using {self.country_aggregate_csv}")
        with requests.get(file_url, stream=True) as ca:
            lines = (line.decode('utf-8') for line in ca.iter_lines())
            self.session.query(CountryAggregated).delete()
            for row in csv.reader(lines):
                if idx > 0 and len(row) > 0:
                    confirmed = (row[2] if row[2] != '' else '0')
                    recovered = (row[3] if row[3] != '' else '0')
                    death = (row[4] if row[4] != '' else '0')
                    ca_data.append(
                        CountryAggregated(date=row[0],
                                          country=row[1],
                                          confirmed=confirmed,
                                          recovered=recovered,
                                          death=death))
                idx += 1
            self.session.add_all(ca_data)
            self.session.commit()
        print(
            f"[END]Insert country aggregated data. Success inserting {idx} records"
        )

    def crawl_world_aggregated_data(self, file_url: str):
        wwa_data = []
        idx = 0
        print("[START]Insert world aggregated data")
        print(f"Crawl data using {self.world_aggregate_csv}")
        with requests.get(file_url, stream=True) as wwa:
            lines = (line.decode('utf-8') for line in wwa.iter_lines())
            self.session.query(WorldwideAggregated).delete()
            for row in csv.reader(lines):
                if idx > 0 and len(row) > 0:
                    confirmed = (row[1] if row[1] != '' else '0')
                    recovered = (row[2] if row[2] != '' else '0')
                    death = (row[3] if row[3] != '' else '0')
                    wwa_data.append(
                        WorldwideAggregated(date=row[0],
                                            confirmed=confirmed,
                                            recovered=recovered,
                                            death=death))
                idx += 1
            self.session.add_all(wwa_data)
            self.session.commit()
        print(
            f"[END]Insert world aggregated data. Success inserting {idx} records"
        )
        print("Finish run DataHub crawler")