Example #1
0
def _apply_row_cfr_100(row):
    try:
        if pd.notnull(row['total_cases']) and row['total_cases'] >= 100:
            return row['cfr']
        print('\npd.NA\n')
        return pd.notna
    except Exception as e:
        trace.getException(e)
Example #2
0
def _load_merged(filename):
    try:
        print('start _load_merged %s' % colored("debug", 'blue'))
        df_data = load_data(filename)
        print('finish load_data from filename %s' % colored("debug", 'blue'))
        df_locs = load_locations()
        print('finish load_locations() %s' % colored("debug", 'blue'))
        return df_data.merge(df_locs,
                             how='left',
                             on=['countriesAndTerritories'])

    except Exception as e:
        trace.getException(e)
Example #3
0
def load_locations():
    try:
        print('start return load_locations %s' % colored('debug', 'blue'))
        return pd.read_csv(
            LOCATIONS_CSV_PATH,
            keep_default_na=False,
            # engine='python'
        ).rename(
            columns={
                'Country': 'countriesAndTerritories',
                'Our World In Data Name': 'location'
            })
    except Exception as e:
        trace.getException(e)
Example #4
0
def check_data_correctness(filename):

    errors = 0
    try:
        df_merged = _load_merged(filename)
    except Exception as e:
        trace.getException(e)
    print('finish _load_merged at check_data_correctness %s' %
          colored("debug", 'blue'))
    df_uniq = df_merged[['countriesAndTerritories', 'geoId',
                         'location']].drop_duplicates()
    if df_uniq['location'].isnull().any():
        print("\n" + ERROR + " Could not find OWID names for:")
        print(df_uniq[df_uniq['location'].isnull()])
        csv_path = os.path.join(TMP_PATH, 'ecdc.csv')
        os.system('mkdir -p %s' % os.path.abspath(TMP_PATH))
        df_uniq[['countriesAndTerritories']] \
            .drop_duplicates() \
            .rename(columns={'countriesAndTerritories': 'Country'}) \
            .to_csv(csv_path, index=False)
        print(
            "\nSaved CSV file to be standardized at %s. \nRun it through the OWID standardizer and save in %s"
            % (colored(os.path.abspath(csv_path), 'magenta'),
               colored(os.path.abspath(LOCATIONS_CSV_PATH), 'magenta')))
        errors += 1
    # Drop missing locations for the further checks – that error is addressed above
    df_merged = df_merged.dropna(subset=['location'])
    if df_merged.duplicated(subset=['dateRep', 'location']).any():
        print("\n" + ERROR + " Found duplicate rows:")
        print(df_merged[df_merged.duplicated(subset=['dateRep', 'location'])])
        print(
            "\nPlease " +
            colored("fix or remove the duplicate rows", 'magenta') +
            " in the Excel file, and then save it again but under a new name, e.g. 2020-03-20-modified.xlsx"
        )
        print("Also please " +
              colored("note down any changes you made", 'magenta') +
              " in %s" % os.path.abspath(os.path.join(INPUT_PATH, 'NOTES.md')))
        errors += 1
    df_pop = load_population()
    pop_entity_diff = set(df_uniq['location']) - set(df_pop['location'])
    if len(pop_entity_diff) > 0:
        # this is not an error, so don't increment errors variable
        print("\n" + WARNING +
              " These entities were not found in the population dataset:")
        print(pop_entity_diff)
        print()
    return True if errors == 0 else False
Example #5
0
def load_data(filename):
    try:
        print('start load_data %s' % colored("debug", 'blue'))
        df = read_file(filename)
        print("filename:%s" % colored(filename, 'cyan'))
        # set to ints
        df['cases'] = df['cases'].astype("Int64")
        df['deaths'] = df['deaths'].astype("Int64")
        df['dateRep'] = pd.to_datetime(df['dateRep'],
                                       format="%d/%m/%Y",
                                       utc=True)
        # fill time gaps
        df = df.set_index(['dateRep']) \
            .groupby('countriesAndTerritories', as_index=True) \
            .resample('D').first() \
            .drop(columns=['countriesAndTerritories']) \
            .reset_index()
        df['dateRep'] = df['dateRep'].dt.date
        return df
    except Exception as e:
        trace.getException(e)
Example #6
0
        questions = [
            {
                'type': 'list',
                'name': 'filename',
                'message': 'Which release to use?',
                'choices': filenames,
                'default': 0
            }
            # List('filename',
            #         message='Which release to use?',
            #         choices=filenames,
            #         default=0)
        ]
        answers = prompt(questions)

        filename = answers['filename']

        if check_data_correctness(filename):
            print("Data correctness check %s.\n" % colored("passed", 'green'))
        else:
            print("Data correctness check %s.\n" % colored("failed", 'red'))
            sys.exit(1)

        if export(filename):
            print("Successfully exported CSVs to %s\n" %
                  colored(os.path.abspath(OUTPUT_PATH), 'magenta'))
        else:
            print("ECDC Export failed.\n")
    except Exception as e:
        trace.getException(e)