def _apply_row_cfr_100(row): try: if pd.notnull(row['total_cases']) and row['total_cases'] >= 100: return row['cfr'] print('\npd.NA\n') return pd.notna except Exception as e: trace.getException(e)
def _load_merged(filename): try: print('start _load_merged %s' % colored("debug", 'blue')) df_data = load_data(filename) print('finish load_data from filename %s' % colored("debug", 'blue')) df_locs = load_locations() print('finish load_locations() %s' % colored("debug", 'blue')) return df_data.merge(df_locs, how='left', on=['countriesAndTerritories']) except Exception as e: trace.getException(e)
def load_locations(): try: print('start return load_locations %s' % colored('debug', 'blue')) return pd.read_csv( LOCATIONS_CSV_PATH, keep_default_na=False, # engine='python' ).rename( columns={ 'Country': 'countriesAndTerritories', 'Our World In Data Name': 'location' }) except Exception as e: trace.getException(e)
def check_data_correctness(filename): errors = 0 try: df_merged = _load_merged(filename) except Exception as e: trace.getException(e) print('finish _load_merged at check_data_correctness %s' % colored("debug", 'blue')) df_uniq = df_merged[['countriesAndTerritories', 'geoId', 'location']].drop_duplicates() if df_uniq['location'].isnull().any(): print("\n" + ERROR + " Could not find OWID names for:") print(df_uniq[df_uniq['location'].isnull()]) csv_path = os.path.join(TMP_PATH, 'ecdc.csv') os.system('mkdir -p %s' % os.path.abspath(TMP_PATH)) df_uniq[['countriesAndTerritories']] \ .drop_duplicates() \ .rename(columns={'countriesAndTerritories': 'Country'}) \ .to_csv(csv_path, index=False) print( "\nSaved CSV file to be standardized at %s. \nRun it through the OWID standardizer and save in %s" % (colored(os.path.abspath(csv_path), 'magenta'), colored(os.path.abspath(LOCATIONS_CSV_PATH), 'magenta'))) errors += 1 # Drop missing locations for the further checks – that error is addressed above df_merged = df_merged.dropna(subset=['location']) if df_merged.duplicated(subset=['dateRep', 'location']).any(): print("\n" + ERROR + " Found duplicate rows:") print(df_merged[df_merged.duplicated(subset=['dateRep', 'location'])]) print( "\nPlease " + colored("fix or remove the duplicate rows", 'magenta') + " in the Excel file, and then save it again but under a new name, e.g. 2020-03-20-modified.xlsx" ) print("Also please " + colored("note down any changes you made", 'magenta') + " in %s" % os.path.abspath(os.path.join(INPUT_PATH, 'NOTES.md'))) errors += 1 df_pop = load_population() pop_entity_diff = set(df_uniq['location']) - set(df_pop['location']) if len(pop_entity_diff) > 0: # this is not an error, so don't increment errors variable print("\n" + WARNING + " These entities were not found in the population dataset:") print(pop_entity_diff) print() return True if errors == 0 else False
def load_data(filename): try: print('start load_data %s' % colored("debug", 'blue')) df = read_file(filename) print("filename:%s" % colored(filename, 'cyan')) # set to ints df['cases'] = df['cases'].astype("Int64") df['deaths'] = df['deaths'].astype("Int64") df['dateRep'] = pd.to_datetime(df['dateRep'], format="%d/%m/%Y", utc=True) # fill time gaps df = df.set_index(['dateRep']) \ .groupby('countriesAndTerritories', as_index=True) \ .resample('D').first() \ .drop(columns=['countriesAndTerritories']) \ .reset_index() df['dateRep'] = df['dateRep'].dt.date return df except Exception as e: trace.getException(e)
questions = [ { 'type': 'list', 'name': 'filename', 'message': 'Which release to use?', 'choices': filenames, 'default': 0 } # List('filename', # message='Which release to use?', # choices=filenames, # default=0) ] answers = prompt(questions) filename = answers['filename'] if check_data_correctness(filename): print("Data correctness check %s.\n" % colored("passed", 'green')) else: print("Data correctness check %s.\n" % colored("failed", 'red')) sys.exit(1) if export(filename): print("Successfully exported CSVs to %s\n" % colored(os.path.abspath(OUTPUT_PATH), 'magenta')) else: print("ECDC Export failed.\n") except Exception as e: trace.getException(e)