def get_place_comparison_df(place, place2, level='countries', priority='now'): # df_orig = pd.read_csv(data_dir + 'total_cases_{}_normalized.csv'.format(level)) df_orig = Country.all_countries_as_df() # to force place order df_orig_c1 = df_orig[df_orig['Name'] == place] df_orig_c2 = df_orig[df_orig['Name'] == place2] len_c1 = len(df_orig_c1[df_orig_c1['TotalDeaths'] > 0]) len_c2 = len(df_orig_c2[df_orig_c2['TotalDeaths'] > 0]) # place has to be the one with smallest number of values for Deaths if (len_c1 > len_c2): place, place2 = place2, place df_orig = pd.concat([df_orig_c2, df_orig_c1]) else: df_orig = pd.concat([df_orig_c1, df_orig_c2]) df_countries_gap = get_places_gap_df(df_orig, place, place2, priority) df_total_cases_top = get_total_cases_df_adjusted(df_orig, df_countries_gap, place, place2) place_start_cases = ( df_orig.set_index('Name').loc[place, ].set_index('Day')['Total'] > 0).idxmax() df_total_cases_top = df_total_cases_top[ df_total_cases_top['DayAdj'] >= place_start_cases] return df_total_cases_top.reset_index()
def get_timeline_list(place, place2, level='countries'): # df_orig = pd.read_csv(data_dir + 'total_cases_{}_normalized.csv'.format(level)) df_orig = Country.all_countries_as_df() # to force place order df_orig_c1 = df_orig[df_orig['Name'] == place] df_orig_c2 = df_orig[df_orig['Name'] == place2] len_c1 = len(df_orig_c1[df_orig_c1['TotalDeaths'] > 0]) len_c2 = len(df_orig_c2[df_orig_c2['TotalDeaths'] > 0]) # place has to be the one with smallest number of values for Deaths if (len_c1 > len_c2): place, place2 = place2, place df_orig = pd.concat([df_orig_c2, df_orig_c1]) else: df_orig = pd.concat([df_orig_c1, df_orig_c2]) df_places_gap = get_places_gap_df(df_orig, place, place2) df_total_cases_top = get_total_cases_df_adjusted(df_orig, df_places_gap, place, place2) places = [place, place2] df_places_to_show = df_total_cases_top.loc[places, :] places_to_show = list(df_places_to_show.index.unique()) df_events_owd = pd.DataFrame({ 'Date': [], 'Name': [], 'Desc': [], 'FullText': [], 'Highlight': [] }) today = df_places_to_show['Date'].max() for c in places_to_show: df_place = df_places_to_show.loc[c, ] # df_events_owd = df_events_owd.append(pd.DataFrame({'Date':['2019-12-31'], 'Name': [c], 'Desc':['Begining of epidemic'], 'FullText':['First day of data tracking.']})) df_events_owd = df_events_owd.append(pd.Series( [(df_place.set_index('Date')['Total'] > 0).idxmax(), c, '1st Confirmed Case', '', 1], index=df_events_owd.columns), ignore_index=True) df_events_owd = df_events_owd.append(pd.Series( [(df_place.set_index('Date')['TotalDeaths'] > 0).idxmax(), c, '1st Death', '', 5], index=df_events_owd.columns), ignore_index=True) msg = """{} is approximately {} days behind {}'s epidemic progression. This is an estimate based on matching their death growth curves.""".format( place, abs(df_places_gap.loc[place2, 'gap']), place2) df_events_owd = df_events_owd.append(pd.Series( [today, c, 'Today', msg, 1], index=df_events_owd.columns), ignore_index=True) df_events_owd['Source'] = 'Our World in Data' # Adding data from Situation Reports if level == 'countries': df_events_sr = pd.read_csv(data_dir + 'situation_reports_countries_highlight.csv') else: df_events_sr = pd.DataFrame({'Name': []}) df_events_sr = df_events_sr[df_events_sr['Name'].isin([place, place2])] df_events = pd.concat([df_events_owd, df_events_sr], sort=True) # Groups events that happen on the same day df_events_grouped = pd.DataFrame( df_events.groupby(['Date', 'Name'])['Desc'].apply(lambda x: "\n".join(x))) df_events_grouped['FullText'] = df_events.groupby( ['Date', 'Name'])['FullText'].apply(lambda x: "\n".join(x)) df_events_grouped['Source'] = df_events.groupby( ['Date', 'Name'])['Source'].apply(lambda x: "\n".join(x)) df_events_grouped['Highlight'] = df_events.groupby(['Date', 'Name' ])['Highlight'].max() df_events_adj = pd.merge(df_events_grouped, df_places_to_show[['Date', 'DayAdj']].reset_index(), how='left', on=['Date', 'Name']) df_events_adj['Highlight'] = df_events_adj['Highlight'].astype(int) df_places_events = pd.merge(df_events_adj[[ 'Name', 'DayAdj', 'Desc', 'FullText', 'Highlight', 'Source' ]], df_places_to_show.reset_index(), how='outer', on=['DayAdj', 'Name']) df_places_events = df_places_events.set_index('Name') df_places_events_merged = pd.merge( df_places_events.loc[place, :].reset_index(), df_places_events.loc[place2, :].reset_index(), on='DayAdj', how='outer', suffixes=('', '2')) df_places_events_merged = df_places_events_merged.set_index( 'DayAdj').sort_index() start_events = min(df_places_events_merged['Desc'].first_valid_index(), df_places_events_merged['Desc2'].first_valid_index()) end_events = max( df_places_events_merged['TotalDeaths'].last_valid_index(), df_places_events_merged['TotalDeaths2'].last_valid_index()) df_places_events_trimed = df_places_events_merged.loc[ start_events:end_events] df_places_events_trimed = df_places_events_trimed[[ 'Name', 'Date', 'Desc', 'FullText', 'Highlight', 'Source', 'Total', 'TotalDeaths', 'GrowthRate', 'GrowthRateDeaths', 'DaysToDouble', 'DaysToDoubleDeaths', 'Date2', 'Name2', 'Desc2', 'FullText2', 'Highlight2', 'Source2', 'Total2', 'TotalDeaths2', 'GrowthRate2', 'GrowthRateDeaths2', 'DaysToDouble2', 'DaysToDoubleDeaths2', ]] # Fill place name for 1st place df_places_events_trimed['Name'] = df_places_events_trimed['Name'].ffill() # Fill place name for 2nd place df_places_events_trimed['Name2'] = df_places_events_trimed['Name2'].ffill() # Fill TotalDeath # df_places_events_trimed['TotalDeaths'] = df_places_events_trimed['TotalDeaths'].ffill() # df_places_events_trimed['TotalDeaths2'] = df_places_events_trimed['TotalDeaths2'].ffill() # Fill dates for 1st place # sr_days = pd.to_datetime(df_places_events_trimed['Date'].ffill()) # sr_adj_days = df_places_events_trimed.groupby(df_places_events_trimed['Date'].notnull().cumsum()).cumcount() # df_places_events_trimed['Date'] = (sr_days + pd.to_timedelta(sr_adj_days, unit='d')).dt.strftime('%Y-%m-%d') # Fill dates for 2nd place # sr_days = pd.to_datetime(df_places_events_trimed['Date2'].ffill()) # sr_adj_days = df_places_events_trimed.groupby(df_places_events_trimed['Date2'].notnull().cumsum()).cumcount() # df_places_events_trimed['Date2'] = (sr_days + pd.to_timedelta(sr_adj_days, unit='d')).dt.strftime('%Y-%m-%d') df_places_events_trimed = df_places_events_trimed.fillna('').replace( {'NaT': ''}) return df_places_events_trimed.to_dict('records')
def get_df_similar_places(place, level='countries'): # if level == 'cities': # df_sim = pd.read_csv(data_dir + 'all_{}_similarity.csv'.format(level)) # df_sim = df_sim[df_sim['CityBase'] == place] # df_sim = df_sim[['Name', 'gap', 'dist', 'Similarity']].set_index('Name') # return df_sim # df_orig = pd.read_csv(data_dir + 'total_cases_{}_normalized.csv'.format(level)) df_orig = Country.all_countries_as_df() df_orig_piv_day = df_orig.pivot(index='Name', columns='Day', values='TotalDeaths') df_orig_piv_day = df_orig_piv_day.fillna(0) sr_place = df_orig_piv_day.loc[place, ] place_start = (sr_place > 0).idxmax() # place_start_cases = (df_orig.set_index('Name').loc[place,].set_index('Day')['Total'] > 0).idxmax() days_ahead = 14 #if level == 'countries' else 5 df_places_ahead = df_orig_piv_day[ df_orig_piv_day.loc[:, max(place_start - days_ahead, 0)] > 0.0] df_places_rate_norm = df_orig_piv_day.loc[df_places_ahead.index, :] # df_places_rate_norm = df_orig_piv_day.loc[['France', 'Italy'], :] df_places_rate_norm = df_places_rate_norm.append( df_orig_piv_day.loc[place, ]) # reverse order to keep base place on top df_places_rate_norm = df_places_rate_norm.iloc[::-1] sr_place = df_orig_piv_day.loc[place, ] # place_start = (sr_place > 0).idxmax() # sr_place_compare = sr_place.loc[place_start:].dropna() sr_place = df_orig_piv_day.loc[place, ] place_start = (sr_place > 0).idxmax() sr_place_compare = sr_place.loc[place_start:].dropna() df_places_gap = pd.DataFrame({'Name': [], 'gap': [], 'dist': []}) df_places_gap = df_places_gap.append(pd.Series( [place, 0.0, -1], index=df_places_gap.columns), ignore_index=True) for other_place in df_places_rate_norm.index[1:]: sr_other_place = df_places_rate_norm.loc[other_place, ].fillna(0) min_dist = np.inf min_pos = 0 for i in range(0, 1 + len(sr_other_place) - len(sr_place_compare)): sr_other_place_compare = sr_other_place[i:i + len(sr_place_compare)] dist = euclidean(sr_place_compare, sr_other_place_compare) if (dist < min_dist): min_dist = dist min_pos = i day_place2 = sr_other_place.index[min_pos] gap = day_place2 - place_start df_places_gap = df_places_gap.append(pd.Series( [other_place, gap, min_dist], index=df_places_gap.columns), ignore_index=True) df_places_gap = df_places_gap.set_index('Name') similar_places = df_places_gap.sort_values('dist') dist_max = euclidean(sr_place_compare, np.zeros(len(sr_place_compare))) similar_places['Similarity'] = similar_places['dist'].apply( lambda x: (1.0 - x / dist_max) if x >= 0 else 1) return similar_places