コード例 #1
0
ファイル: compare.py プロジェクト: Luca-A-Magalhaes/himcd
def get_place_comparison_df(place, place2, level='countries', priority='now'):

    # df_orig = pd.read_csv(data_dir + 'total_cases_{}_normalized.csv'.format(level))
    df_orig = Country.all_countries_as_df()

    # to force place order
    df_orig_c1 = df_orig[df_orig['Name'] == place]
    df_orig_c2 = df_orig[df_orig['Name'] == place2]

    len_c1 = len(df_orig_c1[df_orig_c1['TotalDeaths'] > 0])
    len_c2 = len(df_orig_c2[df_orig_c2['TotalDeaths'] > 0])

    # place has to be the one with smallest number of values for Deaths
    if (len_c1 > len_c2):
        place, place2 = place2, place
        df_orig = pd.concat([df_orig_c2, df_orig_c1])
    else:
        df_orig = pd.concat([df_orig_c1, df_orig_c2])

    df_countries_gap = get_places_gap_df(df_orig, place, place2, priority)

    df_total_cases_top = get_total_cases_df_adjusted(df_orig, df_countries_gap,
                                                     place, place2)

    place_start_cases = (
        df_orig.set_index('Name').loc[place, ].set_index('Day')['Total'] >
        0).idxmax()

    df_total_cases_top = df_total_cases_top[
        df_total_cases_top['DayAdj'] >= place_start_cases]

    return df_total_cases_top.reset_index()
コード例 #2
0
ファイル: compare.py プロジェクト: Luca-A-Magalhaes/himcd
def get_timeline_list(place, place2, level='countries'):
    # df_orig = pd.read_csv(data_dir + 'total_cases_{}_normalized.csv'.format(level))
    df_orig = Country.all_countries_as_df()

    # to force place order
    df_orig_c1 = df_orig[df_orig['Name'] == place]
    df_orig_c2 = df_orig[df_orig['Name'] == place2]

    len_c1 = len(df_orig_c1[df_orig_c1['TotalDeaths'] > 0])
    len_c2 = len(df_orig_c2[df_orig_c2['TotalDeaths'] > 0])

    # place has to be the one with smallest number of values for Deaths
    if (len_c1 > len_c2):
        place, place2 = place2, place
        df_orig = pd.concat([df_orig_c2, df_orig_c1])
    else:
        df_orig = pd.concat([df_orig_c1, df_orig_c2])

    df_places_gap = get_places_gap_df(df_orig, place, place2)

    df_total_cases_top = get_total_cases_df_adjusted(df_orig, df_places_gap,
                                                     place, place2)

    places = [place, place2]

    df_places_to_show = df_total_cases_top.loc[places, :]

    places_to_show = list(df_places_to_show.index.unique())

    df_events_owd = pd.DataFrame({
        'Date': [],
        'Name': [],
        'Desc': [],
        'FullText': [],
        'Highlight': []
    })

    today = df_places_to_show['Date'].max()

    for c in places_to_show:
        df_place = df_places_to_show.loc[c, ]

        #     df_events_owd = df_events_owd.append(pd.DataFrame({'Date':['2019-12-31'], 'Name': [c], 'Desc':['Begining of epidemic'], 'FullText':['First day of data tracking.']}))

        df_events_owd = df_events_owd.append(pd.Series(
            [(df_place.set_index('Date')['Total'] > 0).idxmax(), c,
             '1st Confirmed Case', '', 1],
            index=df_events_owd.columns),
                                             ignore_index=True)

        df_events_owd = df_events_owd.append(pd.Series(
            [(df_place.set_index('Date')['TotalDeaths'] > 0).idxmax(), c,
             '1st Death', '', 5],
            index=df_events_owd.columns),
                                             ignore_index=True)

        msg = """{} is approximately {} days behind {}'s epidemic progression. 
                      This is an estimate based on matching their death growth curves.""".format(
            place, abs(df_places_gap.loc[place2, 'gap']), place2)

        df_events_owd = df_events_owd.append(pd.Series(
            [today, c, 'Today', msg, 1], index=df_events_owd.columns),
                                             ignore_index=True)

    df_events_owd['Source'] = 'Our World in Data'

    # Adding data from Situation Reports
    if level == 'countries':
        df_events_sr = pd.read_csv(data_dir +
                                   'situation_reports_countries_highlight.csv')
    else:
        df_events_sr = pd.DataFrame({'Name': []})

    df_events_sr = df_events_sr[df_events_sr['Name'].isin([place, place2])]

    df_events = pd.concat([df_events_owd, df_events_sr], sort=True)

    # Groups events that happen on the same day

    df_events_grouped = pd.DataFrame(
        df_events.groupby(['Date',
                           'Name'])['Desc'].apply(lambda x: "\n".join(x)))

    df_events_grouped['FullText'] = df_events.groupby(
        ['Date', 'Name'])['FullText'].apply(lambda x: "\n".join(x))

    df_events_grouped['Source'] = df_events.groupby(
        ['Date', 'Name'])['Source'].apply(lambda x: "\n".join(x))

    df_events_grouped['Highlight'] = df_events.groupby(['Date', 'Name'
                                                        ])['Highlight'].max()

    df_events_adj = pd.merge(df_events_grouped,
                             df_places_to_show[['Date',
                                                'DayAdj']].reset_index(),
                             how='left',
                             on=['Date', 'Name'])

    df_events_adj['Highlight'] = df_events_adj['Highlight'].astype(int)

    df_places_events = pd.merge(df_events_adj[[
        'Name', 'DayAdj', 'Desc', 'FullText', 'Highlight', 'Source'
    ]],
                                df_places_to_show.reset_index(),
                                how='outer',
                                on=['DayAdj', 'Name'])

    df_places_events = df_places_events.set_index('Name')

    df_places_events_merged = pd.merge(
        df_places_events.loc[place, :].reset_index(),
        df_places_events.loc[place2, :].reset_index(),
        on='DayAdj',
        how='outer',
        suffixes=('', '2'))

    df_places_events_merged = df_places_events_merged.set_index(
        'DayAdj').sort_index()

    start_events = min(df_places_events_merged['Desc'].first_valid_index(),
                       df_places_events_merged['Desc2'].first_valid_index())

    end_events = max(
        df_places_events_merged['TotalDeaths'].last_valid_index(),
        df_places_events_merged['TotalDeaths2'].last_valid_index())

    df_places_events_trimed = df_places_events_merged.loc[
        start_events:end_events]

    df_places_events_trimed = df_places_events_trimed[[
        'Name',
        'Date',
        'Desc',
        'FullText',
        'Highlight',
        'Source',
        'Total',
        'TotalDeaths',
        'GrowthRate',
        'GrowthRateDeaths',
        'DaysToDouble',
        'DaysToDoubleDeaths',
        'Date2',
        'Name2',
        'Desc2',
        'FullText2',
        'Highlight2',
        'Source2',
        'Total2',
        'TotalDeaths2',
        'GrowthRate2',
        'GrowthRateDeaths2',
        'DaysToDouble2',
        'DaysToDoubleDeaths2',
    ]]

    # Fill place name for 1st place
    df_places_events_trimed['Name'] = df_places_events_trimed['Name'].ffill()

    # Fill place name for 2nd place
    df_places_events_trimed['Name2'] = df_places_events_trimed['Name2'].ffill()

    # Fill TotalDeath
    # df_places_events_trimed['TotalDeaths'] = df_places_events_trimed['TotalDeaths'].ffill()
    # df_places_events_trimed['TotalDeaths2'] = df_places_events_trimed['TotalDeaths2'].ffill()

    # Fill dates for 1st place
    # sr_days = pd.to_datetime(df_places_events_trimed['Date'].ffill())
    # sr_adj_days = df_places_events_trimed.groupby(df_places_events_trimed['Date'].notnull().cumsum()).cumcount()
    # df_places_events_trimed['Date'] = (sr_days + pd.to_timedelta(sr_adj_days, unit='d')).dt.strftime('%Y-%m-%d')

    # Fill dates for 2nd place
    # sr_days = pd.to_datetime(df_places_events_trimed['Date2'].ffill())
    # sr_adj_days = df_places_events_trimed.groupby(df_places_events_trimed['Date2'].notnull().cumsum()).cumcount()
    # df_places_events_trimed['Date2'] = (sr_days + pd.to_timedelta(sr_adj_days, unit='d')).dt.strftime('%Y-%m-%d')

    df_places_events_trimed = df_places_events_trimed.fillna('').replace(
        {'NaT': ''})

    return df_places_events_trimed.to_dict('records')
コード例 #3
0
ファイル: compare.py プロジェクト: Luca-A-Magalhaes/himcd
def get_df_similar_places(place, level='countries'):
    # if level == 'cities':
    #     df_sim = pd.read_csv(data_dir + 'all_{}_similarity.csv'.format(level))
    #     df_sim = df_sim[df_sim['CityBase'] == place]
    #     df_sim = df_sim[['Name', 'gap', 'dist', 'Similarity']].set_index('Name')
    #     return df_sim
    # df_orig = pd.read_csv(data_dir + 'total_cases_{}_normalized.csv'.format(level))
    df_orig = Country.all_countries_as_df()
    df_orig_piv_day = df_orig.pivot(index='Name',
                                    columns='Day',
                                    values='TotalDeaths')

    df_orig_piv_day = df_orig_piv_day.fillna(0)

    sr_place = df_orig_piv_day.loc[place, ]

    place_start = (sr_place > 0).idxmax()

    # place_start_cases = (df_orig.set_index('Name').loc[place,].set_index('Day')['Total'] > 0).idxmax()

    days_ahead = 14  #if level == 'countries' else 5

    df_places_ahead = df_orig_piv_day[
        df_orig_piv_day.loc[:, max(place_start - days_ahead, 0)] > 0.0]

    df_places_rate_norm = df_orig_piv_day.loc[df_places_ahead.index, :]

    # df_places_rate_norm = df_orig_piv_day.loc[['France', 'Italy'], :]

    df_places_rate_norm = df_places_rate_norm.append(
        df_orig_piv_day.loc[place, ])

    # reverse order to keep base place on top
    df_places_rate_norm = df_places_rate_norm.iloc[::-1]

    sr_place = df_orig_piv_day.loc[place, ]

    # place_start = (sr_place > 0).idxmax()

    # sr_place_compare = sr_place.loc[place_start:].dropna()

    sr_place = df_orig_piv_day.loc[place, ]

    place_start = (sr_place > 0).idxmax()

    sr_place_compare = sr_place.loc[place_start:].dropna()

    df_places_gap = pd.DataFrame({'Name': [], 'gap': [], 'dist': []})

    df_places_gap = df_places_gap.append(pd.Series(
        [place, 0.0, -1], index=df_places_gap.columns),
                                         ignore_index=True)

    for other_place in df_places_rate_norm.index[1:]:
        sr_other_place = df_places_rate_norm.loc[other_place, ].fillna(0)

        min_dist = np.inf

        min_pos = 0

        for i in range(0, 1 + len(sr_other_place) - len(sr_place_compare)):
            sr_other_place_compare = sr_other_place[i:i +
                                                    len(sr_place_compare)]
            dist = euclidean(sr_place_compare, sr_other_place_compare)
            if (dist < min_dist):
                min_dist = dist
                min_pos = i
        day_place2 = sr_other_place.index[min_pos]
        gap = day_place2 - place_start

        df_places_gap = df_places_gap.append(pd.Series(
            [other_place, gap, min_dist], index=df_places_gap.columns),
                                             ignore_index=True)

    df_places_gap = df_places_gap.set_index('Name')

    similar_places = df_places_gap.sort_values('dist')

    dist_max = euclidean(sr_place_compare, np.zeros(len(sr_place_compare)))

    similar_places['Similarity'] = similar_places['dist'].apply(
        lambda x: (1.0 - x / dist_max) if x >= 0 else 1)

    return similar_places