Exemple #1
0
def generate_lsp_problems(
        target_list: list = [
            'cumulative_total_cases', 'cumulative_total_deaths'
        ]):
    converter = DatetimeConverter()
    dfs = []
    target_list_delta = list(map(lambda x: x + '_delta', target_list))
    final_target_list = ['date'] + target_list_delta + vaccine_list
    for country in country_list:
        df = get_lsp_data(country)
        for feature in target_list:
            df[feature + '_forecast'] -= df[feature + '_real']
            df.columns = df.columns.map(lambda x: '_'.join(x.split(
                '_')[:-1]) + '_delta' if x == feature + '_forecast' else x)
        df = df[final_target_list]
        dfs.append(df)
    for idx, date in enumerate(date_list):
        for jdx, df in enumerate(dfs):
            # Get row
            df_row = df.iloc[[idx]]
            df_result = df_row if jdx == 0 else pd.concat([df_result, df_row],
                                                          ignore_index=True)
        df_result.to_csv(LSP_PROBLEMS_PATH + converter.date2str(date) + '.csv',
                         index=False)
    pass
Exemple #2
0
def get_one_lsp_problem(date: datetime):
    converter = DatetimeConverter()
    date_str = converter.date2str(date)
    lsp_problem_file = open(LSP_PROBLEMS_PATH + date_str + '.csv',
                            'r',
                            encoding='utf-8')
    df_lsp_problem = pd.read_csv(lsp_problem_file)
    df_lsp_problem['date'] = df_lsp_problem['date'].apply(pd.to_datetime)
    return df_lsp_problem
def augment_time(df: DataFrame,
                 source_formatter: str,
                 target_formatter: str,
                 source_freq: FreqType,
                 target_freq: FreqType,
                 total_cols: list = None,
                 item_cols: list = None,
                 date_col: str = 'date',
                 aug_method_type: str = 'linear'):
    if not (total_cols is None and item_cols is None):
        if aug_method_type == 'linear':
            str2date = DatetimeConverter(source_formatter).str2date
            date2str = DatetimeConverter(target_formatter).date2str
            freq_rate = get_freq_rate(source_freq, target_freq)
            seconds_num = get_freq_rate(source_freq + 1, FreqType.Second)
            new_df = DataFrame(columns=df.columns)
            current_row = df.iloc[0]
            tqdm_bar = tqdm(total=df.shape[0])
            for _, next_row in islice(df.iterrows(), 1, None):
                current_date = current_row[[date_col
                                            ]]  # Need [] to wrap date_col
                current_totals = None
                next_totals = None
                totals_gap = None
                if item_cols is not None:
                    current_items = current_row[item_cols]
                    avg = current_items // freq_rate
                    left = current_items % freq_rate
                if total_cols is not None:
                    current_totals = current_row[total_cols]
                    next_totals = next_row[total_cols]
                    totals_gap = next_totals - current_totals
                for i in range(freq_rate):
                    new_row = current_row.copy()
                    new_row[[
                        date_col
                    ]] = (current_date.map(str2date) +
                          i * timedelta(seconds=seconds_num)).map(date2str)
                    if total_cols is not None:
                        new_row[total_cols] = (
                            current_totals +
                            i * totals_gap / freq_rate).map(floor)
                    if item_cols is not None:
                        new_row[
                            item_cols] = avg if i != freq_rate - 1 else avg + left
                    new_df = new_df.append(new_row, ignore_index=True)
                current_row = next_row
                tqdm_bar.update(1)
            tqdm_bar.close()
            return new_df
def augment_time_new_fit_rate(df: DataFrame,
                              source_formatter: str,
                              target_formatter: str,
                              source_freq: FreqType,
                              target_freq: FreqType,
                              item_cols: list = None,
                              date_col: str = 'date',
                              aug_method_type: str = 'linear'):
    if item_cols is not None:
        if aug_method_type == 'linear':
            str2date = DatetimeConverter(source_formatter).str2date
            date2str = DatetimeConverter(target_formatter).date2str
            freq_rate = get_freq_rate(source_freq, target_freq)
            seconds_num = get_freq_rate(source_freq + 1, FreqType.Second)
            new_df = DataFrame(columns=df.columns)
            tqdm_bar = tqdm(total=df.shape[0])
            for _, current_row in df.iterrows():
                current_date = current_row[[date_col
                                            ]]  # Need [] to wrap date_col
                current_items = current_row[item_cols]
                avg = current_items // freq_rate
                upper = avg + 1
                counter = current_items % freq_rate
                rate = counter / freq_rate
                for i in range(freq_rate):
                    new_row = current_row.copy()
                    new_row[[
                        date_col
                    ]] = (current_date.map(str2date) +
                          i * timedelta(seconds=seconds_num)).map(date2str)
                    rand = random()
                    if rand > rate.all() and counter.all() > 0:
                        counter -= 1
                        new_row[item_cols] = upper
                    elif i < freq_rate - 1:
                        new_row[item_cols] = avg
                    else:
                        new_row[item_cols] = avg + counter
                    new_df = new_df.append(new_row, ignore_index=True)
                tqdm_bar.update(1)
            tqdm_bar.close()
            return new_df
def generate_data(pandemic_data_type: str = 'cumulative',
                  augmentation: bool = False):
    # Read raw data
    file_pandemic = open(PANDEMIC_RAW_TO_USE_PATH, 'r', encoding='utf-8')
    file_vaccination = open(VACCINATION_RAW_TO_USE_PATH, 'r', encoding='utf-8')
    if pandemic_data_type == 'all':
        df_pandemic = pd.read_csv(file_pandemic)
    if pandemic_data_type == 'cumulative':
        df_pandemic = pd.read_csv(file_pandemic,
                                  usecols=[
                                      'date', 'country',
                                      'cumulative_total_cases', 'active_cases',
                                      'cumulative_total_deaths'
                                  ])
    if pandemic_data_type == 'new':
        df_pandemic = pd.read_csv(
            file_pandemic,
            usecols=['date', 'country', 'daily_new_cases', 'daily_new_deaths'])

    df_pandemic = df_pandemic.fillna(
        0)  # augmentation might need to use 0. Remember to assign the val

    df_vaccination = pd.read_csv(file_vaccination)
    df_pandemic = df_pandemic[df_pandemic['country'].isin(
        COUNTRY_LIST_PANDEMIC)]
    df_vaccination = df_vaccination[df_vaccination['location'].isin(
        COUNTRY_LIST_VACCINATION)]
    df_vaccination['location'] = df_vaccination['location'].map(
        lambda x: "USA" if x == "United States" else x)
    df_vaccination.columns = df_vaccination.columns.map(
        lambda x: "country" if x == 'location' else x)
    df_vacc_start = df_vaccination.groupby('country')['date'].min()
    # split the pandemic data
    for idx, (country, date) in enumerate(df_vacc_start.items()):
        df_same_country_operator = df_pandemic['country'] == country
        df_before_operator = df_pandemic['date'].map(
            DatetimeConverter().str2date) < date
        df_nxt_bef_selector = df_same_country_operator & df_before_operator
        df_nxt_aft_selector = df_same_country_operator & ~df_before_operator
        df_bef_selector = df_nxt_bef_selector if idx == 0 else df_bef_selector | df_nxt_bef_selector
        df_aft_selector = df_nxt_aft_selector if idx == 0 else df_aft_selector | df_nxt_aft_selector

    df_pandemic_bef_vacc = df_pandemic[df_bef_selector]
    df_pandemic_aft_vacc = df_pandemic[df_aft_selector]

    if augmentation:
        if pandemic_data_type == 'cumulative':
            print('Start augmenting the cumulative data before pandemic...')
            df_pandemic_bef_vacc = augment_time_acc(
                df_pandemic_bef_vacc, '%Y-%m-%d', '%Y-%m-%d %H:%M:%S',
                FreqType.Day, FreqType.Hour, [
                    'cumulative_total_cases', 'active_cases',
                    'cumulative_total_deaths'
                ])
            print('Start augmenting the cumulative after pandemic...')
            df_pandemic_aft_vacc = augment_time_acc(
                df_pandemic_aft_vacc, '%Y-%m-%d', '%Y-%m-%d %H:%M:%S',
                FreqType.Day, FreqType.Hour, [
                    'cumulative_total_cases', 'active_cases',
                    'cumulative_total_deaths'
                ])
        if pandemic_data_type == 'new':
            print('Start augmenting the daily data before pandemic...')
            df_pandemic_bef_vacc = augment_time_new(
                df_pandemic_bef_vacc, '%Y-%m-%d', '%Y-%m-%d %H:%M:%S',
                FreqType.Day, FreqType.Hour,
                ['daily_new_cases', 'daily_new_deaths'])
            print('Start augmenting the daily data after pandemic...')
            df_pandemic_aft_vacc = augment_time_new(
                df_pandemic_aft_vacc, '%Y-%m-%d', '%Y-%m-%d %H:%M:%S',
                FreqType.Day, FreqType.Hour,
                ['daily_new_cases', 'daily_new_deaths'])
        if pandemic_data_type == 'all':
            print('Start augmenting all data before pandemic...')
            df_pandemic_bef_vacc = augment_time(
                df_pandemic_bef_vacc, '%Y-%m-%d', '%Y-%m-%d %H:%M:%S',
                FreqType.Day, FreqType.Hour,
                ['daily_new_cases', 'daily_new_deaths'], [
                    'cumulative_total_cases', 'active_cases',
                    'cumulative_total_deaths'
                ])
            print('Start augmenting all data after pandemic...')
            df_pandemic_aft_vacc = augment_time(
                df_pandemic_aft_vacc, '%Y-%m-%d', '%Y-%m-%d %H:%M:%S',
                FreqType.Day, FreqType.Hour,
                ['daily_new_cases', 'daily_new_deaths'], [
                    'cumulative_total_cases', 'active_cases',
                    'cumulative_total_deaths'
                ])

    # Avoid NaN in the final output data
    df_vaccination = df_vaccination.fillna(0)

    # Don't pass file. Just pass the path, or otherwise unnecessary newlines are added.
    df_pandemic_bef_vacc.to_csv(PANDEMIC_BEF_VACC_PATH, index=False)
    df_pandemic_aft_vacc.to_csv(PANDEMIC_AFT_VACC_PATH, index=False)
    df_vaccination.to_csv(VACC_SLICE_PATH, index=False)