def generate_lsp_problems( target_list: list = [ 'cumulative_total_cases', 'cumulative_total_deaths' ]): converter = DatetimeConverter() dfs = [] target_list_delta = list(map(lambda x: x + '_delta', target_list)) final_target_list = ['date'] + target_list_delta + vaccine_list for country in country_list: df = get_lsp_data(country) for feature in target_list: df[feature + '_forecast'] -= df[feature + '_real'] df.columns = df.columns.map(lambda x: '_'.join(x.split( '_')[:-1]) + '_delta' if x == feature + '_forecast' else x) df = df[final_target_list] dfs.append(df) for idx, date in enumerate(date_list): for jdx, df in enumerate(dfs): # Get row df_row = df.iloc[[idx]] df_result = df_row if jdx == 0 else pd.concat([df_result, df_row], ignore_index=True) df_result.to_csv(LSP_PROBLEMS_PATH + converter.date2str(date) + '.csv', index=False) pass
def get_one_lsp_problem(date: datetime): converter = DatetimeConverter() date_str = converter.date2str(date) lsp_problem_file = open(LSP_PROBLEMS_PATH + date_str + '.csv', 'r', encoding='utf-8') df_lsp_problem = pd.read_csv(lsp_problem_file) df_lsp_problem['date'] = df_lsp_problem['date'].apply(pd.to_datetime) return df_lsp_problem
def augment_time(df: DataFrame, source_formatter: str, target_formatter: str, source_freq: FreqType, target_freq: FreqType, total_cols: list = None, item_cols: list = None, date_col: str = 'date', aug_method_type: str = 'linear'): if not (total_cols is None and item_cols is None): if aug_method_type == 'linear': str2date = DatetimeConverter(source_formatter).str2date date2str = DatetimeConverter(target_formatter).date2str freq_rate = get_freq_rate(source_freq, target_freq) seconds_num = get_freq_rate(source_freq + 1, FreqType.Second) new_df = DataFrame(columns=df.columns) current_row = df.iloc[0] tqdm_bar = tqdm(total=df.shape[0]) for _, next_row in islice(df.iterrows(), 1, None): current_date = current_row[[date_col ]] # Need [] to wrap date_col current_totals = None next_totals = None totals_gap = None if item_cols is not None: current_items = current_row[item_cols] avg = current_items // freq_rate left = current_items % freq_rate if total_cols is not None: current_totals = current_row[total_cols] next_totals = next_row[total_cols] totals_gap = next_totals - current_totals for i in range(freq_rate): new_row = current_row.copy() new_row[[ date_col ]] = (current_date.map(str2date) + i * timedelta(seconds=seconds_num)).map(date2str) if total_cols is not None: new_row[total_cols] = ( current_totals + i * totals_gap / freq_rate).map(floor) if item_cols is not None: new_row[ item_cols] = avg if i != freq_rate - 1 else avg + left new_df = new_df.append(new_row, ignore_index=True) current_row = next_row tqdm_bar.update(1) tqdm_bar.close() return new_df
def augment_time_new_fit_rate(df: DataFrame, source_formatter: str, target_formatter: str, source_freq: FreqType, target_freq: FreqType, item_cols: list = None, date_col: str = 'date', aug_method_type: str = 'linear'): if item_cols is not None: if aug_method_type == 'linear': str2date = DatetimeConverter(source_formatter).str2date date2str = DatetimeConverter(target_formatter).date2str freq_rate = get_freq_rate(source_freq, target_freq) seconds_num = get_freq_rate(source_freq + 1, FreqType.Second) new_df = DataFrame(columns=df.columns) tqdm_bar = tqdm(total=df.shape[0]) for _, current_row in df.iterrows(): current_date = current_row[[date_col ]] # Need [] to wrap date_col current_items = current_row[item_cols] avg = current_items // freq_rate upper = avg + 1 counter = current_items % freq_rate rate = counter / freq_rate for i in range(freq_rate): new_row = current_row.copy() new_row[[ date_col ]] = (current_date.map(str2date) + i * timedelta(seconds=seconds_num)).map(date2str) rand = random() if rand > rate.all() and counter.all() > 0: counter -= 1 new_row[item_cols] = upper elif i < freq_rate - 1: new_row[item_cols] = avg else: new_row[item_cols] = avg + counter new_df = new_df.append(new_row, ignore_index=True) tqdm_bar.update(1) tqdm_bar.close() return new_df
def generate_data(pandemic_data_type: str = 'cumulative', augmentation: bool = False): # Read raw data file_pandemic = open(PANDEMIC_RAW_TO_USE_PATH, 'r', encoding='utf-8') file_vaccination = open(VACCINATION_RAW_TO_USE_PATH, 'r', encoding='utf-8') if pandemic_data_type == 'all': df_pandemic = pd.read_csv(file_pandemic) if pandemic_data_type == 'cumulative': df_pandemic = pd.read_csv(file_pandemic, usecols=[ 'date', 'country', 'cumulative_total_cases', 'active_cases', 'cumulative_total_deaths' ]) if pandemic_data_type == 'new': df_pandemic = pd.read_csv( file_pandemic, usecols=['date', 'country', 'daily_new_cases', 'daily_new_deaths']) df_pandemic = df_pandemic.fillna( 0) # augmentation might need to use 0. Remember to assign the val df_vaccination = pd.read_csv(file_vaccination) df_pandemic = df_pandemic[df_pandemic['country'].isin( COUNTRY_LIST_PANDEMIC)] df_vaccination = df_vaccination[df_vaccination['location'].isin( COUNTRY_LIST_VACCINATION)] df_vaccination['location'] = df_vaccination['location'].map( lambda x: "USA" if x == "United States" else x) df_vaccination.columns = df_vaccination.columns.map( lambda x: "country" if x == 'location' else x) df_vacc_start = df_vaccination.groupby('country')['date'].min() # split the pandemic data for idx, (country, date) in enumerate(df_vacc_start.items()): df_same_country_operator = df_pandemic['country'] == country df_before_operator = df_pandemic['date'].map( DatetimeConverter().str2date) < date df_nxt_bef_selector = df_same_country_operator & df_before_operator df_nxt_aft_selector = df_same_country_operator & ~df_before_operator df_bef_selector = df_nxt_bef_selector if idx == 0 else df_bef_selector | df_nxt_bef_selector df_aft_selector = df_nxt_aft_selector if idx == 0 else df_aft_selector | df_nxt_aft_selector df_pandemic_bef_vacc = df_pandemic[df_bef_selector] df_pandemic_aft_vacc = df_pandemic[df_aft_selector] if augmentation: if pandemic_data_type == 'cumulative': print('Start augmenting the cumulative data before pandemic...') df_pandemic_bef_vacc = augment_time_acc( df_pandemic_bef_vacc, '%Y-%m-%d', '%Y-%m-%d %H:%M:%S', FreqType.Day, FreqType.Hour, [ 'cumulative_total_cases', 'active_cases', 'cumulative_total_deaths' ]) print('Start augmenting the cumulative after pandemic...') df_pandemic_aft_vacc = augment_time_acc( df_pandemic_aft_vacc, '%Y-%m-%d', '%Y-%m-%d %H:%M:%S', FreqType.Day, FreqType.Hour, [ 'cumulative_total_cases', 'active_cases', 'cumulative_total_deaths' ]) if pandemic_data_type == 'new': print('Start augmenting the daily data before pandemic...') df_pandemic_bef_vacc = augment_time_new( df_pandemic_bef_vacc, '%Y-%m-%d', '%Y-%m-%d %H:%M:%S', FreqType.Day, FreqType.Hour, ['daily_new_cases', 'daily_new_deaths']) print('Start augmenting the daily data after pandemic...') df_pandemic_aft_vacc = augment_time_new( df_pandemic_aft_vacc, '%Y-%m-%d', '%Y-%m-%d %H:%M:%S', FreqType.Day, FreqType.Hour, ['daily_new_cases', 'daily_new_deaths']) if pandemic_data_type == 'all': print('Start augmenting all data before pandemic...') df_pandemic_bef_vacc = augment_time( df_pandemic_bef_vacc, '%Y-%m-%d', '%Y-%m-%d %H:%M:%S', FreqType.Day, FreqType.Hour, ['daily_new_cases', 'daily_new_deaths'], [ 'cumulative_total_cases', 'active_cases', 'cumulative_total_deaths' ]) print('Start augmenting all data after pandemic...') df_pandemic_aft_vacc = augment_time( df_pandemic_aft_vacc, '%Y-%m-%d', '%Y-%m-%d %H:%M:%S', FreqType.Day, FreqType.Hour, ['daily_new_cases', 'daily_new_deaths'], [ 'cumulative_total_cases', 'active_cases', 'cumulative_total_deaths' ]) # Avoid NaN in the final output data df_vaccination = df_vaccination.fillna(0) # Don't pass file. Just pass the path, or otherwise unnecessary newlines are added. df_pandemic_bef_vacc.to_csv(PANDEMIC_BEF_VACC_PATH, index=False) df_pandemic_aft_vacc.to_csv(PANDEMIC_AFT_VACC_PATH, index=False) df_vaccination.to_csv(VACC_SLICE_PATH, index=False)