def export_results(df, date_update_fin, date_fin): ext = '.csv' date_init_pred = (datetime.datetime.strptime(date_update_fin, '%Y-%m-%d') + datetime.timedelta(days=1)).strftime('%Y-%m-%d') dir_save = os.sep.join([ get_dir_main(), 'forecasting', 'forecast', 'results', date_init_pred + '_' + date_fin.strftime('%Y-%m-%d') + ext ]) df.to_csv(dir_save, encoding='ansi', index=False)
def save_result_cluster(m_dem_reduce_cluster, op_red, type_day, date_init, date_fin, is_train): if is_train: dir_save_results = os.sep.join([ get_dir_main(), 'training', 'cluster', 'results', date_init + '_' + date_fin ]) else: dir_save_results = os.sep.join([ get_dir_main(), 'forecasting', 'cluster', 'results', date_init + '_' + date_fin ]) if not os.path.exists(dir_save_results): os.makedirs(dir_save_results) filename_result = 'cluster-data_' + op_red + '_' + type_day + '.csv' m_dem_reduce_cluster.to_csv(dir_save_results + os.sep + filename_result, index=True)
def forecast_train_process(date_init, date_fin, transform='decompose-Fourier', list_num_decompose=(1, 4), list_num_coeff_fourier=(12, 5), type_decompose='additive'): if isinstance(list_num_coeff_fourier, int): list_num_coeff_fourier = [list_num_coeff_fourier] if isinstance(list_num_decompose, int): list_num_decompose = [list_num_decompose] start_time = time.time() dir_train_name = os.sep.join([ get_dir_main(), 'training', 'cluster', 'results', date_init + '_' + date_fin ]) list_files_train = glob.glob(dir_train_name + os.sep + '*.csv') date_f_fin = datetime.datetime.strptime(date_fin + ' 23:00:00', '%Y-%m-%d %H:%M:%S') date_init = datetime.datetime.strptime(date_init, '%Y-%m-%d') date_fin = datetime.datetime.strptime(date_fin, '%Y-%m-%d') df_dir_train = pd.DataFrame(list_files_train, columns=['dir_name_train']) df_dir_train['date_init'] = date_init df_dir_train['date_fin'] = date_f_fin df_dir_train['cod_op_red'] = df_dir_train.dir_name_train.apply( lambda x: x.split(os.sep)[-1].split('_')[-2]) df_dir_train['type_day'] = df_dir_train.dir_name_train.apply( lambda x: x.split(os.sep)[-1].split('_')[-1].split('.')[0]) df_dir_train['train'] = df_dir_train.apply( lambda x: pd.read_csv(x.dir_name_train, sep=',', header=0, encoding='ansi', parse_dates=False), axis=1) if transform == 'decompose-Fourier': df_param_transform = pd.DataFrame({ 'num_decompose': list_num_decompose, 'num_coeff_fourier': list_num_coeff_fourier }) df_param_transform['key'] = 1 df_dir_train['key'] = 1 df_dir_train = df_dir_train.merge(df_param_transform, how='left', left_on='key', right_on='key') df_dir_train.drop(columns=['key'], inplace=True) df_dir_train['transform_model'] = transform df_dir_train['type_decompose'] = type_decompose df_dir_train.apply(dynamic_train, axis=1) print('total_time_execution_forecast_train_process(sec): ', abs(time.time() - start_time))
def save_cluster_model_comp_pca(model_cluster, op_red, type_day, date_init, date_fin): dir_model_save = os.sep.join([ get_dir_main(), 'training', 'cluster', 'models', date_init + '_' + date_fin ]) if not os.path.exists(dir_model_save): os.makedirs(dir_model_save) filename_model = op_red + '_' + type_day + '_cluster-model.pkl' pickle.dump(model_cluster, open(dir_model_save + os.sep + filename_model, 'wb'))
def save_model_dir(pipeline, transform, num_cluster, op_red, type_day, type_model, date_init, date_fin , periods_decompose=(), n_decompose='', type_decompose='additive'): ext = '.pkl' dir_model_save = os.sep.join([get_dir_main(), 'training', 'forecast', 'models', date_init+'_'+date_fin]) if not os.path.exists(dir_model_save): os.makedirs(dir_model_save) if 'decompose' in transform: filename = '_'.join([op_red, type_day, type_model, type_decompose, 'pd-'+'-'.join(periods_decompose) , 'nd-'+str(n_decompose), 'cluster-'+str(num_cluster), transform]) elif 'normal' in transform or 'fourier' in transform: filename = '_'.join([op_red, type_day, type_model, 'cluster-'+str(num_cluster), transform]) else: raise ValueError('invalid variable transform {}.'.format(transform)) pickle.dump(pipeline, open(dir_model_save+os.sep+filename+ext, 'wb'))
def cluster_process(directory_input_data, ops_red, types_days, date_init_train, date_fin_train, date_init, date_fin, is_train=False): dir_load_model_cluster = os.sep.join([ get_dir_main(), 'training', 'cluster', 'models', date_init_train + '_' + date_fin_train ]) filename_components = 'n_components_features.csv' df_comp = pd.read_csv(dir_load_model_cluster + os.sep + filename_components, sep=',', header=0, encoding='ansi') for op_red in ops_red: print('\n\n Executing OR: ', op_red) data_op_red = get_data(directory_input_data, op_red, date_init, date_fin) for var_type_day in types_days: data_op_red_t_day = data_op_red.query('tipodia == @var_type_day') print('\t type day: ', var_type_day) dem_data, pv_dem_data = transform_data(data_op_red_t_day, date_init, date_fin) m_features = matrix_features(pv_dem_data, features='fourier') stat_test(m_features) n_comp = df_comp.query( 'cod_or == @op_red and type_day == @var_type_day' ).n_components.values[0] m_pca_features = get_matrix_pca(m_features, show_plot=False, dynamic_component=False, n_comp=n_comp) labels = get_clusters(dir_load_model_cluster, m_pca_features, var_type_day, op_red) m_pca_features['labels'] = labels m_dem_cluster = group_dem_users_cluster( dem_data=dem_data, m_features_labels=m_pca_features) m_dem_reduce_cluster = reduce_k_cluster(m_dem_cluster, threshold_dem=0.02) save_result_cluster(m_dem_reduce_cluster, op_red, var_type_day, date_init, date_fin, is_train)
def cluster_train_process(ops_red, types_days, directory_input_data, date_init, date_fin): start_time = time.time() filename_components = 'n_components_features.csv' df_comp = pd.DataFrame() for op_red in ops_red: print('\n\n Executing OR: ', op_red) data_op_red = get_data(directory_input_data, op_red, date_init, date_fin) for var_type_day in types_days: print('\t type day: ', var_type_day) data_op_red_t_day = data_op_red.query('tipodia == @var_type_day') dem_data, pv_dem_data = transform_data(data_op_red_t_day, date_init, date_fin) m_features = matrix_features(pv_dem_data, features='fourier') stat_test(m_features) m_pca_features = get_matrix_pca(m_features, show_plot=False, dynamic_component=True) df_k_opt, labels, k_means_model = cluster_kmeans( x_train=m_pca_features, k_min=2, k_max=10) k = k_optimal(df_k_opt) _, _, model_cluster = cluster_kmeans(x_train=m_pca_features, k_min=k, k_max=k + 1) save_cluster_model_comp_pca(model_cluster, op_red, var_type_day, date_init, date_fin) dict_comp = { 'cod_or': [op_red], 'type_day': [var_type_day], 'n_components': [m_pca_features.shape[1]] } comp = pd.DataFrame(dict_comp) df_comp = df_comp.append(comp) df_comp.to_csv(os.sep.join([ get_dir_main(), 'training', 'cluster', 'models', date_init + '_' + date_fin, filename_components ]), sep=',', encoding='ansi', index=False) print('total_time_execution_cluster_train_process(sec): ', abs(time.time() - start_time))
def get_train_models(date_train_init, date_train_fin): dir_train = os.sep.join([ get_dir_main(), 'training', 'forecast', 'models', date_train_init + '_' + date_train_fin ]) files_train = glob.glob(dir_train + os.sep + '*.pkl') df_dir_train = pd.DataFrame(files_train, columns=['dir_model_train']) df_dir_train['cod_op_red'] = df_dir_train.dir_model_train.apply( lambda x: x.split(os.sep)[-1].split('_')[0]) df_dir_train['type_day'] = df_dir_train.dir_model_train.apply( lambda x: x.split(os.sep)[-1].split('_')[1]) df_dir_train['t_transform'] = df_dir_train.dir_model_train.apply( lambda x: x.split(os.sep)[-1].split('_')[-1].split('.')[0]) n_transform = df_dir_train.t_transform.drop_duplicates().shape[0] if n_transform == 1: transform = df_dir_train.t_transform.drop_duplicates().values[0] if transform == 'decompose-Fourier' or transform == 'decompose': df_dir_train[ 'type_decompose'] = df_dir_train.dir_model_train.apply( lambda x: x.split(os.sep)[-1].split('_')[3]) df_dir_train['num_decompose'] = df_dir_train.dir_model_train.apply( lambda x: x.split(os.sep)[-1].split('_')[-3]) cols = [ 'dir_model_train', 'cod_op_red', 'type_day', 't_transform', 'type_decompose', 'num_decompose' ] df_dir_train = df_dir_train[cols] df_train = df_dir_train.groupby( by=list(df_dir_train.columns)[1:]).agg( lambda x: ",".join(x)).reset_index() return df_train else: raise ValueError( 'invalid variable transform {}.'.format(transform)) else: raise ValueError( 'exist more than one or anything type transform in training models. Number types transform get {}.' .format(n_transform))
def get_data_train(date_train_init, date_update_fin): dir_train = os.sep.join([ get_dir_main(), 'forecasting', 'cluster', 'results', date_train_init + '_' + date_update_fin ]) files_train = glob.glob(dir_train + os.sep + '*.csv') df_dir_train = pd.DataFrame(files_train, columns=['dir_name_train']) df_dir_train['date_train_init'] = date_train_init df_dir_train['date_update_fin'] = date_update_fin df_dir_train['cod_op_red'] = df_dir_train.dir_name_train.apply( lambda x: x.split(os.sep)[-1].split('_')[-2]) df_dir_train['type_day'] = df_dir_train.dir_name_train.apply( lambda x: x.split(os.sep)[-1].split('_')[-1].split('.')[0]) df_dir_train['train'] = df_dir_train.apply( lambda x: pd.read_csv(x.dir_name_train, sep=',', header=0, encoding='ansi', parse_dates=False), axis=1) return df_dir_train
k_max=k + 1) save_cluster_model_comp_pca(model_cluster, op_red, var_type_day, date_init, date_fin) dict_comp = { 'cod_or': [op_red], 'type_day': [var_type_day], 'n_components': [m_pca_features.shape[1]] } comp = pd.DataFrame(dict_comp) df_comp = df_comp.append(comp) df_comp.to_csv(os.sep.join([ get_dir_main(), 'training', 'cluster', 'models', date_init + '_' + date_fin, filename_components ]), sep=',', encoding='ansi', index=False) print('total_time_execution_cluster_train_process(sec): ', abs(time.time() - start_time)) if __name__ == '__main__': dir_main = get_dir_main() dir_input_data = dir_main + os.sep + 'data' + os.sep + 'input' t_days = ['ORD', 'SAB', 'FESTIVO', 'DOM'] operadores_red = ['ORD1'] d_i = '2017-01-01' d_f = '2020-01-03' print(dir_input_data) cluster_train_process(operadores_red, t_days, dir_input_data, d_i, d_f)