def main(test_case): data_original_baselines = pd.read_csv( input_folder.format(output_original_baselines)).set_index( col_programme) data_original_baselines = data_original_baselines[ data_original_baselines.index.isin(test_case)] gateways_new = [ i for i in gateways if i in data_original_baselines.dropna(axis=1).columns.values ] for i, _ in enumerate(gateways_new[:-1]): X_col = gateways_new[i] # 'Delay_PSC_Gate' Y_col = gateways_new[i + 1] # 'Delay_J1_Gate' print(Y_col) data_delay_o = read_data(X_col, Y_col, train=True) data_delay_o = clean_data(data_delay_o, X_col, Y_col) data_delay_o = data_delay_o[~data_delay_o.index.isin(test_case)] # data_delay_o = data_delay_o[(~(data_delay_o[X_col]==0) & ~(data_delay_o[Y_col]==0)) ] # not interested in the cases that are well behave fig, ax = render_mpl_table(data_delay_o.corr(), col_width=4.0) filename = f'correlations - {X_col}-{Y_col}.png' fig.savefig(output_folder.format(filename)) data = GP_model(data_delay_o, X_col, Y_col, test_case, resample=is_trained_resampled) rhats = pd.DataFrame( [pm.diagnostics.gelman_rubin(data['delay_trace'])]) fig, ax = render_mpl_table(rhats, rounding=4, col_width=4.0) filename = f'rhats - {X_col}-{Y_col}.png' fig.savefig(output_folder.format(filename))
def GP_model(data_delay, X_col, Y_col, test_case, resample=False): test_cases = '-'.join(test_case) pickle_name = f'delay_model_train-{X_col}-{Y_col}-{test_cases}.p' pickle_path = output_folder.format(pickle_name) col_isCensored = 'isCensored?' col_X_intercept = 'intercept' col_complexity = 'Complexity' train_col = [X_col, col_complexity] seed = 192103 if resample: n_rows, _ = data_delay.shape X = np.empty((n_rows, 3)) X[:, 0] = 1 X[:, 1:] = data_delay[train_col] n_cols = X.shape[1] y = data_delay[Y_col].values y_std = y # censored = (data_delay[col_isCensored] == 1).values X_ = shared(X) censored_ = shared(censored) vague_sd_prior = 1000000 with pm.Model() as delay_model: beta = pm.Normal('beta', 0, vague_sd_prior, shape=n_cols) eta = beta.dot(X_.T) s = pm.HalfCauchy('s', 200) #for dead subjects (uncencored) y_obs = pm.Gumbel('y_obs', eta[~censored_], s, observed=y_std[~censored]) y_cens = pm.Potential( 'y_cens', gumble_sf(y_std[censored], eta[censored_], s)) with delay_model: # delay_trace = pm.sample(tune=tune_size, chains = 4, cores = 1, njobs=1) delay_trace = pm.sample(**sample_kwargs, nuts_kwargs=dict(target_accept=.95)) time.sleep(5) with open(pickle_path, 'wb') as buff: data = { 'delay_model': delay_model, 'delay_trace': delay_trace, 'X_': X_, 'censored': censored_, 'n_cols': n_cols, 'y_mean': y.mean(), 'y_std': y.std() } pickle.dump(data, buff) #TODO why does using two chains not giving any convergence, while using 4 daoes? # doesn't work else: with open(pickle_path, 'rb') as buff: data = pickle.load(buff) return data
def main(test_case): simulated_dates_all = pd.DataFrame() distribution_df_all = pd.DataFrame() gateways = find_first_gateways(output_planned_dates, test_case) data_original_baselines = pd.read_csv( input_folder.format(output_original_baselines)).set_index( col_programme) data_output_planned_dates = pd.read_csv( input_folder.format(output_planned_dates)).set_index(col_programme) data_original_baselines = data_original_baselines[ data_original_baselines.index.isin(test_case)] data_output_planned_dates = data_output_planned_dates[ data_output_planned_dates.index.isin(test_case)] gateways = [ i for i in gateways if i in data_original_baselines.dropna(axis=1).columns.values ] is_empty_baselines = data_original_baselines.isnull().sum().sum( ) == data_original_baselines.shape[1] if len(gateways) > 1 and ~(is_empty_baselines): for i, _ in enumerate(gateways[:-1]): print(gateways[i]) X_col = gateways[i] Y_col = gateways[i + 1] data_delay_o = read_data(X_col, Y_col, train=False) data_delay_o = data_delay_o[data_delay_o.index.isin(test_case)] cycle_plan_X_date = pd.to_datetime(data_original_baselines[X_col], format='%Y-%m-%d') cycle_plan_Y_date = pd.to_datetime(data_original_baselines[Y_col], format='%Y-%m-%d') planned_Y_date = pd.to_datetime(data_output_planned_dates[Y_col], format='%Y-%m-%d') planned_X_date = pd.to_datetime(data_output_planned_dates[X_col], format='%Y-%m-%d') data = GP_model(data_delay_o, X_col, Y_col, test_case, resample=False) delay_trace = data['delay_trace'] n_cols = data['n_cols'] y_mean = data['y_mean'] y_std = data['y_std'] X_pp = np.empty((nsim, n_cols)) X_pp[:, 0] = 1 X_pp[:, 2] = data_delay_o[col_complex] iscomplete = iscompleted_programme(output_planned_dates, test_case) past_gateway = False if i == 0: X_pp[:, 1] = data_delay_o[X_col] delay_start = data_delay_o[X_col] gateway_start = gateways[i] past_gateway = True elif (not iscomplete) and i == 1: # if not complete, then this is the current gateway # if data_delay_o[X_col] <0 then we are still within given time if (data_delay_o[X_col] < 0)[0] or data_delay_o[X_col].isnull()[0]: X_pp[:, 1] = survival_times past_gateway = False else: X_pp[:, 1] = data_delay_o[X_col] delay_start = data_delay_o[X_col] gateway_start = gateways[i] past_gateway = False else: X_pp[:, 1] = survival_times past_gateway = False if past_gateway: distribution_df = pd.DataFrame() distribution_df['dates'] = cycle_plan_X_date.values distribution_df['cycle_plan_date'] = 1 distribution_df['isPrediction'] = False distribution_df['isComplete'] = iscomplete distribution_df['test_case'] = test_case[0] distribution_df['gateway'] = X_col distribution_df_all = distribution_df_all.append( distribution_df) distribution_df = pd.DataFrame() distribution_df['dates'] = planned_X_date.values distribution_df['planned_actual_date'] = 1 distribution_df['isPrediction'] = False distribution_df['isComplete'] = iscomplete distribution_df['test_case'] = test_case[0] distribution_df['gateway'] = X_col distribution_df_all = distribution_df_all.append( distribution_df) indices = np.random.randint(0, sample_size * nchain, nsim) survival_times2 = [] for i, ind in enumerate(indices): pointix, chainix = np.divmod(ind, nchain) points = delay_trace._straces[chainix].point(pointix) eta = points['beta'].dot(X_pp[i, :].T) s = points['s'] # with pm.Model() as ppc: # survival_dist = pm.Gumbel('suv', eta, s) survival_dist = np.random.gumbel(eta, s, 1)[0] survival_times2.append(survival_dist) t_plot, weibull_pp_surv_mean, survival_times = build_survival_function( survival_times2, y_mean, y_std) weibull_pp_curv_mean = 1 - weibull_pp_surv_mean weibull_pp_pdf_mean = np.insert(np.diff(weibull_pp_curv_mean), 0, 0) ##output file test_cases = '-'.join(test_case) filename = distribution_filename.format(test_cases) simulated_dates = pd.DataFrame() simulated_dates['survival_days'] = survival_times simulated_dates['survival_days'] = pd.to_timedelta( simulated_dates['survival_days'], 'D') simulated_dates['cycle_plan_date'] = cycle_plan_Y_date[0] simulated_dates['simulated_completion_dates'] = simulated_dates[ 'cycle_plan_date'] + simulated_dates['survival_days'] simulated_dates['test_case'] = test_case[0] simulated_dates['gateway'] = Y_col distribution_df = pd.DataFrame() distribution_df['days'] = t_plot distribution_df['dates'] = cycle_plan_Y_date[0] distribution_df['dates'] = cycle_plan_Y_date[0] + pd.to_timedelta( distribution_df['days'], 'D') distribution_df['cycle_plan_date'] = ( distribution_df['dates'] == cycle_plan_Y_date[0]).astype(int) distribution_df['planned_actual_date'] = ( distribution_df['dates'] == planned_Y_date[0]).astype(int) distribution_df['survival_curve'] = weibull_pp_surv_mean distribution_df['cumulative_curve'] = weibull_pp_curv_mean distribution_df['pdf_curve'] = weibull_pp_pdf_mean if ~cycle_plan_Y_date.isnull()[0]: distribution_df['expected_date'] = ( ((cycle_plan_Y_date + pd.to_timedelta(survival_times.mean(), 'D')) ).dt.date[0] == distribution_df['dates']).astype(int) else: distribution_df['expected_date'] = 0 distribution_df['expected_days'] = survival_times.mean() distribution_df['test_case'] = test_case[0] distribution_df['gateway'] = Y_col distribution_df['conservative_view'] = 0 distribution_df['aggressive_view'] = 0 min_val = max(distribution_df.survival_curve.min(), 0.25) conservative_index = distribution_df.index[ distribution_df.survival_curve <= 0.25][0] aggressive_index = distribution_df.index[ distribution_df.survival_curve <= 0.75][0] distribution_df['conservative_view'].iloc[conservative_index] = 1 distribution_df['aggressive_view'].iloc[aggressive_index] = 1 distribution_df['gateway_start'] = gateway_start distribution_df['delay_start'] = delay_start[0] distribution_df['isComplete'] = iscomplete distribution_df['isPrediction'] = True distribution_df_all = distribution_df_all.append(distribution_df) simulated_dates_all = simulated_dates_all.append(simulated_dates) with open(output_folder.format(filename), 'wb') as buff: pickle.dump( { 'simualted_dates': simulated_dates_all, 'distribution_df': distribution_df_all }, buff) return True else: return False print('complete')
import model.Baysien as bays import model.Baysien_Sampling as samps import model.Baysien_Sampling_Viz as vizs import pandas as pd from config.config import output_folder test = pd.read_csv('.\\model\\data\\data_planned_dates.csv') test_cases = test['Program Display Name'].tolist( ) ### for \weibull_experiment_cleaned_dates_linear_complex if __name__ == '__main__': data_all = pd.DataFrame() for tc in test_cases: try: print(tc) bays.main([tc]) exist = samps.main([tc]) if exist: temp = vizs.main([tc]) data_all = data_all.append(temp) except ValueError: continue data_all.to_csv(output_folder.format('distribution_all.csv'))
def main(test_case): filename = distribution_filename.format(test_case[0]) path = output_folder.format(filename) with open(path, 'rb') as buff: data = pickle.load(buff) return (data['distribution_df'])