def plot_production_rate_and_injection_rate(): for producer_name in ['PA12']: producer = get_real_producer_data(producers_df, producer_name) injectors = [] for name in injector_names: injector = injectors_df.loc[injectors_df['Name'] == name, ['Water Vol', 'Date']] injectors.append(injector) plt.plot(producer['Date'], producer[producer_name], alpha=0.5) for i in range(len(injectors)): injector = injectors[i] plt.plot(injector['Date'], injector['Water Vol'], alpha=0.5, label='Injector {}'.format(i + 1)) dates = producer['Date'].tolist() middle_date = dates[int(len(dates) * 0.40)] plt.vlines(middle_date, 0, 50000, linewidth=1, alpha=0.8) print(middle_date) plt.gcf().autofmt_xdate() plt.title(producer_name) plt.xlabel('Dates') plt.ylabel('Production Rate [bbls/day]') plt.legend() plt.show()
def determine_train_test_split(): # producer_names = ['PA01', 'PA02', 'PA03', 'PA09', 'PA10', 'PA12'] train_sizes = np.linspace(0.1, 0.9, 81) for i in [4]: # Constructing dataset name = producer_names[i] print(name) producer = get_real_producer_data(producers_df, name, bhp=True) injectors = injectors_df[['Name', 'Date', 'Water Vol']] X, y = construct_real_production_rate_dataset(producer, injectors) for train_size in train_sizes: X_train, X_test, y_train, y_test = train_test_split( X, y, train_size=train_size, shuffle=False) X_train = X_train.to_numpy() X_test = X_test.to_numpy() y_train = y_train.to_numpy() y_test = y_test.to_numpy() train_length = len(X_train) t_fit = np.linspace(0, train_length - 1, train_length) t_test = np.linspace(train_length, (train_length + 29), 30) model = CrmpBHP().fit(X_train, y_train) model.q0 = y_train[-1] y_hat = model.predict(X_test[:30, 1:]) plt.plot(t_test, y_test[:30], color='k', label='True Value') plt.plot(t_test, y_hat, color='r', label='Prediction') plot_helper(FIG_DIR, title='{}: {} Train Size'.format(name, train_size), xlabel='Days', ylabel='Production Rate [bbls/day]', legend=True, save=False) plt.show()
def plot_delta_bhp(): for name in producer_names: producer = get_real_producer_data(producers_df, name, bhp=True) delta_p = producer['delta_p'] l = len(delta_p) t = np.linspace(1, l, l) plt.plot(t, delta_p) plot_helper(FIG_DIR, title=name, xlabel='Time [days]', ylabel='Change in Bottom Hole Pressure [psi]', save=True)
def plot_imputed_and_original_production_rate(): for name in producer_names: producer = get_real_producer_data(producers_df, name) original_data = deepcopy(producer[name]) l = len(producer) y = np.zeros(l) impute_training_data(producer, y, name)[0] t = np.linspace(1, l, l) plt.plot(t, original_data) plt.plot(t, producer[name]) plot_helper(FIG_DIR, title='{}: Imputed Production Data'.format(name), xlabel='Time [days]', ylabel='Producer Rate [bbls/day]', save=True)
def train_bagging_regressor_with_crmp(): # producer_names = ['PA01', 'PA02', 'PA03', 'PA09', 'PA10', 'PA12'] train_sizes = [0.33, 0.735, 0.49, 0.56, 0.80, 0.45, 0.54] for i in range(len(producer_names)): # Constructing dataset name = producer_names[i] print(name) producer = get_real_producer_data(producers_df, name, bhp=True) injectors = injectors_df[['Name', 'Date', 'Water Vol']] X, y = construct_real_production_rate_dataset(producer, injectors) X_train, X_test, y_train, y_test = train_test_split( X, y, train_size=train_sizes[i], shuffle=False) X_train = X_train.to_numpy() X_test = X_test.to_numpy() y_train = y_train.to_numpy() y_test = y_test.to_numpy() n_splits = len(X_train) // 30 tscv = TimeSeriesSplit(n_splits) cv = tscv.split(X_train) # Setting up estimator bgr = MBBaggingRegressor(base_estimator=CrmpBHP(), n_estimators=100, bootstrap=True, random_state=0) param_grid = {'block_size': [7, 14, 21, 28, 90]} gcv = GridSearchCV(bgr, param_grid=param_grid, scoring=scorer_for_crmp, cv=cv) # Fitting the estimator gcv.fit(X_train, y_train) print(gcv.best_params_) print(gcv.best_estimator_) print() print()
def train_bagging_regressor_with_crmp(): train_sizes = [0.33, 0.735, 0.49, 0.45, 0.52, 0.66, 0.54] # for i in range(len(producer_names) - 1): n_estimators = 100 delta_t = 1 for i in [0, 1, 2, 3, 4, 6]: # Constructing dataset name = producer_names[i] print(name) producer = get_real_producer_data(producers_df, name, bhp=True) injectors = injectors_df[['Name', 'Date', 'Water Vol']] X, y = construct_real_production_rate_dataset(producer, injectors, delta_t=delta_t) X_train, X_test, y_train, y_test = train_test_split( X, y, train_size=train_sizes[i], shuffle=False) X_train = X_train.to_numpy() X_test = X_test.to_numpy() y_train = y_train.to_numpy() y_test = y_test.to_numpy() train_length = len(X_train) t_fit = np.linspace(0, train_length - 1, train_length) t_test = np.linspace(train_length, (train_length + 29), 30) # Setting up estimator bgr = MBBaggingRegressor(base_estimator=CrmpBHP(delta_t=delta_t), n_estimators=n_estimators, block_size=7, bootstrap=True, n_jobs=-1, random_state=0) bgr.fit(X_train, y_train) model = CrmpBHP().fit(X_train, y_train) y_fits = [] for e in bgr.estimators_: y_hat_i = [] for i in range(len(y_train)): e.q0 = X_train[i, 0] y_hat_i.append(e.predict(np.array([X_train[i, 1:]]))) y_fits.append(y_hat_i) y_fits_by_time = np.asarray(y_fits).T.reshape(-1, n_estimators) y_fits_average = [] for y_hats_i in y_fits_by_time: average = np.average(y_hats_i) y_fits_average.append(average) r2, mse = fit_statistics(y_fits_average, y_train) # Getting all bootstrapped predictions y_hats = [] for e in bgr.estimators_: e.q0 = y_train[-1] y_hat_i = e.predict(X_test[:30, 1:]) y_hats.append(y_hat_i) y_hats_by_time = np.asarray(y_hats).T p10s = [] averages = [] p90s = [] for y_hats_i in y_hats_by_time: p10 = np.percentile(y_hats_i, 10) average = np.average(y_hats_i) p90 = np.percentile(y_hats_i, 90) p10s.append(p10) averages.append(average) p90s.append(p90) mse = fit_statistics(y_test[:30], averages)[1] max_train = np.amax(y_train[-100:]) max_fit = np.amax(y_fits_average[-100:]) max_realization = np.amax(y_hats) height = max(max_train, max_fit, max_realization) # Plotting plt.plot(t_fit[-100:], y_train[-100:], color='k') plt.plot(t_fit[-100:], y_fits_average[-100:], color='g', label='Fitting') plt.plot(t_test, y_test[:30], color='k', label='True Value') plt.plot(t_test, averages, color='b', label='Average') plt.plot(t_test, p10s, color='r', alpha=0.5, label='P10 & P90') plt.plot(t_test, p90s, color='r', alpha=0.5) for hat in y_hats: plt.plot(t_test, hat, color='k', alpha=0.02) plt.annotate('r-squared = {:.4f}'.format(r2), xy=(train_length - 60, height)) plt.vlines(train_length - 1, 0, height, linewidth=2, colors='k', linestyles='dashed', alpha=0.8) plot_helper(FIG_DIR, title='{}: 30 Days Prediction'.format(name), xlabel='Days', ylabel='Production Rate [bbls/day]', legend=True, save=True)
def best_worse(): train_sizes = [0.33, 0.735, 0.49, 0.45, 0.52, 0.66, 0.54] n_estimators = 100 delta_t = 1 models = [ [CrmpBHP(), False], [HuberRegressor(alpha=0.5, epsilon=100, fit_intercept=False), True], [LinearRegression(fit_intercept=False, positive=True), False], ] labels = [ 'CRMP-BHP', 'Huber Regression (Best)', 'Linear Regression (Worst)', ] # for i in [0, 1, 2, 3, 4, 6]: for i in [1]: # Constructing dataset name = producer_names[i] print(name) producer = get_real_producer_data(producers_df, name, bhp=True) injectors = injectors_df[['Name', 'Date', 'Water Vol']] X, y = construct_real_production_rate_dataset(producer, injectors, delta_t=delta_t) X_train, X_test, y_train, y_test = train_test_split( X, y, train_size=train_sizes[i], shuffle=False) train_length = len(X_train) t_fit = np.linspace(0, train_length - 1, train_length) t_test = np.linspace(train_length, (train_length + 29), 30) plt.plot(t_test, y_test[:30], color='k', label='True Value', linewidth=2) X_train_scaled = X_train.copy(deep=True) X_train_scaled[name] = log_transformation(X_train[name]) X_test_scaled = X_test.copy(deep=True) X_test_scaled[name] = log_transformation(X_test[name]) y_train_scaled = log_transformation(y_train) y_test_scaled = log_transformation(y_test) X_train = X_train.to_numpy() X_test = X_test.to_numpy() y_train = y_train.to_numpy() y_test = y_test.to_numpy() X_train_scaled = X_train_scaled.to_numpy() X_test_scaled = X_test_scaled.to_numpy() y_train_scaled = y_train_scaled.to_numpy() y_test_scaled = y_test_scaled.to_numpy() for j in range(len(models)): model = models[j][0] log = models[j][1] print(labels[j]) bgr = MBBaggingRegressor(base_estimator=model, n_estimators=n_estimators, block_size=7, bootstrap=True, n_jobs=-1, random_state=1) if log: bgr.fit(X_train_scaled, y_train_scaled) else: bgr.fit(X_train, y_train) if j == 0: y_hats = [] for e in bgr.estimators_: e.q0 = y_train[-1] y_hat_i = e.predict(X_test[:30, 1:]) y_hats.append(y_hat_i) y_hats_by_time = np.asarray(y_hats).T averages = [] for y_hats_i in y_hats_by_time: average = np.average(y_hats_i) averages.append(average) plt.plot(t_test, averages, label=labels[j], alpha=0.5, linewidth=2) continue y_hats = [] for e in bgr.estimators_: if log: y_hat_i = y_train_scaled[-1] else: y_hat_i = y_train[-1] y_hat = [] for k in range(30): if log: X_test_i = X_test_scaled[k, :] else: X_test_i = X_test[k, :] X_test_i[0] = y_hat_i X_test_i = X_test_i.reshape(1, -1) y_hat_i = e.predict(X_test_i) if log: y_hat.append(np.exp(y_hat_i) - 1) else: y_hat.append(y_hat_i) y_hats.append(y_hat) y_hats_by_time = np.asarray(y_hats).T.reshape(-1, n_estimators) averages = [] p50s = [] for y_hats_i in y_hats_by_time: average = np.average(y_hats_i) p50 = np.percentile(y_hats_i, 50) averages.append(average) p50s.append(p50) # Plotting p50s = np.array(p50s).clip(min=0) averages = np.array(averages).clip(min=0) plt.plot(t_test, averages, label=labels[j], alpha=0.5, linewidth=2) plt.tight_layout() plot_helper( FIG_DIR, title= '{}: 30 Days Prediction for CRMP-BHP and the Best and Worst Performing ML Estimators' .format(name), xlabel='Days', ylabel='Production Rate [bbls/day]', legend=True, save=True) # plt.show() print()
def evaluate_crmp_bhp_model(): iteration = 0 for name in producer_names: print('Producer Name: ', name) producer = get_real_producer_data(producers_df, name, bhp=True) injectors = injectors_df[['Name', 'Date', 'Water Vol']] X, y = construct_real_production_rate_dataset(producer[['Date', name]], injectors, producer['delta_p']) X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.40, shuffle=False) X_train = X_train.to_numpy() X_test = X_test.to_numpy() y_train = y_train.to_numpy() y_test = y_test.to_numpy() for p0 in p0s: iteration += 1 print('Iteration: {}'.format(iteration)) crmpbhp = CrmpBHP(p0=deepcopy(p0)) crmpbhp = crmpbhp.fit(X_train, y_train) # Fitting # y_hat = crmpbhp.predict(X_train) # r2, mse = fit_statistics(y_hat, y_train, shutin=True) # fit_data['Producer'].append(name) # fit_data['Model'].append(model_namer(crmpbhp)) # fit_data['tau_initial'].append(p0[0]) # fit_data['tau_final'].append(crmpbhp.tau_) # fit_data['f1_initial'].append(p0[1]) # fit_data['f1_final'].append(crmpbhp.gains_[0]) # fit_data['f2_initial'].append(p0[2]) # fit_data['f2_final'].append(crmpbhp.gains_[1]) # fit_data['f3_initial'].append(p0[3]) # fit_data['f3_final'].append(crmpbhp.gains_[2]) # fit_data['f4_initial'].append(p0[4]) # fit_data['f4_final'].append(crmpbhp.gains_[3]) # fit_data['r2'].append(r2) # fit_data['MSE'].append(mse) # Prediction y_hat = crmpbhp.predict(X_test[:30, 1:]) r2, mse = fit_statistics(y_hat, y_test[:30], shutin=True) predict_data['Producer'].append(name) predict_data['Model'].append(model_namer(crmpbhp)) predict_data['tau_initial'].append(p0[0]) predict_data['tau_final'].append(crmpbhp.tau_) predict_data['f1_initial'].append(p0[1]) predict_data['f1_final'].append(crmpbhp.gains_[0]) predict_data['f2_initial'].append(p0[2]) predict_data['f2_final'].append(crmpbhp.gains_[1]) predict_data['f3_initial'].append(p0[3]) predict_data['f3_final'].append(crmpbhp.gains_[2]) predict_data['f4_initial'].append(p0[4]) predict_data['f4_final'].append(crmpbhp.gains_[3]) predict_data['r2'].append(r2) predict_data['MSE'].append(mse) # Fitting fit_df = pd.DataFrame(fit_data) fit_df.to_csv(fit_output_file) # Prediction predict_df = pd.DataFrame(predict_data) predict_df.to_csv(predict_output_file)