def plot_production_history_with_fit_and_predict(): df = pd.read_csv(predict_file) starting_index = producer_starting_indicies[1] producer = producers[1][starting_index:] injectors_tmp = [injector[starting_index:] for injector in injectors] X, y = production_rate_dataset(producer, *injectors_tmp) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, shuffle=False) crmp = CRMP().fit(X_train, y_train) for i in range(len(producer_names)): producer_df = producer_rows_from_df(df, i + 1) starting_index = producer_starting_indicies[i] producer = producers[i][starting_index:] injectors_tmp = [injector[starting_index:] for injector in injectors] X, y = production_rate_dataset(producer, *injectors_tmp) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, shuffle=False) producer_length = len(producer) t = np.linspace(1, producer_length, producer_length) train_length = len(y_train) train_time = t[:train_length] test_time = t[train_length:][1:] empty = [] plt.plot(empty, empty, c='r', label='Fit') plt.plot(empty, empty, c='g', label='Predict') plt.plot(t, producer, c='k') for index, row in producer_df.iterrows(): tau = row['tau_final'] f1 = row['f1_final'] f2 = row['f2_final'] f3 = row['f3_final'] f4 = row['f4_final'] crmp.tau_ = tau crmp.gains_ = [f1, f2, f3, f4] # Fitting y_hat = crmp.predict(X_train) plt.plot(train_time, y_hat, '--', alpha=0.02, c='r', linewidth=2) # Prediction y_hat = crmp.predict(X_test) plt.plot(test_time, y_hat, ':', alpha=0.02, c='g', linewidth=2) plt.vlines(test_time[0], 0, 1.1 * max(producer), linewidth=2, alpha=0.8) plot_helper(FIG_DIR, title=producer_names[i], xlabel='Time [days]', ylabel='Production Rate [bbls/day]', legend=True, save=True)
def total_water_injected_and_predicted_water_cut_dimensionless_time(): plt.figure() V_p = 3.53E+07 fitting_df = pd.read_csv(koval_fitting_file) predictions_df = pd.read_csv(koval_predictions_file) predictions_step_size_12 = predictions_df.loc[predictions_df['Step size'] == 12] models = ['Koval', 'LinearRegression', 'ElasticNet'] # models = ['Koval', 'LinearRegression', 'BayesianRidge', 'Lasso', 'ElasticNet'] t_D = [0] * 30 t_D = W_t / V_p for model in models: fitting = fitting_df.loc[fitting_df['Model'] == model] predictions = predictions_step_size_12.loc[ predictions_step_size_12['Model'] == model] x = [0] * 30 y = [0] * 30 for index, row in predictions.iterrows(): i = int(row['t_i'] - 121) x[i] = int(row['t_i']) y[i] = row['Prediction'] x = fitting['t_i'].tolist() + x y = fitting['Fit'].tolist() + y plt.plot(t_D[x], y, linestyle='--', linewidth=2, alpha=0.6) plt.axvline(x=t_D[120], color='k') plt.plot(t_D[x], f_w[3:]) legend = models legend.append('Predictions Start') legend.append('Data') plot_helper(FIG_DIR, title='Water Cut Fitting and Predictions', xlabel='Dimensionless Time', ylabel='Estimated Water Cut', legend=legend, save=True)
def total_water_injected_and_water_cut(): plt.figure() plt.plot(W_t, f_w) plot_helper(FIG_DIR, xlabel='Total Water Injected', ylabel='Water Cut', save=True)
def objective_function_contour_plot(): for i in range(number_of_producers): producer = i + 1 producer_df = producer_rows_from_df(objective_function_df, producer) x, y, z = contour_params(producer_df, x_column='f1', y_column='tau', z_column='MSE') plt.contourf(x, y, z, 15, alpha=1.0) plt.colorbar() title = 'CRMP: Producer {} Objective Function'.format(producer) x_true, y_true = true_params[producer] actual = plt.scatter(x_true, y_true, s=100, c='r', label='True Value', alpha=0.4) plt.legend(handles=[actual], loc='upper left') plt.tight_layout() plt.ylim(0, 100) plot_helper(FIG_DIR, title=title, xlabel=xlabel, ylabel=ylabel, save=True)
def initial_guesses_and_mse_from_prediction(): df = fitting_sensitivity_analysis_df for i in range(number_of_producers): producer = i + 1 df_producer_rows = df.loc[df['Producer'] == producer] x, y, z = contour_params(df_producer_rows, x_column='f1_initial', y_column='tau_initial', z_column='MSE') plt.contourf(x, y, z, 15, alpha=1.0) plt.colorbar() title = 'CRMP: Producer {} Initial Guesses with MSEs from Prediction'.format( producer) x_true, y_true = true_params[producer] actual = plt.scatter(x_true, y_true, s=100, c='r', label='Actual', alpha=0.5) plt.legend(handles=[actual], loc='upper left') plt.tight_layout() plt.ylim(0, 100) plot_helper(FIG_DIR, title=title, xlabel=xlabel, ylabel=ylabel, save=True)
def determine_train_test_split(): # producer_names = ['PA01', 'PA02', 'PA03', 'PA09', 'PA10', 'PA12'] train_sizes = np.linspace(0.1, 0.9, 81) for i in [4]: # Constructing dataset name = producer_names[i] print(name) producer = get_real_producer_data(producers_df, name, bhp=True) injectors = injectors_df[['Name', 'Date', 'Water Vol']] X, y = construct_real_production_rate_dataset(producer, injectors) for train_size in train_sizes: X_train, X_test, y_train, y_test = train_test_split( X, y, train_size=train_size, shuffle=False) X_train = X_train.to_numpy() X_test = X_test.to_numpy() y_train = y_train.to_numpy() y_test = y_test.to_numpy() train_length = len(X_train) t_fit = np.linspace(0, train_length - 1, train_length) t_test = np.linspace(train_length, (train_length + 29), 30) model = CrmpBHP().fit(X_train, y_train) model.q0 = y_train[-1] y_hat = model.predict(X_test[:30, 1:]) plt.plot(t_test, y_test[:30], color='k', label='True Value') plt.plot(t_test, y_hat, color='r', label='Prediction') plot_helper(FIG_DIR, title='{}: {} Train Size'.format(name, train_size), xlabel='Days', ylabel='Production Rate [bbls/day]', legend=True, save=False) plt.show()
def plot_on_line_hours_per_day(): for name in producer_names: producer = producers_df.loc[producers_df['Name'] == name] production_rate = producer['Total Vol'] on_line_hours = producer['On-Line'] plt.hist(on_line_hours, bins=5) plot_helper(FIG_DIR, title=name, xlabel='Time [days]', save=True)
def plot_histogram_of_production_rates(): for name in producer_names: producer = producers_df.loc[producers_df['Name'] == name] production_rate = producer[producer['Total Vol'] != 0]['Total Vol'] plt.hist(production_rate, bins=10) plot_helper(FIG_DIR, title=name, xlabel='Production Rate [bbls/day]', save=True)
def producers_vs_time(): plt.figure() plt.plot(time, producers.T) plot_helper( FIG_DIR, xlabel='Time', ylabel='Production Rate', legend=producer_names, save=True )
def plot_bhp(): for name in producer_names: producer = producers_df.loc[producers_df['Name'] == name] bhp = producer['Av BHP'] l = len(bhp) t = np.linspace(1, l, l) plt.plot(t, bhp) plot_helper(FIG_DIR, title=name, xlabel='Time [days]', ylabel='Bottom Hole Pressure [psi]', save=True)
def plot_delta_bhp(): for name in producer_names: producer = get_real_producer_data(producers_df, name, bhp=True) delta_p = producer['delta_p'] l = len(delta_p) t = np.linspace(1, l, l) plt.plot(t, delta_p) plot_helper(FIG_DIR, title=name, xlabel='Time [days]', ylabel='Change in Bottom Hole Pressure [psi]', save=True)
def plot_production_rate(): tmp_producer_names = producer_names for name in tmp_producer_names: producer = producers_df.loc[producers_df['Name'] == name] production_rate = producer['total rate'] t = np.linspace(0, len(production_rate), len(production_rate)) plt.plot(t, production_rate) plot_helper(FIG_DIR, xlabel='Time [days]', ylabel='Production Rate [bbls/day]', title=name, save=True)
def plot_production_rate(): tmp_producer_names = ['PA09', 'PA12'] for name in tmp_producer_names: i = producer_names.index(name) print(i) producer = producers[i] starting_index = producer_starting_indicies[i] plt.plot(time[starting_index:], producer[starting_index:]) plot_helper(FIG_DIR, xlabel='Date', ylabel='Production Rate', legend=tmp_producer_names, save=True)
def producers_vs_injector(): for i in range(len(injectors)): plt.figure() for producer in producers: plt.scatter(injectors[i], producer) plot_helper( FIG_DIR, title='Injector {}'.format(i + 1), xlabel='Injection Rate', ylabel='Production Rate', legend=producer_names, save=True )
def production_rate_vs_different_time_constants(): time = tau_at_zero_df['time'] taus = tau_at_zero_df.iloc[:, 2:] plt.plot(time, taus, alpha=0.5, linewidth=3) plot_helper( FIG_DIR, title='CRMP: Constant Injection Rate Over Different Time Constants', xlabel='Time', ylabel='Production Rate', legend=[ 'Tau = 1e-06', 'Tau = 1', 'Tau = 10', 'Tau = 20', 'Tau = 50', 'Tau = 100' ], save=True)
def plot_fractional_flow_curve(): for i in range(len(producer_names)): starting_index = producer_starting_indicies[i] total_prod = producers[i][starting_index:] water_prod = producers_water_production[i][starting_index:] t = time[starting_index:] water_fraction = water_prod / total_prod water_fraction.fillna(0, inplace=True) plt.plot(t, water_fraction) plot_helper(FIG_DIR, title=producer_names[i], xlabel='Time [days]', ylabel='Water Fraction of Total Production [unitless]', save=True)
def plot_imputed_and_original_production_rate(): for name in producer_names: producer = get_real_producer_data(producers_df, name) original_data = deepcopy(producer[name]) l = len(producer) y = np.zeros(l) impute_training_data(producer, y, name)[0] t = np.linspace(1, l, l) plt.plot(t, original_data) plt.plot(t, producer[name]) plot_helper(FIG_DIR, title='{}: Imputed Production Data'.format(name), xlabel='Time [days]', ylabel='Producer Rate [bbls/day]', save=True)
def production_rate_with_predictions(): fit_df = pd.read_csv(fit_file) predict_df = pd.read_csv(predict_file) for i in range(len(producers)): producer_number = i + 1 plt.figure() fitting_producer = fit_df.loc[ fit_df['Producer'] == producer_number ] predictions_producer = predict_df.loc[ predict_df['Producer'] == producer_number ] producer = producers[i] predictions_step_size_2 = predictions_producer.loc[ predictions_producer['Step size'] == 12 ] models = ['CRMP', 'LinearRegression', 'BayesianRidge'] # models = ['ICRMP', 'LinearRegression', 'BayesianRidge'] for model in models: fitting = fitting_producer.loc[ fitting_producer['Model'] == model ] predictions = predictions_step_size_2.loc[ predictions_step_size_2['Model'] == model ] x = [0] * 29 y = [0] * 29 for index, row in predictions.iterrows(): k = int(row['t_i'] - 121) x[k] = int(row['t_i']) y[k] = row['Prediction'] x = fitting['t_i'].tolist() + x y = fitting['Fit'].tolist() + y plt.plot(x, y, linestyle='--', linewidth=2, alpha=0.6) plt.axvline(x=120, color='k') plt.plot(producer) legend = models legend.append('Predictions Start') legend.append('Data') plot_helper( FIG_DIR, title='Producer {}'.format(producer_number), xlabel='Time', ylabel='Production Rate Fitting and Predictions', legend=legend, save=True )
def plot_injection_rates(): for name in injector_names: injector = injectors_df.loc[injectors_df['Name'] == name] injection_rate = injector['Water Vol'] l = len(injection_rate) count = (injector['Water Vol'] == 0).sum() print('Length: {}'.format(l)) print('Count: {}'.format(count)) print('Shut in Fraction: {}'.format(count * 1.0 / l)) t = np.linspace(0, l, l) continue plt.plot(t, injection_rate) plot_helper(FIG_DIR, xlabel='Time [days]', ylabel='Injection Rate [bbls/day]', title=name, save=True)
def parameter_convergence(): for i in range(len(producers)): plt.figure(figsize=[7, 4.8]) producer = i + 1 producer_rows_df = producer_rows_from_df( fitting_sensitivity_analysis_df, producer) x, y = initial_and_final_params_from_df(producer_rows_df) x_true, y_true = true_params[producer] for j in range(len(x)): initial = plt.scatter(x[j][0], y[j][0], s=40, c='g', marker='o', label='Initial') final = plt.scatter(x[j][1], y[j][1], s=40, c='r', marker='x', label='Final') plt.plot(x[j], y[j], c='k', alpha=0.15) actual = plt.scatter(x_true, y_true, s=200, c='b', marker='X', label='True Value') # actual = plt.scatter( # x_true, y_true, s=100, c='r', label='Actual', alpha=0.5 # ) title = 'CRMP: Producer {} Initial Parameter Values with Convergence'.format( producer) plt.legend(handles=[actual, initial, final], bbox_to_anchor=(1.04, 1), loc="upper left") plt.xlim(0, 1) plt.ylim(0, 100) plt.tight_layout() plot_helper(FIG_DIR, title=title, xlabel=xlabel, ylabel=ylabel, save=True)
def plot_average_hour_production_rate(): t = np.linspace(1, 1317, 1317) for name in producer_names: producer = producers_df.loc[producers_df['Name'] == name] production_rate = producer['Total Vol'] on_line_hours = producer['On-Line'] hourly_production_rate = production_rate / on_line_hours hourly_production_rate.fillna(0, inplace=True) hourly_production_rate.replace(np.inf, 0, inplace=True) l = len(hourly_production_rate) plt.plot(t[-l:], hourly_production_rate) y_max = 1.1 * max(hourly_production_rate) print(y_max) plt.ylim(0, y_max) plot_helper(FIG_DIR, title=name, xlabel='Time [days]', ylabel='Hourly Production Rate [bbls/hour]', save=True)
def gradient_across_parameter_space_prediction_data(): for i in range(number_of_producers): producer = i + 1 producer_df = producer_rows_from_df(objective_function_df, producer) x, y, z = contour_params(producer_df, x_column='f1', y_column='tau', z_column='MSE') dz = np.gradient(z)[0] plt.contourf(x, y, dz, 15, alpha=1.0) plt.colorbar() title = 'CRMP: Producer {} ln(Gradient) Across Parameter Space for MSEs from Prediction'.format( producer) plt.tight_layout() plt.ylim(0, 100) plot_helper(FIG_DIR, title=title, xlabel=xlabel, ylabel=ylabel, save=True)
def fitted_params_and_mean_squared_error_fitting(): for i in range(len(producers)): producer = i + 1 producer_rows_df = producer_rows_from_df( fitting_sensitivity_analysis_df, producer) x, y, z = contour_params(producer_rows_df, x_column='f1_initial', y_column='tau_initial', z_column='MSE') plt.contourf(x, y, z) plt.colorbar() x, y = true_params[producer] actual = plt.scatter(x, y, c='red', label='Actual') plt.legend(handles=[actual]) title = 'CRMP Producer {}: Fitted Parameter Values with ln(MSE) from Fitting'.format( producer) plot_helper(FIG_DIR, title=title, xlabel=xlabel, ylabel=ylabel, save=True)
def water_cut_vs_time(): plt.figure() plt.plot(f_w) plot_helper(FIG_DIR, xlabel='Time', ylabel='Water Cut', save=True)
y_hat_lstm = [] for j in range(30): y_hat_j = model.predict(X_test_scaled[j:(j + 1)])[0][0] X_test_scaled[j + 1] = y_hat_j y_hat_lstm.append(y_hat_j) y_hat_lstm = np.array(y_hat_lstm).reshape(-1, 1) y_hat_lstm = scaler.inverse_transform(y_hat_lstm) r2, mse = fit_statistics(y_hat_lstm, y_test[:30]) print(mse) crmp = CRMP().fit(X_train, y_train) y_hat_crmp = crmp.predict(X_test[:30, 1:]) r2, mse = fit_statistics(y_hat_crmp, y_test[:30]) print(mse) t = np.linspace(76, 105, 30) plt.plot(t, y_test[:30], color='k', label='True Value', linewidth=2) plt.plot(t, y_hat_crmp, alpha=0.5, label='CRMP', linewidth=2) plt.plot(t, y_hat_lstm, alpha=0.5, label='LSTM Neural Network', linewidth=2) plt.tight_layout() plot_helper( FIG_DIR, title='{}: 30 Days Prediction for CRMP and LSTM Neural Network'.format( name), xlabel='Days', ylabel='Production Rate [bbls/day]', legend=True, save=True)
def best_worse(): train_sizes = [0.33, 0.735, 0.49, 0.45, 0.52, 0.66, 0.54] n_estimators = 100 delta_t = 1 models = [ [CrmpBHP(), False], [HuberRegressor(alpha=0.5, epsilon=100, fit_intercept=False), True], [LinearRegression(fit_intercept=False, positive=True), False], ] labels = [ 'CRMP-BHP', 'Huber Regression (Best)', 'Linear Regression (Worst)', ] # for i in [0, 1, 2, 3, 4, 6]: for i in [1]: # Constructing dataset name = producer_names[i] print(name) producer = get_real_producer_data(producers_df, name, bhp=True) injectors = injectors_df[['Name', 'Date', 'Water Vol']] X, y = construct_real_production_rate_dataset(producer, injectors, delta_t=delta_t) X_train, X_test, y_train, y_test = train_test_split( X, y, train_size=train_sizes[i], shuffle=False) train_length = len(X_train) t_fit = np.linspace(0, train_length - 1, train_length) t_test = np.linspace(train_length, (train_length + 29), 30) plt.plot(t_test, y_test[:30], color='k', label='True Value', linewidth=2) X_train_scaled = X_train.copy(deep=True) X_train_scaled[name] = log_transformation(X_train[name]) X_test_scaled = X_test.copy(deep=True) X_test_scaled[name] = log_transformation(X_test[name]) y_train_scaled = log_transformation(y_train) y_test_scaled = log_transformation(y_test) X_train = X_train.to_numpy() X_test = X_test.to_numpy() y_train = y_train.to_numpy() y_test = y_test.to_numpy() X_train_scaled = X_train_scaled.to_numpy() X_test_scaled = X_test_scaled.to_numpy() y_train_scaled = y_train_scaled.to_numpy() y_test_scaled = y_test_scaled.to_numpy() for j in range(len(models)): model = models[j][0] log = models[j][1] print(labels[j]) bgr = MBBaggingRegressor(base_estimator=model, n_estimators=n_estimators, block_size=7, bootstrap=True, n_jobs=-1, random_state=1) if log: bgr.fit(X_train_scaled, y_train_scaled) else: bgr.fit(X_train, y_train) if j == 0: y_hats = [] for e in bgr.estimators_: e.q0 = y_train[-1] y_hat_i = e.predict(X_test[:30, 1:]) y_hats.append(y_hat_i) y_hats_by_time = np.asarray(y_hats).T averages = [] for y_hats_i in y_hats_by_time: average = np.average(y_hats_i) averages.append(average) plt.plot(t_test, averages, label=labels[j], alpha=0.5, linewidth=2) continue y_hats = [] for e in bgr.estimators_: if log: y_hat_i = y_train_scaled[-1] else: y_hat_i = y_train[-1] y_hat = [] for k in range(30): if log: X_test_i = X_test_scaled[k, :] else: X_test_i = X_test[k, :] X_test_i[0] = y_hat_i X_test_i = X_test_i.reshape(1, -1) y_hat_i = e.predict(X_test_i) if log: y_hat.append(np.exp(y_hat_i) - 1) else: y_hat.append(y_hat_i) y_hats.append(y_hat) y_hats_by_time = np.asarray(y_hats).T.reshape(-1, n_estimators) averages = [] p50s = [] for y_hats_i in y_hats_by_time: average = np.average(y_hats_i) p50 = np.percentile(y_hats_i, 50) averages.append(average) p50s.append(p50) # Plotting p50s = np.array(p50s).clip(min=0) averages = np.array(averages).clip(min=0) plt.plot(t_test, averages, label=labels[j], alpha=0.5, linewidth=2) plt.tight_layout() plot_helper( FIG_DIR, title= '{}: 30 Days Prediction for CRMP-BHP and the Best and Worst Performing ML Estimators' .format(name), xlabel='Days', ylabel='Production Rate [bbls/day]', legend=True, save=True) # plt.show() print()
def train_bagging_regressor_with_crmp(): train_sizes = [0.33, 0.735, 0.49, 0.45, 0.52, 0.66, 0.54] # for i in range(len(producer_names) - 1): n_estimators = 100 delta_t = 1 for i in [0, 1, 2, 3, 4, 6]: # Constructing dataset name = producer_names[i] print(name) producer = get_real_producer_data(producers_df, name, bhp=True) injectors = injectors_df[['Name', 'Date', 'Water Vol']] X, y = construct_real_production_rate_dataset(producer, injectors, delta_t=delta_t) X_train, X_test, y_train, y_test = train_test_split( X, y, train_size=train_sizes[i], shuffle=False) X_train = X_train.to_numpy() X_test = X_test.to_numpy() y_train = y_train.to_numpy() y_test = y_test.to_numpy() train_length = len(X_train) t_fit = np.linspace(0, train_length - 1, train_length) t_test = np.linspace(train_length, (train_length + 29), 30) # Setting up estimator bgr = MBBaggingRegressor(base_estimator=CrmpBHP(delta_t=delta_t), n_estimators=n_estimators, block_size=7, bootstrap=True, n_jobs=-1, random_state=0) bgr.fit(X_train, y_train) model = CrmpBHP().fit(X_train, y_train) y_fits = [] for e in bgr.estimators_: y_hat_i = [] for i in range(len(y_train)): e.q0 = X_train[i, 0] y_hat_i.append(e.predict(np.array([X_train[i, 1:]]))) y_fits.append(y_hat_i) y_fits_by_time = np.asarray(y_fits).T.reshape(-1, n_estimators) y_fits_average = [] for y_hats_i in y_fits_by_time: average = np.average(y_hats_i) y_fits_average.append(average) r2, mse = fit_statistics(y_fits_average, y_train) # Getting all bootstrapped predictions y_hats = [] for e in bgr.estimators_: e.q0 = y_train[-1] y_hat_i = e.predict(X_test[:30, 1:]) y_hats.append(y_hat_i) y_hats_by_time = np.asarray(y_hats).T p10s = [] averages = [] p90s = [] for y_hats_i in y_hats_by_time: p10 = np.percentile(y_hats_i, 10) average = np.average(y_hats_i) p90 = np.percentile(y_hats_i, 90) p10s.append(p10) averages.append(average) p90s.append(p90) mse = fit_statistics(y_test[:30], averages)[1] max_train = np.amax(y_train[-100:]) max_fit = np.amax(y_fits_average[-100:]) max_realization = np.amax(y_hats) height = max(max_train, max_fit, max_realization) # Plotting plt.plot(t_fit[-100:], y_train[-100:], color='k') plt.plot(t_fit[-100:], y_fits_average[-100:], color='g', label='Fitting') plt.plot(t_test, y_test[:30], color='k', label='True Value') plt.plot(t_test, averages, color='b', label='Average') plt.plot(t_test, p10s, color='r', alpha=0.5, label='P10 & P90') plt.plot(t_test, p90s, color='r', alpha=0.5) for hat in y_hats: plt.plot(t_test, hat, color='k', alpha=0.02) plt.annotate('r-squared = {:.4f}'.format(r2), xy=(train_length - 60, height)) plt.vlines(train_length - 1, 0, height, linewidth=2, colors='k', linestyles='dashed', alpha=0.8) plot_helper(FIG_DIR, title='{}: 30 Days Prediction'.format(name), xlabel='Days', ylabel='Production Rate [bbls/day]', legend=True, save=True)