def random_forest_model_1(X_train, y_train, X_test, y_test, parameters:Dict): regressor_1 = RandomForestRegressor(random_state=0) regressor_1.fit(X_train, y_train) ''' print the mean squared error and accuracy of regression model ''' ''' training performance ''' print("Train Result:\n") print("mean squared error: {}\n".format(mean_squared_error(y_train, regressor_1.predict(X_train)))) print("R_squared: {0:.4f}\n".format(r2_score(y_train, regressor_1.predict(X_train)))) # res = cross_val_score(regr, X_train, y_train, cv=10, scoring='accuracy') # print("Average Accuracy: \t {0:.4f}".format(np.mean(res))) # print("Accuracy SD: \t\t {0:.4f}".format(np.std(res))) ''' test performance ''' print("Test Result:\n") print("mean squared error: {0:.4f}\n".format(mean_squared_error(y_test, regressor_1.predict(X_test)))) print("R_squared: {0:.4f}\n".format(r2_score(y_test, regressor_1.predict(X_test)))) data_set_regressor_1 = PickleLocalDataSet(filepath=parameters["path_models"]+"/regressor_1.pickle") data_set_regressor_1.save(regressor_1) dummy1 = X_test return dummy1
def random_forest_model_2(dummy1, X_train2, y_train2, X_test2, y_test2, parameters: Dict): regressor_2 = RandomForestRegressor(random_state=0) regressor_2.fit(X_train2, y_train2) ''' print the mean squared error and accuracy of regression model ''' ''' training performance ''' print("Train Result:\n") print("mean squared error: {}\n".format( mean_squared_error(y_train2, regressor_2.predict(X_train2)))) print("R_squared: {0:.4f}\n".format( r2_score(y_train2, regressor_2.predict(X_train2)))) ''' test performance ''' print("Test Result:\n") print("mean squared error: {0:.4f}\n".format( mean_squared_error(y_test2, regressor_2.predict(X_test2)))) print("R_squared: {0:.4f}\n".format( r2_score(y_test2, regressor_2.predict(X_test2)))) data_set_regressor_2 = PickleLocalDataSet( filepath=parameters["path_models"] + "/regressor_2.pickle") data_set_regressor_2.save(regressor_2) dummy2 = X_test2 return dummy2
def __init__( self, propensity_model_filename="../data/06_models/propensity_model.pickle", uplift_models_filename="../data/06_models/uplift_models_dict.pickle", df_filename="../data/07_model_output/df.csv", treated_sim_eval_filename="../data/08_reporting/treated__sim_eval_df.csv", untreated_sim_eval_filename="../data/08_reporting/untreated__sim_eval_df.csv", estimated_effect_filename="../data/08_reporting/estimated_effect_df.csv", args_raw=MemoryDataSet({}).load()): self.propensity_model = PickleLocalDataSet( filepath=propensity_model_filename, version=None) self.uplift_models_dict = PickleLocalDataSet( filepath=uplift_models_filename, version=None) self.df_03 = CSVLocalDataSet( filepath=df_filename, load_args=dict(index_col=["partition", "index"], float_precision="high"), save_args=dict(index=True, float_format="%.16e"), version=None, ) self.treated__sim_eval_df = CSVLocalDataSet( filepath=treated_sim_eval_filename, version=None) self.untreated__sim_eval_df = CSVLocalDataSet( filepath=untreated_sim_eval_filename, version=None) self.estimated_effect_df = CSVLocalDataSet( filepath=estimated_effect_filename, version=None) self.args_raw = args_raw
def random_forest_model_2(dummy1, X_train2, y_train2, X_test2, y_test2, parameters, property_names, steam_input_names, emulsion_input_names, scheme_train): # regressor_2 = RandomForestRegressor(n_estimators=250, min_samples_split=30, min_samples_leaf=30, max_depth=7) regressor_2 = RandomForestRegressor(max_depth=5, n_estimators=500, random_state=0) # AdaBoost and XGBoost Regressors # regressor_2 = AdaBoostRegressor(n_estimators = 500, random_state=0) # regressor_2 = XGBRegressor(objective='reg:squarederror') regressor_2.fit(X_train2, y_train2) # print_decision_rules(regressor_2) if scheme_train == 1: input_2_names = ['Steam [m3/d]'] + list(emulsion_input_names) + list(property_names) else: input_2_names = ['Oil [m3/d]'] + list(steam_input_names) + list(property_names) feat_idx = np.argsort(regressor_2.feature_importances_)[::-1] # import eli5 # from eli5.sklearn import PermutationImportance # perm = PermutationImportance(regressor_2).fit(X_test2, y_test2) # feat_idx = np.argsort(eli5.show_weights(perm))[::-1] input_2_names = np.array(input_2_names)[feat_idx] input_2_names = list(input_2_names) print("Feature importance:\n") for name, importance in zip(input_2_names, regressor_2.feature_importances_[feat_idx]): print(name, ": {0:.3f}".format(importance)) fig, ax = plt.subplots(1, 1, figsize=(14, 14)) pd.Series(regressor_2.feature_importances_[feat_idx][::-1], index=input_2_names[::-1]).plot('barh', ax=ax) ax.set_title('Features importance') fig.savefig(parameters["path_model_output_No_DWT"]+"/regressor_2_feature_importance.png") print("\n") ''' print the mean squared error and accuracy of regression model ''' ''' training performance ''' print("Train Result:") print("mean squared error: {0:.4f}".format(mean_squared_error(y_train2, regressor_2.predict(X_train2)))) print("R_squared: {0:.4f}\n".format(regressor_2.score(X_train2, y_train2))) ''' test performance ''' print("Test Result:") print("mean squared error: {0:.4f}".format(mean_squared_error(y_test2, regressor_2.predict(X_test2)))) print("R_squared: {0:.4f}\n".format(regressor_2.score(X_test2, y_test2))) data_set_regressor_2 = PickleLocalDataSet(filepath=parameters["path_models"]+"/regressor_2.pickle") data_set_regressor_2.save(regressor_2) # fig, axes = plt.subplots(nrows = 1,ncols = 1,figsize = (12,12), dpi=800) # tree.plot_tree(regressor_2.estimators_[0], feature_names = input_2_names, filled = True); # fig.savefig(parameters["path_model_output_No_DWT"]+"/regressor_2_tree.png") # tree.export_graphviz(regressor_2.estimators_[0], out_file="path_model_output_No_DWT"]+"/regressor_2_tree.dot", # feature_names = input_2_names, filled = True) algorithm = 1 dummy2 = X_test2 return [dummy2, algorithm]
def random_forest_model_1(X_train, y_train, X_test, y_test, parameters, property_names, steam_input_names, emulsion_input_names, scheme_train): regressor_1 = RandomForestRegressor(n_estimators=200, random_state=0) regressor_1.fit(X_train, y_train) if scheme_train == 1: input_1_names = list(steam_input_names) + list(property_names) else: input_1_names = list(emulsion_input_names) + list(property_names) feat_idx = np.argsort(regressor_1.feature_importances_)[::-1] input_1_names = np.array(input_1_names)[feat_idx] input_1_names = list(input_1_names) print("Feature importance:\n") for name, importance in zip(input_1_names, regressor_1.feature_importances_[feat_idx]): print(name, ": {0:.3f}".format(importance)) fig, ax = plt.subplots(1, 1, figsize=(14, 14)) pd.Series(regressor_1.feature_importances_[feat_idx][::-1], index=input_1_names[::-1]).plot('barh', ax=ax) ax.set_title('Features importance') fig.savefig(parameters["path_model_output_No_DWT"] + "/regressor_1_feature_importance.png") print("\n") ''' print the mean squared error and accuracy of regression model ''' ''' training performance ''' print("Train Result:") print("mean squared error: {0:.4f}".format( mean_squared_error(y_train, regressor_1.predict(X_train)))) print("R_squared: {0:.4f}\n".format( r2_score(y_train, regressor_1.predict(X_train)))) ''' test performance ''' print("Test Result:") print("mean squared error: {0:.4f}".format( mean_squared_error(y_test, regressor_1.predict(X_test)))) print("R_squared: {0:.4f}\n".format( r2_score(y_test, regressor_1.predict(X_test)))) data_set_regressor_1 = PickleLocalDataSet( filepath=parameters["path_models"] + "/regressor_1.pickle") data_set_regressor_1.save(regressor_1) # fig, axes = plt.subplots(1, 1,figsize = (12,12), dpi=800) # tree.plot_tree(regressor_1.estimators_[0], feature_names = input_1_names, filled = True); # fig.savefig(parameters["path_model_output_No_DWT"]+"/regressor_1_tree.png") # tree.export_graphviz(regressor_1.estimators_[0], out_file="path_model_output_No_DWT"]+"/regressor_1_tree.dot", # feature_names = input_1_names, filled = True) dummy1 = X_test return dummy1
def test_joblib_not_installed(self, filepath_pkl, mocker): """Check the error if 'joblib' module is not installed.""" mocker.patch.dict("sys.modules", joblib=None) reload(kedro.io.pickle_local) # creating a pickle-based data set should be fine PickleLocalDataSet(filepath=filepath_pkl, backend="pickle") # creating a joblib-based data set should fail pattern = (r"selected backend \'joblib\' could not be imported\. " r"Make sure it is installed\.") with pytest.raises(ImportError, match=pattern): PickleLocalDataSet(filepath=filepath_pkl, backend="joblib")
def test_version_str_repr(self, load_version, save_version): """Test that version is in string representation of the class instance when applicable.""" filepath = "test.pkl" ds = PickleLocalDataSet(filepath=filepath) ds_versioned = PickleLocalDataSet(filepath=filepath, version=Version( load_version, save_version)) assert filepath in str(ds) assert "version" not in str(ds) assert filepath in str(ds_versioned) ver_str = "version=Version(load={}, save='{}')".format( load_version, save_version) assert ver_str in str(ds_versioned)
def test_parallel_runner_with_pickle_dataset(self, tmp_path, spark_in, spark_out): """Test ParallelRunner with SparkDataSet -> PickleDataSet -> SparkDataSet . """ pickle_data = PickleLocalDataSet( filepath=str(tmp_path / "data.pkl"), backend="pickle" ) catalog = DataCatalog( data_sets={ "spark_in": spark_in, "pickle": pickle_data, "spark_out": spark_out, } ) pipeline = Pipeline( [ node(identity, "spark_in", "pickle"), node(identity, "pickle", "spark_out"), ] ) runner = ParallelRunner() pattern = ( r"The following data_sets cannot be " r"serialized: \[\'spark\_in\'\, \'spark\_out\'\]" ) with pytest.raises(AttributeError, match=pattern): runner.run(pipeline, catalog)
def test_bad_backend(self): """Check the error when trying to instantiate with invalid backend.""" pattern = ( r"backend should be one of \[\'pickle\'\, \'joblib\'\]\, " r"got wrong\-backend" ) with pytest.raises(ValueError, match=pattern): PickleLocalDataSet(filepath="test.pkl", backend="wrong-backend")
def standardisation(dummy, properties: np.ndarray, files: List, parameters: Dict): from sklearn.preprocessing import StandardScaler all_wells_input = [] all_wells_labels = [] for file in files: data_set_input = CSVLocalDataSet(filepath=parameters["path_primary"] + "/input_DWT_coeffs_" + file) DWT_Aprox_coeff_input = data_set_input.load() all_wells_input.append(DWT_Aprox_coeff_input.values) data_set_labels = CSVLocalDataSet(filepath=parameters["path_primary"] + "/labels_" + file) labels = data_set_labels.load() all_wells_labels.append(labels.values) all_wells_input = np.array(all_wells_input) # Standardize dynamic data coeffs scaler_coeffs = StandardScaler() scaler_coeffs.fit(all_wells_input[0]) # fit based on first well record all_wells_standardized_input = [] all_wells_standardized_input_flattened = [] for well_coeffs in all_wells_input: std_coeffs = scaler_coeffs.transform(well_coeffs) all_wells_standardized_input.append(std_coeffs) transposed_std_coeffs = np.transpose(std_coeffs) flattened_std_coeffs = transposed_std_coeffs.flatten() all_wells_standardized_input_flattened.append(flattened_std_coeffs) all_wells_standardized_input = np.array(all_wells_standardized_input) all_wells_standardized_input_flattened = np.array( all_wells_standardized_input_flattened) data_set_scaler_coeffs = PickleLocalDataSet( filepath=parameters["path_models"] + "/scaler_coeffs.pickle") data_set_scaler_coeffs.save(scaler_coeffs) input_columns = list(DWT_Aprox_coeff_input.columns) for std_coeffs, file in zip(all_wells_standardized_input, files): std_coeffs = pd.DataFrame(std_coeffs, columns=input_columns) data_set = CSVLocalDataSet(filepath=parameters["path_features"] + "/std_DWT_input_coeffs_" + file) data_set.save(std_coeffs) # Standardize static data scaler_static = StandardScaler() all_wells_standardized_properties = scaler_static.fit_transform(properties) data_set_scaler_static = PickleLocalDataSet( filepath=parameters["path_models"] + "/scaler_static.pickle") data_set_scaler_static.save(scaler_static) return [ all_wells_standardized_input_flattened, all_wells_standardized_properties, all_wells_labels ]
def test_parallel_runner_with_pickle_dataset(self, tmp_path, spark_in, spark_out, sample_spark_df): """Test ParallelRunner with SparkDataSet -> PickleDataSet -> SparkDataSet . """ pickle_data = PickleLocalDataSet(filepath=str(tmp_path / "data.pkl"), backend="pickle") catalog = DataCatalog(data_sets={ "spark_in": spark_in, "pickle": pickle_data, "spark_out": spark_out, }) pipeline = Pipeline([ node(identity, "spark_in", "pickle"), node(identity, "pickle", "spark_out"), ]) runner = ParallelRunner() pattern = r"{0} cannot be serialized. {1} can only be used with serializable data".format( str(sample_spark_df.__class__), str(pickle_data.__class__.__name__)) with pytest.raises(DataSetError, match=pattern): runner.run(pipeline, catalog)
def validate(parameters: Dict): # def validate(dummy2, parameters: Dict): import glob, os os.chdir(parameters["path_model_input"]) files = [] for file in glob.glob("*.csv"): files.append(file) filenames = [] wells_data = [] for file in files: filename, extension = file.split('.') filenames.append(filename) for file, filename in zip(files, filenames): io = DataCatalog({ filename: CSVLocalDataSet(filepath=parameters["path_model_input"] + "/" + file), }) well_data = io.load(filename) wells_data.append(well_data) Raw_Data_preprocessed = [] for well, file, filename in zip(wells_data, files, filenames): well = well[[ 'Date', 'Injector Bottom Hole Pressure', 'Steam Flow Rate - Outer', 'Bottom Hole Heel Temperature', 'Emulsion Pressure', 'Producer Bottom Hole Pressure', 'ESP Speed', 'Emulsion Flow Rate' ]] for i in range(1, len(well.columns)): well[well.columns[i]] = pd.to_numeric(well[well.columns[i]], errors='coerce') well = well.iloc[:1399] well = well.fillna(well.rolling(30, min_periods=1).median()) well = well.fillna(well.median()) well['Date'] = pd.to_datetime(well['Date']) well = well.set_index('Date') Raw_Data_preprocessed.append(well) os.chdir(parameters["path_val_stats"]) static_files = [] for static_file in glob.glob("*.csv"): static_files.append(static_file) static_filenames = [] statics_data = [] for static_file in static_files: static_filename, others = static_file.split('_') static_filenames.append(static_filename) for static_file, static_filename in zip(static_files, static_filenames): io = DataCatalog({ static_filename: CSVLocalDataSet(filepath=parameters["path_val_stats"] + "/" + static_file), }) static_data = io.load(static_filename) statics_data.append(static_data) statics_data_new = [] well_name_list = [] for pad_static in statics_data: well_name = pad_static['WELLPAIR_NAME'].values well_name_list.append(well_name) pad_static = pad_static.set_index('WELLPAIR_NAME') pad_static = pad_static.drop(columns=['PLAN_NAME', 'HIGH_PRESSURE']) statics_data_new.append(pad_static) properties = [] probabilities = [] asset_names = [] for pad_static, names in zip(statics_data_new, well_name_list): for well in names: prob = pad_static.loc[well, 'Forecast_Prob'] probabilities.append(prob) pad_code = pad_static.loc[well, 'PAD_CODE'] asset_name, pad = pad_code.split('_') asset_names.append(asset_name) property_ = pad_static.loc[ well, 'SAGD_PRESSURE':'BOTTOM_WATER_THICKNESS'].values properties.append(property_) properties = np.array(properties) all_wells_input = [] all_wells_labels = [] for well_data, file in zip(Raw_Data_preprocessed, files): DWT_Aprox_coeff_input = [] input_data, labels = split(well_data) input_columns = list(input_data.columns) for data_idx in input_columns: signal = well_data[data_idx].values thresh = parameters["thresh"] * np.nanmax(signal) coeff = pywt.wavedec(signal, wavelet=parameters["wavelet"], mode=parameters["mode1"], level=parameters["level"]) coeff[1:] = (pywt.threshold(i, value=thresh, mode=str(parameters["mode2"])) for i in coeff[1:]) DWT_Aprox_coeff_input.append(coeff[0]) DWT_Aprox_coeff_input = pd.DataFrame( np.transpose(DWT_Aprox_coeff_input), columns=input_columns) data_set_input = CSVLocalDataSet( filepath=parameters["path_val_pre_processed"] + "/validation_input_DWT_coeffs_" + file) data_set_input.save(DWT_Aprox_coeff_input) all_wells_input.append(DWT_Aprox_coeff_input.values) data_set_labels = CSVLocalDataSet( filepath=parameters["path_val_pre_processed"] + "/validation_labels_" + file) data_set_labels.save(labels) all_wells_labels.append(labels.values) # Standardize dynamic data coeffs data_set_scaler_coeffs = PickleLocalDataSet( filepath=parameters["path_models"] + "/scaler_coeffs.pickle") scaler_coeffs = data_set_scaler_coeffs.load() all_wells_standardized_input = [] all_wells_standardized_input_flattened = [] for well_coeffs in all_wells_input: std_coeffs = scaler_coeffs.transform(well_coeffs) all_wells_standardized_input.append(std_coeffs) transposed_std_coeffs = np.transpose(std_coeffs) flattened_std_coeffs = transposed_std_coeffs.flatten() all_wells_standardized_input_flattened.append(flattened_std_coeffs) all_wells_standardized_input = np.array(all_wells_standardized_input) all_wells_standardized_input_flattened = np.array( all_wells_standardized_input_flattened) input_columns = list(DWT_Aprox_coeff_input.columns) for std_coeffs, file in zip(all_wells_standardized_input, files): std_coeffs = pd.DataFrame(std_coeffs, columns=input_columns) data_set = CSVLocalDataSet( filepath=parameters["path_val_pre_processed"] + "/validation_std_DWT_input_coeffs_" + file) data_set.save(std_coeffs) # Standardize static data data_set_scaler_static = PickleLocalDataSet( filepath=parameters["path_models"] + "/scaler_static.pickle") scaler_static = data_set_scaler_static.load() all_wells_standardized_properties = scaler_static.fit_transform(properties) all_wells_coeffs_reservoir_data = [] for flattened_std_coeffs, standardized_properties in zip( all_wells_standardized_input_flattened, all_wells_standardized_properties): flattened_std_coeffs = list(flattened_std_coeffs) standardized_properties = list(standardized_properties) for reservoir_property in standardized_properties: flattened_std_coeffs.append( reservoir_property ) # append reservoir data to dynamic data coeffs all_wells_coeffs_reservoir_data.append(flattened_std_coeffs) all_wells_coeffs_reservoir_data = np.array(all_wells_coeffs_reservoir_data) well_count = np.arange(len(all_wells_coeffs_reservoir_data)) daily_timesteps = np.arange(len(all_wells_labels[0])) input_data = [] for coeff_inputs in all_wells_coeffs_reservoir_data: for time_lapse in daily_timesteps: well_inputs = [time_lapse] + list( coeff_inputs) # append time lapse to input data input_data.append(well_inputs) input_data = np.array(input_data) data_set_regressor_1 = PickleLocalDataSet( filepath=parameters["path_models"] + "/regressor_1.pickle") regressor_1 = data_set_regressor_1.load() number_of_wells = len(well_count) wells_steam_rate_predicted = regressor_1.predict(input_data) wells_steam_rate_predicted = wells_steam_rate_predicted.reshape( (number_of_wells, 1399)).T # prediction inputs to model 2 input_data_model_2 = [] for coeff_inputs, well in zip(all_wells_coeffs_reservoir_data, well_count): for time_lapse in daily_timesteps: well_inputs = [time_lapse] + list( coeff_inputs) # append time lapse to input data well_inputs_model_2 = [ wells_steam_rate_predicted[time_lapse, well] ] + well_inputs input_data_model_2.append(well_inputs_model_2) input_data_model_2 = np.array(input_data_model_2) data_set_regressor_2 = PickleLocalDataSet( filepath=parameters["path_models"] + "/regressor_2.pickle") regressor_2 = data_set_regressor_2.load() wells_emulsion_rate_predicted = regressor_2.predict(input_data_model_2) wells_emulsion_rate_predicted = wells_emulsion_rate_predicted.reshape( (number_of_wells, 1399)).T # actual targets all_wells_steam_data = [] all_wells_emulsion_data = [] for ID in well_count: well_steam_data = all_wells_labels[ID][:, 0] well_emulsion_data = all_wells_labels[ID][:, 1] all_wells_steam_data = all_wells_steam_data + list(well_steam_data) all_wells_emulsion_data = all_wells_emulsion_data + list( well_emulsion_data) all_wells_steam_data = np.array(all_wells_steam_data) all_wells_emulsion_data = np.array(all_wells_emulsion_data) wells_steam_rate_actual = all_wells_steam_data.reshape( (number_of_wells, 1399)).T wells_emulsion_rate_actual = all_wells_emulsion_data.reshape( (number_of_wells, 1399)).T print("Prediction Performance:\n") print("Steam Flow Rate:") for well, file in zip(well_count, files): steam_rate_predicted = wells_steam_rate_predicted[:, well] steam_rate_actual = wells_steam_rate_actual[:, well] steam_rate_actual_predicted = pd.DataFrame( np.vstack((steam_rate_actual, steam_rate_predicted)).T, columns=["steam rate actual", "steam rate predicted"]) data_set_steam_rate = CSVLocalDataSet( filepath=parameters["path_model_output"] + "/steam_rate_" + file) data_set_steam_rate.save(steam_rate_actual_predicted) print(file + " R_squared: {0:.4f}".format( r2_score(steam_rate_actual, steam_rate_predicted))) print("\n") print("Emulsion Flow Rate:") for well, file in zip(well_count, files): emulsion_rate_predicted = wells_emulsion_rate_predicted[:, well] emulsion_rate_actual = wells_emulsion_rate_actual[:, well] emulsion_rate_actual_predicted = pd.DataFrame( np.vstack((emulsion_rate_actual, emulsion_rate_predicted)).T, columns=["emulsion rate actual", "emulsion rate predicted"]) data_set_emulsion_rate = CSVLocalDataSet( filepath=parameters["path_model_output"] + "/emulsion_rate_" + file) data_set_emulsion_rate.save(emulsion_rate_actual_predicted) print(file + " R_squared: {0:.4f}".format( r2_score(emulsion_rate_actual, emulsion_rate_predicted))) dummy_validate = files return dummy_validate
def test_str_representation(self): """Test string representation of the data set instance.""" data_set = PickleLocalDataSet(filepath="test.pkl", backend="pickle") pattern = "PickleLocalDataSet(backend=pickle, filepath=test.pkl)" assert pattern in str(data_set)
def predict_oil_then_steam_RF(dummy14, well_count, number_of_wells, time_index, timesteps_validation, all_wells_emulsion_input_val, all_wells_steam_input_val, wells_emulsion_rate_actual, properties_val, stats_val_ROIP, parameters): data_set_regressor_1 = PickleLocalDataSet( filepath=parameters["path_models"] + "/regressor_1.pickle") regressor_1 = data_set_regressor_1.load() data_set_regressor_2 = PickleLocalDataSet( filepath=parameters["path_models"] + "/regressor_2.pickle") regressor_2 = data_set_regressor_2.load() well_count = np.arange(len(all_wells_emulsion_input_val)) wells_RF_array = [] wells_steam_rate_predicted = [] wells_emulsion_rate_predicted = [] for well_predictors_steam, well_predictors_emulsion, property_, well in zip( all_wells_steam_input_val, all_wells_emulsion_input_val, properties_val, well_count): ROIP = stats_val_ROIP[well] cum_oil = wells_emulsion_rate_actual[0, well] RF = cum_oil / ROIP well_RF_array = [] well_steam_rate_predicted = [] well_emulsion_rate_predicted = [] for time_lapse in time_index: well_inputs_1 = [RF] + list( well_predictors_emulsion[time_lapse]) + list(property_) well_inputs_1 = np.array(well_inputs_1).reshape(1, -1) well_RF_array.append(RF) emulsion_rate_predicted = regressor_1.predict( well_inputs_1 )[0] # [0]to emulsion_rate_predicted arry to element eg [300] to 300 well_emulsion_rate_predicted.append(emulsion_rate_predicted) # well_inputs_2 = [emulsion_rate_predicted, RF] + list(well_predictors_steam[time_lapse]) + list(property_) well_inputs_2 = [emulsion_rate_predicted, RF] + list( well_predictors_steam[time_lapse]) well_inputs_2 = np.array(well_inputs_2).reshape(1, -1) steam_rate_predicted = regressor_2.predict(well_inputs_2)[0] well_steam_rate_predicted.append(steam_rate_predicted) cum_oil = cum_oil + emulsion_rate_predicted RF = cum_oil / ROIP wells_RF_array.append(np.array(well_RF_array)) wells_emulsion_rate_predicted.append( np.array(well_emulsion_rate_predicted)) wells_steam_rate_predicted.append(np.array(well_steam_rate_predicted)) wells_RF_array = np.array(wells_RF_array) wells_emulsion_rate_predicted = np.array(wells_emulsion_rate_predicted) wells_emulsion_rate_predicted = wells_emulsion_rate_predicted.reshape( (number_of_wells, timesteps_validation)).T wells_steam_rate_predicted = np.array(wells_steam_rate_predicted) wells_steam_rate_predicted = wells_steam_rate_predicted.reshape( (number_of_wells, timesteps_validation)).T scheme = 4 dummy15 = scheme return [ dummy15, wells_steam_rate_predicted, wells_emulsion_rate_predicted, wells_RF_array, scheme ]
def versioned_pickle_data_set(filepath_pkl, load_version, save_version): return PickleLocalDataSet(filepath=filepath_pkl, version=Version(load_version, save_version))
def pickle_data_set_with_args(filepath_pkl): return PickleLocalDataSet( filepath=filepath_pkl, load_args={"fix_imports": False}, save_args={"fix_imports": False}, )
def pickle_data_set(filepath_pkl, request): return PickleLocalDataSet(filepath=filepath_pkl, backend=request.param)
def predict_oil_then_steam(dummy14, well_count, number_of_wells, time_index, timesteps_validation, all_wells_emulsion_input_val, all_wells_steam_input_val, properties_val, parameters, algorithm): input_data = [] for well_predictors_emulsion, property_ in zip( all_wells_emulsion_input_val, properties_val): for time_lapse in time_index: well_inputs = list( well_predictors_emulsion[time_lapse]) + list(property_) input_data.append(well_inputs) input_data = np.array(input_data) if algorithm == 1: data_set_regressor_1 = PickleLocalDataSet( filepath=parameters["path_models"] + "/regressor_1.pickle") regressor_1 = data_set_regressor_1.load() wells_emulsion_rate_predicted = regressor_1.predict(input_data) else: # # standardization # dataset_scaler_input_1 = PickleLocalDataSet(filepath=parameters["path_models"]+"/scaler_input_1.pickle") # scaler_input_1 = dataset_scaler_input_1.load() # input_data_model_1 = scaler_input_1.transform(input_data_model_1) from tensorflow import keras model_1 = keras.models.load_model(parameters["path_models"] + "/network_model_1.h5") wells_emulsion_rate_predicted = model_1.predict(input_data) wells_emulsion_rate_predicted = wells_emulsion_rate_predicted.reshape( (number_of_wells, timesteps_validation)).T # prediction inputs to model 2 input_data_model_2 = [] for well_predictors_steam, property_, well in zip( all_wells_steam_input_val, properties_val, well_count): for time_lapse in time_index: well_inputs_model_2 = [ wells_emulsion_rate_predicted[time_lapse, well] ] + list(well_predictors_steam[time_lapse]) + list(property_) # well_inputs_model_2 = [wells_emulsion_rate_predicted[time_lapse, well]] + list(well_predictors_steam[time_lapse]) input_data_model_2.append(well_inputs_model_2) input_data_model_2 = np.array(input_data_model_2) if algorithm == 1: data_set_regressor_2 = PickleLocalDataSet( filepath=parameters["path_models"] + "/regressor_2.pickle") regressor_2 = data_set_regressor_2.load() wells_steam_rate_predicted = regressor_2.predict(input_data_model_2) else: # # standardization # dataset_scaler_input_2 = PickleLocalDataSet(filepath=parameters["path_models"]+"/scaler_input_2.pickle") # scaler_input_1 = dataset_scaler_input_2.load() # input_data_model_1 = scaler_input_1.transform(input_data_model_2) model_2 = keras.models.load_model(parameters["path_models"] + "/network_model_2.h5") wells_steam_rate_predicted = model_2.predict(input_data_model_2) scheme = 2 wells_steam_rate_predicted = wells_steam_rate_predicted.reshape( (number_of_wells, timesteps_validation)).T dummy15 = scheme return [ dummy15, wells_steam_rate_predicted, wells_emulsion_rate_predicted, scheme ]
def __init__( self, train_df=None, # type: Optional[pd.DataFrame] test_df=None, # type: Optional[pd.DataFrame] cols_features=None, # type: Optional[List[str]] col_treatment="Treatment", # type: str col_outcome="Outcome", # type: str col_propensity="Propensity", # type: str col_cate="CATE", # type: str col_recommendation="Recommendation", # type: str min_propensity=0.01, # type: float max_propensity=0.99, # type: float verbose=2, # type: int uplift_model_params=dict( search_cv="sklearn.model_selection.GridSearchCV", estimator="xgboost.XGBClassifier", scoring=None, cv=3, return_train_score=False, n_jobs=-1, param_grid=dict( random_state=[0], max_depth=[3], learning_rate=[0.1], n_estimators=[100], verbose=[0], objective=["binary:logistic"], booster=["gbtree"], n_jobs=[-1], nthread=[None], gamma=[0], min_child_weight=[1], max_delta_step=[0], subsample=[1], colsample_bytree=[1], colsample_bylevel=[1], reg_alpha=[0], reg_lambda=[1], scale_pos_weight=[1], base_score=[0.5], missing=[None], ), ), # type: Union[Dict[str, List[Any]], Type[sklearn.base.BaseEstimator]] enable_ipw=True, # type: bool propensity_model_params=dict( search_cv="sklearn.model_selection.GridSearchCV", estimator="sklearn.linear_model.LogisticRegression", scoring=None, cv=3, return_train_score=False, n_jobs=-1, param_grid=dict( random_state=[0], C=[0.1, 1, 10], class_weight=[None], dual=[False], fit_intercept=[True], intercept_scaling=[1], max_iter=[100], multi_class=["ovr"], n_jobs=[1], penalty=["l1", "l2"], solver=["liblinear"], tol=[0.0001], warm_start=[False], ), ), # type: Dict[str, List[Any]] cv=3, # type: int index_name="index", # type: str partition_name="partition", # type: str runner="SequentialRunner", # type: str conditionally_skip=False, # type: bool dataset_catalog=dict( # args_raw = CSVLocalDataSet(filepath='../data/01_raw/args_raw.csv', version=None), # train_df = CSVLocalDataSet(filepath='../data/01_raw/train_df.csv', version=None), # test_df = CSVLocalDataSet(filepath='../data/01_raw/test_df.csv', version=None), propensity_model=PickleLocalDataSet( filepath="../data/06_models/propensity_model.pickle", version=None), uplift_models_dict=PickleLocalDataSet( filepath="../data/06_models/uplift_models_dict.pickle", version=None), df_03=CSVLocalDataSet( filepath="../data/07_model_output/df.csv", load_args=dict(index_col=["partition", "index"], float_precision="high"), save_args=dict(index=True, float_format="%.16e"), version=None, ), treated__sim_eval_df=CSVLocalDataSet( filepath="../data/08_reporting/treated__sim_eval_df.csv", version=None), untreated__sim_eval_df=CSVLocalDataSet( filepath="../data/08_reporting/untreated__sim_eval_df.csv", version=None), estimated_effect_df=CSVLocalDataSet( filepath="../data/08_reporting/estimated_effect_df.csv", version=None), ), # type: Dict[str, AbstractDataSet] logging_config={ "disable_existing_loggers": False, "formatters": { "json_formatter": { "class": "pythonjsonlogger.jsonlogger.JsonFormatter", "format": "[%(asctime)s|%(name)s|%(funcName)s|%(levelname)s] %(message)s", }, "simple": { "format": "[%(asctime)s|%(name)s|%(levelname)s] %(message)s" }, }, "handlers": { "console": { "class": "logging.StreamHandler", "formatter": "simple", "level": "INFO", "stream": "ext://sys.stdout", }, "info_file_handler": { "class": "logging.handlers.RotatingFileHandler", "level": "INFO", "formatter": "simple", "filename": "./info.log", "maxBytes": 10485760, # 10MB "backupCount": 20, "encoding": "utf8", "delay": True, }, "error_file_handler": { "class": "logging.handlers.RotatingFileHandler", "level": "ERROR", "formatter": "simple", "filename": "./errors.log", "maxBytes": 10485760, # 10MB "backupCount": 20, "encoding": "utf8", "delay": True, }, }, "loggers": { "anyconfig": { "handlers": ["console", "info_file_handler", "error_file_handler"], "level": "WARNING", "propagate": False, }, "kedro.io": { "handlers": ["console", "info_file_handler", "error_file_handler"], "level": "WARNING", "propagate": False, }, "kedro.pipeline": { "handlers": ["console", "info_file_handler", "error_file_handler"], "level": "INFO", "propagate": False, }, "kedro.runner": { "handlers": ["console", "info_file_handler", "error_file_handler"], "level": "INFO", "propagate": False, }, "causallift": { "handlers": ["console", "info_file_handler", "error_file_handler"], "level": "INFO", "propagate": False, }, }, "root": { "handlers": ["console", "info_file_handler", "error_file_handler"], "level": "INFO", }, "version": 1, }, # type: Optional[Dict[str, Any]] ): # type: (...) -> None self.runner = None # type: Optional[str] self.kedro_context = None # type: Optional[Type[FlexibleKedroContext]] self.args = None # type: Optional[Type[EasyDict]] self.train_df = None # type: Optional[Type[pd.DataFrame]] self.test_df = None # type: Optional[Type[pd.DataFrame]] self.df = None # type: Optional[Type[pd.DataFrame]] self.propensity_model = None # type: Optional[Type[sklearn.base.BaseEstimator]] self.uplift_models_dict = None # type: Optional[Type[EasyDict]] self.treatment_fractions = None # type: Optional[Type[EasyDict]] self.treatment_fraction_train = None # type: Optional[float] self.treatment_fraction_test = None # type: Optional[float] self.treated__proba = None # type: Optional[Type[np.array]] self.untreated__proba = None # type: Optional[Type[np.array]] self.cate_estimated = None # type: Optional[Type[pd.Series]] self.treated__sim_eval_df = None # type: Optional[Type[pd.DataFrame]] self.untreated__sim_eval_df = None # type: Optional[Type[pd.DataFrame]] self.estimated_effect_df = None # type: Optional[Type[pd.DataFrame]] # Instance attributes were defined above. if logging_config: logging.config.dictConfig(logging_config) args_raw = dict( cols_features=cols_features, col_treatment=col_treatment, col_outcome=col_outcome, col_propensity=col_propensity, col_cate=col_cate, col_recommendation=col_recommendation, min_propensity=min_propensity, max_propensity=max_propensity, verbose=verbose, uplift_model_params=uplift_model_params, enable_ipw=enable_ipw, propensity_model_params=propensity_model_params, index_name=index_name, partition_name=partition_name, runner=runner, conditionally_skip=conditionally_skip, ) args_raw = EasyDict(args_raw) args_raw.update( dataset_catalog.get("args_raw", MemoryDataSet({}).load())) assert args_raw.runner in {"SequentialRunner", "ParallelRunner", None} if args_raw.runner is None and args_raw.conditionally_skip: log.warning( "[Warning] conditionally_skip option is ignored since runner is None" ) self.kedro_context = FlexibleKedroContext( runner=args_raw.runner, only_missing=args_raw.conditionally_skip) self.runner = args_raw.runner if self.runner is None: self.df = bundle_train_and_test_data(args_raw, train_df, test_df) self.args = impute_cols_features(args_raw, self.df) self.args = schedule_propensity_scoring(self.args, self.df) self.treatment_fractions = treatment_fractions_(self.args, self.df) if self.args.need_propensity_scoring: self.propensity_model = fit_propensity(self.args, self.df) self.df = estimate_propensity(self.args, self.df, self.propensity_model) if self.runner: self.kedro_context.catalog.add_feed_dict( { "train_df": MemoryDataSet(train_df), "test_df": MemoryDataSet(test_df), "args_raw": MemoryDataSet(args_raw), }, replace=True, ) self.kedro_context.catalog.add_feed_dict(dataset_catalog, replace=True) self.kedro_context.run(tags=["011_bundle_train_and_test_data"]) self.df = self.kedro_context.catalog.load("df_00") self.kedro_context.run(tags=[ "121_prepare_args", "131_treatment_fractions_", "141_initialize_model", ]) self.args = self.kedro_context.catalog.load("args") self.treatment_fractions = self.kedro_context.catalog.load( "treatment_fractions") if self.args.need_propensity_scoring: self.kedro_context.run(tags=["211_fit_propensity"]) self.propensity_model = self.kedro_context.catalog.load( "propensity_model") self.kedro_context.run(tags=["221_estimate_propensity"]) self.df = self.kedro_context.catalog.load("df_01") else: self.kedro_context.catalog.add_feed_dict( {"df_01": MemoryDataSet(self.df)}, replace=True) self.treatment_fraction_train = self.treatment_fractions.train self.treatment_fraction_test = self.treatment_fractions.test if self.args.verbose >= 3: log.info("### Treatment fraction in train dataset: {}".format( self.treatment_fractions.train)) log.info("### Treatment fraction in test dataset: {}".format( self.treatment_fractions.test)) self._separate_train_test()