Esempio n. 1
0
def random_forest_model_1(X_train, y_train, X_test, y_test, parameters:Dict):    
    regressor_1  = RandomForestRegressor(random_state=0)
    regressor_1.fit(X_train, y_train)
    '''
    print the mean squared error and accuracy of regression model
    '''
    '''
    training performance
    '''
    print("Train Result:\n")
    print("mean squared error: {}\n".format(mean_squared_error(y_train, regressor_1.predict(X_train))))
    print("R_squared: {0:.4f}\n".format(r2_score(y_train, regressor_1.predict(X_train))))
#     res = cross_val_score(regr, X_train, y_train, cv=10, scoring='accuracy')
#     print("Average Accuracy: \t {0:.4f}".format(np.mean(res)))
#     print("Accuracy SD: \t\t {0:.4f}".format(np.std(res)))
    '''
    test performance
    '''
    print("Test Result:\n")        
    print("mean squared error: {0:.4f}\n".format(mean_squared_error(y_test, regressor_1.predict(X_test))))
    print("R_squared: {0:.4f}\n".format(r2_score(y_test, regressor_1.predict(X_test))))
    
    data_set_regressor_1 = PickleLocalDataSet(filepath=parameters["path_models"]+"/regressor_1.pickle")
    data_set_regressor_1.save(regressor_1)
    dummy1 = X_test
    return dummy1
Esempio n. 2
0
def random_forest_model_2(dummy1, X_train2, y_train2, X_test2, y_test2,
                          parameters: Dict):
    regressor_2 = RandomForestRegressor(random_state=0)
    regressor_2.fit(X_train2, y_train2)
    '''
    print the mean squared error and accuracy of regression model
    '''
    '''
    training performance
    '''
    print("Train Result:\n")
    print("mean squared error: {}\n".format(
        mean_squared_error(y_train2, regressor_2.predict(X_train2))))
    print("R_squared: {0:.4f}\n".format(
        r2_score(y_train2, regressor_2.predict(X_train2))))
    '''
    test performance
    '''
    print("Test Result:\n")
    print("mean squared error: {0:.4f}\n".format(
        mean_squared_error(y_test2, regressor_2.predict(X_test2))))
    print("R_squared: {0:.4f}\n".format(
        r2_score(y_test2, regressor_2.predict(X_test2))))
    data_set_regressor_2 = PickleLocalDataSet(
        filepath=parameters["path_models"] + "/regressor_2.pickle")
    data_set_regressor_2.save(regressor_2)
    dummy2 = X_test2
    return dummy2
Esempio n. 3
0
    def __init__(
        self,
        propensity_model_filename="../data/06_models/propensity_model.pickle",
        uplift_models_filename="../data/06_models/uplift_models_dict.pickle",
        df_filename="../data/07_model_output/df.csv",
        treated_sim_eval_filename="../data/08_reporting/treated__sim_eval_df.csv",
        untreated_sim_eval_filename="../data/08_reporting/untreated__sim_eval_df.csv",
        estimated_effect_filename="../data/08_reporting/estimated_effect_df.csv",
        args_raw=MemoryDataSet({}).load()):

        self.propensity_model = PickleLocalDataSet(
            filepath=propensity_model_filename, version=None)
        self.uplift_models_dict = PickleLocalDataSet(
            filepath=uplift_models_filename, version=None)
        self.df_03 = CSVLocalDataSet(
            filepath=df_filename,
            load_args=dict(index_col=["partition", "index"],
                           float_precision="high"),
            save_args=dict(index=True, float_format="%.16e"),
            version=None,
        )
        self.treated__sim_eval_df = CSVLocalDataSet(
            filepath=treated_sim_eval_filename, version=None)
        self.untreated__sim_eval_df = CSVLocalDataSet(
            filepath=untreated_sim_eval_filename, version=None)
        self.estimated_effect_df = CSVLocalDataSet(
            filepath=estimated_effect_filename, version=None)
        self.args_raw = args_raw
Esempio n. 4
0
def random_forest_model_2(dummy1, X_train2, y_train2, X_test2, y_test2, parameters, property_names, steam_input_names, emulsion_input_names, scheme_train):
#     regressor_2  = RandomForestRegressor(n_estimators=250, min_samples_split=30, min_samples_leaf=30, max_depth=7)
    regressor_2  = RandomForestRegressor(max_depth=5, n_estimators=500, random_state=0)
    
#     AdaBoost and XGBoost Regressors
#     regressor_2  = AdaBoostRegressor(n_estimators = 500, random_state=0)
#     regressor_2 = XGBRegressor(objective='reg:squarederror')

    regressor_2.fit(X_train2, y_train2)
#     print_decision_rules(regressor_2)
    if scheme_train == 1:
        input_2_names = ['Steam [m3/d]'] + list(emulsion_input_names) + list(property_names)  
    else:
        input_2_names = ['Oil [m3/d]'] + list(steam_input_names) + list(property_names)
    feat_idx = np.argsort(regressor_2.feature_importances_)[::-1]
    
#     import eli5
#     from eli5.sklearn import PermutationImportance
#     perm = PermutationImportance(regressor_2).fit(X_test2, y_test2)
#     feat_idx = np.argsort(eli5.show_weights(perm))[::-1]
    
    input_2_names = np.array(input_2_names)[feat_idx]
    input_2_names = list(input_2_names)
    print("Feature importance:\n")
    for name, importance in zip(input_2_names, regressor_2.feature_importances_[feat_idx]):
        print(name, ": {0:.3f}".format(importance))
    
    fig, ax = plt.subplots(1, 1, figsize=(14, 14))
    pd.Series(regressor_2.feature_importances_[feat_idx][::-1], index=input_2_names[::-1]).plot('barh', ax=ax)
    ax.set_title('Features importance')
    fig.savefig(parameters["path_model_output_No_DWT"]+"/regressor_2_feature_importance.png")
    
    print("\n")
    '''
    print the mean squared error and accuracy of regression model
    '''
    '''
    training performance
    '''
    print("Train Result:")
    print("mean squared error: {0:.4f}".format(mean_squared_error(y_train2, regressor_2.predict(X_train2))))
    print("R_squared: {0:.4f}\n".format(regressor_2.score(X_train2, y_train2)))
    '''
    test performance
    '''
    print("Test Result:")        
    print("mean squared error: {0:.4f}".format(mean_squared_error(y_test2, regressor_2.predict(X_test2))))
    print("R_squared: {0:.4f}\n".format(regressor_2.score(X_test2, y_test2)))   
    data_set_regressor_2 = PickleLocalDataSet(filepath=parameters["path_models"]+"/regressor_2.pickle")
    data_set_regressor_2.save(regressor_2)
    
#     fig, axes = plt.subplots(nrows = 1,ncols = 1,figsize = (12,12), dpi=800)
#     tree.plot_tree(regressor_2.estimators_[0], feature_names = input_2_names, filled = True);
#     fig.savefig(parameters["path_model_output_No_DWT"]+"/regressor_2_tree.png")
#     tree.export_graphviz(regressor_2.estimators_[0], out_file="path_model_output_No_DWT"]+"/regressor_2_tree.dot",
#                             feature_names = input_2_names, filled = True) 
    algorithm = 1                         
    dummy2 = X_test2
    return [dummy2, algorithm]
Esempio n. 5
0
def random_forest_model_1(X_train, y_train, X_test, y_test, parameters,
                          property_names, steam_input_names,
                          emulsion_input_names, scheme_train):
    regressor_1 = RandomForestRegressor(n_estimators=200, random_state=0)
    regressor_1.fit(X_train, y_train)
    if scheme_train == 1:
        input_1_names = list(steam_input_names) + list(property_names)
    else:
        input_1_names = list(emulsion_input_names) + list(property_names)
    feat_idx = np.argsort(regressor_1.feature_importances_)[::-1]
    input_1_names = np.array(input_1_names)[feat_idx]
    input_1_names = list(input_1_names)
    print("Feature importance:\n")
    for name, importance in zip(input_1_names,
                                regressor_1.feature_importances_[feat_idx]):
        print(name, ": {0:.3f}".format(importance))

    fig, ax = plt.subplots(1, 1, figsize=(14, 14))
    pd.Series(regressor_1.feature_importances_[feat_idx][::-1],
              index=input_1_names[::-1]).plot('barh', ax=ax)
    ax.set_title('Features importance')
    fig.savefig(parameters["path_model_output_No_DWT"] +
                "/regressor_1_feature_importance.png")

    print("\n")
    '''
    print the mean squared error and accuracy of regression model
    '''
    '''
    training performance
    '''
    print("Train Result:")
    print("mean squared error: {0:.4f}".format(
        mean_squared_error(y_train, regressor_1.predict(X_train))))
    print("R_squared: {0:.4f}\n".format(
        r2_score(y_train, regressor_1.predict(X_train))))
    '''
    test performance
    '''
    print("Test Result:")
    print("mean squared error: {0:.4f}".format(
        mean_squared_error(y_test, regressor_1.predict(X_test))))
    print("R_squared: {0:.4f}\n".format(
        r2_score(y_test, regressor_1.predict(X_test))))
    data_set_regressor_1 = PickleLocalDataSet(
        filepath=parameters["path_models"] + "/regressor_1.pickle")
    data_set_regressor_1.save(regressor_1)

    #     fig, axes = plt.subplots(1, 1,figsize = (12,12), dpi=800)
    #     tree.plot_tree(regressor_1.estimators_[0], feature_names = input_1_names, filled = True);
    #     fig.savefig(parameters["path_model_output_No_DWT"]+"/regressor_1_tree.png")
    #     tree.export_graphviz(regressor_1.estimators_[0], out_file="path_model_output_No_DWT"]+"/regressor_1_tree.dot",
    #                             feature_names = input_1_names, filled = True)

    dummy1 = X_test
    return dummy1
Esempio n. 6
0
    def test_joblib_not_installed(self, filepath_pkl, mocker):
        """Check the error if 'joblib' module is not installed."""
        mocker.patch.dict("sys.modules", joblib=None)
        reload(kedro.io.pickle_local)
        # creating a pickle-based data set should be fine
        PickleLocalDataSet(filepath=filepath_pkl, backend="pickle")

        # creating a joblib-based data set should fail
        pattern = (r"selected backend \'joblib\' could not be imported\. "
                   r"Make sure it is installed\.")
        with pytest.raises(ImportError, match=pattern):
            PickleLocalDataSet(filepath=filepath_pkl, backend="joblib")
Esempio n. 7
0
    def test_version_str_repr(self, load_version, save_version):
        """Test that version is in string representation of the class instance
        when applicable."""
        filepath = "test.pkl"
        ds = PickleLocalDataSet(filepath=filepath)
        ds_versioned = PickleLocalDataSet(filepath=filepath,
                                          version=Version(
                                              load_version, save_version))
        assert filepath in str(ds)
        assert "version" not in str(ds)

        assert filepath in str(ds_versioned)
        ver_str = "version=Version(load={}, save='{}')".format(
            load_version, save_version)
        assert ver_str in str(ds_versioned)
Esempio n. 8
0
    def test_parallel_runner_with_pickle_dataset(self, tmp_path, spark_in, spark_out):
        """Test ParallelRunner with SparkDataSet -> PickleDataSet -> SparkDataSet .
        """
        pickle_data = PickleLocalDataSet(
            filepath=str(tmp_path / "data.pkl"), backend="pickle"
        )
        catalog = DataCatalog(
            data_sets={
                "spark_in": spark_in,
                "pickle": pickle_data,
                "spark_out": spark_out,
            }
        )
        pipeline = Pipeline(
            [
                node(identity, "spark_in", "pickle"),
                node(identity, "pickle", "spark_out"),
            ]
        )
        runner = ParallelRunner()

        pattern = (
            r"The following data_sets cannot be "
            r"serialized: \[\'spark\_in\'\, \'spark\_out\'\]"
        )
        with pytest.raises(AttributeError, match=pattern):
            runner.run(pipeline, catalog)
Esempio n. 9
0
 def test_bad_backend(self):
     """Check the error when trying to instantiate with invalid backend."""
     pattern = (
         r"backend should be one of \[\'pickle\'\, \'joblib\'\]\, "
         r"got wrong\-backend"
     )
     with pytest.raises(ValueError, match=pattern):
         PickleLocalDataSet(filepath="test.pkl", backend="wrong-backend")
Esempio n. 10
0
def standardisation(dummy, properties: np.ndarray, files: List,
                    parameters: Dict):
    from sklearn.preprocessing import StandardScaler
    all_wells_input = []
    all_wells_labels = []

    for file in files:
        data_set_input = CSVLocalDataSet(filepath=parameters["path_primary"] +
                                         "/input_DWT_coeffs_" + file)
        DWT_Aprox_coeff_input = data_set_input.load()
        all_wells_input.append(DWT_Aprox_coeff_input.values)
        data_set_labels = CSVLocalDataSet(filepath=parameters["path_primary"] +
                                          "/labels_" + file)
        labels = data_set_labels.load()
        all_wells_labels.append(labels.values)
    all_wells_input = np.array(all_wells_input)

    #     Standardize dynamic data coeffs
    scaler_coeffs = StandardScaler()
    scaler_coeffs.fit(all_wells_input[0])  # fit based on first well record
    all_wells_standardized_input = []
    all_wells_standardized_input_flattened = []
    for well_coeffs in all_wells_input:
        std_coeffs = scaler_coeffs.transform(well_coeffs)
        all_wells_standardized_input.append(std_coeffs)
        transposed_std_coeffs = np.transpose(std_coeffs)
        flattened_std_coeffs = transposed_std_coeffs.flatten()
        all_wells_standardized_input_flattened.append(flattened_std_coeffs)

    all_wells_standardized_input = np.array(all_wells_standardized_input)
    all_wells_standardized_input_flattened = np.array(
        all_wells_standardized_input_flattened)
    data_set_scaler_coeffs = PickleLocalDataSet(
        filepath=parameters["path_models"] + "/scaler_coeffs.pickle")
    data_set_scaler_coeffs.save(scaler_coeffs)

    input_columns = list(DWT_Aprox_coeff_input.columns)
    for std_coeffs, file in zip(all_wells_standardized_input, files):
        std_coeffs = pd.DataFrame(std_coeffs, columns=input_columns)
        data_set = CSVLocalDataSet(filepath=parameters["path_features"] +
                                   "/std_DWT_input_coeffs_" + file)
        data_set.save(std_coeffs)


#     Standardize static data
    scaler_static = StandardScaler()
    all_wells_standardized_properties = scaler_static.fit_transform(properties)
    data_set_scaler_static = PickleLocalDataSet(
        filepath=parameters["path_models"] + "/scaler_static.pickle")
    data_set_scaler_static.save(scaler_static)
    return [
        all_wells_standardized_input_flattened,
        all_wells_standardized_properties, all_wells_labels
    ]
Esempio n. 11
0
    def test_parallel_runner_with_pickle_dataset(self, tmp_path, spark_in,
                                                 spark_out, sample_spark_df):
        """Test ParallelRunner with SparkDataSet -> PickleDataSet -> SparkDataSet .
        """
        pickle_data = PickleLocalDataSet(filepath=str(tmp_path / "data.pkl"),
                                         backend="pickle")
        catalog = DataCatalog(data_sets={
            "spark_in": spark_in,
            "pickle": pickle_data,
            "spark_out": spark_out,
        })
        pipeline = Pipeline([
            node(identity, "spark_in", "pickle"),
            node(identity, "pickle", "spark_out"),
        ])
        runner = ParallelRunner()

        pattern = r"{0} cannot be serialized. {1} can only be used with serializable data".format(
            str(sample_spark_df.__class__),
            str(pickle_data.__class__.__name__))

        with pytest.raises(DataSetError, match=pattern):
            runner.run(pipeline, catalog)
Esempio n. 12
0
def validate(parameters: Dict):
    # def validate(dummy2, parameters: Dict):
    import glob, os
    os.chdir(parameters["path_model_input"])
    files = []
    for file in glob.glob("*.csv"):
        files.append(file)
    filenames = []
    wells_data = []
    for file in files:
        filename, extension = file.split('.')
        filenames.append(filename)
    for file, filename in zip(files, filenames):
        io = DataCatalog({
            filename:
            CSVLocalDataSet(filepath=parameters["path_model_input"] + "/" +
                            file),
        })
        well_data = io.load(filename)
        wells_data.append(well_data)
    Raw_Data_preprocessed = []
    for well, file, filename in zip(wells_data, files, filenames):
        well = well[[
            'Date', 'Injector Bottom Hole Pressure', 'Steam Flow Rate - Outer',
            'Bottom Hole Heel Temperature', 'Emulsion Pressure',
            'Producer Bottom Hole Pressure', 'ESP Speed', 'Emulsion Flow Rate'
        ]]
        for i in range(1, len(well.columns)):
            well[well.columns[i]] = pd.to_numeric(well[well.columns[i]],
                                                  errors='coerce')

        well = well.iloc[:1399]
        well = well.fillna(well.rolling(30, min_periods=1).median())
        well = well.fillna(well.median())

        well['Date'] = pd.to_datetime(well['Date'])
        well = well.set_index('Date')
        Raw_Data_preprocessed.append(well)

    os.chdir(parameters["path_val_stats"])
    static_files = []
    for static_file in glob.glob("*.csv"):
        static_files.append(static_file)
    static_filenames = []
    statics_data = []
    for static_file in static_files:
        static_filename, others = static_file.split('_')
        static_filenames.append(static_filename)
    for static_file, static_filename in zip(static_files, static_filenames):
        io = DataCatalog({
            static_filename:
            CSVLocalDataSet(filepath=parameters["path_val_stats"] + "/" +
                            static_file),
        })
        static_data = io.load(static_filename)
        statics_data.append(static_data)
    statics_data_new = []
    well_name_list = []
    for pad_static in statics_data:
        well_name = pad_static['WELLPAIR_NAME'].values
        well_name_list.append(well_name)
        pad_static = pad_static.set_index('WELLPAIR_NAME')
        pad_static = pad_static.drop(columns=['PLAN_NAME', 'HIGH_PRESSURE'])
        statics_data_new.append(pad_static)
    properties = []
    probabilities = []
    asset_names = []
    for pad_static, names in zip(statics_data_new, well_name_list):
        for well in names:
            prob = pad_static.loc[well, 'Forecast_Prob']
            probabilities.append(prob)
            pad_code = pad_static.loc[well, 'PAD_CODE']
            asset_name, pad = pad_code.split('_')
            asset_names.append(asset_name)
            property_ = pad_static.loc[
                well, 'SAGD_PRESSURE':'BOTTOM_WATER_THICKNESS'].values
            properties.append(property_)
    properties = np.array(properties)

    all_wells_input = []
    all_wells_labels = []
    for well_data, file in zip(Raw_Data_preprocessed, files):
        DWT_Aprox_coeff_input = []
        input_data, labels = split(well_data)

        input_columns = list(input_data.columns)
        for data_idx in input_columns:
            signal = well_data[data_idx].values
            thresh = parameters["thresh"] * np.nanmax(signal)
            coeff = pywt.wavedec(signal,
                                 wavelet=parameters["wavelet"],
                                 mode=parameters["mode1"],
                                 level=parameters["level"])
            coeff[1:] = (pywt.threshold(i,
                                        value=thresh,
                                        mode=str(parameters["mode2"]))
                         for i in coeff[1:])
            DWT_Aprox_coeff_input.append(coeff[0])
        DWT_Aprox_coeff_input = pd.DataFrame(
            np.transpose(DWT_Aprox_coeff_input), columns=input_columns)
        data_set_input = CSVLocalDataSet(
            filepath=parameters["path_val_pre_processed"] +
            "/validation_input_DWT_coeffs_" + file)
        data_set_input.save(DWT_Aprox_coeff_input)
        all_wells_input.append(DWT_Aprox_coeff_input.values)
        data_set_labels = CSVLocalDataSet(
            filepath=parameters["path_val_pre_processed"] +
            "/validation_labels_" + file)
        data_set_labels.save(labels)
        all_wells_labels.append(labels.values)

    #     Standardize dynamic data coeffs
    data_set_scaler_coeffs = PickleLocalDataSet(
        filepath=parameters["path_models"] + "/scaler_coeffs.pickle")
    scaler_coeffs = data_set_scaler_coeffs.load()
    all_wells_standardized_input = []
    all_wells_standardized_input_flattened = []
    for well_coeffs in all_wells_input:
        std_coeffs = scaler_coeffs.transform(well_coeffs)
        all_wells_standardized_input.append(std_coeffs)
        transposed_std_coeffs = np.transpose(std_coeffs)
        flattened_std_coeffs = transposed_std_coeffs.flatten()
        all_wells_standardized_input_flattened.append(flattened_std_coeffs)
    all_wells_standardized_input = np.array(all_wells_standardized_input)
    all_wells_standardized_input_flattened = np.array(
        all_wells_standardized_input_flattened)
    input_columns = list(DWT_Aprox_coeff_input.columns)
    for std_coeffs, file in zip(all_wells_standardized_input, files):
        std_coeffs = pd.DataFrame(std_coeffs, columns=input_columns)
        data_set = CSVLocalDataSet(
            filepath=parameters["path_val_pre_processed"] +
            "/validation_std_DWT_input_coeffs_" + file)
        data_set.save(std_coeffs)
#     Standardize static data

    data_set_scaler_static = PickleLocalDataSet(
        filepath=parameters["path_models"] + "/scaler_static.pickle")
    scaler_static = data_set_scaler_static.load()
    all_wells_standardized_properties = scaler_static.fit_transform(properties)
    all_wells_coeffs_reservoir_data = []
    for flattened_std_coeffs, standardized_properties in zip(
            all_wells_standardized_input_flattened,
            all_wells_standardized_properties):
        flattened_std_coeffs = list(flattened_std_coeffs)
        standardized_properties = list(standardized_properties)
        for reservoir_property in standardized_properties:
            flattened_std_coeffs.append(
                reservoir_property
            )  # append reservoir data to dynamic data coeffs
        all_wells_coeffs_reservoir_data.append(flattened_std_coeffs)
    all_wells_coeffs_reservoir_data = np.array(all_wells_coeffs_reservoir_data)

    well_count = np.arange(len(all_wells_coeffs_reservoir_data))
    daily_timesteps = np.arange(len(all_wells_labels[0]))
    input_data = []
    for coeff_inputs in all_wells_coeffs_reservoir_data:
        for time_lapse in daily_timesteps:
            well_inputs = [time_lapse] + list(
                coeff_inputs)  # append time lapse to input data
            input_data.append(well_inputs)
    input_data = np.array(input_data)
    data_set_regressor_1 = PickleLocalDataSet(
        filepath=parameters["path_models"] + "/regressor_1.pickle")
    regressor_1 = data_set_regressor_1.load()
    number_of_wells = len(well_count)
    wells_steam_rate_predicted = regressor_1.predict(input_data)
    wells_steam_rate_predicted = wells_steam_rate_predicted.reshape(
        (number_of_wells, 1399)).T

    # prediction inputs to model 2
    input_data_model_2 = []
    for coeff_inputs, well in zip(all_wells_coeffs_reservoir_data, well_count):
        for time_lapse in daily_timesteps:
            well_inputs = [time_lapse] + list(
                coeff_inputs)  # append time lapse to input data
            well_inputs_model_2 = [
                wells_steam_rate_predicted[time_lapse, well]
            ] + well_inputs
            input_data_model_2.append(well_inputs_model_2)
    input_data_model_2 = np.array(input_data_model_2)

    data_set_regressor_2 = PickleLocalDataSet(
        filepath=parameters["path_models"] + "/regressor_2.pickle")
    regressor_2 = data_set_regressor_2.load()
    wells_emulsion_rate_predicted = regressor_2.predict(input_data_model_2)
    wells_emulsion_rate_predicted = wells_emulsion_rate_predicted.reshape(
        (number_of_wells, 1399)).T

    # actual targets
    all_wells_steam_data = []
    all_wells_emulsion_data = []
    for ID in well_count:
        well_steam_data = all_wells_labels[ID][:, 0]
        well_emulsion_data = all_wells_labels[ID][:, 1]
        all_wells_steam_data = all_wells_steam_data + list(well_steam_data)
        all_wells_emulsion_data = all_wells_emulsion_data + list(
            well_emulsion_data)
    all_wells_steam_data = np.array(all_wells_steam_data)
    all_wells_emulsion_data = np.array(all_wells_emulsion_data)
    wells_steam_rate_actual = all_wells_steam_data.reshape(
        (number_of_wells, 1399)).T
    wells_emulsion_rate_actual = all_wells_emulsion_data.reshape(
        (number_of_wells, 1399)).T

    print("Prediction Performance:\n")
    print("Steam Flow Rate:")
    for well, file in zip(well_count, files):
        steam_rate_predicted = wells_steam_rate_predicted[:, well]
        steam_rate_actual = wells_steam_rate_actual[:, well]
        steam_rate_actual_predicted = pd.DataFrame(
            np.vstack((steam_rate_actual, steam_rate_predicted)).T,
            columns=["steam rate actual", "steam rate predicted"])
        data_set_steam_rate = CSVLocalDataSet(
            filepath=parameters["path_model_output"] + "/steam_rate_" + file)
        data_set_steam_rate.save(steam_rate_actual_predicted)
        print(file + " R_squared: {0:.4f}".format(
            r2_score(steam_rate_actual, steam_rate_predicted)))

    print("\n")
    print("Emulsion Flow Rate:")
    for well, file in zip(well_count, files):
        emulsion_rate_predicted = wells_emulsion_rate_predicted[:, well]
        emulsion_rate_actual = wells_emulsion_rate_actual[:, well]
        emulsion_rate_actual_predicted = pd.DataFrame(
            np.vstack((emulsion_rate_actual, emulsion_rate_predicted)).T,
            columns=["emulsion rate actual", "emulsion rate predicted"])
        data_set_emulsion_rate = CSVLocalDataSet(
            filepath=parameters["path_model_output"] + "/emulsion_rate_" +
            file)
        data_set_emulsion_rate.save(emulsion_rate_actual_predicted)
        print(file + " R_squared: {0:.4f}".format(
            r2_score(emulsion_rate_actual, emulsion_rate_predicted)))

    dummy_validate = files
    return dummy_validate
Esempio n. 13
0
 def test_str_representation(self):
     """Test string representation of the data set instance."""
     data_set = PickleLocalDataSet(filepath="test.pkl", backend="pickle")
     pattern = "PickleLocalDataSet(backend=pickle, filepath=test.pkl)"
     assert pattern in str(data_set)
Esempio n. 14
0
def predict_oil_then_steam_RF(dummy14, well_count, number_of_wells, time_index,
                              timesteps_validation,
                              all_wells_emulsion_input_val,
                              all_wells_steam_input_val,
                              wells_emulsion_rate_actual, properties_val,
                              stats_val_ROIP, parameters):

    data_set_regressor_1 = PickleLocalDataSet(
        filepath=parameters["path_models"] + "/regressor_1.pickle")
    regressor_1 = data_set_regressor_1.load()
    data_set_regressor_2 = PickleLocalDataSet(
        filepath=parameters["path_models"] + "/regressor_2.pickle")
    regressor_2 = data_set_regressor_2.load()

    well_count = np.arange(len(all_wells_emulsion_input_val))
    wells_RF_array = []
    wells_steam_rate_predicted = []
    wells_emulsion_rate_predicted = []
    for well_predictors_steam, well_predictors_emulsion, property_, well in zip(
            all_wells_steam_input_val, all_wells_emulsion_input_val,
            properties_val, well_count):
        ROIP = stats_val_ROIP[well]
        cum_oil = wells_emulsion_rate_actual[0, well]
        RF = cum_oil / ROIP
        well_RF_array = []
        well_steam_rate_predicted = []
        well_emulsion_rate_predicted = []
        for time_lapse in time_index:
            well_inputs_1 = [RF] + list(
                well_predictors_emulsion[time_lapse]) + list(property_)
            well_inputs_1 = np.array(well_inputs_1).reshape(1, -1)
            well_RF_array.append(RF)
            emulsion_rate_predicted = regressor_1.predict(
                well_inputs_1
            )[0]  # [0]to emulsion_rate_predicted arry to element eg [300] to 300
            well_emulsion_rate_predicted.append(emulsion_rate_predicted)

            #             well_inputs_2 = [emulsion_rate_predicted, RF] + list(well_predictors_steam[time_lapse]) + list(property_)
            well_inputs_2 = [emulsion_rate_predicted, RF] + list(
                well_predictors_steam[time_lapse])
            well_inputs_2 = np.array(well_inputs_2).reshape(1, -1)
            steam_rate_predicted = regressor_2.predict(well_inputs_2)[0]
            well_steam_rate_predicted.append(steam_rate_predicted)

            cum_oil = cum_oil + emulsion_rate_predicted
            RF = cum_oil / ROIP

        wells_RF_array.append(np.array(well_RF_array))
        wells_emulsion_rate_predicted.append(
            np.array(well_emulsion_rate_predicted))
        wells_steam_rate_predicted.append(np.array(well_steam_rate_predicted))

    wells_RF_array = np.array(wells_RF_array)
    wells_emulsion_rate_predicted = np.array(wells_emulsion_rate_predicted)
    wells_emulsion_rate_predicted = wells_emulsion_rate_predicted.reshape(
        (number_of_wells, timesteps_validation)).T

    wells_steam_rate_predicted = np.array(wells_steam_rate_predicted)
    wells_steam_rate_predicted = wells_steam_rate_predicted.reshape(
        (number_of_wells, timesteps_validation)).T

    scheme = 4
    dummy15 = scheme
    return [
        dummy15, wells_steam_rate_predicted, wells_emulsion_rate_predicted,
        wells_RF_array, scheme
    ]
Esempio n. 15
0
def versioned_pickle_data_set(filepath_pkl, load_version, save_version):
    return PickleLocalDataSet(filepath=filepath_pkl,
                              version=Version(load_version, save_version))
Esempio n. 16
0
def pickle_data_set_with_args(filepath_pkl):
    return PickleLocalDataSet(
        filepath=filepath_pkl,
        load_args={"fix_imports": False},
        save_args={"fix_imports": False},
    )
Esempio n. 17
0
def pickle_data_set(filepath_pkl, request):
    return PickleLocalDataSet(filepath=filepath_pkl, backend=request.param)
Esempio n. 18
0
def predict_oil_then_steam(dummy14, well_count, number_of_wells, time_index,
                           timesteps_validation, all_wells_emulsion_input_val,
                           all_wells_steam_input_val, properties_val,
                           parameters, algorithm):

    input_data = []
    for well_predictors_emulsion, property_ in zip(
            all_wells_emulsion_input_val, properties_val):
        for time_lapse in time_index:
            well_inputs = list(
                well_predictors_emulsion[time_lapse]) + list(property_)
            input_data.append(well_inputs)
    input_data = np.array(input_data)
    if algorithm == 1:
        data_set_regressor_1 = PickleLocalDataSet(
            filepath=parameters["path_models"] + "/regressor_1.pickle")
        regressor_1 = data_set_regressor_1.load()
        wells_emulsion_rate_predicted = regressor_1.predict(input_data)
    else:
        #         # standardization
        #         dataset_scaler_input_1 = PickleLocalDataSet(filepath=parameters["path_models"]+"/scaler_input_1.pickle")
        #         scaler_input_1 = dataset_scaler_input_1.load()
        #         input_data_model_1 = scaler_input_1.transform(input_data_model_1)

        from tensorflow import keras
        model_1 = keras.models.load_model(parameters["path_models"] +
                                          "/network_model_1.h5")
        wells_emulsion_rate_predicted = model_1.predict(input_data)

    wells_emulsion_rate_predicted = wells_emulsion_rate_predicted.reshape(
        (number_of_wells, timesteps_validation)).T

    # prediction inputs to model 2
    input_data_model_2 = []
    for well_predictors_steam, property_, well in zip(
            all_wells_steam_input_val, properties_val, well_count):
        for time_lapse in time_index:
            well_inputs_model_2 = [
                wells_emulsion_rate_predicted[time_lapse, well]
            ] + list(well_predictors_steam[time_lapse]) + list(property_)
            #             well_inputs_model_2 = [wells_emulsion_rate_predicted[time_lapse, well]] + list(well_predictors_steam[time_lapse])
            input_data_model_2.append(well_inputs_model_2)
    input_data_model_2 = np.array(input_data_model_2)
    if algorithm == 1:
        data_set_regressor_2 = PickleLocalDataSet(
            filepath=parameters["path_models"] + "/regressor_2.pickle")
        regressor_2 = data_set_regressor_2.load()
        wells_steam_rate_predicted = regressor_2.predict(input_data_model_2)
    else:
        #         # standardization
        #         dataset_scaler_input_2 = PickleLocalDataSet(filepath=parameters["path_models"]+"/scaler_input_2.pickle")
        #         scaler_input_1 = dataset_scaler_input_2.load()
        #         input_data_model_1 = scaler_input_1.transform(input_data_model_2)

        model_2 = keras.models.load_model(parameters["path_models"] +
                                          "/network_model_2.h5")
        wells_steam_rate_predicted = model_2.predict(input_data_model_2)

    scheme = 2
    wells_steam_rate_predicted = wells_steam_rate_predicted.reshape(
        (number_of_wells, timesteps_validation)).T

    dummy15 = scheme
    return [
        dummy15, wells_steam_rate_predicted, wells_emulsion_rate_predicted,
        scheme
    ]
Esempio n. 19
0
    def __init__(
            self,
            train_df=None,  # type: Optional[pd.DataFrame]
            test_df=None,  # type: Optional[pd.DataFrame]
            cols_features=None,  # type: Optional[List[str]]
            col_treatment="Treatment",  # type: str
            col_outcome="Outcome",  # type: str
            col_propensity="Propensity",  # type: str
            col_cate="CATE",  # type: str
            col_recommendation="Recommendation",  # type: str
            min_propensity=0.01,  # type: float
            max_propensity=0.99,  # type: float
            verbose=2,  # type: int
            uplift_model_params=dict(
                search_cv="sklearn.model_selection.GridSearchCV",
                estimator="xgboost.XGBClassifier",
                scoring=None,
                cv=3,
                return_train_score=False,
                n_jobs=-1,
                param_grid=dict(
                    random_state=[0],
                    max_depth=[3],
                    learning_rate=[0.1],
                    n_estimators=[100],
                    verbose=[0],
                    objective=["binary:logistic"],
                    booster=["gbtree"],
                    n_jobs=[-1],
                    nthread=[None],
                    gamma=[0],
                    min_child_weight=[1],
                    max_delta_step=[0],
                    subsample=[1],
                    colsample_bytree=[1],
                    colsample_bylevel=[1],
                    reg_alpha=[0],
                    reg_lambda=[1],
                    scale_pos_weight=[1],
                    base_score=[0.5],
                    missing=[None],
                ),
            ),  # type: Union[Dict[str, List[Any]], Type[sklearn.base.BaseEstimator]]
            enable_ipw=True,  # type: bool
            propensity_model_params=dict(
                search_cv="sklearn.model_selection.GridSearchCV",
                estimator="sklearn.linear_model.LogisticRegression",
                scoring=None,
                cv=3,
                return_train_score=False,
                n_jobs=-1,
                param_grid=dict(
                    random_state=[0],
                    C=[0.1, 1, 10],
                    class_weight=[None],
                    dual=[False],
                    fit_intercept=[True],
                    intercept_scaling=[1],
                    max_iter=[100],
                    multi_class=["ovr"],
                    n_jobs=[1],
                    penalty=["l1", "l2"],
                    solver=["liblinear"],
                    tol=[0.0001],
                    warm_start=[False],
                ),
            ),  # type: Dict[str, List[Any]]
            cv=3,  # type: int
            index_name="index",  # type: str
            partition_name="partition",  # type: str
            runner="SequentialRunner",  # type: str
            conditionally_skip=False,  # type: bool
            dataset_catalog=dict(
                # args_raw = CSVLocalDataSet(filepath='../data/01_raw/args_raw.csv', version=None),
                # train_df = CSVLocalDataSet(filepath='../data/01_raw/train_df.csv', version=None),
                # test_df = CSVLocalDataSet(filepath='../data/01_raw/test_df.csv', version=None),
                propensity_model=PickleLocalDataSet(
                    filepath="../data/06_models/propensity_model.pickle",
                    version=None),
                uplift_models_dict=PickleLocalDataSet(
                    filepath="../data/06_models/uplift_models_dict.pickle",
                    version=None),
                df_03=CSVLocalDataSet(
                    filepath="../data/07_model_output/df.csv",
                    load_args=dict(index_col=["partition", "index"],
                                   float_precision="high"),
                    save_args=dict(index=True, float_format="%.16e"),
                    version=None,
                ),
                treated__sim_eval_df=CSVLocalDataSet(
                    filepath="../data/08_reporting/treated__sim_eval_df.csv",
                    version=None),
                untreated__sim_eval_df=CSVLocalDataSet(
                    filepath="../data/08_reporting/untreated__sim_eval_df.csv",
                    version=None),
                estimated_effect_df=CSVLocalDataSet(
                    filepath="../data/08_reporting/estimated_effect_df.csv",
                    version=None),
            ),  # type: Dict[str, AbstractDataSet]
            logging_config={
                "disable_existing_loggers": False,
                "formatters": {
                    "json_formatter": {
                        "class":
                        "pythonjsonlogger.jsonlogger.JsonFormatter",
                        "format":
                        "[%(asctime)s|%(name)s|%(funcName)s|%(levelname)s] %(message)s",
                    },
                    "simple": {
                        "format":
                        "[%(asctime)s|%(name)s|%(levelname)s] %(message)s"
                    },
                },
                "handlers": {
                    "console": {
                        "class": "logging.StreamHandler",
                        "formatter": "simple",
                        "level": "INFO",
                        "stream": "ext://sys.stdout",
                    },
                    "info_file_handler": {
                        "class": "logging.handlers.RotatingFileHandler",
                        "level": "INFO",
                        "formatter": "simple",
                        "filename": "./info.log",
                        "maxBytes": 10485760,  # 10MB
                        "backupCount": 20,
                        "encoding": "utf8",
                        "delay": True,
                    },
                    "error_file_handler": {
                        "class": "logging.handlers.RotatingFileHandler",
                        "level": "ERROR",
                        "formatter": "simple",
                        "filename": "./errors.log",
                        "maxBytes": 10485760,  # 10MB
                        "backupCount": 20,
                        "encoding": "utf8",
                        "delay": True,
                    },
                },
                "loggers": {
                    "anyconfig": {
                        "handlers":
                        ["console", "info_file_handler", "error_file_handler"],
                        "level":
                        "WARNING",
                        "propagate":
                        False,
                    },
                    "kedro.io": {
                        "handlers":
                        ["console", "info_file_handler", "error_file_handler"],
                        "level":
                        "WARNING",
                        "propagate":
                        False,
                    },
                    "kedro.pipeline": {
                        "handlers":
                        ["console", "info_file_handler", "error_file_handler"],
                        "level":
                        "INFO",
                        "propagate":
                        False,
                    },
                    "kedro.runner": {
                        "handlers":
                        ["console", "info_file_handler", "error_file_handler"],
                        "level":
                        "INFO",
                        "propagate":
                        False,
                    },
                    "causallift": {
                        "handlers":
                        ["console", "info_file_handler", "error_file_handler"],
                        "level":
                        "INFO",
                        "propagate":
                        False,
                    },
                },
                "root": {
                    "handlers":
                    ["console", "info_file_handler", "error_file_handler"],
                    "level":
                    "INFO",
                },
                "version": 1,
            },  # type: Optional[Dict[str, Any]]
    ):
        # type: (...) -> None

        self.runner = None  # type: Optional[str]
        self.kedro_context = None  # type: Optional[Type[FlexibleKedroContext]]
        self.args = None  # type: Optional[Type[EasyDict]]
        self.train_df = None  # type: Optional[Type[pd.DataFrame]]
        self.test_df = None  # type: Optional[Type[pd.DataFrame]]
        self.df = None  # type: Optional[Type[pd.DataFrame]]
        self.propensity_model = None  # type: Optional[Type[sklearn.base.BaseEstimator]]
        self.uplift_models_dict = None  # type: Optional[Type[EasyDict]]
        self.treatment_fractions = None  # type: Optional[Type[EasyDict]]
        self.treatment_fraction_train = None  # type: Optional[float]
        self.treatment_fraction_test = None  # type: Optional[float]

        self.treated__proba = None  # type: Optional[Type[np.array]]
        self.untreated__proba = None  # type: Optional[Type[np.array]]
        self.cate_estimated = None  # type: Optional[Type[pd.Series]]

        self.treated__sim_eval_df = None  # type: Optional[Type[pd.DataFrame]]
        self.untreated__sim_eval_df = None  # type: Optional[Type[pd.DataFrame]]
        self.estimated_effect_df = None  # type: Optional[Type[pd.DataFrame]]

        # Instance attributes were defined above.
        if logging_config:
            logging.config.dictConfig(logging_config)

        args_raw = dict(
            cols_features=cols_features,
            col_treatment=col_treatment,
            col_outcome=col_outcome,
            col_propensity=col_propensity,
            col_cate=col_cate,
            col_recommendation=col_recommendation,
            min_propensity=min_propensity,
            max_propensity=max_propensity,
            verbose=verbose,
            uplift_model_params=uplift_model_params,
            enable_ipw=enable_ipw,
            propensity_model_params=propensity_model_params,
            index_name=index_name,
            partition_name=partition_name,
            runner=runner,
            conditionally_skip=conditionally_skip,
        )

        args_raw = EasyDict(args_raw)
        args_raw.update(
            dataset_catalog.get("args_raw",
                                MemoryDataSet({}).load()))

        assert args_raw.runner in {"SequentialRunner", "ParallelRunner", None}
        if args_raw.runner is None and args_raw.conditionally_skip:
            log.warning(
                "[Warning] conditionally_skip option is ignored since runner is None"
            )

        self.kedro_context = FlexibleKedroContext(
            runner=args_raw.runner, only_missing=args_raw.conditionally_skip)

        self.runner = args_raw.runner

        if self.runner is None:
            self.df = bundle_train_and_test_data(args_raw, train_df, test_df)
            self.args = impute_cols_features(args_raw, self.df)
            self.args = schedule_propensity_scoring(self.args, self.df)
            self.treatment_fractions = treatment_fractions_(self.args, self.df)
            if self.args.need_propensity_scoring:
                self.propensity_model = fit_propensity(self.args, self.df)
                self.df = estimate_propensity(self.args, self.df,
                                              self.propensity_model)

        if self.runner:
            self.kedro_context.catalog.add_feed_dict(
                {
                    "train_df": MemoryDataSet(train_df),
                    "test_df": MemoryDataSet(test_df),
                    "args_raw": MemoryDataSet(args_raw),
                },
                replace=True,
            )
            self.kedro_context.catalog.add_feed_dict(dataset_catalog,
                                                     replace=True)

            self.kedro_context.run(tags=["011_bundle_train_and_test_data"])
            self.df = self.kedro_context.catalog.load("df_00")

            self.kedro_context.run(tags=[
                "121_prepare_args",
                "131_treatment_fractions_",
                "141_initialize_model",
            ])
            self.args = self.kedro_context.catalog.load("args")
            self.treatment_fractions = self.kedro_context.catalog.load(
                "treatment_fractions")

            if self.args.need_propensity_scoring:
                self.kedro_context.run(tags=["211_fit_propensity"])
                self.propensity_model = self.kedro_context.catalog.load(
                    "propensity_model")
                self.kedro_context.run(tags=["221_estimate_propensity"])
                self.df = self.kedro_context.catalog.load("df_01")
            else:
                self.kedro_context.catalog.add_feed_dict(
                    {"df_01": MemoryDataSet(self.df)}, replace=True)

        self.treatment_fraction_train = self.treatment_fractions.train
        self.treatment_fraction_test = self.treatment_fractions.test

        if self.args.verbose >= 3:
            log.info("### Treatment fraction in train dataset: {}".format(
                self.treatment_fractions.train))
            log.info("### Treatment fraction in test dataset: {}".format(
                self.treatment_fractions.test))

        self._separate_train_test()