Example #1
0
    def test_sequential_load_from_disk(self, dummy_dataframe, filepath,
                                       versioned_csv_data_set):
        """Tests if the correct load version is logged when two versions are saved in
        disk."""
        save_version_1 = "2019-01-01T23.00.00.000Z"
        save_version_2 = "2019-01-01T23.59.59.999Z"
        CSVLocalDataSet(
            filepath=filepath,
            save_args={
                "sep": ","
            },
            version=Version(None, save_version_1),
        ).save(dummy_dataframe)
        CSVLocalDataSet(
            filepath=filepath,
            save_args={
                "sep": ","
            },
            version=Version(None, save_version_2),
        ).save(dummy_dataframe)

        versioned_csv_data_set.load()
        last_load_version = versioned_csv_data_set.get_last_load_version()

        assert last_load_version == save_version_2
Example #2
0
    def test_save_options_csv(self, tmp_path, sample_spark_df):
        # To cross check the correct Spark save operation we save to
        # a single spark partition with csv format and retrieve it with Kedro
        # CSVLocalDataSet
        temp_dir = Path(str(tmp_path / "test_data"))
        spark_data_set = SparkDataSet(
            filepath=str(temp_dir),
            file_format="csv",
            save_args={
                "sep": "|",
                "header": True
            },
        )
        spark_df = sample_spark_df.coalesce(1)
        spark_data_set.save(spark_df)

        single_csv_file = [
            f for f in temp_dir.iterdir() if f.is_file() and f.suffix == ".csv"
        ][0]

        csv_local_data_set = CSVLocalDataSet(filepath=str(single_csv_file),
                                             load_args={"sep": "|"})
        pandas_df = csv_local_data_set.load()

        assert pandas_df[pandas_df["name"] == "Alex"]["age"][0] == 31
    def __init__(
        self,
        propensity_model_filename="../data/06_models/propensity_model.pickle",
        uplift_models_filename="../data/06_models/uplift_models_dict.pickle",
        df_filename="../data/07_model_output/df.csv",
        treated_sim_eval_filename="../data/08_reporting/treated__sim_eval_df.csv",
        untreated_sim_eval_filename="../data/08_reporting/untreated__sim_eval_df.csv",
        estimated_effect_filename="../data/08_reporting/estimated_effect_df.csv",
        args_raw=MemoryDataSet({}).load()):

        self.propensity_model = PickleLocalDataSet(
            filepath=propensity_model_filename, version=None)
        self.uplift_models_dict = PickleLocalDataSet(
            filepath=uplift_models_filename, version=None)
        self.df_03 = CSVLocalDataSet(
            filepath=df_filename,
            load_args=dict(index_col=["partition", "index"],
                           float_precision="high"),
            save_args=dict(index=True, float_format="%.16e"),
            version=None,
        )
        self.treated__sim_eval_df = CSVLocalDataSet(
            filepath=treated_sim_eval_filename, version=None)
        self.untreated__sim_eval_df = CSVLocalDataSet(
            filepath=untreated_sim_eval_filename, version=None)
        self.estimated_effect_df = CSVLocalDataSet(
            filepath=estimated_effect_filename, version=None)
        self.args_raw = args_raw
Example #4
0
def create_master_table(Raw_Data_dated: pd.DataFrame,
                        parameters: Dict) -> pd.DataFrame:
    master_table = pd.concat(Raw_Data_dated, axis=1, sort=False)
    data_set = CSVLocalDataSet(filepath=parameters["path_primary"] +
                               "/master_table.csv")
    data_set.save(master_table)
    return master_table
Example #5
0
def dynamic_time_warping(Raw_Data_preprocessed, parameters):
    reference_well = CSVLocalDataSet(filepath=parameters["path_raw"] +
                                     "/B03-1P.csv")
    well_ref = reference_well.load()
    data = well_ref['Oil [bbl/d]'] / 6.28981
    well_ref.insert(4, 'Oil [m3/d]', data)
    well_ref_oil_data = well_ref['Oil [m3/d]'].values

    Raw_Data_preprocessed_ = []
    distance_array = []
    for well_data in Raw_Data_preprocessed:
        well_oil_data = well_data['Oil [m3/d]'].values

        distance, path = fastdtw(well_ref_oil_data,
                                 well_oil_data,
                                 dist=euclidean)
        distance_array.append(distance)
        path = np.array(path)
        index_well = path[..., 1]
        index_ref_well = path[..., 0]
        well = well_data.iloc[index_well]
        well.insert(0, 'index_ref', index_ref_well)
        well = well.groupby('index_ref').mean()
        #         well = well.reset_index(drop=True)
        Raw_Data_preprocessed_.append(well)

    distance_array = np.array(distance_array)
    return [distance_array, Raw_Data_preprocessed_]
Example #6
0
def test_save_options_csv():
    # To cross check the correct Spark save operation we save to
    # a single spark partition with csv format and retrieve it with Kedro
    # CSVLocalDataSet
    with tempfile.TemporaryDirectory() as temp_dir:
        temp_path = join(temp_dir, "test_data")
        spark_data_set = SparkDataSet(
            filepath=temp_path,
            file_format="csv",
            save_args={
                "sep": "|",
                "header": True
            },
        )
        spark_df = _get_sample_spark_data_frame().coalesce(1)
        spark_data_set.save(spark_df)

        single_csv_file = [
            join(temp_path, f) for f in listdir(temp_path) if f.endswith("csv")
        ][0]

        csv_local_data_set = CSVLocalDataSet(filepath=single_csv_file,
                                             load_args={"sep": "|"})
        pandas_df = csv_local_data_set.load()

        assert pandas_df[pandas_df["name"] == "Alex"]["age"][0] == 31
Example #7
0
 def test_load_options_csv(self, tmp_path, sample_pandas_df):
     filepath = str(tmp_path / "data")
     local_csv_data_set = CSVLocalDataSet(filepath=filepath)
     local_csv_data_set.save(sample_pandas_df)
     spark_data_set = SparkDataSet(filepath=filepath,
                                   file_format="csv",
                                   load_args={"header": True})
     spark_df = spark_data_set.load()
     assert spark_df.filter(col("Name") == "Alex").count() == 1
Example #8
0
def test_load_options_csv(tmpdir):
    temp_path = str(tmpdir.join("data"))
    pandas_df = _get_sample_pandas_data_frame()
    local_csv_data_set = CSVLocalDataSet(filepath=temp_path)
    local_csv_data_set.save(pandas_df)
    spark_data_set = SparkDataSet(filepath=temp_path,
                                  file_format="csv",
                                  load_args={"header": True})
    spark_df = spark_data_set.load()
    assert spark_df.filter(col("Name") == "Alex").count() == 1
Example #9
0
def load_data(dummy, files: List, parameters: Dict):
    all_wells_steam_input = []
    all_wells_emulsion_input = []
    all_wells_labels = []

    for file in files:
        data_set_steam_input = CSVLocalDataSet(
            filepath=parameters["path_primary"] + "/steam_inputs_" + file)
        steam_input_data = data_set_steam_input.load()
        all_wells_steam_input.append(steam_input_data.values)
        data_set_emulsion_input = CSVLocalDataSet(
            filepath=parameters["path_primary"] + "/emulsion_inputs_" + file)
        emulsion_input_data = data_set_emulsion_input.load()
        all_wells_emulsion_input.append(emulsion_input_data.values)
        data_set_labels = CSVLocalDataSet(filepath=parameters["path_primary"] +
                                          "/labels_" + file)
        labels = data_set_labels.load()
        all_wells_labels.append(labels.values)

    steam_input_names = steam_input_data.columns
    emulsion_input_names = emulsion_input_data.columns
    all_wells_steam_input = np.array(all_wells_steam_input)
    all_wells_emulsion_input = np.array(all_wells_emulsion_input)
    return [
        all_wells_steam_input, all_wells_emulsion_input, all_wells_labels,
        steam_input_names, emulsion_input_names
    ]
Example #10
0
def standardisation(dummy, properties: np.ndarray, files: List,
                    parameters: Dict):
    from sklearn.preprocessing import StandardScaler
    all_wells_input = []
    all_wells_labels = []

    for file in files:
        data_set_input = CSVLocalDataSet(filepath=parameters["path_primary"] +
                                         "/input_DWT_coeffs_" + file)
        DWT_Aprox_coeff_input = data_set_input.load()
        all_wells_input.append(DWT_Aprox_coeff_input.values)
        data_set_labels = CSVLocalDataSet(filepath=parameters["path_primary"] +
                                          "/labels_" + file)
        labels = data_set_labels.load()
        all_wells_labels.append(labels.values)
    all_wells_input = np.array(all_wells_input)

    #     Standardize dynamic data coeffs
    scaler_coeffs = StandardScaler()
    scaler_coeffs.fit(all_wells_input[0])  # fit based on first well record
    all_wells_standardized_input = []
    all_wells_standardized_input_flattened = []
    for well_coeffs in all_wells_input:
        std_coeffs = scaler_coeffs.transform(well_coeffs)
        all_wells_standardized_input.append(std_coeffs)
        transposed_std_coeffs = np.transpose(std_coeffs)
        flattened_std_coeffs = transposed_std_coeffs.flatten()
        all_wells_standardized_input_flattened.append(flattened_std_coeffs)

    all_wells_standardized_input = np.array(all_wells_standardized_input)
    all_wells_standardized_input_flattened = np.array(
        all_wells_standardized_input_flattened)
    data_set_scaler_coeffs = PickleLocalDataSet(
        filepath=parameters["path_models"] + "/scaler_coeffs.pickle")
    data_set_scaler_coeffs.save(scaler_coeffs)

    input_columns = list(DWT_Aprox_coeff_input.columns)
    for std_coeffs, file in zip(all_wells_standardized_input, files):
        std_coeffs = pd.DataFrame(std_coeffs, columns=input_columns)
        data_set = CSVLocalDataSet(filepath=parameters["path_features"] +
                                   "/std_DWT_input_coeffs_" + file)
        data_set.save(std_coeffs)


#     Standardize static data
    scaler_static = StandardScaler()
    all_wells_standardized_properties = scaler_static.fit_transform(properties)
    data_set_scaler_static = PickleLocalDataSet(
        filepath=parameters["path_models"] + "/scaler_static.pickle")
    data_set_scaler_static.save(scaler_static)
    return [
        all_wells_standardized_input_flattened,
        all_wells_standardized_properties, all_wells_labels
    ]
Example #11
0
def discrete_wavelet_transform(Raw_Data_preprocessed: pd.DataFrame,
                               parameters: Dict, files: List):
    for well_data, file in zip(Raw_Data_preprocessed, files):
        list_input_DWT_Aprox_coeff = []
        input_data, labels = split(well_data)

        input_columns = list(input_data.columns)
        for data_idx in input_columns:
            signal = well_data[data_idx].values
            thresh = parameters["thresh"] * np.nanmax(signal)
            coeff = pywt.wavedec(signal,
                                 wavelet=parameters["wavelet"],
                                 mode=parameters["mode1"],
                                 level=parameters["level"])
            coeff[1:] = (pywt.threshold(i,
                                        value=thresh,
                                        mode=str(parameters["mode2"]))
                         for i in coeff[1:])
            list_input_DWT_Aprox_coeff.append(coeff[0])
        list_input_DWT_Aprox_coeff = pd.DataFrame(
            np.transpose(list_input_DWT_Aprox_coeff), columns=input_columns)
        data_set_input = CSVLocalDataSet(filepath=parameters["path_primary"] +
                                         "/input_DWT_coeffs_" + file)
        data_set_input.save(list_input_DWT_Aprox_coeff)
        data_set_labels = CSVLocalDataSet(filepath=parameters["path_primary"] +
                                          "/labels_" + file)
        data_set_labels.save(labels)
    dummy = labels
    return dummy
Example #12
0
    def test_run_load_versions(self, tmp_path, dummy_context, dummy_dataframe, mocker):
        class DummyContext(KedroContext):
            project_name = "bob"
            project_version = __version__

            def _get_pipelines(self) -> Dict[str, Pipeline]:
                return {"__default__": Pipeline([node(identity, "cars", "boats")])}

        mocker.patch("logging.config.dictConfig")
        dummy_context = DummyContext(str(tmp_path))
        filepath = str(dummy_context.project_path / "cars.csv")

        old_save_version = generate_timestamp()
        old_df = pd.DataFrame({"col1": [0, 0], "col2": [0, 0], "col3": [0, 0]})
        old_csv_data_set = CSVLocalDataSet(
            filepath=filepath,
            save_args={"sep": ","},
            version=Version(None, old_save_version),
        )
        old_csv_data_set.save(old_df)

        new_save_version = generate_timestamp()
        new_csv_data_set = CSVLocalDataSet(
            filepath=filepath,
            save_args={"sep": ","},
            version=Version(None, new_save_version),
        )
        new_csv_data_set.save(dummy_dataframe)

        load_versions = {"cars": old_save_version}
        dummy_context.run(load_versions=load_versions)
        assert not dummy_context.catalog.load("boats").equals(dummy_dataframe)
        assert dummy_context.catalog.load("boats").equals(old_df)
Example #13
0
    def test_version_str_repr(self, load_version, save_version):
        """Test that version is in string representation of the class instance
        when applicable."""
        filepath = "test.csv"
        ds = CSVLocalDataSet(filepath=filepath)
        ds_versioned = CSVLocalDataSet(filepath=filepath,
                                       version=Version(load_version,
                                                       save_version))
        assert filepath in str(ds)
        assert "version" not in str(ds)

        assert filepath in str(ds_versioned)
        ver_str = "version=Version(load={}, save='{}')".format(
            load_version, save_version)
        assert ver_str in str(ds_versioned)
Example #14
0
 def test_datasets_on_add(self, data_catalog_from_config):
     """Check datasets are updated correctly after adding"""
     data_catalog_from_config.add("new_dataset", CSVLocalDataSet("some_path"))
     assert isinstance(
         data_catalog_from_config.datasets.new_dataset, CSVLocalDataSet
     )
     assert isinstance(data_catalog_from_config.datasets.boats, CSVLocalDataSet)
Example #15
0
def prepare_csv_data_with_tabs(context):
    context.read_csv_path = create_sample_csv()
    context.write_csv_path = create_temp_csv()
    context.csv_data_set = CSVLocalDataSet(
        filepath=context.read_csv_path,
        load_args={"sep": "\t"},
        save_args={"index": False, "sep": "\t"},
    )
Example #16
0
def save_well_data(Raw_Data_preprocessed, parameters, files):
    for well_data, file in zip(Raw_Data_preprocessed, files):
        steam_input_data, emulsion_input_data, labels = split(well_data)
        data_set_steam_input = CSVLocalDataSet(
            filepath=parameters["path_primary"] + "/steam_inputs_" + file)
        data_set_steam_input.save(steam_input_data)
        data_set_emulsion_input = CSVLocalDataSet(
            filepath=parameters["path_primary"] + "/emulsion_inputs_" + file)
        data_set_emulsion_input.save(emulsion_input_data)
        data_set_labels = CSVLocalDataSet(filepath=parameters["path_primary"] +
                                          "/labels_" + file)
        data_set_labels.save(labels)
    dummy = labels
    return dummy
Example #17
0
def multi_catalog(mocker):
    csv = CSVLocalDataSet(filepath="abc.csv")
    parq = ParquetLocalDataSet(filepath="xyz.parq")
    journal = mocker.Mock()
    return DataCatalog({"abc": csv, "xyz": parq}, journal=journal)
Example #18
0
    def test_fails_with_remote_path(self, path):
        with pytest.raises(ValueError) as assertion:
            CSVLocalDataSet(filepath=path, save_args={"sep": ","})

        assert "seems to be a remote file" in assertion.value.args[0]
Example #19
0
def versioned_csv_data_set(filepath, load_version, save_version):
    return CSVLocalDataSet(
        filepath=filepath,
        save_args={"sep": ","},
        version=Version(load_version, save_version),
    )
Example #20
0
def csv_data_set(filepath, request):
    return CSVLocalDataSet(filepath=filepath, save_args=request.param)
Example #21
0
def load_well_validation_data(dummy2, timesteps, parameters: Dict):
    import glob, os
    os.chdir(parameters["path_model_input"])
    files_val = []
    for file in glob.glob("*.csv"):
        files_val.append(file)
    filenames_val = []
    wells_data = []
    for file in files_val:
        filename, extension = file.split('.')
        filenames_val.append(filename)
    for file, filename in zip(files_val, filenames_val):
        io = DataCatalog({
            filename:
            CSVLocalDataSet(filepath=parameters["path_model_input"] + "/" +
                            file),
        })
        well_data = io.load(filename)
        wells_data.append(well_data)
    Raw_Data_preprocessed_val = []
    wells_life = []
    wells_data_ = []
    for well in wells_data:
        #         well = well[['Date', 'Injector Bottom Hole Pressure',
        #                    'Producer Bottom Hole Pressure', 'ESP Speed',
        #                    'Steam Flow Rate - Outer',
        #                    'Emulsion Flow Rate']]

        well = well[[
            'Date', 'Speed [Hz]', 'Current [A]', 'IBHP', 'PBHP',
            'Co-Injection [E3m3/d]', 'Oil [bbl/d]', 'Steam [m3/d]',
            'Emulsion [m3/d]'
        ]]

        for i in range(1, len(well.columns)):
            well[well.columns[i]] = pd.to_numeric(well[well.columns[i]],
                                                  errors='coerce')
        well['Prod_Date'] = pd.to_datetime(well['Date'])
        well = well.set_index('Prod_Date')
        #         well = well.dropna(axis=0)   # may change
        #         well = well.resample('7D').mean()   # weekly data
        #         well = well.resample('30D').mean()   # monthly data
        #         well = well.rolling(30, min_periods=1).mean()
        #         well = well.rolling(30, min_periods=1).mean()

        data = well['Oil [bbl/d]'] / 6.28981
        well.insert(4, 'Oil [m3/d]', data)
        time_data = np.arange(len(well))
        well.insert(0, 'Timestep', time_data)

        wells_life.append(len(well))
        wells_data_.append(well)
    min_well_length = np.min(np.array(wells_life))
    if min_well_length < timesteps:
        timesteps_validation = min_well_length
    else:
        timesteps_validation = timesteps

    for well, file, filename in zip(wells_data_, files_val, filenames_val):
        well = well.iloc[:timesteps_validation]  # daily data
        #         well = well.fillna(0)
        #         well = well.fillna(well.rolling(30, min_periods=1).median())
        #         well = well.fillna(well.median())
        Raw_Data_preprocessed_val.append(well)

    stats_validation = CSVLocalDataSet(filepath=parameters["path_val_stats"] +
                                       "/static_P50_data_validation.csv")
    stats_val = stats_validation.load()
    stats_val_ROIP = stats_val.loc[:, 'ROIP']
    stats_val = stats_val.loc[:,
                              'Effective_Length':'BottomWater_Oil_Saturation']

    # #     using only rich geoostats and no bottom water properties
    #     stats_val = stats_val.loc[:, 'Effective_Length':'Rich_Oil_Saturation']

    # #     Using "Effective_Rich_Pay_Thickness" to account for standoff and rich thickness
    #     data = stats_val['Rich_Pay_Thickness'] - stats_val['Stand_Off']
    #     stats_val.insert(3, 'Effective_Rich_Pay_Thickness', data)
    #     stats_val = stats_val.drop(columns = ['Rich_Pay_Thickness', 'Stand_Off'])

    property_names_val = list(stats_val.columns)
    properties_val = list(stats_val.values)

    #     properties_val = stats.loc[:, ['Effective_Length', 'Spacing', 'Effective_Rich_Pay_Thickness', 'Non_Rich_Pay_Thickness',
    #                               'Rich_Vertical_Permeability','Non_Rich_Vertical_Permeability', 'Rich_Porosity',
    #                                       'Non_Rich_Porosity', 'Rich_Oil_Saturation', 'Non_Rich_Oil_Saturation']].values
    properties_val = np.array(properties_val)
    dummy11 = files_val
    return [
        dummy11, timesteps_validation, Raw_Data_preprocessed_val, files_val,
        filenames_val, properties_val, stats_val_ROIP, property_names_val
    ]
Example #22
0
    def test_sequential_save_and_load(self, dummy_dataframe, filepath):
        """Tests if the correct load version is logged when two datasets are saved
        sequentially."""

        dataset1 = CSVLocalDataSet(
            filepath=filepath,
            save_args={"sep": ","},
            version=Version(None, "2000-01-01"),
        )

        dataset2 = CSVLocalDataSet(
            filepath=filepath,
            save_args={"sep": ","},
            version=Version(None, "2001-01-01"),
        )

        dataset1.save(dummy_dataframe)
        last_save_version1 = dataset1.get_last_save_version()

        dataset2.save(dummy_dataframe)
        last_save_version2 = dataset2.get_last_save_version()

        dataset2.load()
        last_load_version = dataset2.get_last_load_version()
        assert last_save_version2 == last_load_version
        assert last_save_version1 != last_save_version2
Example #23
0
    def __init__(
            self,
            train_df=None,  # type: Optional[pd.DataFrame]
            test_df=None,  # type: Optional[pd.DataFrame]
            cols_features=None,  # type: Optional[List[str]]
            col_treatment="Treatment",  # type: str
            col_outcome="Outcome",  # type: str
            col_propensity="Propensity",  # type: str
            col_cate="CATE",  # type: str
            col_recommendation="Recommendation",  # type: str
            min_propensity=0.01,  # type: float
            max_propensity=0.99,  # type: float
            verbose=2,  # type: int
            uplift_model_params=dict(
                search_cv="sklearn.model_selection.GridSearchCV",
                estimator="xgboost.XGBClassifier",
                scoring=None,
                cv=3,
                return_train_score=False,
                n_jobs=-1,
                param_grid=dict(
                    random_state=[0],
                    max_depth=[3],
                    learning_rate=[0.1],
                    n_estimators=[100],
                    verbose=[0],
                    objective=["binary:logistic"],
                    booster=["gbtree"],
                    n_jobs=[-1],
                    nthread=[None],
                    gamma=[0],
                    min_child_weight=[1],
                    max_delta_step=[0],
                    subsample=[1],
                    colsample_bytree=[1],
                    colsample_bylevel=[1],
                    reg_alpha=[0],
                    reg_lambda=[1],
                    scale_pos_weight=[1],
                    base_score=[0.5],
                    missing=[None],
                ),
            ),  # type: Union[Dict[str, List[Any]], Type[sklearn.base.BaseEstimator]]
            enable_ipw=True,  # type: bool
            propensity_model_params=dict(
                search_cv="sklearn.model_selection.GridSearchCV",
                estimator="sklearn.linear_model.LogisticRegression",
                scoring=None,
                cv=3,
                return_train_score=False,
                n_jobs=-1,
                param_grid=dict(
                    random_state=[0],
                    C=[0.1, 1, 10],
                    class_weight=[None],
                    dual=[False],
                    fit_intercept=[True],
                    intercept_scaling=[1],
                    max_iter=[100],
                    multi_class=["ovr"],
                    n_jobs=[1],
                    penalty=["l1", "l2"],
                    solver=["liblinear"],
                    tol=[0.0001],
                    warm_start=[False],
                ),
            ),  # type: Dict[str, List[Any]]
            cv=3,  # type: int
            index_name="index",  # type: str
            partition_name="partition",  # type: str
            runner="SequentialRunner",  # type: str
            conditionally_skip=False,  # type: bool
            dataset_catalog=dict(
                # args_raw = CSVLocalDataSet(filepath='../data/01_raw/args_raw.csv', version=None),
                # train_df = CSVLocalDataSet(filepath='../data/01_raw/train_df.csv', version=None),
                # test_df = CSVLocalDataSet(filepath='../data/01_raw/test_df.csv', version=None),
                propensity_model=PickleLocalDataSet(
                    filepath="../data/06_models/propensity_model.pickle",
                    version=None),
                uplift_models_dict=PickleLocalDataSet(
                    filepath="../data/06_models/uplift_models_dict.pickle",
                    version=None),
                df_03=CSVLocalDataSet(
                    filepath="../data/07_model_output/df.csv",
                    load_args=dict(index_col=["partition", "index"],
                                   float_precision="high"),
                    save_args=dict(index=True, float_format="%.16e"),
                    version=None,
                ),
                treated__sim_eval_df=CSVLocalDataSet(
                    filepath="../data/08_reporting/treated__sim_eval_df.csv",
                    version=None),
                untreated__sim_eval_df=CSVLocalDataSet(
                    filepath="../data/08_reporting/untreated__sim_eval_df.csv",
                    version=None),
                estimated_effect_df=CSVLocalDataSet(
                    filepath="../data/08_reporting/estimated_effect_df.csv",
                    version=None),
            ),  # type: Dict[str, AbstractDataSet]
            logging_config={
                "disable_existing_loggers": False,
                "formatters": {
                    "json_formatter": {
                        "class":
                        "pythonjsonlogger.jsonlogger.JsonFormatter",
                        "format":
                        "[%(asctime)s|%(name)s|%(funcName)s|%(levelname)s] %(message)s",
                    },
                    "simple": {
                        "format":
                        "[%(asctime)s|%(name)s|%(levelname)s] %(message)s"
                    },
                },
                "handlers": {
                    "console": {
                        "class": "logging.StreamHandler",
                        "formatter": "simple",
                        "level": "INFO",
                        "stream": "ext://sys.stdout",
                    },
                    "info_file_handler": {
                        "class": "logging.handlers.RotatingFileHandler",
                        "level": "INFO",
                        "formatter": "simple",
                        "filename": "./info.log",
                        "maxBytes": 10485760,  # 10MB
                        "backupCount": 20,
                        "encoding": "utf8",
                        "delay": True,
                    },
                    "error_file_handler": {
                        "class": "logging.handlers.RotatingFileHandler",
                        "level": "ERROR",
                        "formatter": "simple",
                        "filename": "./errors.log",
                        "maxBytes": 10485760,  # 10MB
                        "backupCount": 20,
                        "encoding": "utf8",
                        "delay": True,
                    },
                },
                "loggers": {
                    "anyconfig": {
                        "handlers":
                        ["console", "info_file_handler", "error_file_handler"],
                        "level":
                        "WARNING",
                        "propagate":
                        False,
                    },
                    "kedro.io": {
                        "handlers":
                        ["console", "info_file_handler", "error_file_handler"],
                        "level":
                        "WARNING",
                        "propagate":
                        False,
                    },
                    "kedro.pipeline": {
                        "handlers":
                        ["console", "info_file_handler", "error_file_handler"],
                        "level":
                        "INFO",
                        "propagate":
                        False,
                    },
                    "kedro.runner": {
                        "handlers":
                        ["console", "info_file_handler", "error_file_handler"],
                        "level":
                        "INFO",
                        "propagate":
                        False,
                    },
                    "causallift": {
                        "handlers":
                        ["console", "info_file_handler", "error_file_handler"],
                        "level":
                        "INFO",
                        "propagate":
                        False,
                    },
                },
                "root": {
                    "handlers":
                    ["console", "info_file_handler", "error_file_handler"],
                    "level":
                    "INFO",
                },
                "version": 1,
            },  # type: Optional[Dict[str, Any]]
    ):
        # type: (...) -> None

        self.runner = None  # type: Optional[str]
        self.kedro_context = None  # type: Optional[Type[FlexibleKedroContext]]
        self.args = None  # type: Optional[Type[EasyDict]]
        self.train_df = None  # type: Optional[Type[pd.DataFrame]]
        self.test_df = None  # type: Optional[Type[pd.DataFrame]]
        self.df = None  # type: Optional[Type[pd.DataFrame]]
        self.propensity_model = None  # type: Optional[Type[sklearn.base.BaseEstimator]]
        self.uplift_models_dict = None  # type: Optional[Type[EasyDict]]
        self.treatment_fractions = None  # type: Optional[Type[EasyDict]]
        self.treatment_fraction_train = None  # type: Optional[float]
        self.treatment_fraction_test = None  # type: Optional[float]

        self.treated__proba = None  # type: Optional[Type[np.array]]
        self.untreated__proba = None  # type: Optional[Type[np.array]]
        self.cate_estimated = None  # type: Optional[Type[pd.Series]]

        self.treated__sim_eval_df = None  # type: Optional[Type[pd.DataFrame]]
        self.untreated__sim_eval_df = None  # type: Optional[Type[pd.DataFrame]]
        self.estimated_effect_df = None  # type: Optional[Type[pd.DataFrame]]

        # Instance attributes were defined above.
        if logging_config:
            logging.config.dictConfig(logging_config)

        args_raw = dict(
            cols_features=cols_features,
            col_treatment=col_treatment,
            col_outcome=col_outcome,
            col_propensity=col_propensity,
            col_cate=col_cate,
            col_recommendation=col_recommendation,
            min_propensity=min_propensity,
            max_propensity=max_propensity,
            verbose=verbose,
            uplift_model_params=uplift_model_params,
            enable_ipw=enable_ipw,
            propensity_model_params=propensity_model_params,
            index_name=index_name,
            partition_name=partition_name,
            runner=runner,
            conditionally_skip=conditionally_skip,
        )

        args_raw = EasyDict(args_raw)
        args_raw.update(
            dataset_catalog.get("args_raw",
                                MemoryDataSet({}).load()))

        assert args_raw.runner in {"SequentialRunner", "ParallelRunner", None}
        if args_raw.runner is None and args_raw.conditionally_skip:
            log.warning(
                "[Warning] conditionally_skip option is ignored since runner is None"
            )

        self.kedro_context = FlexibleKedroContext(
            runner=args_raw.runner, only_missing=args_raw.conditionally_skip)

        self.runner = args_raw.runner

        if self.runner is None:
            self.df = bundle_train_and_test_data(args_raw, train_df, test_df)
            self.args = impute_cols_features(args_raw, self.df)
            self.args = schedule_propensity_scoring(self.args, self.df)
            self.treatment_fractions = treatment_fractions_(self.args, self.df)
            if self.args.need_propensity_scoring:
                self.propensity_model = fit_propensity(self.args, self.df)
                self.df = estimate_propensity(self.args, self.df,
                                              self.propensity_model)

        if self.runner:
            self.kedro_context.catalog.add_feed_dict(
                {
                    "train_df": MemoryDataSet(train_df),
                    "test_df": MemoryDataSet(test_df),
                    "args_raw": MemoryDataSet(args_raw),
                },
                replace=True,
            )
            self.kedro_context.catalog.add_feed_dict(dataset_catalog,
                                                     replace=True)

            self.kedro_context.run(tags=["011_bundle_train_and_test_data"])
            self.df = self.kedro_context.catalog.load("df_00")

            self.kedro_context.run(tags=[
                "121_prepare_args",
                "131_treatment_fractions_",
                "141_initialize_model",
            ])
            self.args = self.kedro_context.catalog.load("args")
            self.treatment_fractions = self.kedro_context.catalog.load(
                "treatment_fractions")

            if self.args.need_propensity_scoring:
                self.kedro_context.run(tags=["211_fit_propensity"])
                self.propensity_model = self.kedro_context.catalog.load(
                    "propensity_model")
                self.kedro_context.run(tags=["221_estimate_propensity"])
                self.df = self.kedro_context.catalog.load("df_01")
            else:
                self.kedro_context.catalog.add_feed_dict(
                    {"df_01": MemoryDataSet(self.df)}, replace=True)

        self.treatment_fraction_train = self.treatment_fractions.train
        self.treatment_fraction_test = self.treatment_fractions.test

        if self.args.verbose >= 3:
            log.info("### Treatment fraction in train dataset: {}".format(
                self.treatment_fractions.train))
            log.info("### Treatment fraction in test dataset: {}".format(
                self.treatment_fractions.test))

        self._separate_train_test()
Example #24
0
def multi_catalog():
    csv = CSVLocalDataSet(filepath="abc.csv")
    parq = ParquetLocalDataSet(filepath="xyz.parq")
    return DataCatalog({"abc": csv, "xyz": parq})
Example #25
0
def data_set(filepath):
    return CSVLocalDataSet(filepath=filepath, save_args={"index": False})
Example #26
0
def save_predicted_data(dummy15, well_count, all_wells_dates_input,
                        wells_steam_rate_actual, wells_steam_rate_predicted,
                        wells_emulsion_rate_actual,
                        wells_emulsion_rate_predicted, steam_input_column,
                        all_wells_steam_input_val, emulsion_input_column,
                        all_wells_emulsion_input_val, labels_column,
                        parameters, files_val,
                        scheme):  # to input wells_RF_array for RF case

    print("Prediction Performance:\n")
    print("Steam Flow Rate:")
    for well, file in zip(well_count, files_val):
        dates = all_wells_dates_input[well].T
        steam_input = all_wells_steam_input_val[well].T
        steam_rate_actual = wells_steam_rate_actual[:, well]

        steam_rate_predicted = wells_steam_rate_predicted[:, well]
        emulsion_rate_predicted = wells_emulsion_rate_predicted[:, well]

        if scheme == 1:
            steam_rate_actual_predicted = pd.DataFrame(
                np.vstack((dates, steam_input, steam_rate_actual,
                           steam_rate_predicted)).T,
                columns=["Date"] + list(steam_input_column) + [
                    labels_column[0] + " actual",
                    labels_column[0] + " predicted"
                ])
            data_set_steam_rate = CSVLocalDataSet(
                filepath=parameters["path_model_output_No_DWT_scheme1"] +
                "/steam_rate_" + file)
        elif scheme == 2:
            steam_rate_actual_predicted = pd.DataFrame(
                np.vstack((dates, steam_input, emulsion_rate_predicted,
                           steam_rate_actual, steam_rate_predicted)).T,
                columns=["Date"] + list(steam_input_column) + [
                    labels_column[1] + " predicted", labels_column[0] +
                    " actual", labels_column[0] + " predicted"
                ])
            data_set_steam_rate = CSVLocalDataSet(
                filepath=parameters["path_model_output_No_DWT_scheme2"] +
                "/steam_rate_" + file)
        elif scheme == 3:
            RF_input = wells_RF_array[well].T
            steam_rate_actual_predicted = pd.DataFrame(
                np.vstack((dates, RF_input, steam_input, steam_rate_actual,
                           steam_rate_predicted)).T,
                columns=["Date", "RF"] + list(steam_input_column) + [
                    labels_column[0] + " actual",
                    labels_column[0] + " predicted"
                ])
            data_set_steam_rate = CSVLocalDataSet(
                filepath=parameters["path_model_output_No_DWT_RF_scheme1"] +
                "/steam_rate_" + file)
        else:
            RF_input = wells_RF_array[well].T
            steam_rate_actual_predicted = pd.DataFrame(
                np.vstack(
                    (dates, RF_input, steam_input, emulsion_rate_predicted,
                     steam_rate_actual, steam_rate_predicted)).T,
                columns=["Date", "RF"] + list(steam_input_column) + [
                    labels_column[1] + " predicted", labels_column[0] +
                    " actual", labels_column[0] + " predicted"
                ])
            data_set_steam_rate = CSVLocalDataSet(
                filepath=parameters["path_model_output_No_DWT_RF_scheme2"] +
                "/steam_rate_" + file)
        data_set_steam_rate.save(steam_rate_actual_predicted)
        print(file + " R_squared: {0:.4f}".format(
            r2_score(steam_rate_actual, steam_rate_predicted)))

    print("\n")
    print("Oil Rate:")
    for well, file in zip(well_count, files_val):
        dates = all_wells_dates_input[well].T
        emulsion_input = all_wells_emulsion_input_val[well].T
        emulsion_rate_actual = wells_emulsion_rate_actual[:, well]

        steam_rate_predicted = wells_steam_rate_predicted[:, well]
        emulsion_rate_predicted = wells_emulsion_rate_predicted[:, well]

        if scheme == 1:
            emulsion_rate_actual_predicted = pd.DataFrame(
                np.vstack((dates, emulsion_input, steam_rate_predicted,
                           emulsion_rate_actual, emulsion_rate_predicted)).T,
                columns=["Date"] + list(emulsion_input_column) + [
                    labels_column[0] + " predicted", labels_column[1] +
                    " actual", labels_column[1] + " predicted"
                ])
            data_set_emulsion_rate = CSVLocalDataSet(
                filepath=parameters["path_model_output_No_DWT_scheme1"] +
                "/emulsion_rate_" + file)
        elif scheme == 2:
            emulsion_rate_actual_predicted = pd.DataFrame(
                np.vstack((dates, emulsion_input, emulsion_rate_actual,
                           emulsion_rate_predicted)).T,
                columns=["Date"] + list(emulsion_input_column) + [
                    labels_column[1] + " actual",
                    labels_column[1] + " predicted"
                ])
            data_set_emulsion_rate = CSVLocalDataSet(
                filepath=parameters["path_model_output_No_DWT_scheme2"] +
                "/emulsion_rate_" + file)
        elif scheme == 3:
            #             cum_input = wells_cum_oil_array[well].T
            RF_input = wells_RF_array[well].T
            emulsion_rate_actual_predicted = pd.DataFrame(
                np.vstack(
                    (dates, RF_input, emulsion_input, steam_rate_predicted,
                     emulsion_rate_actual, emulsion_rate_predicted)).T,
                columns=["Date", "RF"] + list(emulsion_input_column) + [
                    labels_column[0] + " predicted", labels_column[1] +
                    " actual", labels_column[1] + " predicted"
                ])
            data_set_emulsion_rate = CSVLocalDataSet(
                filepath=parameters["path_model_output_No_DWT_RF_scheme1"] +
                "/emulsion_rate_" + file)
        else:
            RF_input = wells_RF_array[well].T
            emulsion_rate_actual_predicted = pd.DataFrame(
                np.vstack((dates, RF_input, emulsion_input,
                           emulsion_rate_actual, emulsion_rate_predicted)).T,
                columns=["Date", "RF"] + list(emulsion_input_column) + [
                    labels_column[1] + " actual",
                    labels_column[1] + " predicted"
                ])
            data_set_emulsion_rate = CSVLocalDataSet(
                filepath=parameters["path_model_output_No_DWT_RF_scheme2"] +
                "/emulsion_rate_" + file)
        data_set_emulsion_rate.save(emulsion_rate_actual_predicted)
        print(file + " R_squared: {0:.4f}".format(
            r2_score(emulsion_rate_actual, emulsion_rate_predicted)))
    print("\n")
    dummy_validate = files_val
    return dummy_validate
Example #27
0
def preprocess_raw_data(parameters: Dict):
    import glob, os
    #     os.chdir(parameters["path_raw"])
    os.chdir(parameters["path_raw_matlab"])
    files = []
    for file in glob.glob("*.csv"):
        files.append(file)
    filenames = []
    wells_data = []
    for file in files:
        filename, extension = file.split('.')
        filenames.append(filename)
    for file, filename in zip(files, filenames):
        io = DataCatalog({
            #                 filename: CSVLocalDataSet(filepath=parameters["path_raw"]+"/"+file),
            filename:
            CSVLocalDataSet(filepath=parameters["path_raw_matlab"] + "/" +
                            file),
        })

        well_data = io.load(filename)
        wells_data.append(well_data)
    Raw_Data_preprocessed = []
    Raw_Data_dated = []
    wells_life = []
    wells_data_ = []
    for well in wells_data:
        #         well = well[['Date', 'Injector Bottom Hole Pressure',
        #                    'Producer Bottom Hole Pressure', 'ESP Speed',
        #                    'Steam Flow Rate - Outer',
        #                    'Emulsion Flow Rate']]

        well = well[[
            'Date', 'Speed [Hz]', 'Current [A]', 'IBHP', 'PBHP',
            'Co-Injection [E3m3/d]', 'Oil [bbl/d]', 'Steam [m3/d]',
            'Emulsion [m3/d]'
        ]]

        for i in range(1, len(well.columns)):
            well[well.columns[i]] = pd.to_numeric(well[well.columns[i]],
                                                  errors='coerce')
        well['Prod_Date'] = pd.to_datetime(well['Date'])
        well = well.set_index('Prod_Date')
        #         well = well.resample('7D').mean()   # weekly data
        #         well = well.resample('30D').mean()   # monthly data
        #         well = well.rolling(30, min_periods=1).mean()

        data = well['Oil [bbl/d]'] / 6.28981
        well.insert(4, 'Oil [m3/d]', data)
        time_data = np.arange(len(well))
        well.insert(0, 'Timestep', time_data)

        wells_life.append(len(well))
        wells_data_.append(well)
    min_well_length = np.min(np.array(wells_life))
    print((min_well_length, np.argmin(np.array(wells_life))))
    timesteps = min_well_length  # 1008
    #     timesteps = 371

    for well, file, filename in zip(wells_data_, files, filenames):
        well = well.iloc[:timesteps]  # daily, weekly, monthly data
        #         well = well.fillna(0)
        #         well = well.fillna(well.rolling(30, min_periods=1).median())
        #         well = well.fillna(well.median())

        well["Well"] = filename  # create a column for well name
        well = well.reset_index(drop=True)  # remove date index
        data_set = CSVLocalDataSet(filepath=parameters["path_intermediate"] +
                                   "/pre_processed_data_" + file)
        data_set.save(well)
        Raw_Data_dated.append(well)
        well = well.drop(columns=['Date', 'Well'])
        Raw_Data_preprocessed.append(well)


#     stats_training = CSVLocalDataSet(filepath=parameters["path_raw_static"]+"/static_P50_data_training.csv")
    stats_training = CSVLocalDataSet(
        filepath=parameters["path_raw_static_matlab"] +
        "/static_P50_data_training_152_wells.csv")

    stats = stats_training.load()
    stats_ROIP = stats.loc[:, 'ROIP']
    stats = stats.loc[:, 'Effective_Length':'BottomWater_Oil_Saturation']

    # #     using only rich geoostats and no bottom water properties
    #     stats = stats.loc[:, 'Effective_Length':'Rich_Oil_Saturation']

    # #     Using "Effective_Rich_Pay_Thickness" to account for standoff and rich thickness
    #     data = stats['Rich_Pay_Thickness'] - stats['Stand_Off']
    #     stats.insert(3, 'Effective_Rich_Pay_Thickness', data)
    #     stats = stats.drop(columns = ['Rich_Pay_Thickness', 'Stand_Off'])

    property_names = list(stats.columns)
    properties = list(stats.values)

    properties = np.array(properties)
    return [
        timesteps, Raw_Data_preprocessed, Raw_Data_dated, files, filenames,
        properties, stats_ROIP, property_names
    ]
Example #28
0
def save_well_validation_data(dummy11, Raw_Data_preprocessed_val, parameters,
                              files_val):
    # def save_well_validation_data(dummy12, Raw_Data_preprocessed_val_, parameters, files_val):
    all_wells_dates_input = []
    all_wells_steam_input_val = []
    all_wells_emulsion_input_val = []
    all_wells_labels_val = []
    for well_data, file in zip(Raw_Data_preprocessed_val, files_val):
        #     for well_data, file in zip(Raw_Data_preprocessed_val_, files_val):
        steam_input_data, emulsion_input_data, labels = split(well_data)
        data_set_steam_input = CSVLocalDataSet(
            filepath=parameters["path_val_pre_processed"] +
            "/vali_steam_inputs_" + file)
        data_set_steam_input.save(steam_input_data)
        all_wells_dates_input.append(steam_input_data['Date'].values)
        steam_input_data = steam_input_data.drop(columns='Date')
        all_wells_steam_input_val.append(steam_input_data.values)
        data_set_emulsion_input = CSVLocalDataSet(
            filepath=parameters["path_val_pre_processed"] +
            "/vali_emulsion_inputs_" + file)
        data_set_emulsion_input.save(emulsion_input_data)
        all_wells_emulsion_input_val.append(emulsion_input_data.values)
        data_set_labels = CSVLocalDataSet(
            filepath=parameters["path_val_pre_processed"] +
            "/validation_labels_" + file)
        data_set_labels.save(labels)
        all_wells_labels_val.append(labels.values)

    steam_input_column = steam_input_data.columns
    emulsion_input_column = emulsion_input_data.columns
    labels_column = list(labels.columns)
    all_wells_dates_input = np.array(all_wells_dates_input)
    all_wells_steam_input_val = np.array(all_wells_steam_input_val)
    all_wells_emulsion_input_val = np.array(all_wells_emulsion_input_val)

    dummy13 = files_val
    return [
        dummy13, steam_input_column, emulsion_input_column, labels_column,
        all_wells_dates_input, all_wells_steam_input_val,
        all_wells_emulsion_input_val, all_wells_labels_val
    ]
Example #29
0
def validate(parameters: Dict):
    # def validate(dummy2, parameters: Dict):
    import glob, os
    os.chdir(parameters["path_model_input"])
    files = []
    for file in glob.glob("*.csv"):
        files.append(file)
    filenames = []
    wells_data = []
    for file in files:
        filename, extension = file.split('.')
        filenames.append(filename)
    for file, filename in zip(files, filenames):
        io = DataCatalog({
            filename:
            CSVLocalDataSet(filepath=parameters["path_model_input"] + "/" +
                            file),
        })
        well_data = io.load(filename)
        wells_data.append(well_data)
    Raw_Data_preprocessed = []
    for well, file, filename in zip(wells_data, files, filenames):
        well = well[[
            'Date', 'Injector Bottom Hole Pressure', 'Steam Flow Rate - Outer',
            'Bottom Hole Heel Temperature', 'Emulsion Pressure',
            'Producer Bottom Hole Pressure', 'ESP Speed', 'Emulsion Flow Rate'
        ]]
        for i in range(1, len(well.columns)):
            well[well.columns[i]] = pd.to_numeric(well[well.columns[i]],
                                                  errors='coerce')

        well = well.iloc[:1399]
        well = well.fillna(well.rolling(30, min_periods=1).median())
        well = well.fillna(well.median())

        well['Date'] = pd.to_datetime(well['Date'])
        well = well.set_index('Date')
        Raw_Data_preprocessed.append(well)

    os.chdir(parameters["path_val_stats"])
    static_files = []
    for static_file in glob.glob("*.csv"):
        static_files.append(static_file)
    static_filenames = []
    statics_data = []
    for static_file in static_files:
        static_filename, others = static_file.split('_')
        static_filenames.append(static_filename)
    for static_file, static_filename in zip(static_files, static_filenames):
        io = DataCatalog({
            static_filename:
            CSVLocalDataSet(filepath=parameters["path_val_stats"] + "/" +
                            static_file),
        })
        static_data = io.load(static_filename)
        statics_data.append(static_data)
    statics_data_new = []
    well_name_list = []
    for pad_static in statics_data:
        well_name = pad_static['WELLPAIR_NAME'].values
        well_name_list.append(well_name)
        pad_static = pad_static.set_index('WELLPAIR_NAME')
        pad_static = pad_static.drop(columns=['PLAN_NAME', 'HIGH_PRESSURE'])
        statics_data_new.append(pad_static)
    properties = []
    probabilities = []
    asset_names = []
    for pad_static, names in zip(statics_data_new, well_name_list):
        for well in names:
            prob = pad_static.loc[well, 'Forecast_Prob']
            probabilities.append(prob)
            pad_code = pad_static.loc[well, 'PAD_CODE']
            asset_name, pad = pad_code.split('_')
            asset_names.append(asset_name)
            property_ = pad_static.loc[
                well, 'SAGD_PRESSURE':'BOTTOM_WATER_THICKNESS'].values
            properties.append(property_)
    properties = np.array(properties)

    all_wells_input = []
    all_wells_labels = []
    for well_data, file in zip(Raw_Data_preprocessed, files):
        DWT_Aprox_coeff_input = []
        input_data, labels = split(well_data)

        input_columns = list(input_data.columns)
        for data_idx in input_columns:
            signal = well_data[data_idx].values
            thresh = parameters["thresh"] * np.nanmax(signal)
            coeff = pywt.wavedec(signal,
                                 wavelet=parameters["wavelet"],
                                 mode=parameters["mode1"],
                                 level=parameters["level"])
            coeff[1:] = (pywt.threshold(i,
                                        value=thresh,
                                        mode=str(parameters["mode2"]))
                         for i in coeff[1:])
            DWT_Aprox_coeff_input.append(coeff[0])
        DWT_Aprox_coeff_input = pd.DataFrame(
            np.transpose(DWT_Aprox_coeff_input), columns=input_columns)
        data_set_input = CSVLocalDataSet(
            filepath=parameters["path_val_pre_processed"] +
            "/validation_input_DWT_coeffs_" + file)
        data_set_input.save(DWT_Aprox_coeff_input)
        all_wells_input.append(DWT_Aprox_coeff_input.values)
        data_set_labels = CSVLocalDataSet(
            filepath=parameters["path_val_pre_processed"] +
            "/validation_labels_" + file)
        data_set_labels.save(labels)
        all_wells_labels.append(labels.values)

    #     Standardize dynamic data coeffs
    data_set_scaler_coeffs = PickleLocalDataSet(
        filepath=parameters["path_models"] + "/scaler_coeffs.pickle")
    scaler_coeffs = data_set_scaler_coeffs.load()
    all_wells_standardized_input = []
    all_wells_standardized_input_flattened = []
    for well_coeffs in all_wells_input:
        std_coeffs = scaler_coeffs.transform(well_coeffs)
        all_wells_standardized_input.append(std_coeffs)
        transposed_std_coeffs = np.transpose(std_coeffs)
        flattened_std_coeffs = transposed_std_coeffs.flatten()
        all_wells_standardized_input_flattened.append(flattened_std_coeffs)
    all_wells_standardized_input = np.array(all_wells_standardized_input)
    all_wells_standardized_input_flattened = np.array(
        all_wells_standardized_input_flattened)
    input_columns = list(DWT_Aprox_coeff_input.columns)
    for std_coeffs, file in zip(all_wells_standardized_input, files):
        std_coeffs = pd.DataFrame(std_coeffs, columns=input_columns)
        data_set = CSVLocalDataSet(
            filepath=parameters["path_val_pre_processed"] +
            "/validation_std_DWT_input_coeffs_" + file)
        data_set.save(std_coeffs)
#     Standardize static data

    data_set_scaler_static = PickleLocalDataSet(
        filepath=parameters["path_models"] + "/scaler_static.pickle")
    scaler_static = data_set_scaler_static.load()
    all_wells_standardized_properties = scaler_static.fit_transform(properties)
    all_wells_coeffs_reservoir_data = []
    for flattened_std_coeffs, standardized_properties in zip(
            all_wells_standardized_input_flattened,
            all_wells_standardized_properties):
        flattened_std_coeffs = list(flattened_std_coeffs)
        standardized_properties = list(standardized_properties)
        for reservoir_property in standardized_properties:
            flattened_std_coeffs.append(
                reservoir_property
            )  # append reservoir data to dynamic data coeffs
        all_wells_coeffs_reservoir_data.append(flattened_std_coeffs)
    all_wells_coeffs_reservoir_data = np.array(all_wells_coeffs_reservoir_data)

    well_count = np.arange(len(all_wells_coeffs_reservoir_data))
    daily_timesteps = np.arange(len(all_wells_labels[0]))
    input_data = []
    for coeff_inputs in all_wells_coeffs_reservoir_data:
        for time_lapse in daily_timesteps:
            well_inputs = [time_lapse] + list(
                coeff_inputs)  # append time lapse to input data
            input_data.append(well_inputs)
    input_data = np.array(input_data)
    data_set_regressor_1 = PickleLocalDataSet(
        filepath=parameters["path_models"] + "/regressor_1.pickle")
    regressor_1 = data_set_regressor_1.load()
    number_of_wells = len(well_count)
    wells_steam_rate_predicted = regressor_1.predict(input_data)
    wells_steam_rate_predicted = wells_steam_rate_predicted.reshape(
        (number_of_wells, 1399)).T

    # prediction inputs to model 2
    input_data_model_2 = []
    for coeff_inputs, well in zip(all_wells_coeffs_reservoir_data, well_count):
        for time_lapse in daily_timesteps:
            well_inputs = [time_lapse] + list(
                coeff_inputs)  # append time lapse to input data
            well_inputs_model_2 = [
                wells_steam_rate_predicted[time_lapse, well]
            ] + well_inputs
            input_data_model_2.append(well_inputs_model_2)
    input_data_model_2 = np.array(input_data_model_2)

    data_set_regressor_2 = PickleLocalDataSet(
        filepath=parameters["path_models"] + "/regressor_2.pickle")
    regressor_2 = data_set_regressor_2.load()
    wells_emulsion_rate_predicted = regressor_2.predict(input_data_model_2)
    wells_emulsion_rate_predicted = wells_emulsion_rate_predicted.reshape(
        (number_of_wells, 1399)).T

    # actual targets
    all_wells_steam_data = []
    all_wells_emulsion_data = []
    for ID in well_count:
        well_steam_data = all_wells_labels[ID][:, 0]
        well_emulsion_data = all_wells_labels[ID][:, 1]
        all_wells_steam_data = all_wells_steam_data + list(well_steam_data)
        all_wells_emulsion_data = all_wells_emulsion_data + list(
            well_emulsion_data)
    all_wells_steam_data = np.array(all_wells_steam_data)
    all_wells_emulsion_data = np.array(all_wells_emulsion_data)
    wells_steam_rate_actual = all_wells_steam_data.reshape(
        (number_of_wells, 1399)).T
    wells_emulsion_rate_actual = all_wells_emulsion_data.reshape(
        (number_of_wells, 1399)).T

    print("Prediction Performance:\n")
    print("Steam Flow Rate:")
    for well, file in zip(well_count, files):
        steam_rate_predicted = wells_steam_rate_predicted[:, well]
        steam_rate_actual = wells_steam_rate_actual[:, well]
        steam_rate_actual_predicted = pd.DataFrame(
            np.vstack((steam_rate_actual, steam_rate_predicted)).T,
            columns=["steam rate actual", "steam rate predicted"])
        data_set_steam_rate = CSVLocalDataSet(
            filepath=parameters["path_model_output"] + "/steam_rate_" + file)
        data_set_steam_rate.save(steam_rate_actual_predicted)
        print(file + " R_squared: {0:.4f}".format(
            r2_score(steam_rate_actual, steam_rate_predicted)))

    print("\n")
    print("Emulsion Flow Rate:")
    for well, file in zip(well_count, files):
        emulsion_rate_predicted = wells_emulsion_rate_predicted[:, well]
        emulsion_rate_actual = wells_emulsion_rate_actual[:, well]
        emulsion_rate_actual_predicted = pd.DataFrame(
            np.vstack((emulsion_rate_actual, emulsion_rate_predicted)).T,
            columns=["emulsion rate actual", "emulsion rate predicted"])
        data_set_emulsion_rate = CSVLocalDataSet(
            filepath=parameters["path_model_output"] + "/emulsion_rate_" +
            file)
        data_set_emulsion_rate.save(emulsion_rate_actual_predicted)
        print(file + " R_squared: {0:.4f}".format(
            r2_score(emulsion_rate_actual, emulsion_rate_predicted)))

    dummy_validate = files
    return dummy_validate
Example #30
0
def preprocess_raw_data(parameters: Dict):
    import glob, os
    os.chdir(parameters["path_raw"])
    files = []
    for file in glob.glob("*.csv"):
        files.append(file)
    filenames = []
    wells_data = []
    for file in files:
        filename, extension = file.split('.')
        filenames.append(filename)
    for file, filename in zip(files, filenames):
        io = DataCatalog({
            filename:
            CSVLocalDataSet(filepath=parameters["path_raw"] + "/" + file),
        })
        well_data = io.load(filename)
        wells_data.append(well_data)
    Raw_Data_preprocessed = []
    Raw_Data_dated = []
    for well, file, filename in zip(wells_data, files, filenames):
        well = well[[
            'Date', 'Injector Bottom Hole Pressure', 'Steam Flow Rate - Outer',
            'Bottom Hole Heel Temperature', 'Emulsion Pressure',
            'Producer Bottom Hole Pressure', 'ESP Speed', 'Emulsion Flow Rate'
        ]]
        for i in range(1, len(well.columns)):
            well[well.columns[i]] = pd.to_numeric(well[well.columns[i]],
                                                  errors='coerce')

        well = well.iloc[:1399]
        well = well.fillna(well.rolling(30, min_periods=1).median())
        well = well.fillna(well.median())

        well_dated = well.copy()
        well_dated["Well"] = filename  # create a column for well name
        data_set = CSVLocalDataSet(filepath=parameters["path_intermediate"] +
                                   "/pre_processed_data_" + file)
        data_set.save(well_dated)
        Raw_Data_dated.append(well_dated)

        well['Date'] = pd.to_datetime(well['Date'])
        well = well.set_index('Date')
        Raw_Data_preprocessed.append(well)

    os.chdir(parameters["path_raw_static"])
    static_files = []
    for static_file in glob.glob("*.csv"):
        static_files.append(static_file)
    static_filenames = []
    statics_data = []
    for static_file in static_files:
        static_filename, others = static_file.split('_')
        static_filenames.append(static_filename)
    for static_file, static_filename in zip(static_files, static_filenames):
        io = DataCatalog({
            static_filename:
            CSVLocalDataSet(filepath=parameters["path_raw_static"] + "/" +
                            static_file),
        })
        static_data = io.load(static_filename)
        statics_data.append(static_data)
    statics_data_new = []
    well_name_list = []
    for pad_static in statics_data:
        well_name = pad_static['WELLPAIR_NAME'].values
        well_name_list.append(well_name)
        pad_static = pad_static.set_index('WELLPAIR_NAME')
        pad_static = pad_static.drop(columns=['PLAN_NAME', 'HIGH_PRESSURE'])
        statics_data_new.append(pad_static)
    properties = []
    probabilities = []
    asset_names = []
    for pad_static, names in zip(statics_data_new, well_name_list):
        for well in names:
            prob = pad_static.loc[well, 'Forecast_Prob']
            probabilities.append(prob)
            pad_code = pad_static.loc[well, 'PAD_CODE']
            asset_name, pad = pad_code.split('_')
            asset_names.append(asset_name)
            property_ = pad_static.loc[
                well, 'SAGD_PRESSURE':'BOTTOM_WATER_THICKNESS'].values
            properties.append(property_)
    properties = np.array(properties)
    return [
        Raw_Data_preprocessed, Raw_Data_dated, files, filenames, probabilities,
        asset_names, properties
    ]