def test_sequential_load_from_disk(self, dummy_dataframe, filepath, versioned_csv_data_set): """Tests if the correct load version is logged when two versions are saved in disk.""" save_version_1 = "2019-01-01T23.00.00.000Z" save_version_2 = "2019-01-01T23.59.59.999Z" CSVLocalDataSet( filepath=filepath, save_args={ "sep": "," }, version=Version(None, save_version_1), ).save(dummy_dataframe) CSVLocalDataSet( filepath=filepath, save_args={ "sep": "," }, version=Version(None, save_version_2), ).save(dummy_dataframe) versioned_csv_data_set.load() last_load_version = versioned_csv_data_set.get_last_load_version() assert last_load_version == save_version_2
def test_save_options_csv(self, tmp_path, sample_spark_df): # To cross check the correct Spark save operation we save to # a single spark partition with csv format and retrieve it with Kedro # CSVLocalDataSet temp_dir = Path(str(tmp_path / "test_data")) spark_data_set = SparkDataSet( filepath=str(temp_dir), file_format="csv", save_args={ "sep": "|", "header": True }, ) spark_df = sample_spark_df.coalesce(1) spark_data_set.save(spark_df) single_csv_file = [ f for f in temp_dir.iterdir() if f.is_file() and f.suffix == ".csv" ][0] csv_local_data_set = CSVLocalDataSet(filepath=str(single_csv_file), load_args={"sep": "|"}) pandas_df = csv_local_data_set.load() assert pandas_df[pandas_df["name"] == "Alex"]["age"][0] == 31
def __init__( self, propensity_model_filename="../data/06_models/propensity_model.pickle", uplift_models_filename="../data/06_models/uplift_models_dict.pickle", df_filename="../data/07_model_output/df.csv", treated_sim_eval_filename="../data/08_reporting/treated__sim_eval_df.csv", untreated_sim_eval_filename="../data/08_reporting/untreated__sim_eval_df.csv", estimated_effect_filename="../data/08_reporting/estimated_effect_df.csv", args_raw=MemoryDataSet({}).load()): self.propensity_model = PickleLocalDataSet( filepath=propensity_model_filename, version=None) self.uplift_models_dict = PickleLocalDataSet( filepath=uplift_models_filename, version=None) self.df_03 = CSVLocalDataSet( filepath=df_filename, load_args=dict(index_col=["partition", "index"], float_precision="high"), save_args=dict(index=True, float_format="%.16e"), version=None, ) self.treated__sim_eval_df = CSVLocalDataSet( filepath=treated_sim_eval_filename, version=None) self.untreated__sim_eval_df = CSVLocalDataSet( filepath=untreated_sim_eval_filename, version=None) self.estimated_effect_df = CSVLocalDataSet( filepath=estimated_effect_filename, version=None) self.args_raw = args_raw
def create_master_table(Raw_Data_dated: pd.DataFrame, parameters: Dict) -> pd.DataFrame: master_table = pd.concat(Raw_Data_dated, axis=1, sort=False) data_set = CSVLocalDataSet(filepath=parameters["path_primary"] + "/master_table.csv") data_set.save(master_table) return master_table
def dynamic_time_warping(Raw_Data_preprocessed, parameters): reference_well = CSVLocalDataSet(filepath=parameters["path_raw"] + "/B03-1P.csv") well_ref = reference_well.load() data = well_ref['Oil [bbl/d]'] / 6.28981 well_ref.insert(4, 'Oil [m3/d]', data) well_ref_oil_data = well_ref['Oil [m3/d]'].values Raw_Data_preprocessed_ = [] distance_array = [] for well_data in Raw_Data_preprocessed: well_oil_data = well_data['Oil [m3/d]'].values distance, path = fastdtw(well_ref_oil_data, well_oil_data, dist=euclidean) distance_array.append(distance) path = np.array(path) index_well = path[..., 1] index_ref_well = path[..., 0] well = well_data.iloc[index_well] well.insert(0, 'index_ref', index_ref_well) well = well.groupby('index_ref').mean() # well = well.reset_index(drop=True) Raw_Data_preprocessed_.append(well) distance_array = np.array(distance_array) return [distance_array, Raw_Data_preprocessed_]
def test_save_options_csv(): # To cross check the correct Spark save operation we save to # a single spark partition with csv format and retrieve it with Kedro # CSVLocalDataSet with tempfile.TemporaryDirectory() as temp_dir: temp_path = join(temp_dir, "test_data") spark_data_set = SparkDataSet( filepath=temp_path, file_format="csv", save_args={ "sep": "|", "header": True }, ) spark_df = _get_sample_spark_data_frame().coalesce(1) spark_data_set.save(spark_df) single_csv_file = [ join(temp_path, f) for f in listdir(temp_path) if f.endswith("csv") ][0] csv_local_data_set = CSVLocalDataSet(filepath=single_csv_file, load_args={"sep": "|"}) pandas_df = csv_local_data_set.load() assert pandas_df[pandas_df["name"] == "Alex"]["age"][0] == 31
def test_load_options_csv(self, tmp_path, sample_pandas_df): filepath = str(tmp_path / "data") local_csv_data_set = CSVLocalDataSet(filepath=filepath) local_csv_data_set.save(sample_pandas_df) spark_data_set = SparkDataSet(filepath=filepath, file_format="csv", load_args={"header": True}) spark_df = spark_data_set.load() assert spark_df.filter(col("Name") == "Alex").count() == 1
def test_load_options_csv(tmpdir): temp_path = str(tmpdir.join("data")) pandas_df = _get_sample_pandas_data_frame() local_csv_data_set = CSVLocalDataSet(filepath=temp_path) local_csv_data_set.save(pandas_df) spark_data_set = SparkDataSet(filepath=temp_path, file_format="csv", load_args={"header": True}) spark_df = spark_data_set.load() assert spark_df.filter(col("Name") == "Alex").count() == 1
def load_data(dummy, files: List, parameters: Dict): all_wells_steam_input = [] all_wells_emulsion_input = [] all_wells_labels = [] for file in files: data_set_steam_input = CSVLocalDataSet( filepath=parameters["path_primary"] + "/steam_inputs_" + file) steam_input_data = data_set_steam_input.load() all_wells_steam_input.append(steam_input_data.values) data_set_emulsion_input = CSVLocalDataSet( filepath=parameters["path_primary"] + "/emulsion_inputs_" + file) emulsion_input_data = data_set_emulsion_input.load() all_wells_emulsion_input.append(emulsion_input_data.values) data_set_labels = CSVLocalDataSet(filepath=parameters["path_primary"] + "/labels_" + file) labels = data_set_labels.load() all_wells_labels.append(labels.values) steam_input_names = steam_input_data.columns emulsion_input_names = emulsion_input_data.columns all_wells_steam_input = np.array(all_wells_steam_input) all_wells_emulsion_input = np.array(all_wells_emulsion_input) return [ all_wells_steam_input, all_wells_emulsion_input, all_wells_labels, steam_input_names, emulsion_input_names ]
def standardisation(dummy, properties: np.ndarray, files: List, parameters: Dict): from sklearn.preprocessing import StandardScaler all_wells_input = [] all_wells_labels = [] for file in files: data_set_input = CSVLocalDataSet(filepath=parameters["path_primary"] + "/input_DWT_coeffs_" + file) DWT_Aprox_coeff_input = data_set_input.load() all_wells_input.append(DWT_Aprox_coeff_input.values) data_set_labels = CSVLocalDataSet(filepath=parameters["path_primary"] + "/labels_" + file) labels = data_set_labels.load() all_wells_labels.append(labels.values) all_wells_input = np.array(all_wells_input) # Standardize dynamic data coeffs scaler_coeffs = StandardScaler() scaler_coeffs.fit(all_wells_input[0]) # fit based on first well record all_wells_standardized_input = [] all_wells_standardized_input_flattened = [] for well_coeffs in all_wells_input: std_coeffs = scaler_coeffs.transform(well_coeffs) all_wells_standardized_input.append(std_coeffs) transposed_std_coeffs = np.transpose(std_coeffs) flattened_std_coeffs = transposed_std_coeffs.flatten() all_wells_standardized_input_flattened.append(flattened_std_coeffs) all_wells_standardized_input = np.array(all_wells_standardized_input) all_wells_standardized_input_flattened = np.array( all_wells_standardized_input_flattened) data_set_scaler_coeffs = PickleLocalDataSet( filepath=parameters["path_models"] + "/scaler_coeffs.pickle") data_set_scaler_coeffs.save(scaler_coeffs) input_columns = list(DWT_Aprox_coeff_input.columns) for std_coeffs, file in zip(all_wells_standardized_input, files): std_coeffs = pd.DataFrame(std_coeffs, columns=input_columns) data_set = CSVLocalDataSet(filepath=parameters["path_features"] + "/std_DWT_input_coeffs_" + file) data_set.save(std_coeffs) # Standardize static data scaler_static = StandardScaler() all_wells_standardized_properties = scaler_static.fit_transform(properties) data_set_scaler_static = PickleLocalDataSet( filepath=parameters["path_models"] + "/scaler_static.pickle") data_set_scaler_static.save(scaler_static) return [ all_wells_standardized_input_flattened, all_wells_standardized_properties, all_wells_labels ]
def discrete_wavelet_transform(Raw_Data_preprocessed: pd.DataFrame, parameters: Dict, files: List): for well_data, file in zip(Raw_Data_preprocessed, files): list_input_DWT_Aprox_coeff = [] input_data, labels = split(well_data) input_columns = list(input_data.columns) for data_idx in input_columns: signal = well_data[data_idx].values thresh = parameters["thresh"] * np.nanmax(signal) coeff = pywt.wavedec(signal, wavelet=parameters["wavelet"], mode=parameters["mode1"], level=parameters["level"]) coeff[1:] = (pywt.threshold(i, value=thresh, mode=str(parameters["mode2"])) for i in coeff[1:]) list_input_DWT_Aprox_coeff.append(coeff[0]) list_input_DWT_Aprox_coeff = pd.DataFrame( np.transpose(list_input_DWT_Aprox_coeff), columns=input_columns) data_set_input = CSVLocalDataSet(filepath=parameters["path_primary"] + "/input_DWT_coeffs_" + file) data_set_input.save(list_input_DWT_Aprox_coeff) data_set_labels = CSVLocalDataSet(filepath=parameters["path_primary"] + "/labels_" + file) data_set_labels.save(labels) dummy = labels return dummy
def test_run_load_versions(self, tmp_path, dummy_context, dummy_dataframe, mocker): class DummyContext(KedroContext): project_name = "bob" project_version = __version__ def _get_pipelines(self) -> Dict[str, Pipeline]: return {"__default__": Pipeline([node(identity, "cars", "boats")])} mocker.patch("logging.config.dictConfig") dummy_context = DummyContext(str(tmp_path)) filepath = str(dummy_context.project_path / "cars.csv") old_save_version = generate_timestamp() old_df = pd.DataFrame({"col1": [0, 0], "col2": [0, 0], "col3": [0, 0]}) old_csv_data_set = CSVLocalDataSet( filepath=filepath, save_args={"sep": ","}, version=Version(None, old_save_version), ) old_csv_data_set.save(old_df) new_save_version = generate_timestamp() new_csv_data_set = CSVLocalDataSet( filepath=filepath, save_args={"sep": ","}, version=Version(None, new_save_version), ) new_csv_data_set.save(dummy_dataframe) load_versions = {"cars": old_save_version} dummy_context.run(load_versions=load_versions) assert not dummy_context.catalog.load("boats").equals(dummy_dataframe) assert dummy_context.catalog.load("boats").equals(old_df)
def test_version_str_repr(self, load_version, save_version): """Test that version is in string representation of the class instance when applicable.""" filepath = "test.csv" ds = CSVLocalDataSet(filepath=filepath) ds_versioned = CSVLocalDataSet(filepath=filepath, version=Version(load_version, save_version)) assert filepath in str(ds) assert "version" not in str(ds) assert filepath in str(ds_versioned) ver_str = "version=Version(load={}, save='{}')".format( load_version, save_version) assert ver_str in str(ds_versioned)
def test_datasets_on_add(self, data_catalog_from_config): """Check datasets are updated correctly after adding""" data_catalog_from_config.add("new_dataset", CSVLocalDataSet("some_path")) assert isinstance( data_catalog_from_config.datasets.new_dataset, CSVLocalDataSet ) assert isinstance(data_catalog_from_config.datasets.boats, CSVLocalDataSet)
def prepare_csv_data_with_tabs(context): context.read_csv_path = create_sample_csv() context.write_csv_path = create_temp_csv() context.csv_data_set = CSVLocalDataSet( filepath=context.read_csv_path, load_args={"sep": "\t"}, save_args={"index": False, "sep": "\t"}, )
def save_well_data(Raw_Data_preprocessed, parameters, files): for well_data, file in zip(Raw_Data_preprocessed, files): steam_input_data, emulsion_input_data, labels = split(well_data) data_set_steam_input = CSVLocalDataSet( filepath=parameters["path_primary"] + "/steam_inputs_" + file) data_set_steam_input.save(steam_input_data) data_set_emulsion_input = CSVLocalDataSet( filepath=parameters["path_primary"] + "/emulsion_inputs_" + file) data_set_emulsion_input.save(emulsion_input_data) data_set_labels = CSVLocalDataSet(filepath=parameters["path_primary"] + "/labels_" + file) data_set_labels.save(labels) dummy = labels return dummy
def multi_catalog(mocker): csv = CSVLocalDataSet(filepath="abc.csv") parq = ParquetLocalDataSet(filepath="xyz.parq") journal = mocker.Mock() return DataCatalog({"abc": csv, "xyz": parq}, journal=journal)
def test_fails_with_remote_path(self, path): with pytest.raises(ValueError) as assertion: CSVLocalDataSet(filepath=path, save_args={"sep": ","}) assert "seems to be a remote file" in assertion.value.args[0]
def versioned_csv_data_set(filepath, load_version, save_version): return CSVLocalDataSet( filepath=filepath, save_args={"sep": ","}, version=Version(load_version, save_version), )
def csv_data_set(filepath, request): return CSVLocalDataSet(filepath=filepath, save_args=request.param)
def load_well_validation_data(dummy2, timesteps, parameters: Dict): import glob, os os.chdir(parameters["path_model_input"]) files_val = [] for file in glob.glob("*.csv"): files_val.append(file) filenames_val = [] wells_data = [] for file in files_val: filename, extension = file.split('.') filenames_val.append(filename) for file, filename in zip(files_val, filenames_val): io = DataCatalog({ filename: CSVLocalDataSet(filepath=parameters["path_model_input"] + "/" + file), }) well_data = io.load(filename) wells_data.append(well_data) Raw_Data_preprocessed_val = [] wells_life = [] wells_data_ = [] for well in wells_data: # well = well[['Date', 'Injector Bottom Hole Pressure', # 'Producer Bottom Hole Pressure', 'ESP Speed', # 'Steam Flow Rate - Outer', # 'Emulsion Flow Rate']] well = well[[ 'Date', 'Speed [Hz]', 'Current [A]', 'IBHP', 'PBHP', 'Co-Injection [E3m3/d]', 'Oil [bbl/d]', 'Steam [m3/d]', 'Emulsion [m3/d]' ]] for i in range(1, len(well.columns)): well[well.columns[i]] = pd.to_numeric(well[well.columns[i]], errors='coerce') well['Prod_Date'] = pd.to_datetime(well['Date']) well = well.set_index('Prod_Date') # well = well.dropna(axis=0) # may change # well = well.resample('7D').mean() # weekly data # well = well.resample('30D').mean() # monthly data # well = well.rolling(30, min_periods=1).mean() # well = well.rolling(30, min_periods=1).mean() data = well['Oil [bbl/d]'] / 6.28981 well.insert(4, 'Oil [m3/d]', data) time_data = np.arange(len(well)) well.insert(0, 'Timestep', time_data) wells_life.append(len(well)) wells_data_.append(well) min_well_length = np.min(np.array(wells_life)) if min_well_length < timesteps: timesteps_validation = min_well_length else: timesteps_validation = timesteps for well, file, filename in zip(wells_data_, files_val, filenames_val): well = well.iloc[:timesteps_validation] # daily data # well = well.fillna(0) # well = well.fillna(well.rolling(30, min_periods=1).median()) # well = well.fillna(well.median()) Raw_Data_preprocessed_val.append(well) stats_validation = CSVLocalDataSet(filepath=parameters["path_val_stats"] + "/static_P50_data_validation.csv") stats_val = stats_validation.load() stats_val_ROIP = stats_val.loc[:, 'ROIP'] stats_val = stats_val.loc[:, 'Effective_Length':'BottomWater_Oil_Saturation'] # # using only rich geoostats and no bottom water properties # stats_val = stats_val.loc[:, 'Effective_Length':'Rich_Oil_Saturation'] # # Using "Effective_Rich_Pay_Thickness" to account for standoff and rich thickness # data = stats_val['Rich_Pay_Thickness'] - stats_val['Stand_Off'] # stats_val.insert(3, 'Effective_Rich_Pay_Thickness', data) # stats_val = stats_val.drop(columns = ['Rich_Pay_Thickness', 'Stand_Off']) property_names_val = list(stats_val.columns) properties_val = list(stats_val.values) # properties_val = stats.loc[:, ['Effective_Length', 'Spacing', 'Effective_Rich_Pay_Thickness', 'Non_Rich_Pay_Thickness', # 'Rich_Vertical_Permeability','Non_Rich_Vertical_Permeability', 'Rich_Porosity', # 'Non_Rich_Porosity', 'Rich_Oil_Saturation', 'Non_Rich_Oil_Saturation']].values properties_val = np.array(properties_val) dummy11 = files_val return [ dummy11, timesteps_validation, Raw_Data_preprocessed_val, files_val, filenames_val, properties_val, stats_val_ROIP, property_names_val ]
def test_sequential_save_and_load(self, dummy_dataframe, filepath): """Tests if the correct load version is logged when two datasets are saved sequentially.""" dataset1 = CSVLocalDataSet( filepath=filepath, save_args={"sep": ","}, version=Version(None, "2000-01-01"), ) dataset2 = CSVLocalDataSet( filepath=filepath, save_args={"sep": ","}, version=Version(None, "2001-01-01"), ) dataset1.save(dummy_dataframe) last_save_version1 = dataset1.get_last_save_version() dataset2.save(dummy_dataframe) last_save_version2 = dataset2.get_last_save_version() dataset2.load() last_load_version = dataset2.get_last_load_version() assert last_save_version2 == last_load_version assert last_save_version1 != last_save_version2
def __init__( self, train_df=None, # type: Optional[pd.DataFrame] test_df=None, # type: Optional[pd.DataFrame] cols_features=None, # type: Optional[List[str]] col_treatment="Treatment", # type: str col_outcome="Outcome", # type: str col_propensity="Propensity", # type: str col_cate="CATE", # type: str col_recommendation="Recommendation", # type: str min_propensity=0.01, # type: float max_propensity=0.99, # type: float verbose=2, # type: int uplift_model_params=dict( search_cv="sklearn.model_selection.GridSearchCV", estimator="xgboost.XGBClassifier", scoring=None, cv=3, return_train_score=False, n_jobs=-1, param_grid=dict( random_state=[0], max_depth=[3], learning_rate=[0.1], n_estimators=[100], verbose=[0], objective=["binary:logistic"], booster=["gbtree"], n_jobs=[-1], nthread=[None], gamma=[0], min_child_weight=[1], max_delta_step=[0], subsample=[1], colsample_bytree=[1], colsample_bylevel=[1], reg_alpha=[0], reg_lambda=[1], scale_pos_weight=[1], base_score=[0.5], missing=[None], ), ), # type: Union[Dict[str, List[Any]], Type[sklearn.base.BaseEstimator]] enable_ipw=True, # type: bool propensity_model_params=dict( search_cv="sklearn.model_selection.GridSearchCV", estimator="sklearn.linear_model.LogisticRegression", scoring=None, cv=3, return_train_score=False, n_jobs=-1, param_grid=dict( random_state=[0], C=[0.1, 1, 10], class_weight=[None], dual=[False], fit_intercept=[True], intercept_scaling=[1], max_iter=[100], multi_class=["ovr"], n_jobs=[1], penalty=["l1", "l2"], solver=["liblinear"], tol=[0.0001], warm_start=[False], ), ), # type: Dict[str, List[Any]] cv=3, # type: int index_name="index", # type: str partition_name="partition", # type: str runner="SequentialRunner", # type: str conditionally_skip=False, # type: bool dataset_catalog=dict( # args_raw = CSVLocalDataSet(filepath='../data/01_raw/args_raw.csv', version=None), # train_df = CSVLocalDataSet(filepath='../data/01_raw/train_df.csv', version=None), # test_df = CSVLocalDataSet(filepath='../data/01_raw/test_df.csv', version=None), propensity_model=PickleLocalDataSet( filepath="../data/06_models/propensity_model.pickle", version=None), uplift_models_dict=PickleLocalDataSet( filepath="../data/06_models/uplift_models_dict.pickle", version=None), df_03=CSVLocalDataSet( filepath="../data/07_model_output/df.csv", load_args=dict(index_col=["partition", "index"], float_precision="high"), save_args=dict(index=True, float_format="%.16e"), version=None, ), treated__sim_eval_df=CSVLocalDataSet( filepath="../data/08_reporting/treated__sim_eval_df.csv", version=None), untreated__sim_eval_df=CSVLocalDataSet( filepath="../data/08_reporting/untreated__sim_eval_df.csv", version=None), estimated_effect_df=CSVLocalDataSet( filepath="../data/08_reporting/estimated_effect_df.csv", version=None), ), # type: Dict[str, AbstractDataSet] logging_config={ "disable_existing_loggers": False, "formatters": { "json_formatter": { "class": "pythonjsonlogger.jsonlogger.JsonFormatter", "format": "[%(asctime)s|%(name)s|%(funcName)s|%(levelname)s] %(message)s", }, "simple": { "format": "[%(asctime)s|%(name)s|%(levelname)s] %(message)s" }, }, "handlers": { "console": { "class": "logging.StreamHandler", "formatter": "simple", "level": "INFO", "stream": "ext://sys.stdout", }, "info_file_handler": { "class": "logging.handlers.RotatingFileHandler", "level": "INFO", "formatter": "simple", "filename": "./info.log", "maxBytes": 10485760, # 10MB "backupCount": 20, "encoding": "utf8", "delay": True, }, "error_file_handler": { "class": "logging.handlers.RotatingFileHandler", "level": "ERROR", "formatter": "simple", "filename": "./errors.log", "maxBytes": 10485760, # 10MB "backupCount": 20, "encoding": "utf8", "delay": True, }, }, "loggers": { "anyconfig": { "handlers": ["console", "info_file_handler", "error_file_handler"], "level": "WARNING", "propagate": False, }, "kedro.io": { "handlers": ["console", "info_file_handler", "error_file_handler"], "level": "WARNING", "propagate": False, }, "kedro.pipeline": { "handlers": ["console", "info_file_handler", "error_file_handler"], "level": "INFO", "propagate": False, }, "kedro.runner": { "handlers": ["console", "info_file_handler", "error_file_handler"], "level": "INFO", "propagate": False, }, "causallift": { "handlers": ["console", "info_file_handler", "error_file_handler"], "level": "INFO", "propagate": False, }, }, "root": { "handlers": ["console", "info_file_handler", "error_file_handler"], "level": "INFO", }, "version": 1, }, # type: Optional[Dict[str, Any]] ): # type: (...) -> None self.runner = None # type: Optional[str] self.kedro_context = None # type: Optional[Type[FlexibleKedroContext]] self.args = None # type: Optional[Type[EasyDict]] self.train_df = None # type: Optional[Type[pd.DataFrame]] self.test_df = None # type: Optional[Type[pd.DataFrame]] self.df = None # type: Optional[Type[pd.DataFrame]] self.propensity_model = None # type: Optional[Type[sklearn.base.BaseEstimator]] self.uplift_models_dict = None # type: Optional[Type[EasyDict]] self.treatment_fractions = None # type: Optional[Type[EasyDict]] self.treatment_fraction_train = None # type: Optional[float] self.treatment_fraction_test = None # type: Optional[float] self.treated__proba = None # type: Optional[Type[np.array]] self.untreated__proba = None # type: Optional[Type[np.array]] self.cate_estimated = None # type: Optional[Type[pd.Series]] self.treated__sim_eval_df = None # type: Optional[Type[pd.DataFrame]] self.untreated__sim_eval_df = None # type: Optional[Type[pd.DataFrame]] self.estimated_effect_df = None # type: Optional[Type[pd.DataFrame]] # Instance attributes were defined above. if logging_config: logging.config.dictConfig(logging_config) args_raw = dict( cols_features=cols_features, col_treatment=col_treatment, col_outcome=col_outcome, col_propensity=col_propensity, col_cate=col_cate, col_recommendation=col_recommendation, min_propensity=min_propensity, max_propensity=max_propensity, verbose=verbose, uplift_model_params=uplift_model_params, enable_ipw=enable_ipw, propensity_model_params=propensity_model_params, index_name=index_name, partition_name=partition_name, runner=runner, conditionally_skip=conditionally_skip, ) args_raw = EasyDict(args_raw) args_raw.update( dataset_catalog.get("args_raw", MemoryDataSet({}).load())) assert args_raw.runner in {"SequentialRunner", "ParallelRunner", None} if args_raw.runner is None and args_raw.conditionally_skip: log.warning( "[Warning] conditionally_skip option is ignored since runner is None" ) self.kedro_context = FlexibleKedroContext( runner=args_raw.runner, only_missing=args_raw.conditionally_skip) self.runner = args_raw.runner if self.runner is None: self.df = bundle_train_and_test_data(args_raw, train_df, test_df) self.args = impute_cols_features(args_raw, self.df) self.args = schedule_propensity_scoring(self.args, self.df) self.treatment_fractions = treatment_fractions_(self.args, self.df) if self.args.need_propensity_scoring: self.propensity_model = fit_propensity(self.args, self.df) self.df = estimate_propensity(self.args, self.df, self.propensity_model) if self.runner: self.kedro_context.catalog.add_feed_dict( { "train_df": MemoryDataSet(train_df), "test_df": MemoryDataSet(test_df), "args_raw": MemoryDataSet(args_raw), }, replace=True, ) self.kedro_context.catalog.add_feed_dict(dataset_catalog, replace=True) self.kedro_context.run(tags=["011_bundle_train_and_test_data"]) self.df = self.kedro_context.catalog.load("df_00") self.kedro_context.run(tags=[ "121_prepare_args", "131_treatment_fractions_", "141_initialize_model", ]) self.args = self.kedro_context.catalog.load("args") self.treatment_fractions = self.kedro_context.catalog.load( "treatment_fractions") if self.args.need_propensity_scoring: self.kedro_context.run(tags=["211_fit_propensity"]) self.propensity_model = self.kedro_context.catalog.load( "propensity_model") self.kedro_context.run(tags=["221_estimate_propensity"]) self.df = self.kedro_context.catalog.load("df_01") else: self.kedro_context.catalog.add_feed_dict( {"df_01": MemoryDataSet(self.df)}, replace=True) self.treatment_fraction_train = self.treatment_fractions.train self.treatment_fraction_test = self.treatment_fractions.test if self.args.verbose >= 3: log.info("### Treatment fraction in train dataset: {}".format( self.treatment_fractions.train)) log.info("### Treatment fraction in test dataset: {}".format( self.treatment_fractions.test)) self._separate_train_test()
def multi_catalog(): csv = CSVLocalDataSet(filepath="abc.csv") parq = ParquetLocalDataSet(filepath="xyz.parq") return DataCatalog({"abc": csv, "xyz": parq})
def data_set(filepath): return CSVLocalDataSet(filepath=filepath, save_args={"index": False})
def save_predicted_data(dummy15, well_count, all_wells_dates_input, wells_steam_rate_actual, wells_steam_rate_predicted, wells_emulsion_rate_actual, wells_emulsion_rate_predicted, steam_input_column, all_wells_steam_input_val, emulsion_input_column, all_wells_emulsion_input_val, labels_column, parameters, files_val, scheme): # to input wells_RF_array for RF case print("Prediction Performance:\n") print("Steam Flow Rate:") for well, file in zip(well_count, files_val): dates = all_wells_dates_input[well].T steam_input = all_wells_steam_input_val[well].T steam_rate_actual = wells_steam_rate_actual[:, well] steam_rate_predicted = wells_steam_rate_predicted[:, well] emulsion_rate_predicted = wells_emulsion_rate_predicted[:, well] if scheme == 1: steam_rate_actual_predicted = pd.DataFrame( np.vstack((dates, steam_input, steam_rate_actual, steam_rate_predicted)).T, columns=["Date"] + list(steam_input_column) + [ labels_column[0] + " actual", labels_column[0] + " predicted" ]) data_set_steam_rate = CSVLocalDataSet( filepath=parameters["path_model_output_No_DWT_scheme1"] + "/steam_rate_" + file) elif scheme == 2: steam_rate_actual_predicted = pd.DataFrame( np.vstack((dates, steam_input, emulsion_rate_predicted, steam_rate_actual, steam_rate_predicted)).T, columns=["Date"] + list(steam_input_column) + [ labels_column[1] + " predicted", labels_column[0] + " actual", labels_column[0] + " predicted" ]) data_set_steam_rate = CSVLocalDataSet( filepath=parameters["path_model_output_No_DWT_scheme2"] + "/steam_rate_" + file) elif scheme == 3: RF_input = wells_RF_array[well].T steam_rate_actual_predicted = pd.DataFrame( np.vstack((dates, RF_input, steam_input, steam_rate_actual, steam_rate_predicted)).T, columns=["Date", "RF"] + list(steam_input_column) + [ labels_column[0] + " actual", labels_column[0] + " predicted" ]) data_set_steam_rate = CSVLocalDataSet( filepath=parameters["path_model_output_No_DWT_RF_scheme1"] + "/steam_rate_" + file) else: RF_input = wells_RF_array[well].T steam_rate_actual_predicted = pd.DataFrame( np.vstack( (dates, RF_input, steam_input, emulsion_rate_predicted, steam_rate_actual, steam_rate_predicted)).T, columns=["Date", "RF"] + list(steam_input_column) + [ labels_column[1] + " predicted", labels_column[0] + " actual", labels_column[0] + " predicted" ]) data_set_steam_rate = CSVLocalDataSet( filepath=parameters["path_model_output_No_DWT_RF_scheme2"] + "/steam_rate_" + file) data_set_steam_rate.save(steam_rate_actual_predicted) print(file + " R_squared: {0:.4f}".format( r2_score(steam_rate_actual, steam_rate_predicted))) print("\n") print("Oil Rate:") for well, file in zip(well_count, files_val): dates = all_wells_dates_input[well].T emulsion_input = all_wells_emulsion_input_val[well].T emulsion_rate_actual = wells_emulsion_rate_actual[:, well] steam_rate_predicted = wells_steam_rate_predicted[:, well] emulsion_rate_predicted = wells_emulsion_rate_predicted[:, well] if scheme == 1: emulsion_rate_actual_predicted = pd.DataFrame( np.vstack((dates, emulsion_input, steam_rate_predicted, emulsion_rate_actual, emulsion_rate_predicted)).T, columns=["Date"] + list(emulsion_input_column) + [ labels_column[0] + " predicted", labels_column[1] + " actual", labels_column[1] + " predicted" ]) data_set_emulsion_rate = CSVLocalDataSet( filepath=parameters["path_model_output_No_DWT_scheme1"] + "/emulsion_rate_" + file) elif scheme == 2: emulsion_rate_actual_predicted = pd.DataFrame( np.vstack((dates, emulsion_input, emulsion_rate_actual, emulsion_rate_predicted)).T, columns=["Date"] + list(emulsion_input_column) + [ labels_column[1] + " actual", labels_column[1] + " predicted" ]) data_set_emulsion_rate = CSVLocalDataSet( filepath=parameters["path_model_output_No_DWT_scheme2"] + "/emulsion_rate_" + file) elif scheme == 3: # cum_input = wells_cum_oil_array[well].T RF_input = wells_RF_array[well].T emulsion_rate_actual_predicted = pd.DataFrame( np.vstack( (dates, RF_input, emulsion_input, steam_rate_predicted, emulsion_rate_actual, emulsion_rate_predicted)).T, columns=["Date", "RF"] + list(emulsion_input_column) + [ labels_column[0] + " predicted", labels_column[1] + " actual", labels_column[1] + " predicted" ]) data_set_emulsion_rate = CSVLocalDataSet( filepath=parameters["path_model_output_No_DWT_RF_scheme1"] + "/emulsion_rate_" + file) else: RF_input = wells_RF_array[well].T emulsion_rate_actual_predicted = pd.DataFrame( np.vstack((dates, RF_input, emulsion_input, emulsion_rate_actual, emulsion_rate_predicted)).T, columns=["Date", "RF"] + list(emulsion_input_column) + [ labels_column[1] + " actual", labels_column[1] + " predicted" ]) data_set_emulsion_rate = CSVLocalDataSet( filepath=parameters["path_model_output_No_DWT_RF_scheme2"] + "/emulsion_rate_" + file) data_set_emulsion_rate.save(emulsion_rate_actual_predicted) print(file + " R_squared: {0:.4f}".format( r2_score(emulsion_rate_actual, emulsion_rate_predicted))) print("\n") dummy_validate = files_val return dummy_validate
def preprocess_raw_data(parameters: Dict): import glob, os # os.chdir(parameters["path_raw"]) os.chdir(parameters["path_raw_matlab"]) files = [] for file in glob.glob("*.csv"): files.append(file) filenames = [] wells_data = [] for file in files: filename, extension = file.split('.') filenames.append(filename) for file, filename in zip(files, filenames): io = DataCatalog({ # filename: CSVLocalDataSet(filepath=parameters["path_raw"]+"/"+file), filename: CSVLocalDataSet(filepath=parameters["path_raw_matlab"] + "/" + file), }) well_data = io.load(filename) wells_data.append(well_data) Raw_Data_preprocessed = [] Raw_Data_dated = [] wells_life = [] wells_data_ = [] for well in wells_data: # well = well[['Date', 'Injector Bottom Hole Pressure', # 'Producer Bottom Hole Pressure', 'ESP Speed', # 'Steam Flow Rate - Outer', # 'Emulsion Flow Rate']] well = well[[ 'Date', 'Speed [Hz]', 'Current [A]', 'IBHP', 'PBHP', 'Co-Injection [E3m3/d]', 'Oil [bbl/d]', 'Steam [m3/d]', 'Emulsion [m3/d]' ]] for i in range(1, len(well.columns)): well[well.columns[i]] = pd.to_numeric(well[well.columns[i]], errors='coerce') well['Prod_Date'] = pd.to_datetime(well['Date']) well = well.set_index('Prod_Date') # well = well.resample('7D').mean() # weekly data # well = well.resample('30D').mean() # monthly data # well = well.rolling(30, min_periods=1).mean() data = well['Oil [bbl/d]'] / 6.28981 well.insert(4, 'Oil [m3/d]', data) time_data = np.arange(len(well)) well.insert(0, 'Timestep', time_data) wells_life.append(len(well)) wells_data_.append(well) min_well_length = np.min(np.array(wells_life)) print((min_well_length, np.argmin(np.array(wells_life)))) timesteps = min_well_length # 1008 # timesteps = 371 for well, file, filename in zip(wells_data_, files, filenames): well = well.iloc[:timesteps] # daily, weekly, monthly data # well = well.fillna(0) # well = well.fillna(well.rolling(30, min_periods=1).median()) # well = well.fillna(well.median()) well["Well"] = filename # create a column for well name well = well.reset_index(drop=True) # remove date index data_set = CSVLocalDataSet(filepath=parameters["path_intermediate"] + "/pre_processed_data_" + file) data_set.save(well) Raw_Data_dated.append(well) well = well.drop(columns=['Date', 'Well']) Raw_Data_preprocessed.append(well) # stats_training = CSVLocalDataSet(filepath=parameters["path_raw_static"]+"/static_P50_data_training.csv") stats_training = CSVLocalDataSet( filepath=parameters["path_raw_static_matlab"] + "/static_P50_data_training_152_wells.csv") stats = stats_training.load() stats_ROIP = stats.loc[:, 'ROIP'] stats = stats.loc[:, 'Effective_Length':'BottomWater_Oil_Saturation'] # # using only rich geoostats and no bottom water properties # stats = stats.loc[:, 'Effective_Length':'Rich_Oil_Saturation'] # # Using "Effective_Rich_Pay_Thickness" to account for standoff and rich thickness # data = stats['Rich_Pay_Thickness'] - stats['Stand_Off'] # stats.insert(3, 'Effective_Rich_Pay_Thickness', data) # stats = stats.drop(columns = ['Rich_Pay_Thickness', 'Stand_Off']) property_names = list(stats.columns) properties = list(stats.values) properties = np.array(properties) return [ timesteps, Raw_Data_preprocessed, Raw_Data_dated, files, filenames, properties, stats_ROIP, property_names ]
def save_well_validation_data(dummy11, Raw_Data_preprocessed_val, parameters, files_val): # def save_well_validation_data(dummy12, Raw_Data_preprocessed_val_, parameters, files_val): all_wells_dates_input = [] all_wells_steam_input_val = [] all_wells_emulsion_input_val = [] all_wells_labels_val = [] for well_data, file in zip(Raw_Data_preprocessed_val, files_val): # for well_data, file in zip(Raw_Data_preprocessed_val_, files_val): steam_input_data, emulsion_input_data, labels = split(well_data) data_set_steam_input = CSVLocalDataSet( filepath=parameters["path_val_pre_processed"] + "/vali_steam_inputs_" + file) data_set_steam_input.save(steam_input_data) all_wells_dates_input.append(steam_input_data['Date'].values) steam_input_data = steam_input_data.drop(columns='Date') all_wells_steam_input_val.append(steam_input_data.values) data_set_emulsion_input = CSVLocalDataSet( filepath=parameters["path_val_pre_processed"] + "/vali_emulsion_inputs_" + file) data_set_emulsion_input.save(emulsion_input_data) all_wells_emulsion_input_val.append(emulsion_input_data.values) data_set_labels = CSVLocalDataSet( filepath=parameters["path_val_pre_processed"] + "/validation_labels_" + file) data_set_labels.save(labels) all_wells_labels_val.append(labels.values) steam_input_column = steam_input_data.columns emulsion_input_column = emulsion_input_data.columns labels_column = list(labels.columns) all_wells_dates_input = np.array(all_wells_dates_input) all_wells_steam_input_val = np.array(all_wells_steam_input_val) all_wells_emulsion_input_val = np.array(all_wells_emulsion_input_val) dummy13 = files_val return [ dummy13, steam_input_column, emulsion_input_column, labels_column, all_wells_dates_input, all_wells_steam_input_val, all_wells_emulsion_input_val, all_wells_labels_val ]
def validate(parameters: Dict): # def validate(dummy2, parameters: Dict): import glob, os os.chdir(parameters["path_model_input"]) files = [] for file in glob.glob("*.csv"): files.append(file) filenames = [] wells_data = [] for file in files: filename, extension = file.split('.') filenames.append(filename) for file, filename in zip(files, filenames): io = DataCatalog({ filename: CSVLocalDataSet(filepath=parameters["path_model_input"] + "/" + file), }) well_data = io.load(filename) wells_data.append(well_data) Raw_Data_preprocessed = [] for well, file, filename in zip(wells_data, files, filenames): well = well[[ 'Date', 'Injector Bottom Hole Pressure', 'Steam Flow Rate - Outer', 'Bottom Hole Heel Temperature', 'Emulsion Pressure', 'Producer Bottom Hole Pressure', 'ESP Speed', 'Emulsion Flow Rate' ]] for i in range(1, len(well.columns)): well[well.columns[i]] = pd.to_numeric(well[well.columns[i]], errors='coerce') well = well.iloc[:1399] well = well.fillna(well.rolling(30, min_periods=1).median()) well = well.fillna(well.median()) well['Date'] = pd.to_datetime(well['Date']) well = well.set_index('Date') Raw_Data_preprocessed.append(well) os.chdir(parameters["path_val_stats"]) static_files = [] for static_file in glob.glob("*.csv"): static_files.append(static_file) static_filenames = [] statics_data = [] for static_file in static_files: static_filename, others = static_file.split('_') static_filenames.append(static_filename) for static_file, static_filename in zip(static_files, static_filenames): io = DataCatalog({ static_filename: CSVLocalDataSet(filepath=parameters["path_val_stats"] + "/" + static_file), }) static_data = io.load(static_filename) statics_data.append(static_data) statics_data_new = [] well_name_list = [] for pad_static in statics_data: well_name = pad_static['WELLPAIR_NAME'].values well_name_list.append(well_name) pad_static = pad_static.set_index('WELLPAIR_NAME') pad_static = pad_static.drop(columns=['PLAN_NAME', 'HIGH_PRESSURE']) statics_data_new.append(pad_static) properties = [] probabilities = [] asset_names = [] for pad_static, names in zip(statics_data_new, well_name_list): for well in names: prob = pad_static.loc[well, 'Forecast_Prob'] probabilities.append(prob) pad_code = pad_static.loc[well, 'PAD_CODE'] asset_name, pad = pad_code.split('_') asset_names.append(asset_name) property_ = pad_static.loc[ well, 'SAGD_PRESSURE':'BOTTOM_WATER_THICKNESS'].values properties.append(property_) properties = np.array(properties) all_wells_input = [] all_wells_labels = [] for well_data, file in zip(Raw_Data_preprocessed, files): DWT_Aprox_coeff_input = [] input_data, labels = split(well_data) input_columns = list(input_data.columns) for data_idx in input_columns: signal = well_data[data_idx].values thresh = parameters["thresh"] * np.nanmax(signal) coeff = pywt.wavedec(signal, wavelet=parameters["wavelet"], mode=parameters["mode1"], level=parameters["level"]) coeff[1:] = (pywt.threshold(i, value=thresh, mode=str(parameters["mode2"])) for i in coeff[1:]) DWT_Aprox_coeff_input.append(coeff[0]) DWT_Aprox_coeff_input = pd.DataFrame( np.transpose(DWT_Aprox_coeff_input), columns=input_columns) data_set_input = CSVLocalDataSet( filepath=parameters["path_val_pre_processed"] + "/validation_input_DWT_coeffs_" + file) data_set_input.save(DWT_Aprox_coeff_input) all_wells_input.append(DWT_Aprox_coeff_input.values) data_set_labels = CSVLocalDataSet( filepath=parameters["path_val_pre_processed"] + "/validation_labels_" + file) data_set_labels.save(labels) all_wells_labels.append(labels.values) # Standardize dynamic data coeffs data_set_scaler_coeffs = PickleLocalDataSet( filepath=parameters["path_models"] + "/scaler_coeffs.pickle") scaler_coeffs = data_set_scaler_coeffs.load() all_wells_standardized_input = [] all_wells_standardized_input_flattened = [] for well_coeffs in all_wells_input: std_coeffs = scaler_coeffs.transform(well_coeffs) all_wells_standardized_input.append(std_coeffs) transposed_std_coeffs = np.transpose(std_coeffs) flattened_std_coeffs = transposed_std_coeffs.flatten() all_wells_standardized_input_flattened.append(flattened_std_coeffs) all_wells_standardized_input = np.array(all_wells_standardized_input) all_wells_standardized_input_flattened = np.array( all_wells_standardized_input_flattened) input_columns = list(DWT_Aprox_coeff_input.columns) for std_coeffs, file in zip(all_wells_standardized_input, files): std_coeffs = pd.DataFrame(std_coeffs, columns=input_columns) data_set = CSVLocalDataSet( filepath=parameters["path_val_pre_processed"] + "/validation_std_DWT_input_coeffs_" + file) data_set.save(std_coeffs) # Standardize static data data_set_scaler_static = PickleLocalDataSet( filepath=parameters["path_models"] + "/scaler_static.pickle") scaler_static = data_set_scaler_static.load() all_wells_standardized_properties = scaler_static.fit_transform(properties) all_wells_coeffs_reservoir_data = [] for flattened_std_coeffs, standardized_properties in zip( all_wells_standardized_input_flattened, all_wells_standardized_properties): flattened_std_coeffs = list(flattened_std_coeffs) standardized_properties = list(standardized_properties) for reservoir_property in standardized_properties: flattened_std_coeffs.append( reservoir_property ) # append reservoir data to dynamic data coeffs all_wells_coeffs_reservoir_data.append(flattened_std_coeffs) all_wells_coeffs_reservoir_data = np.array(all_wells_coeffs_reservoir_data) well_count = np.arange(len(all_wells_coeffs_reservoir_data)) daily_timesteps = np.arange(len(all_wells_labels[0])) input_data = [] for coeff_inputs in all_wells_coeffs_reservoir_data: for time_lapse in daily_timesteps: well_inputs = [time_lapse] + list( coeff_inputs) # append time lapse to input data input_data.append(well_inputs) input_data = np.array(input_data) data_set_regressor_1 = PickleLocalDataSet( filepath=parameters["path_models"] + "/regressor_1.pickle") regressor_1 = data_set_regressor_1.load() number_of_wells = len(well_count) wells_steam_rate_predicted = regressor_1.predict(input_data) wells_steam_rate_predicted = wells_steam_rate_predicted.reshape( (number_of_wells, 1399)).T # prediction inputs to model 2 input_data_model_2 = [] for coeff_inputs, well in zip(all_wells_coeffs_reservoir_data, well_count): for time_lapse in daily_timesteps: well_inputs = [time_lapse] + list( coeff_inputs) # append time lapse to input data well_inputs_model_2 = [ wells_steam_rate_predicted[time_lapse, well] ] + well_inputs input_data_model_2.append(well_inputs_model_2) input_data_model_2 = np.array(input_data_model_2) data_set_regressor_2 = PickleLocalDataSet( filepath=parameters["path_models"] + "/regressor_2.pickle") regressor_2 = data_set_regressor_2.load() wells_emulsion_rate_predicted = regressor_2.predict(input_data_model_2) wells_emulsion_rate_predicted = wells_emulsion_rate_predicted.reshape( (number_of_wells, 1399)).T # actual targets all_wells_steam_data = [] all_wells_emulsion_data = [] for ID in well_count: well_steam_data = all_wells_labels[ID][:, 0] well_emulsion_data = all_wells_labels[ID][:, 1] all_wells_steam_data = all_wells_steam_data + list(well_steam_data) all_wells_emulsion_data = all_wells_emulsion_data + list( well_emulsion_data) all_wells_steam_data = np.array(all_wells_steam_data) all_wells_emulsion_data = np.array(all_wells_emulsion_data) wells_steam_rate_actual = all_wells_steam_data.reshape( (number_of_wells, 1399)).T wells_emulsion_rate_actual = all_wells_emulsion_data.reshape( (number_of_wells, 1399)).T print("Prediction Performance:\n") print("Steam Flow Rate:") for well, file in zip(well_count, files): steam_rate_predicted = wells_steam_rate_predicted[:, well] steam_rate_actual = wells_steam_rate_actual[:, well] steam_rate_actual_predicted = pd.DataFrame( np.vstack((steam_rate_actual, steam_rate_predicted)).T, columns=["steam rate actual", "steam rate predicted"]) data_set_steam_rate = CSVLocalDataSet( filepath=parameters["path_model_output"] + "/steam_rate_" + file) data_set_steam_rate.save(steam_rate_actual_predicted) print(file + " R_squared: {0:.4f}".format( r2_score(steam_rate_actual, steam_rate_predicted))) print("\n") print("Emulsion Flow Rate:") for well, file in zip(well_count, files): emulsion_rate_predicted = wells_emulsion_rate_predicted[:, well] emulsion_rate_actual = wells_emulsion_rate_actual[:, well] emulsion_rate_actual_predicted = pd.DataFrame( np.vstack((emulsion_rate_actual, emulsion_rate_predicted)).T, columns=["emulsion rate actual", "emulsion rate predicted"]) data_set_emulsion_rate = CSVLocalDataSet( filepath=parameters["path_model_output"] + "/emulsion_rate_" + file) data_set_emulsion_rate.save(emulsion_rate_actual_predicted) print(file + " R_squared: {0:.4f}".format( r2_score(emulsion_rate_actual, emulsion_rate_predicted))) dummy_validate = files return dummy_validate
def preprocess_raw_data(parameters: Dict): import glob, os os.chdir(parameters["path_raw"]) files = [] for file in glob.glob("*.csv"): files.append(file) filenames = [] wells_data = [] for file in files: filename, extension = file.split('.') filenames.append(filename) for file, filename in zip(files, filenames): io = DataCatalog({ filename: CSVLocalDataSet(filepath=parameters["path_raw"] + "/" + file), }) well_data = io.load(filename) wells_data.append(well_data) Raw_Data_preprocessed = [] Raw_Data_dated = [] for well, file, filename in zip(wells_data, files, filenames): well = well[[ 'Date', 'Injector Bottom Hole Pressure', 'Steam Flow Rate - Outer', 'Bottom Hole Heel Temperature', 'Emulsion Pressure', 'Producer Bottom Hole Pressure', 'ESP Speed', 'Emulsion Flow Rate' ]] for i in range(1, len(well.columns)): well[well.columns[i]] = pd.to_numeric(well[well.columns[i]], errors='coerce') well = well.iloc[:1399] well = well.fillna(well.rolling(30, min_periods=1).median()) well = well.fillna(well.median()) well_dated = well.copy() well_dated["Well"] = filename # create a column for well name data_set = CSVLocalDataSet(filepath=parameters["path_intermediate"] + "/pre_processed_data_" + file) data_set.save(well_dated) Raw_Data_dated.append(well_dated) well['Date'] = pd.to_datetime(well['Date']) well = well.set_index('Date') Raw_Data_preprocessed.append(well) os.chdir(parameters["path_raw_static"]) static_files = [] for static_file in glob.glob("*.csv"): static_files.append(static_file) static_filenames = [] statics_data = [] for static_file in static_files: static_filename, others = static_file.split('_') static_filenames.append(static_filename) for static_file, static_filename in zip(static_files, static_filenames): io = DataCatalog({ static_filename: CSVLocalDataSet(filepath=parameters["path_raw_static"] + "/" + static_file), }) static_data = io.load(static_filename) statics_data.append(static_data) statics_data_new = [] well_name_list = [] for pad_static in statics_data: well_name = pad_static['WELLPAIR_NAME'].values well_name_list.append(well_name) pad_static = pad_static.set_index('WELLPAIR_NAME') pad_static = pad_static.drop(columns=['PLAN_NAME', 'HIGH_PRESSURE']) statics_data_new.append(pad_static) properties = [] probabilities = [] asset_names = [] for pad_static, names in zip(statics_data_new, well_name_list): for well in names: prob = pad_static.loc[well, 'Forecast_Prob'] probabilities.append(prob) pad_code = pad_static.loc[well, 'PAD_CODE'] asset_name, pad = pad_code.split('_') asset_names.append(asset_name) property_ = pad_static.loc[ well, 'SAGD_PRESSURE':'BOTTOM_WATER_THICKNESS'].values properties.append(property_) properties = np.array(properties) return [ Raw_Data_preprocessed, Raw_Data_dated, files, filenames, probabilities, asset_names, properties ]