def test_sequential_save_and_load(self, dummy_dataframe, filepath): """Tests if the correct load version is logged when two datasets are saved sequentially.""" dataset1 = CSVLocalDataSet( filepath=filepath, save_args={"sep": ","}, version=Version(None, "2000-01-01"), ) dataset2 = CSVLocalDataSet( filepath=filepath, save_args={"sep": ","}, version=Version(None, "2001-01-01"), ) dataset1.save(dummy_dataframe) last_save_version1 = dataset1.get_last_save_version() dataset2.save(dummy_dataframe) last_save_version2 = dataset2.get_last_save_version() dataset2.load() last_load_version = dataset2.get_last_load_version() assert last_save_version2 == last_load_version assert last_save_version1 != last_save_version2
def load_data(dummy, files: List, parameters: Dict): all_wells_steam_input = [] all_wells_emulsion_input = [] all_wells_labels = [] for file in files: data_set_steam_input = CSVLocalDataSet( filepath=parameters["path_primary"] + "/steam_inputs_" + file) steam_input_data = data_set_steam_input.load() all_wells_steam_input.append(steam_input_data.values) data_set_emulsion_input = CSVLocalDataSet( filepath=parameters["path_primary"] + "/emulsion_inputs_" + file) emulsion_input_data = data_set_emulsion_input.load() all_wells_emulsion_input.append(emulsion_input_data.values) data_set_labels = CSVLocalDataSet(filepath=parameters["path_primary"] + "/labels_" + file) labels = data_set_labels.load() all_wells_labels.append(labels.values) steam_input_names = steam_input_data.columns emulsion_input_names = emulsion_input_data.columns all_wells_steam_input = np.array(all_wells_steam_input) all_wells_emulsion_input = np.array(all_wells_emulsion_input) return [ all_wells_steam_input, all_wells_emulsion_input, all_wells_labels, steam_input_names, emulsion_input_names ]
def standardisation(dummy, properties: np.ndarray, files: List, parameters: Dict): from sklearn.preprocessing import StandardScaler all_wells_input = [] all_wells_labels = [] for file in files: data_set_input = CSVLocalDataSet(filepath=parameters["path_primary"] + "/input_DWT_coeffs_" + file) DWT_Aprox_coeff_input = data_set_input.load() all_wells_input.append(DWT_Aprox_coeff_input.values) data_set_labels = CSVLocalDataSet(filepath=parameters["path_primary"] + "/labels_" + file) labels = data_set_labels.load() all_wells_labels.append(labels.values) all_wells_input = np.array(all_wells_input) # Standardize dynamic data coeffs scaler_coeffs = StandardScaler() scaler_coeffs.fit(all_wells_input[0]) # fit based on first well record all_wells_standardized_input = [] all_wells_standardized_input_flattened = [] for well_coeffs in all_wells_input: std_coeffs = scaler_coeffs.transform(well_coeffs) all_wells_standardized_input.append(std_coeffs) transposed_std_coeffs = np.transpose(std_coeffs) flattened_std_coeffs = transposed_std_coeffs.flatten() all_wells_standardized_input_flattened.append(flattened_std_coeffs) all_wells_standardized_input = np.array(all_wells_standardized_input) all_wells_standardized_input_flattened = np.array( all_wells_standardized_input_flattened) data_set_scaler_coeffs = PickleLocalDataSet( filepath=parameters["path_models"] + "/scaler_coeffs.pickle") data_set_scaler_coeffs.save(scaler_coeffs) input_columns = list(DWT_Aprox_coeff_input.columns) for std_coeffs, file in zip(all_wells_standardized_input, files): std_coeffs = pd.DataFrame(std_coeffs, columns=input_columns) data_set = CSVLocalDataSet(filepath=parameters["path_features"] + "/std_DWT_input_coeffs_" + file) data_set.save(std_coeffs) # Standardize static data scaler_static = StandardScaler() all_wells_standardized_properties = scaler_static.fit_transform(properties) data_set_scaler_static = PickleLocalDataSet( filepath=parameters["path_models"] + "/scaler_static.pickle") data_set_scaler_static.save(scaler_static) return [ all_wells_standardized_input_flattened, all_wells_standardized_properties, all_wells_labels ]
def dynamic_time_warping(Raw_Data_preprocessed, parameters): reference_well = CSVLocalDataSet(filepath=parameters["path_raw"] + "/B03-1P.csv") well_ref = reference_well.load() data = well_ref['Oil [bbl/d]'] / 6.28981 well_ref.insert(4, 'Oil [m3/d]', data) well_ref_oil_data = well_ref['Oil [m3/d]'].values Raw_Data_preprocessed_ = [] distance_array = [] for well_data in Raw_Data_preprocessed: well_oil_data = well_data['Oil [m3/d]'].values distance, path = fastdtw(well_ref_oil_data, well_oil_data, dist=euclidean) distance_array.append(distance) path = np.array(path) index_well = path[..., 1] index_ref_well = path[..., 0] well = well_data.iloc[index_well] well.insert(0, 'index_ref', index_ref_well) well = well.groupby('index_ref').mean() # well = well.reset_index(drop=True) Raw_Data_preprocessed_.append(well) distance_array = np.array(distance_array) return [distance_array, Raw_Data_preprocessed_]
def test_save_options_csv(self, tmp_path, sample_spark_df): # To cross check the correct Spark save operation we save to # a single spark partition with csv format and retrieve it with Kedro # CSVLocalDataSet temp_dir = Path(str(tmp_path / "test_data")) spark_data_set = SparkDataSet( filepath=str(temp_dir), file_format="csv", save_args={ "sep": "|", "header": True }, ) spark_df = sample_spark_df.coalesce(1) spark_data_set.save(spark_df) single_csv_file = [ f for f in temp_dir.iterdir() if f.is_file() and f.suffix == ".csv" ][0] csv_local_data_set = CSVLocalDataSet(filepath=str(single_csv_file), load_args={"sep": "|"}) pandas_df = csv_local_data_set.load() assert pandas_df[pandas_df["name"] == "Alex"]["age"][0] == 31
def test_save_options_csv(): # To cross check the correct Spark save operation we save to # a single spark partition with csv format and retrieve it with Kedro # CSVLocalDataSet with tempfile.TemporaryDirectory() as temp_dir: temp_path = join(temp_dir, "test_data") spark_data_set = SparkDataSet( filepath=temp_path, file_format="csv", save_args={ "sep": "|", "header": True }, ) spark_df = _get_sample_spark_data_frame().coalesce(1) spark_data_set.save(spark_df) single_csv_file = [ join(temp_path, f) for f in listdir(temp_path) if f.endswith("csv") ][0] csv_local_data_set = CSVLocalDataSet(filepath=single_csv_file, load_args={"sep": "|"}) pandas_df = csv_local_data_set.load() assert pandas_df[pandas_df["name"] == "Alex"]["age"][0] == 31
def preprocess_raw_data(parameters: Dict): import glob, os # os.chdir(parameters["path_raw"]) os.chdir(parameters["path_raw_matlab"]) files = [] for file in glob.glob("*.csv"): files.append(file) filenames = [] wells_data = [] for file in files: filename, extension = file.split('.') filenames.append(filename) for file, filename in zip(files, filenames): io = DataCatalog({ # filename: CSVLocalDataSet(filepath=parameters["path_raw"]+"/"+file), filename: CSVLocalDataSet(filepath=parameters["path_raw_matlab"] + "/" + file), }) well_data = io.load(filename) wells_data.append(well_data) Raw_Data_preprocessed = [] Raw_Data_dated = [] wells_life = [] wells_data_ = [] for well in wells_data: # well = well[['Date', 'Injector Bottom Hole Pressure', # 'Producer Bottom Hole Pressure', 'ESP Speed', # 'Steam Flow Rate - Outer', # 'Emulsion Flow Rate']] well = well[[ 'Date', 'Speed [Hz]', 'Current [A]', 'IBHP', 'PBHP', 'Co-Injection [E3m3/d]', 'Oil [bbl/d]', 'Steam [m3/d]', 'Emulsion [m3/d]' ]] for i in range(1, len(well.columns)): well[well.columns[i]] = pd.to_numeric(well[well.columns[i]], errors='coerce') well['Prod_Date'] = pd.to_datetime(well['Date']) well = well.set_index('Prod_Date') # well = well.resample('7D').mean() # weekly data # well = well.resample('30D').mean() # monthly data # well = well.rolling(30, min_periods=1).mean() data = well['Oil [bbl/d]'] / 6.28981 well.insert(4, 'Oil [m3/d]', data) time_data = np.arange(len(well)) well.insert(0, 'Timestep', time_data) wells_life.append(len(well)) wells_data_.append(well) min_well_length = np.min(np.array(wells_life)) print((min_well_length, np.argmin(np.array(wells_life)))) timesteps = min_well_length # 1008 # timesteps = 371 for well, file, filename in zip(wells_data_, files, filenames): well = well.iloc[:timesteps] # daily, weekly, monthly data # well = well.fillna(0) # well = well.fillna(well.rolling(30, min_periods=1).median()) # well = well.fillna(well.median()) well["Well"] = filename # create a column for well name well = well.reset_index(drop=True) # remove date index data_set = CSVLocalDataSet(filepath=parameters["path_intermediate"] + "/pre_processed_data_" + file) data_set.save(well) Raw_Data_dated.append(well) well = well.drop(columns=['Date', 'Well']) Raw_Data_preprocessed.append(well) # stats_training = CSVLocalDataSet(filepath=parameters["path_raw_static"]+"/static_P50_data_training.csv") stats_training = CSVLocalDataSet( filepath=parameters["path_raw_static_matlab"] + "/static_P50_data_training_152_wells.csv") stats = stats_training.load() stats_ROIP = stats.loc[:, 'ROIP'] stats = stats.loc[:, 'Effective_Length':'BottomWater_Oil_Saturation'] # # using only rich geoostats and no bottom water properties # stats = stats.loc[:, 'Effective_Length':'Rich_Oil_Saturation'] # # Using "Effective_Rich_Pay_Thickness" to account for standoff and rich thickness # data = stats['Rich_Pay_Thickness'] - stats['Stand_Off'] # stats.insert(3, 'Effective_Rich_Pay_Thickness', data) # stats = stats.drop(columns = ['Rich_Pay_Thickness', 'Stand_Off']) property_names = list(stats.columns) properties = list(stats.values) properties = np.array(properties) return [ timesteps, Raw_Data_preprocessed, Raw_Data_dated, files, filenames, properties, stats_ROIP, property_names ]
def load_well_validation_data(dummy2, timesteps, parameters: Dict): import glob, os os.chdir(parameters["path_model_input"]) files_val = [] for file in glob.glob("*.csv"): files_val.append(file) filenames_val = [] wells_data = [] for file in files_val: filename, extension = file.split('.') filenames_val.append(filename) for file, filename in zip(files_val, filenames_val): io = DataCatalog({ filename: CSVLocalDataSet(filepath=parameters["path_model_input"] + "/" + file), }) well_data = io.load(filename) wells_data.append(well_data) Raw_Data_preprocessed_val = [] wells_life = [] wells_data_ = [] for well in wells_data: # well = well[['Date', 'Injector Bottom Hole Pressure', # 'Producer Bottom Hole Pressure', 'ESP Speed', # 'Steam Flow Rate - Outer', # 'Emulsion Flow Rate']] well = well[[ 'Date', 'Speed [Hz]', 'Current [A]', 'IBHP', 'PBHP', 'Co-Injection [E3m3/d]', 'Oil [bbl/d]', 'Steam [m3/d]', 'Emulsion [m3/d]' ]] for i in range(1, len(well.columns)): well[well.columns[i]] = pd.to_numeric(well[well.columns[i]], errors='coerce') well['Prod_Date'] = pd.to_datetime(well['Date']) well = well.set_index('Prod_Date') # well = well.dropna(axis=0) # may change # well = well.resample('7D').mean() # weekly data # well = well.resample('30D').mean() # monthly data # well = well.rolling(30, min_periods=1).mean() # well = well.rolling(30, min_periods=1).mean() data = well['Oil [bbl/d]'] / 6.28981 well.insert(4, 'Oil [m3/d]', data) time_data = np.arange(len(well)) well.insert(0, 'Timestep', time_data) wells_life.append(len(well)) wells_data_.append(well) min_well_length = np.min(np.array(wells_life)) if min_well_length < timesteps: timesteps_validation = min_well_length else: timesteps_validation = timesteps for well, file, filename in zip(wells_data_, files_val, filenames_val): well = well.iloc[:timesteps_validation] # daily data # well = well.fillna(0) # well = well.fillna(well.rolling(30, min_periods=1).median()) # well = well.fillna(well.median()) Raw_Data_preprocessed_val.append(well) stats_validation = CSVLocalDataSet(filepath=parameters["path_val_stats"] + "/static_P50_data_validation.csv") stats_val = stats_validation.load() stats_val_ROIP = stats_val.loc[:, 'ROIP'] stats_val = stats_val.loc[:, 'Effective_Length':'BottomWater_Oil_Saturation'] # # using only rich geoostats and no bottom water properties # stats_val = stats_val.loc[:, 'Effective_Length':'Rich_Oil_Saturation'] # # Using "Effective_Rich_Pay_Thickness" to account for standoff and rich thickness # data = stats_val['Rich_Pay_Thickness'] - stats_val['Stand_Off'] # stats_val.insert(3, 'Effective_Rich_Pay_Thickness', data) # stats_val = stats_val.drop(columns = ['Rich_Pay_Thickness', 'Stand_Off']) property_names_val = list(stats_val.columns) properties_val = list(stats_val.values) # properties_val = stats.loc[:, ['Effective_Length', 'Spacing', 'Effective_Rich_Pay_Thickness', 'Non_Rich_Pay_Thickness', # 'Rich_Vertical_Permeability','Non_Rich_Vertical_Permeability', 'Rich_Porosity', # 'Non_Rich_Porosity', 'Rich_Oil_Saturation', 'Non_Rich_Oil_Saturation']].values properties_val = np.array(properties_val) dummy11 = files_val return [ dummy11, timesteps_validation, Raw_Data_preprocessed_val, files_val, filenames_val, properties_val, stats_val_ROIP, property_names_val ]