def fit(self, df, method: str = "min_max", columns: list = [], **kwargs): r""" Documentation here """ if not method.lower() in self.methods: raise TypeError( "{} method not available, availables methods: {}".format( method, methods.keys())) self.__scaler = self.methods[method.lower()](**kwargs) self.__columns = columns if not columns: column_name = Utils.get_column_names(df) if len(column_name) == 1: df = df.values.reshape(-1, 1) return self.__scaler.fit(df) if len(columns) == 1: df = df.values.reshape(-1, 1) return self.__scaler.fit(df) return self.__scaler.fit(df[columns])
def detect(self, df: pd.DataFrame, win_size: int = 30, step: int = 1, conf: float = 0.95, cols: list = None) -> pd.DataFrame: """ Detects any outliers values if exists in dataframe. If exists these outliers values will be imputed. **Parameters** * **:param df:** (pandas.DataFrame) * **:param win_size:** (int) * **:param step:** (int) * **:param conf:** (float) * **:param cols:** (list) **returns** * **df:** (pandas.DataFrame) ______ ### **Snippet code** ```python >>> import matplotlib.pyplot as plt >>> from rackio_AI import Outliers >>> df = pd.DataFrame(np.random.randn(1000,2), columns=["a", "b"]) >>> out = Outliers() >>> df = out.add(df, percent=1) >>> df_imputed = out.detect(df, win_size=30) >>> ax = plt.plot(df["a"], '-r', df["b"], '-b', out.outliers["a"]["locs"], out.outliers["a"]["values"], 'rD', out.outliers["b"]["locs"], out.outliers["b"]["values"], 'bo', out.detected["a"]["locs"], out.detected["a"]["values"], 'kD', out.detected["b"]["locs"], out.detected["b"]["values"], 'ko') >>> ax = plt.legend(["a", "b", "a outliers", "b outliers", "a dectected", "b detected"]) >>> plt.show() ``` ![Detect Outlier](../img/impute_outliers.png) """ self._df_ = df.copy() self.detected = dict() if not cols: cols = Utils.get_column_names(self._df_) options = {"win_size": win_size, "step": step} self._serie_list_ = Utils().get_windows(self._df_, win_size, step=step) self.__first_step_detect(cols, **options) df = self._df_ return df
def add(self, df: pd.DataFrame, percent: float = 5, method: str = "tf", cols: list = None): """ Creates outliers values in a dataframe based on a given method **Parameters** * **:param df:** (pandas.DataFrame) Data to add outlier * **:param percent:** (float) outliers percent * **:param method:** (str) custom function name to calculate outlier * "tf": tukey-fence method * **:param cols:** (list) column names to add outliers, default None * If "None" outliers will be added to all columns **returns** * **df:** (pandas.DataFrame) Data with outliers added ______ ### **Snippet code** ```python >>> import matplotlib.pyplot as plt >>> from rackio_AI import Outliers >>> df = pd.DataFrame(np.random.randn(100,2), columns=["a", "b"]) >>> out = Outliers() >>> df = out.add(df) >>> ax = plt.plot(df["a"], '-r', df["b"], '-b', out.outliers["a"]["locs"], out.outliers["a"]["values"], 'rD', out.outliers["b"]["locs"], out.outliers["b"]["values"], 'bD') >>> ax = plt.legend(["a", "b", "a outliers", "b outliers"]) >>> plt.show() ``` ![Add Outlier](../img/add_outliers.png) """ options = { "percent": percent, "method": method, } self.outliers = dict() self._df_ = df.copy() if not cols: cols = Utils.get_column_names(df) self.__first_step_add(cols, **options) df = self._df_ return df
def inverse(self, df): r""" Documentation here """ if isinstance(df, pd.DataFrame): column_name = Utils.get_column_names(df) return pd.DataFrame(self.__scaler.inverse_transform(df), columns=column_name) return self.__scaler.inverse_transform(df)
def add(self, df: pd.DataFrame, win_size: int = 30, method: str = "rhinehardt", cols: list = None, std_factor: float = 0.001) -> pd.DataFrame: """ Add gaussian noise over subsequence windows based on some method **Parameters** * **:param df:** (pandas.DataFrame) * **:param win_size:** (int) window size to apply gaussian noise * **:param method:** (str) method to base gaussian noise * *rhinehardt* or *rh* * **:param cols:** (list) column names to add gaussian noise. **returns** * **df** (pandas.DataFrame) noise added ______ ### **Snippet code ```python >>> import matplotlib.pyplot as plt >>> from rackio_AI import Noise >>> df = pd.DataFrame(np.random.randn(100,2), columns=["a", "b"]) >>> noise = Noise() >>> df_noisy = noise.add(df, win_size=10) >>> ax = plt.plot(df.index, df["a"], '-r', df.index, df["b"], '-b', df_noisy.index, df_noisy["a"], '--r', df_noisy.index, df_noisy["b"], '--b') >>> ax = plt.legend(["a", "b", "noisy a", "noisy b"]) >>> plt.show() ``` ![Add rhinehardt noise](../img/rhinehardt_noise.png) """ options = { 'win_size': win_size, 'method': method, 'std_factor': std_factor } self._df_ = df.copy() if not cols: cols = Utils.get_column_names(self._df_) self.__first_step_add(cols, **options) df = self._df_ return df
def data(self): """ Variable where is storaged the loaded data. **Parameters** None **:return:** * **data:** (pandas.DataFrame) """ self.columns_name = Utils.get_column_names(self._data) return self._data
def __change_columns(self, column_name): """ Decorated function to visualize the progress bar during the execution of *change_colums* method in the pipeline **Parameters** * **:param column_name:** (list) list of data column to be deleted in DataFrame **returns** None """ if column_name in Utils.get_column_names(self.data): self.data.loc[:, column_name] = self._data_.loc[:, column_name] return
def data(self, value): """ **Parameters** * **:param value:** (pd.DataFrame or np.ndarray) **:return:** None """ if isinstance(value, pd.DataFrame) or isinstance(value, np.ndarray): if hasattr(self, '_data'): if isinstance(value, np.ndarray): self._data = pd.DataFrame(value, columns=self.columns_name) else: if isinstance(self._data.columns, pd.MultiIndex): self.columns_name = pd.MultiIndex.from_tuples( self.columns_name, names=['tag', 'variable', 'unit']) self._data = value else: self.columns_name = Utils.get_column_names(value) self._data = value else: raise TypeError('value must be a pd.DataFrame or np.ndarray')
def __best(self, _iterator, **kwargs): """ Decorated function to visualize the progress bar during the execution of best_win_size_step method **Parameters** * **:param column_name:** (list) list of grid **returns** None """ df = kwargs['df'] win_size, step = _iterator self.detect(df, win_size=win_size, step=step) result = dict() for col in Utils.get_column_names(df): y_pred = pd.Series(0, index=df[col].index) y_pred.loc[y_pred.index.isin(self.detected[col]['locs'])] = 1 y = pd.Series(0, index=df[col].index) y.loc[y.index.isin(self.outliers[col]['locs'])] = 1 precision, recall, _ = precision_recall_curve( y.values, y_pred.values) _auc = auc(recall, precision) result[col] = {"win_size": win_size, "step": step, "auc": _auc} self.optimizer_result.append(result) return
def fixnan(self, df: pd.DataFrame, key: str = "median", neighbors: int = 3, _round: bool = False, down: bool = False, decimals: int = 5) -> pd.DataFrame: """ Fixes nan in dataframe columns by a key function **Parameters** * **:param df:** (pandas.DataFrame) * **:param key:** (str) Function's name to fix nan * *median* * *mean* * *std* * *var* * **:param neighbors:** (int) neighbors values to apply key function * **:param _round:** (bool) * If True the value fixed is rounded * **:param down:** (bool) round down if *_round* is True otherwise round up * **:param decimals:** (int): If *_round* is True, the value is rounded with these decimals **returns** * **df** (pandas.DataFrame) dataframe with nan values fixed ___ ### **Snippet code ```python >>> import pandas as pd >>> import numpy as np >>> from rackio_AI import RackioAI >>> EDA = RackioAI.get(name="EDA core", _type='EDA') >>> df = pd.DataFrame(np.random.randn(10, 3), index=['a', 'b', 'c', 'd', 'f', 'g', 'h', 'i', 'j', 'k'], columns=['one', 'two', 'three']) >>> df2 = df.reindex(['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k']) >>> df_fixed = EDA.fixnan(df2, _round=True) ``` """ if key.lower() in ["median", "mean", "std", "var"]: self._dict_nonan_ = dict() self._df_ = df columns = Utils.get_column_names(df) options = { "key": key, "neighbors": neighbors, "_round": _round, "down": down, "decimals": decimals } self.__first_step_fixnan(columns, **options) names = list(self._dict_nonan_.keys()) self.__last_step_fixnan(names) return self._df_ else: raise TypeError( "{} is key not valid, use: ['median', 'mean', 'std', 'var']". format(key))
def load(self, pathname: str, ext: str = ".tpl", reset_index=False, **kwargs): """ Load data into DataFrame format: * **.tpl:** Is an [OLGA](https://www.petromehras.com/petroleum-software-directory/production-engineering-software/olga-dynamic-multiphase-flow-simulator) extension file. * **.pkl:** Numpy arrays or Pandas.DataFrame saved in pickle format. ___ **Parameters** * **:param pathname:** (str) Filename or directory. * If the *pathname* is a directory, it will load all the files with extension *ext*. * If the *pathname* is a filename, it will load the file with a supported extension. * **:param ext:** (str) filename extension, it's necessary if pathname is a directory. Extensions supported are: * *.tpl* [OLGA](https://www.petromehras.com/petroleum-software-directory/production-engineering-software/olga-dynamic-multiphase-flow-simulator) extension file. * *.xls* * *.xlsx* * *.xlsm* * *.xlsb* * *.odf* * *.ods* * *.odt* * *.csv* * *.pkl* (Only if the pkl saved is a DataFrame) **:return:** * **data:** (pandas.DataFrame) ___ ## Snippet code ```python >>> import os >>> from rackio_AI import RackioAI, get_directory >>> filename = os.path.join(get_directory('Leak'), 'Leak01.tpl') >>> df = RackioAI.load(filename) >>> print(df.head()) tag TIME_SERIES PT_SECTION_BRANCH_TUBERIA_PIPE_Pipe60_NR_1 ... CONTR_CONTROLLER_CONTROL_FUGA file variable Pressure ... Controller_output filename unit S PA ... .tpl 0 0.000000 568097.3 ... 0.0 Leak01 1 0.502732 568098.2 ... 0.0 Leak01 2 1.232772 568783.2 ... 0.0 Leak01 3 1.653696 569367.3 ... 0.0 Leak01 4 2.200430 569933.5 ... 0.0 Leak01 <BLANKLINE> [5 rows x 12 columns] **Example loading a directory with .tpl files** >>> directory = os.path.join(get_directory('Leak')) >>> df = RackioAI.load(directory) >>> print(df.head()) tag TIME_SERIES PT_SECTION_BRANCH_TUBERIA_PIPE_Pipe60_NR_1 ... CONTR_CONTROLLER_CONTROL_FUGA file variable Pressure ... Controller_output filename unit S PA ... .tpl 0 0.000000 568097.3 ... 0.0 Leak01 1 0.502732 568098.2 ... 0.0 Leak01 2 1.232772 568783.2 ... 0.0 Leak01 3 1.653696 569367.3 ... 0.0 Leak01 4 2.200430 569933.5 ... 0.0 Leak01 <BLANKLINE> [5 rows x 12 columns] **Example loading a directory with .csv files** >>> directory = os.path.join(get_directory('csv'), "Hysys") >>> df = RackioAI.load(directory, ext=".csv", _format="hysys") >>> print(df.head()) (Time, [seconds]) (PIC-118 - PV, [kPa]) (PIC-118 - OP, [%]) (SPRDSHT-1 - Cell Matrix (G-16), []) (UIC-101 - OP, [%]) 1 0 294.769 42 37.6105 10 2 0.3 294.769 42 37.6105 10 3 0.6 294.769 42 37.6105 10 4 0.9 294.769 42 37.6105 10 5 1.1 294.769 42 37.6105 10 >>> directory = os.path.join(get_directory('csv'), "VMGSim") >>> df = RackioAI.load(directory, ext=".csv", _format="vmgsim") >>> print(df.head()) (time, s) (/Bed-1.In.MoleFlow, kmol/h) (/Bed-1.In.P, kPa) ... (/Sep2.In.P, kPa) (/Sep3.In.P, kPa) (/Tail_Gas.In.T, C) 1 1 2072.582713 285.9299038 ... 315.8859771 291.4325134 159 2 2 2081.622826 286.9027793 ... 315.8953772 292.3627861 159 3 3 2085.98973 287.5966429 ... 316.0995398 293.0376745 159 4 4 2089.323383 288.1380485 ... 316.3974799 293.5708836 159 5 5 2092.214077 288.591646 ... 316.7350299 294.0200778 159 <BLANKLINE> [5 rows x 16 columns] **Example loading a .pkl with pandas.dataFrame** >>> filename = os.path.join(get_directory('pkl_files'), 'test_data.pkl') >>> df = RackioAI.load(filename) >>> print(df.head()) Pipe-60 Totalmassflow_(KG/S) Pipe-151 Totalmassflow_(KG/S) Pipe-60 Pressure_(PA) Pipe-151 Pressure_(PA) 0 37.83052 37.83052 568097.3 352683.3 1 37.83918 37.70243 568098.2 353449.8 2 37.83237 37.67011 568783.2 353587.3 3 37.80707 37.67344 569367.3 353654.8 4 37.76957 37.69019 569933.5 353706.8 ``` """ filename, ext = Utils.check_path(pathname, ext=ext) data = self.reader.read(filename, ext=ext, **kwargs) self.columns_name = Utils.get_column_names(data) if data.index.has_duplicates: data = data.reset_index(drop=True) if reset_index: data = data.reset_index(drop=True) self.columns_name = Utils.get_column_names(data) self._data = data return data
def split_sequences(self, df: pd.DataFrame, timesteps, stepsize: int = 1, input_cols: list = None, output_cols: list = None, maxlen=None, dtype: str = 'int32', padding: str = 'pre', truncating: str = 'pre', value: float = 0.): """ Splits dataframe in a 3D numpy array format supported by LSTM architectures using sliding windows concept. **Parameters** * **:param df:** (pandas.DataFrame) Contains inputs and outputs data * **:param timesteps:** (list or int) Timestep for each input variable. * If timestep is an int value, all input columns will be the same timestep * If timestep is a list, must be same lenght that input_cols argument * **:param stepsize:** (int, default = 1) step size for the sliding window * **:param input_cols:** (list, default = None) Column names that represents the input variables to LSTM * If input_cols is None the method assumes that inputs are all column except the last one. * **:param output_cols:** (list, default = None) Column names that represents the output variables to LSTM * If output_cols is None the method assumes that output is the last column. The rest of parameters represent the parameters for *pad_sequences* method, see its description. **returns** **sequences** (3D numpy array) dimensions (df.shape[0] - max(timesteps), max(timesteps), features) ```python >>> import numpy as np >>> from rackio_AI import RackioAI >>> a = np.array([10, 20, 30, 40, 50, 60, 70, 80, 90]).reshape(-1,1) >>> b = np.array([15, 25, 35, 45, 55, 65, 75, 85, 95]).reshape(-1,1) >>> c = np.array([a[i]+b[i] for i in range(len(a))]).reshape(-1,1) >>> data = np.hstack((a,b,c)) >>> data array([[ 10, 15, 25], [ 20, 25, 45], [ 30, 35, 65], [ 40, 45, 85], [ 50, 55, 105], [ 60, 65, 125], [ 70, 75, 145], [ 80, 85, 165], [ 90, 95, 185]]) >>> df = pd.DataFrame(data, columns=['a', 'b', 'c']) >>> preprocess = RackioAI.get("Preprocessing", _type="Preprocessing") >>> x, y = preprocess.lstm_data_preparation.split_sequences(df, 2) >>> x.shape (8, 2, 2) >>> x array([[[10., 15.], [20., 25.]], <BLANKLINE> [[20., 25.], [30., 35.]], <BLANKLINE> [[30., 35.], [40., 45.]], <BLANKLINE> [[40., 45.], [50., 55.]], <BLANKLINE> [[50., 55.], [60., 65.]], <BLANKLINE> [[60., 65.], [70., 75.]], <BLANKLINE> [[70., 75.], [80., 85.]], <BLANKLINE> [[80., 85.], [90., 95.]]]) >>> y.shape (8, 1, 1) >>> y array([[[ 45.]], <BLANKLINE> [[ 65.]], <BLANKLINE> [[ 85.]], <BLANKLINE> [[105.]], <BLANKLINE> [[125.]], <BLANKLINE> [[145.]], <BLANKLINE> [[165.]], <BLANKLINE> [[185.]]]) ``` """ if not input_cols: input_cols = Utils.get_column_names(df) input_cols = input_cols[:-1] if not output_cols: output_cols = Utils.get_column_names(df) output_cols = [output_cols[-1]] if isinstance(timesteps, list): if not len(timesteps) == len(input_cols): raise ValueError( 'timesteps and input_cols arguments must be same length') else: timesteps = [timesteps] * len(input_cols) input_data = df.loc[:, input_cols].values output_data = df.loc[:, output_cols].values iteration = list( range(0, input_data.shape[0] - max(timesteps) + stepsize, stepsize)) self.x_sequences = np.zeros( (len(iteration), max(timesteps), len(input_cols))) self.y_sequences = np.zeros((len(iteration), 1, len(output_cols))) self.start = 0 options = { 'output_data': output_data, 'input_data': input_data, 'timesteps': timesteps, 'maxlen': maxlen, 'dtype': dtype, 'padding': padding, 'truncating': truncating, 'value': value } self.__split_sequences(iteration, **options) return self.x_sequences, self.y_sequences