def transformCols(self): """Transform all columns from colsToTransform.columns. Complexity 3 * n (n = nb_cols). Return: ------- pandas.DataFrame with the new columns """ for col in self.cols_transform: logger.log("Transform {}".format(col), logging.DEBUG) self.transformCol(col)
def readFile(filepath, encoding="utf-8-sig", sep=",", infer_datetime=True, decimal=',', thousands='.'): """Read a csv file. Args: ----- filepath (str): the path of the data file encoding (str): the encoding type sep (char): the delimiter infer_datetime (bool): try to optimaze datetime Return: ------- pandas.DataFrame with data """ def getColumns(dargs): """Get all columns names. Arg: ----- dargs (dict): args to read the csv file Return: ------- list of all columns in the dataframe """ dargs.update({"nrows": 5}) return list(pd.read_csv(**dargs).columns) dargs = { "encoding": encoding, "sep": sep, "decimal": decimal, "engine": "python", "filepath_or_buffer": filepath, "thousands": thousands } logger.log("Read csv file: {}".format(filepath), logging.DEBUG) columns = getColumns(dargs) del dargs["nrows"] if infer_datetime: dargs.update({ "parse_dates": columns, "infer_datetime_format": infer_datetime }) logger.log("args: {}".format(str(dargs)), logging.DEBUG) return pd.read_csv(**dargs)
def fill(self): """Fill the dataframe. Return: ------- pandas.DataFrame filled """ dataframe = self.dataframe.copy() wcss = self.wcss(dataframe) nb_cluster = self.computeOptimalCluster(wcss) logger.log("Optimal nb of cluster is: {}".format(nb_cluster), logging.DEBUG) dataframe["Cluster"] = self.clustering(dataframe, nb_cluster) return self.fillCols(dataframe)
def fillCols(self, dataframe): """Fill NaN for all columns. Args: ----- dataframe (pandas.DataFrame): data Return: ------- pandas.DataFrame with new value instead of NaN """ for col in self.cols: logger.log("Filling NaN, column: {}".format(col), logging.DEBUG) dataframe[col] = self.fillCol(dataframe, col) return dataframe.drop("Cluster", axis=1)
def derivateCols(self): """Derivate all the columns. Complexity n * n (n = nb_cols) Return: ------- pandas.DataFrame with all new columns derivate """ possible_derivatif = itertools.product(self.cols_transform, self.cols_transform) for col1, col2 in possible_derivatif: # Trash hack find a best way, pls if col1 != col2: logger.log("Derivate {} / {}".format(col1, col2), logging.DEBUG) self.derivateCol(col1, col2)
def featurize(self): """Build new features. Return: ------- pandas.DataFrame with all the new features """ if self.transformations: self.transformCols() else: logger.log("We won't transform features", logging.WARNING) if self.derivate: self.derivateCols() else: logger.log("We won't derivate features", logging.WARNING) return self.dataframe
def formatCols(df): """Formats all object columns of the dataframe. Arg: ---- df (pandas.DataFrame): datas Return: ------- pandas.DataFrame formatted """ cols = df.select_dtypes(include=["object"]).columns for col in cols: logger.log("Format col: {}".format(col), logging.DEBUG) df[col] = df[col].map(formatStr) try: res = df[col].astype(float) logger.log("Col {} has been cast into float".format(col), logging.DEBUG) except Exception: res = df[col] finally: df[col] = res return df
def GetX_Y(df, col_y, col_to_remove=[]): """Select X and y dataframes. Args: ----- df (pandas.DataFrame): the datas col_y (str): col to predict col_to_remove (list): columns you don't want to use Returns: ------- pandas.DataFrame X and y """ y = df[[col_y]] X = df.drop([col_y], axis=1) for col in col_to_remove: if col in X: X.drop([col], axis=1, inplace=True) logger.log("Remove column {}".format(col), logging.DEBUG) else: logger.log("Col {} not in the dataframe".format(col), logging.WARNING) return X, y