コード例 #1
0
    def regex(self, df):
        logger.log("Apply regex on the string rows", logging.INFO)
        df = formatCols(df)
        print(df.shape)
        logger.log("New dtypes is {}".format(df.dtypes), logging.DEBUG)

        return df
コード例 #2
0
    def date_cols(self, df):
        logger.log("Build new columns thanks to the datetime columns",
                   logging.INFO)
        df = buildColsFromDateCols(df)
        print(df.shape)

        return df
コード例 #3
0
    def read(self):
        logger.log("Reading the file {}".format(self.filepath), logging.INFO)
        df = readFile(self.filepath, self.encoding, self.sep,
                      self.infer_datetime)
        print(df.shape)
        logger.log("Inital dtypes is {}".format(df.dtypes), logging.DEBUG)

        return df
コード例 #4
0
ファイル: feature.py プロジェクト: Jwuthri/DataPreprocess
    def transformCols(self):
        """Transform all columns from colsToTransform.columns.
        Complexity 3 * n (n = nb_cols).

            Return:
            -------
                pandas.DataFrame with the new columns
        """
        for col in self.cols_transform:
            logger.log("Transform {}".format(col), logging.DEBUG)
            self.transformCol(col)
コード例 #5
0
ファイル: feature.py プロジェクト: Jwuthri/DataPreprocess
    def derivateCols(self):
        """Derivate all the columns.
        Complexity n * n (n = nb_cols)

            Return:
            -------
                pandas.DataFrame with all new columns derivate
        """
        possible_derivatif = itertools.product(self.cols_transform, self.cols_transform)
        for col1, col2 in possible_derivatif:
            # Trash hack find a best way, pls
            if col1 != col2:
                logger.log("Derivate {} / {}".format(col1, col2), logging.DEBUG)
                self.derivateCol(col1, col2)
コード例 #6
0
ファイル: fill.py プロジェクト: Jwuthri/DataPreprocess
    def fill(self):
        """Fill the dataframe.

            Return:
            -------
                pandas.DataFrame filled
        """
        dataframe = self.dataframe.copy()
        wcss = self.wcss(dataframe)
        nb_cluster = self.computeOptimalCluster(wcss)
        logger.log("Optimal nb of cluster is: {}".format(nb_cluster),
                   logging.DEBUG)
        dataframe["Cluster"] = self.clustering(dataframe, nb_cluster)

        return self.fillCols(dataframe)
コード例 #7
0
ファイル: fill.py プロジェクト: Jwuthri/DataPreprocess
    def fillCols(self, dataframe):
        """Fill NaN for all columns.

            Args:
            -----
                dataframe (pandas.DataFrame): data

            Return:
            -------
                pandas.DataFrame with new value instead of NaN
        """
        for col in self.cols:
            logger.log("Filling NaN, column: {}".format(col), logging.DEBUG)
            dataframe[col] = self.fillCol(dataframe, col)

        return dataframe.drop("Cluster", axis=1)
コード例 #8
0
    def scaleCols(self, dataframe):
        """Determine features to scale.

            Args:
            -----
                dataframe (pandas.DataFrame): data

            Return:
            -------
                pandas.DataFrame with columns scaled
        """
        for col in dataframe.columns:
            logger.log("Scale column {}".format(col), logging.DEBUG)
            dataframe[col] = self.scaleCol(dataframe[col], col)

        return dataframe
コード例 #9
0
def readFile(filepath, encoding="utf-8-sig", sep=",", infer_datetime=True):
    """Read a csv file.

        Args:
        -----
            filepath (str): the path of the data file
            encoding (str): the encoding type
            sep (char): the delimiter
            infer_datetime (bool): try to optimaze datetime

        Return:
        -------
            pandas.DataFrame with data
        """
    def getColumns(dargs):
        """Get all columns names.

            Arg:
            -----
                dargs (dict): args to read the csv file

            Return:
            -------
                list of all columns in the dataframe
        """
        dargs.update({"nrows": 5})

        return list(pd.read_csv(**dargs).columns)

    dargs = {
        "encoding": encoding,
        "sep": sep,
        "engine": "python",
        "filepath_or_buffer": filepath
    }
    logger.log("Read csv file: ".format(filepath), logging.DEBUG)
    columns = getColumns(dargs)
    del dargs["nrows"]

    if infer_datetime:
        dargs.update({
            "parse_dates": columns,
            "infer_datetime_format": infer_datetime
        })
        logger.log("args: ".format(str(dargs)), logging.DEBUG)

    return pd.read_csv(**dargs)
コード例 #10
0
ファイル: feature.py プロジェクト: Jwuthri/DataPreprocess
    def featurize(self):
        """Build new features.

            Return:
            -------
                pandas.DataFrame with all the new features
        """
        if self.transformations:
            self.transformCols()
        else:
            logger.log("We won't transform features", logging.WARNING)
        if self.derivate:
            self.derivateCols()
        else:
            logger.log("We won't derivate features", logging.WARNING)

        return self.dataframe
コード例 #11
0
    def scaleCol(self, serie, col):
        """Scale the serie.

            Args:
            -----
                serie (pandas.Serie): serie to scale

            Return:
            -------
                pandas.Serie scaled
        """
        warnings.filterwarnings("ignore")
        try:
            return StandardScaler().fit_transform(serie)
        except Exception as e:
            logger.log("{}".format(e), logging.ERROR)
            return serie
コード例 #12
0
ファイル: dummy.py プロジェクト: Jwuthri/DataPreprocess
    def dummiefication(self, dataframe):
        """Transform categoric variables into dummiees.

            Args:
            -----
                dataframe (pandas.DataFrame): data

            Return:
            --------
                pandas.DataFrame with new columns based on cataegoric value
        """
        dummify = dataframe.loc[:, dataframe.dtypes == object]
        for col in dummify.columns:
            logger.log("Dumify column {}".format(col), logging.DEBUG)
            if len(dataframe[col].unique()) < 10:
                df = pd.get_dummies(dataframe[col],
                                    drop_first=True,
                                    prefix=col)
                dataframe = pd.concat([df, dataframe], axis=1)

        return dataframe
コード例 #13
0
def GetX_Y(df, col_y, col_to_remove=[]):
    """Select X and y dataframes.

        Args:
        -----
            df (pandas.DataFrame): the datas
            col_y (str): col to predict
            col_to_remove (list): columns you don't want to use

        Returns:
        -------
            pandas.DataFrame X and y
    """
    y = df[[col_y]]
    X = df.drop([col_y], axis=1)
    for col in col_to_remove:
        if col in X:
            X.drop([col], axis=1, inplace=True)
            logger.log("Remove column {}".format(col), logging.DEBUG)
        else:
            logger.log("Col {} not in the dataframe".format(col),
                       logging.WARNING)

    return X, y
コード例 #14
0
    def split(self, df):
        logger.log("Split dataframe, and remove useless cols", logging.INFO)
        X, y = GetX_Y(df, self.y_col, self.col_to_drop)
        print(df.shape)

        return X, y
コード例 #15
0
    def feature(self, df, orginal_cols):
        logger.log("Make some feature engineering", logging.INFO)
        df = FeatureEngineering(df, cols=orginal_cols).featurize()
        print(df.shape)

        return df
コード例 #16
0
    def scale(self, df):
        logger.log("Scale the data", logging.INFO)
        df = ScaleData(df).scale()
        print(df.shape)

        return df
コード例 #17
0
    def dummy(self, df):
        logger.log("Dummify categoric variables", logging.INFO)
        df = Dummify(df).dummies()
        print(df.shape)

        return df
コード例 #18
0
    def fill(self, df):
        logger.log("Filling the NaN values", logging.INFO)
        df = FillNaN(df).fill()
        print(df.shape)

        return df