コード例 #1
0
ファイル: audit.py プロジェクト: Wajsbrot/bnboost
def make_audit_file(infile='../data/sample_adinfo_1.csv', desc_file=None,
                    desc_dir='../out', decimal_sep='.', sep=','):
    """ Generate a description file for datas in a csv file

    Parameters
    ----------
    infile: str
        csv file containing data
    desc_file: str
        csv file in which output is written
    desc_dir: str
        directory where description file is written
    decimal_sep: str
        decimal values separator
    sep: str
        csv file columns separator

    """
    def top_values(df, n_values=10, sep=' ', with_count=True):
        """ Retrieve most represented modalities for each column of a dataframe

        Parameters
        ----------
        df: pandas.DataFrame
            input dataframe
        n_values: int
            numbers of top modalities kept
        sep: str
            modalities separator in output column
        with_count: bool
            if True, write count of modalities in percent next to it

        Returns
        -------
        pandas.Series
            Series with the input DataFrame column names as indices, and the
            top modalities concatenated in a single line as values

        """
        ds = pd.Series([])
        for col_name in df:
            top = df[col_name].value_counts(normalize=True)\
                .iloc[:n_values]
            if with_count:
                top = [str(t)+' ('+str(round(100*top[t], 2))+' %)'
                       for t in top.index]
            else:
                top = top.index.astype(str)
            out = (sep).join(top)
            ds = ds.append(pd.Series([out], index=[col_name]))
        return ds

    if desc_file is None:
        desc_file = join(desc_dir, basename(infile)[:-4] + '_desc.csv')

    df = pd.read_csv(infile, sep=sep, decimal=decimal_sep)

    raw_desc = df.describe(include='all').T
    # raw_desc.rename(columns={'': 'col_name'}, inplace=True)

    null_percent = 100 * df.isnull().sum() / df.shape[0]
    null_percent = null_percent.to_frame(name='null_percent')

    # types = df.dtypes.to_frame(name='types')

    nunique = df.apply(pd.Series.nunique).to_frame(name='n_unique')

    top = top_values(df).to_frame(name='top_10')

    cat_cols = find_categorical(df)
    col_type = pd.Series(raw_desc.index, index=raw_desc.index,
                         name='col_type').isin(cat_cols)
    col_type = col_type.apply(lambda x: 'cat' if x else 'num')

    desc = pd.concat([raw_desc, null_percent, col_type, nunique, top],
                     axis=1, copy=False)

    kept_columns = ['col_type', 'null_percent', 'n_unique', 'mean',
                    'min', 'max', 'top_10']

    desc[kept_columns].to_csv(desc_file, sep=';', float_format='%.2f')
コード例 #2
0
ファイル: plots.py プロジェクト: Wajsbrot/bnboost
def make_plots(infile='../data/sample_ctxoeuv_1.csv', max_modalities=10,
               decimal_sep='.', sep=','):
    """ Plot distribution for features in infile

    Parameters
    ----------
    infile: str
        input csv file name
    max_modalities: int
        maximum number of different values for categoorical features
    decimal_sep: str
        decimal values separator
    sep: str
        csv file columns separator


    """
    df = pd.read_csv(infile, sep=sep, decimal=decimal_sep)

    df.dropna(axis=1, how='all', inplace=True)  # remove empty columns
#   df.drop('#rionPaiement', axis=1, inplace=True)  # remove mono-modality cols

    categorical_cols = find_categorical(df)

    # get rid of many-modalities columns
    categorical_cols = [c for c in df[categorical_cols]
                        if (df[c].value_counts().shape[0] < max_modalities)]

    numerical_cols = df.select_dtypes(include=['int', 'float']).columns
    numerical_cols = list(set(numerical_cols).difference(categorical_cols))

    # sns.pairplot(df[numerical_cols], size=5)

    # ----- Plot numerical features ----
    df[numerical_cols][(df >= df.quantile(0.1)) & (df <= df.quantile(0.9)) &
                       (df != 0)].dropna(axis=1, how='all').hist(
                       figsize=(20, 20), normed=True)

    plt.savefig('../figures/'+basename(infile)[:-4]+'_numerical.png')
    plt.clf()

    # ----- Plot categorical features ----
    n = len(categorical_cols)
    height = int(ceil(sqrt(n)))
    width = int(floor(sqrt(n)))
    fig, axes = plt.subplots(nrows=width, ncols=height,
                             figsize=(3*height, 3*width))

    for i in range(1, height*width-n+1):
        print i
        axes[-i, -1].axis('off')  # switch off unused subplots

    for i, c in enumerate(categorical_cols):
        x = i % width
        y = i % height
        df[c].value_counts(normalize=True, dropna=False).plot(kind='bar',
                                                              ax=axes[x, y])
        axes[x, y].set_title(c)
        axes[x, y].yaxis.set_major_formatter(formatter)
        axes[x, y].set_yticks([0, 0.2, 0.4, 0.6, 0.8, 1.])
    fig.tight_layout()
    plt.savefig('../figures/'+basename(infile[:-4])+'_categorical.png')