def make_audit_file(infile='../data/sample_adinfo_1.csv', desc_file=None, desc_dir='../out', decimal_sep='.', sep=','): """ Generate a description file for datas in a csv file Parameters ---------- infile: str csv file containing data desc_file: str csv file in which output is written desc_dir: str directory where description file is written decimal_sep: str decimal values separator sep: str csv file columns separator """ def top_values(df, n_values=10, sep=' ', with_count=True): """ Retrieve most represented modalities for each column of a dataframe Parameters ---------- df: pandas.DataFrame input dataframe n_values: int numbers of top modalities kept sep: str modalities separator in output column with_count: bool if True, write count of modalities in percent next to it Returns ------- pandas.Series Series with the input DataFrame column names as indices, and the top modalities concatenated in a single line as values """ ds = pd.Series([]) for col_name in df: top = df[col_name].value_counts(normalize=True)\ .iloc[:n_values] if with_count: top = [str(t)+' ('+str(round(100*top[t], 2))+' %)' for t in top.index] else: top = top.index.astype(str) out = (sep).join(top) ds = ds.append(pd.Series([out], index=[col_name])) return ds if desc_file is None: desc_file = join(desc_dir, basename(infile)[:-4] + '_desc.csv') df = pd.read_csv(infile, sep=sep, decimal=decimal_sep) raw_desc = df.describe(include='all').T # raw_desc.rename(columns={'': 'col_name'}, inplace=True) null_percent = 100 * df.isnull().sum() / df.shape[0] null_percent = null_percent.to_frame(name='null_percent') # types = df.dtypes.to_frame(name='types') nunique = df.apply(pd.Series.nunique).to_frame(name='n_unique') top = top_values(df).to_frame(name='top_10') cat_cols = find_categorical(df) col_type = pd.Series(raw_desc.index, index=raw_desc.index, name='col_type').isin(cat_cols) col_type = col_type.apply(lambda x: 'cat' if x else 'num') desc = pd.concat([raw_desc, null_percent, col_type, nunique, top], axis=1, copy=False) kept_columns = ['col_type', 'null_percent', 'n_unique', 'mean', 'min', 'max', 'top_10'] desc[kept_columns].to_csv(desc_file, sep=';', float_format='%.2f')
def make_plots(infile='../data/sample_ctxoeuv_1.csv', max_modalities=10, decimal_sep='.', sep=','): """ Plot distribution for features in infile Parameters ---------- infile: str input csv file name max_modalities: int maximum number of different values for categoorical features decimal_sep: str decimal values separator sep: str csv file columns separator """ df = pd.read_csv(infile, sep=sep, decimal=decimal_sep) df.dropna(axis=1, how='all', inplace=True) # remove empty columns # df.drop('#rionPaiement', axis=1, inplace=True) # remove mono-modality cols categorical_cols = find_categorical(df) # get rid of many-modalities columns categorical_cols = [c for c in df[categorical_cols] if (df[c].value_counts().shape[0] < max_modalities)] numerical_cols = df.select_dtypes(include=['int', 'float']).columns numerical_cols = list(set(numerical_cols).difference(categorical_cols)) # sns.pairplot(df[numerical_cols], size=5) # ----- Plot numerical features ---- df[numerical_cols][(df >= df.quantile(0.1)) & (df <= df.quantile(0.9)) & (df != 0)].dropna(axis=1, how='all').hist( figsize=(20, 20), normed=True) plt.savefig('../figures/'+basename(infile)[:-4]+'_numerical.png') plt.clf() # ----- Plot categorical features ---- n = len(categorical_cols) height = int(ceil(sqrt(n))) width = int(floor(sqrt(n))) fig, axes = plt.subplots(nrows=width, ncols=height, figsize=(3*height, 3*width)) for i in range(1, height*width-n+1): print i axes[-i, -1].axis('off') # switch off unused subplots for i, c in enumerate(categorical_cols): x = i % width y = i % height df[c].value_counts(normalize=True, dropna=False).plot(kind='bar', ax=axes[x, y]) axes[x, y].set_title(c) axes[x, y].yaxis.set_major_formatter(formatter) axes[x, y].set_yticks([0, 0.2, 0.4, 0.6, 0.8, 1.]) fig.tight_layout() plt.savefig('../figures/'+basename(infile[:-4])+'_categorical.png')