print('df.describe() stats:') short_desc = df.describe() for col, stats in short_desc.T.iterrows(): print('{} ({})'.format(col, df[col].dtype if isinstance(df[col], pd.Series) else type(df[col]))) print(dict(zip(list(stats.index.values[[0,1,2,3,7]].T), list(stats.values[[0,1,2,3,7]].T)))) # In[ ]: # this takes a few minutes print('Using pandas_profiling to generate more detailed stats, including correlation between columns, skew etc') # pandas_profiling.ProfileReport raises Tkinter exceptions before it can produce any output, # at least describe produces a dataframe of stats desc = pandas_profiling.describe(df) desc['table'] # for col, stats in desc['variables'].iterrows(): # print('') # print(col) # print('{} ({})'.format(col, df[col].dtype if isinstance(df[col], pd.Series) else type(df[col]))) # print(stats) # and if you thought that was tough to read, try printing out all the report['freq'] dicts of histograms # In[43]: display(HTML("<style>.container { width:100% !important; }</style>")) pd.set_option('display.max_rows', 500)
print('{} ({})'.format(col, df[col].dtype if isinstance(df[col], pd.Series) else type(df[col]))) print(stats) html = pandas_profiling.to_html(df.head(3), desc) open('report.html', 'w').write(html) # this is redundant with stats above and takes way longer than it should (30 minutes?) # print('Column, Count, Min, Mean, Max:') # for k, c, colmin, colmean, colmax in izip(df.columns, df.count().T, df.min().T, df.mean().T, df.max().T): # print('{:40s}\t{}\t{}\t{}\t{}'.format(k, c, colmin, colmean, colmax)) # this takes a few minutes print('Trying to compute a ProfileReport, including correlation between columns, skew etc') # pandas_profiling.ProfileReport raises Tkinter exceptions before it can produce any output, # at least describe produces a dataframe of stats report = dict2obj(pandas_profiling.describe(df)) print(report['table']) print('') for col, stats in report['variables'].iterrows(): print('') print(col) # print('{} ({})'.format(col, df[col].dtype if isinstance(df[col], pd.Series) else type(df[col]))) print(stats) print('') for col, stats in report['freq'].iteritems(): print('') print(stats)
# In[9]: print('df.describe() stats:') desc = df.describe() for col, stats in desc.T.iterrows(): print('') print('{} ({})'.format(col, df[col].dtype if isinstance(df[col], pd.Series) else type(df[col]))) print(stats) # In[10]: # this takes a few minutes print('Using pandas_profiling to generate more detailed stats, including correlation between columns, skew etc') # pandas_profiling.ProfileReport raises Tkinter exceptions before it can produce any output, # at least describe produces a dataframe of stats report = pandas_profiling.describe(df) print(report['table']) print('') for col, stats in report['variables'].iterrows(): print('') print(col) # print('{} ({})'.format(col, df[col].dtype if isinstance(df[col], pd.Series) else type(df[col]))) print(stats) # and if you thought that was bad, try printing out all the report['freq'] dict of histograms
print(stats) html = pandas_profiling.to_html(df.head(3), desc) open('report.html', 'w').write(html) # this is redundant with stats above and takes way longer than it should (30 minutes?) # print('Column, Count, Min, Mean, Max:') # for k, c, colmin, colmean, colmax in izip(df.columns, df.count().T, df.min().T, df.mean().T, df.max().T): # print('{:40s}\t{}\t{}\t{}\t{}'.format(k, c, colmin, colmean, colmax)) # this takes a few minutes print( 'Trying to compute a ProfileReport, including correlation between columns, skew etc' ) # pandas_profiling.ProfileReport raises Tkinter exceptions before it can produce any output, # at least describe produces a dataframe of stats report = dict2obj(pandas_profiling.describe(df)) print(report['table']) print('') for col, stats in report['variables'].iterrows(): print('') print(col) # print('{} ({})'.format(col, df[col].dtype if isinstance(df[col], pd.Series) else type(df[col]))) print(stats) print('') for col, stats in report['freq'].iteritems(): print('') print(stats)
col, df[col].dtype if isinstance(df[col], pd.Series) else type(df[col]))) print( dict( zip(list(stats.index.values[[0, 1, 2, 3, 7]].T), list(stats.values[[0, 1, 2, 3, 7]].T)))) # In[ ]: # this takes a few minutes print( 'Using pandas_profiling to generate more detailed stats, including correlation between columns, skew etc' ) # pandas_profiling.ProfileReport raises Tkinter exceptions before it can produce any output, # at least describe produces a dataframe of stats desc = pandas_profiling.describe(df) desc['table'] # for col, stats in desc['variables'].iterrows(): # print('') # print(col) # print('{} ({})'.format(col, df[col].dtype if isinstance(df[col], pd.Series) else type(df[col]))) # print(stats) # and if you thought that was tough to read, try printing out all the report['freq'] dicts of histograms # In[43]: desc['variables'] # In[38]: