Ejemplo n.º 1
0
#     print('{} ({})'.format(col, df[col].dtype if isinstance(df[col], pd.Series) else type(df[col])))
#     print(stats)

# and if you thought that was tough to read, try printing out all the report['freq'] dicts of histograms


# In[43]:


display(HTML("<style>.container { width:100% !important; }</style>"))
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 500)
desc['variables']


# In[38]:

desc['table']


# In[29]:

# desc.keys()
html = pandas_profiling.to_html(df.head(), desc).encode('utf8')
with open('report.html', 'w') as fout:
    fout.write(html)
display(HTML(html))
# report = pandas_profiling.ProfileReport(df)

Ejemplo n.º 2
0
                 index_col='id',
                 compression='gzip',
                 quotechar='"',
                 quoting=pd.io.common.csv.QUOTE_NONNUMERIC,
                 low_memory=False)

print('df.describe() stats:')
desc = df.describe()
for col, stats in desc.T.iterrows():
    print('')
    print('{} ({})'.format(
        col,
        df[col].dtype if isinstance(df[col], pd.Series) else type(df[col])))
    print(stats)

html = pandas_profiling.to_html(df.head(3), desc)
open('report.html', 'w').write(html)

# this is redundant with stats above and takes way longer than it should (30 minutes?)
# print('Column, Count, Min, Mean, Max:')
# for k, c, colmin, colmean, colmax in izip(df.columns, df.count().T, df.min().T, df.mean().T, df.max().T):
#     print('{:40s}\t{}\t{}\t{}\t{}'.format(k, c, colmin, colmean, colmax))

# this takes a few minutes
print(
    'Trying to compute a ProfileReport, including correlation between columns, skew etc'
)
# pandas_profiling.ProfileReport raises Tkinter exceptions before it can produce any output,
#  at least describe produces a dataframe of stats
report = dict2obj(pandas_profiling.describe(df))
print(report['table'])
Ejemplo n.º 3
0
from pug.nlp.util import dict2obj

# the round-trip to disk cleans up encoding issues so encoding option no longer needs to be specified and gzip 
df = pd.read_csv(os.path.join(DATA_PATH, 'cleaned_tweets.csv.gz'), index_col='id', compression='gzip',
                  quotechar='"', quoting=pd.io.common.csv.QUOTE_NONNUMERIC, low_memory=False)


print('df.describe() stats:')
desc = df.describe()
for col, stats in desc.T.iterrows():
    print('')
    print('{} ({})'.format(col, df[col].dtype if isinstance(df[col], pd.Series) else type(df[col])))
    print(stats)

html = pandas_profiling.to_html(df.head(3), desc)
open('report.html', 'w').write(html)

# this is redundant with stats above and takes way longer than it should (30 minutes?)
# print('Column, Count, Min, Mean, Max:')
# for k, c, colmin, colmean, colmax in izip(df.columns, df.count().T, df.min().T, df.mean().T, df.max().T):
#     print('{:40s}\t{}\t{}\t{}\t{}'.format(k, c, colmin, colmean, colmax))

# this takes a few minutes
print('Trying to compute a ProfileReport, including correlation between columns, skew etc')
# pandas_profiling.ProfileReport raises Tkinter exceptions before it can produce any output,
#  at least describe produces a dataframe of stats
report = dict2obj(pandas_profiling.describe(df))
print(report['table'])

print('')
Ejemplo n.º 4
0
    'Using pandas_profiling to generate more detailed stats, including correlation between columns, skew etc'
)
# pandas_profiling.ProfileReport raises Tkinter exceptions before it can produce any output,
#  at least describe produces a dataframe of stats
desc = pandas_profiling.describe(df)
desc['table']
# for col, stats in desc['variables'].iterrows():
#     print('')
#     print(col)
#     print('{} ({})'.format(col, df[col].dtype if isinstance(df[col], pd.Series) else type(df[col])))
#     print(stats)

# and if you thought that was tough to read, try printing out all the report['freq'] dicts of histograms

# In[43]:

desc['variables']

# In[38]:

desc['table']

# In[29]:

# desc.keys()
html = pandas_profiling.to_html(df.head(), desc).encode('utf8')
with open('report.html', 'w') as fout:
    fout.write(html)
display(HTML(html))
# report = pandas_profiling.ProfileReport(df)