import pandas as pd
import Nutrients.utils as utl
import numpy as np
import scipy
import pickle
import matplotlib.pyplot as plt
import seaborn as sns

targets, predictors = utl.get_wordtargets_nutrientpredictors(nwords=150)
nutrients = utl.load_sql_table('nutrient')
nutrients = nutrients.loc[nutrients['id'].isin(predictors.columns), :]

with open('F:/Data/forest_results_20171112.pkl', 'rb') as f:
    forest_clf = pickle.load(f)
# %% best results
best_results = {}
for key, val in forest_clf.items():
    best_idx = val[0]['rank_test_score'][0] - 1
    params = val[0]['params'][best_idx]
    best_results[key] = (val[0]['mean_test_score'][best_idx],
                         val[0]['std_test_score'][best_idx],
                         params['max_features'], params['n_estimators'])

best_results = pd.concat(
    [targets.mean(),
     pd.DataFrame.from_dict(best_results, orient='index')],
    axis=1)
best_results.columns = [
    'term_prob', 'mean_score', 'std_score', 'max_features', 'n_estimators'
]
Example #2
0
import pandas as pd
import Nutrients.utils as utl
import itertools
from scipy.spatial.distance import cdist
import numpy as np
import matplotlib.pyplot as plt
import matplotlib

# %% load food names and collect wordcounts
foods = utl.load_sql_table('food')
food_names = foods['name'].values
words_infood = utl.split_into_words(food_names)
all_words = map(str.lower, list(itertools.chain(*words_infood)))
wordcounts = pd.Series(all_words).value_counts()
unfun_words = ['upc', 's', 'with', 'and', 'in', 'a', 'gtin',
               'to']  # hand selected words to ignore due to lack of fun
wordcounts.drop(unfun_words, inplace=True)

# %% wordcount distribution
# wordcounts.head(20)
matplotlib.rcParams.update({'font.size': 16})
fig, ax = plt.subplots(figsize=(12, 8))
wordcounts.hist(bins=20)
ax.set_yscale('log')
ax.set_xlabel('word counts')
ax.set_ylabel('occurences')
plt.show()

# %%
nwords = 150
word_df = utl.get_word_count_df(nwords)
Example #3
0
import pandas as pd
import Nutrients.utils as utl
import numpy as np
import matplotlib.pyplot as plt
import matplotlib
import seaborn

# %%
nutrients = utl.load_sql_table('nutrient')
nutrients['id'] = nutrients['id'].astype('int64')
quantities = utl.load_sql_table('quantity')
qty = quantities.pivot_table(index='food_id', columns='nutrient_id', values='value', fill_value=0)

qty_cov = qty.cov(min_periods=50).fillna(0).as_matrix()
cov_norm = np.log10(1+np.abs(qty_cov)) * np.sign(qty_cov)  # normalize for visualization
# %%
matplotlib.rcParams.update({'font.size': 16})
# fig, ax = plt.subplots(figsize=(12, 8))
# seaborn.heatmap(np.log10(np.abs(qty_cov+1)) * np.sign(qty_cov), ax=ax)
# plt.show()
fig, ax = plt.subplots(figsize=(12, 8))
offdiag = 1-np.eye(len(cov_norm))
var, vbin = np.histogram(np.diag(cov_norm), bins=25)
width = 0.7 * (vbin[1] - vbin[0])
center = (vbin[:-1] + vbin[1:]) / 2
plt.bar(center, var/sum(var), align='center', width=width, alpha=.5)
cvar, cvbin = np.histogram(cov_norm[offdiag.astype('bool')], bins=25)
width = 0.7 * (cvbin[1] - cvbin[0])
center = (cvbin[:-1] + cvbin[1:]) / 2
plt.bar(center, cvar/sum(cvar), align='center', width=width, alpha=.5)
ax.set_yscale('log')