import pandas as pd import Nutrients.utils as utl import numpy as np import scipy import pickle import matplotlib.pyplot as plt import seaborn as sns targets, predictors = utl.get_wordtargets_nutrientpredictors(nwords=150) nutrients = utl.load_sql_table('nutrient') nutrients = nutrients.loc[nutrients['id'].isin(predictors.columns), :] with open('F:/Data/forest_results_20171112.pkl', 'rb') as f: forest_clf = pickle.load(f) # %% best results best_results = {} for key, val in forest_clf.items(): best_idx = val[0]['rank_test_score'][0] - 1 params = val[0]['params'][best_idx] best_results[key] = (val[0]['mean_test_score'][best_idx], val[0]['std_test_score'][best_idx], params['max_features'], params['n_estimators']) best_results = pd.concat( [targets.mean(), pd.DataFrame.from_dict(best_results, orient='index')], axis=1) best_results.columns = [ 'term_prob', 'mean_score', 'std_score', 'max_features', 'n_estimators' ]
import pandas as pd import Nutrients.utils as utl import itertools from scipy.spatial.distance import cdist import numpy as np import matplotlib.pyplot as plt import matplotlib # %% load food names and collect wordcounts foods = utl.load_sql_table('food') food_names = foods['name'].values words_infood = utl.split_into_words(food_names) all_words = map(str.lower, list(itertools.chain(*words_infood))) wordcounts = pd.Series(all_words).value_counts() unfun_words = ['upc', 's', 'with', 'and', 'in', 'a', 'gtin', 'to'] # hand selected words to ignore due to lack of fun wordcounts.drop(unfun_words, inplace=True) # %% wordcount distribution # wordcounts.head(20) matplotlib.rcParams.update({'font.size': 16}) fig, ax = plt.subplots(figsize=(12, 8)) wordcounts.hist(bins=20) ax.set_yscale('log') ax.set_xlabel('word counts') ax.set_ylabel('occurences') plt.show() # %% nwords = 150 word_df = utl.get_word_count_df(nwords)
import pandas as pd import Nutrients.utils as utl import numpy as np import matplotlib.pyplot as plt import matplotlib import seaborn # %% nutrients = utl.load_sql_table('nutrient') nutrients['id'] = nutrients['id'].astype('int64') quantities = utl.load_sql_table('quantity') qty = quantities.pivot_table(index='food_id', columns='nutrient_id', values='value', fill_value=0) qty_cov = qty.cov(min_periods=50).fillna(0).as_matrix() cov_norm = np.log10(1+np.abs(qty_cov)) * np.sign(qty_cov) # normalize for visualization # %% matplotlib.rcParams.update({'font.size': 16}) # fig, ax = plt.subplots(figsize=(12, 8)) # seaborn.heatmap(np.log10(np.abs(qty_cov+1)) * np.sign(qty_cov), ax=ax) # plt.show() fig, ax = plt.subplots(figsize=(12, 8)) offdiag = 1-np.eye(len(cov_norm)) var, vbin = np.histogram(np.diag(cov_norm), bins=25) width = 0.7 * (vbin[1] - vbin[0]) center = (vbin[:-1] + vbin[1:]) / 2 plt.bar(center, var/sum(var), align='center', width=width, alpha=.5) cvar, cvbin = np.histogram(cov_norm[offdiag.astype('bool')], bins=25) width = 0.7 * (cvbin[1] - cvbin[0]) center = (cvbin[:-1] + cvbin[1:]) / 2 plt.bar(center, cvar/sum(cvar), align='center', width=width, alpha=.5) ax.set_yscale('log')