def get_most_similar_restricted(limit=120): df, df_i = import_data() counts = df_i['ingredient'].value_counts() ings = counts.index.values found_ings, embeddings = retrieve_embeddings(model, ings) #found_ings, embeddings = np.load('word2vec_embeddings.npy') ranks = get_nearest_neighbors(embeddings) print_nearest_neighbors(ings[:limit], found_ings, ranks) highest_ranks, avg_rankings, random_avg_rankings = calc_score(ranks, limit, print_scores=False, score_path='../model/scores.csv') indices = found_ings[found_ings<highest_ranks.shape[0]] highest_ranks = highest_ranks[indices] avg_rankings = avg_rankings[indices] random_avg_rankings = random_avg_rankings[indices] print (highest_ranks<=3).sum(dtype=float) / np.isfinite(highest_ranks).sum() print highest_ranks[np.isfinite(highest_ranks)].mean() print avg_rankings[np.isfinite(avg_rankings)].mean() print random_avg_rankings[np.isfinite(random_avg_rankings)].mean()
def main(): mapping = pd.read_csv('../rasff/rasff_mapping.csv') d = {a : b for a,b in zip(mapping.category.values, mapping.shelf.values)} df_ = rasff.load_df() df_['category_'] = df_['category'].replace(d) df_['chemical_'] = [clean_chemical(c) for c in df_['chemical'].values] df, df_i = gather_data.import_data() counts = df_i['ingredient'].value_counts() ings = counts.index.values chemicals = [i for i in df_['chemical_'].unique() if i] # remove empty string found_chems, d_c = search_chemicals(counts, chemicals) unknown_chems = [c for c in chemicals if c not in found_chems] chemical_counts = df_.groupby('chemical_').size().sort_values()[::-1] category_counts = df_.groupby('category_').size().sort_values()[::-1] pairs = df_.groupby(['chemical_', 'product']).size().sort_values()[::-1] print 'Number of entries :', len(df_) print 'Unique entries :', len(pairs) print 'Unique adulterants :', len(df_['chemical_'].unique()) print 'Unique products :', len(df_['product'].unique()) print 'Unique categories :', len(df_['category'].unique())