def analyze_found_chemicals(df_, found_chems): df_filt = df_[df_['chemical'].isin(found_chems)] found_prods = {} for idx, new_df in df_filt.groupby(['chemical_', 'category_']):#.size().order()[::-1].iteritems(): chem, cat = idx if chem not in found_prods: print '-----' found_prods[chem] = np.array([i[1] for i in ing_utils.get_perc(chem, 'shelf')]) print "Num of categories chem found in: {}, Total: {}".format(len(found_prods[chem]), counts[chem]) print cat.lower() in found_prods[chem], idx
def gen_nearest_neighbors(chemicals, ings, df_): num_ingredients = 5000 num_neighbors = 5 ings_ordered = ings[:num_ingredients] sources = ['SNOMEDCT_US', 'NCI', 'NDFRT', 'MSH'] #ings, reps, cuis_to_idx, neigh = ncim.run('../ncim/ing_to_cuis_5000.pkl', num_ingredients) #found_ings, new_ings_reps = ncim.convert_ing_to_rep( # chemicals, sources, cuis_to_idx) neigh = joblib.load('../ncim/neigh.pkl') found_ings = np.load('../rasff/found_chems.npy') new_ings_reps = np.load('../rasff/found_chems_reps.npy') distances, neighbors = neigh.kneighbors(new_ings_reps) not_found_ings = [c for c in chemicals if c not in found_ings] neighbors_trunc = [] for i in range(len(distances)): num_closest = (distances[i]==distances[i][0]).sum() if num_closest <= num_neighbors: neighbors_trunc.append(neighbors[i][:num_neighbors]) else: # Take the n most popular ingredients if it's a tie. neighbors_trunc.append(sorted(neighbors[i][:num_closest])[:num_neighbors]) neighbors_trunc = np.array(neighbors_trunc) # Print neighbors #for i in range(len(found_ings)): # print '{} --> {}'.format(found_ings[i], ings[neighbors_trunc[i][:3]]) chem_to_best_cats = {} cat_to_best_chems = {} for i, c in enumerate(found_ings): cat_to_perc = {} print '--------------' print '{} of {}'.format(i+1, len(found_ings)) print '{} ({})'.format(c, c in ings) print '--------------' print 'Neighbors:' for nn_idx in neighbors_trunc[i]: nn = ings[nn_idx] print nn cats = ing_utils.get_perc(nn, 'shelf') if len(cats) == 0: continue for perc, cat, count in cats: if cat in cat_to_perc: cat_to_perc[cat] = cat_to_perc[cat] + perc else: cat_to_perc[cat] = perc if cat in cat_to_best_chems: d = cat_to_best_chems[cat] else: d = {} cat_to_best_chems[cat] = d if c in d: d[c] = d[c] + perc else: d[c] = perc best_cats = sorted(cat_to_perc, key=cat_to_perc.get, reverse=True) chem_to_best_cats[c] = [(j, cat_to_perc[j]/num_neighbors) for j in best_cats] print "=========" print "Predicted:" print chem_to_best_cats[c][:5] print "=========" print "Actual:" print df_[df_['chemical_']==c]['category_'].value_counts() for cat in cat_to_best_chems.keys(): d = cat_to_best_chems[cat] best_chems = sorted(d, key=d.get, reverse=True) cat_to_best_chems[cat] = [(j, d[j]/num_neighbors) for j in best_chems] with open('../outputs/cat_to_best_chems.txt', 'wb') as f: for chem, cats in cat_to_best_chems.iteritems(): f.write('\n=================================================================') f.write('\n') f.write(chem) f.write('\n-----------------------------------------------------------------') for cat, perc in cats: f.write('\n{0: <50} {1}'.format(cat, perc)) return chem_to_best_cats, cat_to_best_chems