Example #1
0
def analyze_found_chemicals(df_, found_chems):
    df_filt = df_[df_['chemical'].isin(found_chems)]
    found_prods = {}
    for idx, new_df in df_filt.groupby(['chemical_', 'category_']):#.size().order()[::-1].iteritems():
        chem, cat = idx
        if chem not in found_prods:
            print '-----'
            found_prods[chem] = np.array([i[1] for i in ing_utils.get_perc(chem, 'shelf')])
            print "Num of categories chem found in: {}, Total: {}".format(len(found_prods[chem]), counts[chem])
        print cat.lower() in found_prods[chem], idx
Example #2
0
def gen_nearest_neighbors(chemicals, ings, df_):
    num_ingredients = 5000
    num_neighbors = 5
    ings_ordered = ings[:num_ingredients]
    sources = ['SNOMEDCT_US', 'NCI', 'NDFRT', 'MSH']
    #ings, reps, cuis_to_idx, neigh = ncim.run('../ncim/ing_to_cuis_5000.pkl', num_ingredients)
    #found_ings, new_ings_reps = ncim.convert_ing_to_rep(
    #        chemicals, sources, cuis_to_idx)
    neigh = joblib.load('../ncim/neigh.pkl') 
    found_ings = np.load('../rasff/found_chems.npy')
    new_ings_reps = np.load('../rasff/found_chems_reps.npy')
    distances, neighbors = neigh.kneighbors(new_ings_reps)
    not_found_ings = [c for c in chemicals if c not in found_ings]

    neighbors_trunc = []
    for i in range(len(distances)):
        num_closest = (distances[i]==distances[i][0]).sum()
        if num_closest <= num_neighbors:
            neighbors_trunc.append(neighbors[i][:num_neighbors])
        else:
            # Take the n most popular ingredients if it's a tie.
            neighbors_trunc.append(sorted(neighbors[i][:num_closest])[:num_neighbors])
    neighbors_trunc = np.array(neighbors_trunc)

    # Print neighbors
    #for i in range(len(found_ings)):
    #    print '{} --> {}'.format(found_ings[i], ings[neighbors_trunc[i][:3]])

    chem_to_best_cats = {}
    cat_to_best_chems = {}
    for i, c in enumerate(found_ings):
        cat_to_perc = {}
        print '--------------'
        print '{} of {}'.format(i+1, len(found_ings))
        print '{} ({})'.format(c, c in ings)
        print '--------------'
        print 'Neighbors:'
        for nn_idx in neighbors_trunc[i]:
            nn = ings[nn_idx]
            print nn
            cats = ing_utils.get_perc(nn, 'shelf')
            if len(cats) == 0:
                continue
            for perc, cat, count in cats:
                if cat in cat_to_perc:
                    cat_to_perc[cat] = cat_to_perc[cat] + perc
                else:
                    cat_to_perc[cat] = perc
                if cat in cat_to_best_chems:
                    d = cat_to_best_chems[cat]
                else:
                    d = {}
                    cat_to_best_chems[cat] = d
                if c in d:
                    d[c] = d[c] + perc
                else:
                    d[c] = perc
        best_cats = sorted(cat_to_perc, key=cat_to_perc.get, reverse=True)
        chem_to_best_cats[c] = [(j, cat_to_perc[j]/num_neighbors) for j in best_cats]
        print "========="
        print "Predicted:"
        print chem_to_best_cats[c][:5]
        print "========="
        print "Actual:"
        print df_[df_['chemical_']==c]['category_'].value_counts()
    
    for cat in cat_to_best_chems.keys():
        d = cat_to_best_chems[cat]
        best_chems = sorted(d, key=d.get, reverse=True)
        cat_to_best_chems[cat] = [(j, d[j]/num_neighbors) for j in best_chems]
    
    with open('../outputs/cat_to_best_chems.txt', 'wb') as f:
        for chem, cats in cat_to_best_chems.iteritems():
            f.write('\n=================================================================')
            f.write('\n')
            f.write(chem)
            f.write('\n-----------------------------------------------------------------')
            for cat, perc in cats:
                f.write('\n{0: <50} {1}'.format(cat, perc))

    return chem_to_best_cats, cat_to_best_chems