def recompute_and_populate(): """ - load pickled vectorizer - transform docs - compute cosine similarity for all vector pairs - data is retrieved at rev_rollup_ct = 1 (beer level) """ vec_pkl = "src/vocab/review_vectorizer.p" was_pkl, vec = load_vec(vec_pkl) # load data for styles with feature sets # overridden until full feature table is populated styles = Styles() top_sy = [159, 84, 157, 56, 58, 9, 128, 97, 116, 140] print "Comparing the top %s styles: %s" % (len(top_sy), ", ".join(str(s) for s in top_sy)) X = styles.beer_reviews_rollup(top_sy, limit=0, rev_rollup_ct=1, shuffle=False) if was_pkl: print "Loaded pickled vectorizer." print "Feature count: %s" % len(vec.get_feature_names()) print "Transforming reviews" trans_pool = Pool(min(10, len(top_sy))) res_t = trans_pool.map(__asyncable_transform, [(vec, sy, X[X["style_id"] == sy]) for sy in top_sy]) # as style keyed dict res_t = {r[0]: {"beer_ids": r[1], "X_t": r[2]} for r in res_t} else: # exit program return 0 print "Truncating similarity table" bs = BeerSimilarity() # bs.remove_all() dim1 = sum(v["X_t"].shape[0] for k, v in res_t.iteritems()) dim2 = sum(len(v["X_t"].data) for k, v in res_t.iteritems()) print "Computing similarities and saving to db %s" % dim1 print "Nonzero elements %s" % dim2 # set style RU # will account for symmetry in the database # ru_sids = [ (top_sy[i], top_sy[j]) for i in xrange(len(top_sy)) for j in xrange(i,len(top_sy)) ] ru_sids = [(top_sy[i], top_sy[i]) for i in xrange(len(top_sy))] pool_inp = [] for ruc in ru_sids: X_t_ref = res_t[ruc[0]]["X_t"] b_id_ref = res_t[ruc[0]]["beer_ids"] X_t_comp = res_t[ruc[1]]["X_t"] b_id_comp = res_t[ruc[1]]["beer_ids"] pool_inp.append((bs, b_id_ref, X_t_ref, b_id_comp, X_t_comp, 100)) p = Pool(min(10, len(top_sy))) b_id_res = p.map(__asyncable_similarity, pool_inp) for res in b_id_res: if res[1] is not None: print "%s %s" % (", ".join(str(r) for r in res[0]), res[1])
def recompute_and_populate(): """ - load pickled vectorizer - transform docs - compute cosine similarity for all vector pairs - data is retrieved at rev_rollup_ct = 1 (beer level) """ vec_pkl = "src/vocab/review_vectorizer.p" was_pkl, vec = load_vec(vec_pkl) # load data for styles with feature sets # overridden until full feature table is populated styles = Styles() top_sy = [159, 84, 157, 56, 58, 9, 128, 97, 116, 140] print 'Comparing the top %s styles: %s' % (len(top_sy), ', '.join( str(s) for s in top_sy)) X = styles.beer_reviews_rollup(top_sy, limit=0, rev_rollup_ct=1, shuffle=False) if was_pkl: print "Loaded pickled vectorizer." print "Feature count: %s" % len(vec.get_feature_names()) print "Transforming reviews" trans_pool = Pool(min(10, len(top_sy))) res_t = trans_pool.map(__asyncable_transform, [(vec, sy, X[X['style_id'] == sy]) for sy in top_sy]) # as style keyed dict res_t = {r[0]: {'beer_ids': r[1], 'X_t': r[2]} for r in res_t} else: # exit program return 0 print 'Truncating similarity table' bs = BeerSimilarity() # bs.remove_all() dim1 = sum(v['X_t'].shape[0] for k, v in res_t.iteritems()) dim2 = sum(len(v['X_t'].data) for k, v in res_t.iteritems()) print 'Computing similarities and saving to db %s' % dim1 print 'Nonzero elements %s' % dim2 # set style RU # will account for symmetry in the database # ru_sids = [ (top_sy[i], top_sy[j]) for i in xrange(len(top_sy)) for j in xrange(i,len(top_sy)) ] ru_sids = [(top_sy[i], top_sy[i]) for i in xrange(len(top_sy))] pool_inp = [] for ruc in ru_sids: X_t_ref = res_t[ruc[0]]['X_t'] b_id_ref = res_t[ruc[0]]['beer_ids'] X_t_comp = res_t[ruc[1]]['X_t'] b_id_comp = res_t[ruc[1]]['beer_ids'] pool_inp.append((bs, b_id_ref, X_t_ref, b_id_comp, X_t_comp, 100)) p = Pool(min(10, len(top_sy))) b_id_res = p.map(__asyncable_similarity, pool_inp) for res in b_id_res: if res[1] is not None: print '%s %s' % (', '.join(str(r) for r in res[0]), res[1])