def recompute_and_populate():
        """
      - load pickled vectorizer
      - transform docs
      - compute cosine similarity for all vector pairs
      - data is retrieved at rev_rollup_ct = 1 (beer level)
    """

        vec_pkl = "src/vocab/review_vectorizer.p"
        was_pkl, vec = load_vec(vec_pkl)

        # load data for styles with feature sets
        # overridden until full feature table is populated

        styles = Styles()
        top_sy = [159, 84, 157, 56, 58, 9, 128, 97, 116, 140]
        print "Comparing the top %s styles: %s" % (len(top_sy), ", ".join(str(s) for s in top_sy))
        X = styles.beer_reviews_rollup(top_sy, limit=0, rev_rollup_ct=1, shuffle=False)

        if was_pkl:
            print "Loaded pickled vectorizer."
            print "Feature count: %s" % len(vec.get_feature_names())
            print "Transforming reviews"

            trans_pool = Pool(min(10, len(top_sy)))
            res_t = trans_pool.map(__asyncable_transform, [(vec, sy, X[X["style_id"] == sy]) for sy in top_sy])

            # as style keyed dict
            res_t = {r[0]: {"beer_ids": r[1], "X_t": r[2]} for r in res_t}

        else:
            # exit program
            return 0

        print "Truncating similarity table"
        bs = BeerSimilarity()
        #    bs.remove_all()

        dim1 = sum(v["X_t"].shape[0] for k, v in res_t.iteritems())
        dim2 = sum(len(v["X_t"].data) for k, v in res_t.iteritems())
        print "Computing similarities and saving to db %s" % dim1
        print "Nonzero elements %s" % dim2

        # set style RU
        # will account for symmetry in the database
        #    ru_sids = [ (top_sy[i], top_sy[j]) for i in xrange(len(top_sy)) for j in xrange(i,len(top_sy)) ]
        ru_sids = [(top_sy[i], top_sy[i]) for i in xrange(len(top_sy))]
        pool_inp = []
        for ruc in ru_sids:
            X_t_ref = res_t[ruc[0]]["X_t"]
            b_id_ref = res_t[ruc[0]]["beer_ids"]

            X_t_comp = res_t[ruc[1]]["X_t"]
            b_id_comp = res_t[ruc[1]]["beer_ids"]

            pool_inp.append((bs, b_id_ref, X_t_ref, b_id_comp, X_t_comp, 100))

        p = Pool(min(10, len(top_sy)))
        b_id_res = p.map(__asyncable_similarity, pool_inp)

        for res in b_id_res:
            if res[1] is not None:
                print "%s %s" % (", ".join(str(r) for r in res[0]), res[1])
Ejemplo n.º 2
0
    def recompute_and_populate():
        """
      - load pickled vectorizer
      - transform docs
      - compute cosine similarity for all vector pairs
      - data is retrieved at rev_rollup_ct = 1 (beer level)
    """

        vec_pkl = "src/vocab/review_vectorizer.p"
        was_pkl, vec = load_vec(vec_pkl)

        # load data for styles with feature sets
        # overridden until full feature table is populated

        styles = Styles()
        top_sy = [159, 84, 157, 56, 58, 9, 128, 97, 116, 140]
        print 'Comparing the top %s styles: %s' % (len(top_sy), ', '.join(
            str(s) for s in top_sy))
        X = styles.beer_reviews_rollup(top_sy,
                                       limit=0,
                                       rev_rollup_ct=1,
                                       shuffle=False)

        if was_pkl:
            print "Loaded pickled vectorizer."
            print "Feature count: %s" % len(vec.get_feature_names())
            print "Transforming reviews"

            trans_pool = Pool(min(10, len(top_sy)))
            res_t = trans_pool.map(__asyncable_transform,
                                   [(vec, sy, X[X['style_id'] == sy])
                                    for sy in top_sy])

            # as style keyed dict
            res_t = {r[0]: {'beer_ids': r[1], 'X_t': r[2]} for r in res_t}

        else:
            # exit program
            return 0

        print 'Truncating similarity table'
        bs = BeerSimilarity()
        #    bs.remove_all()

        dim1 = sum(v['X_t'].shape[0] for k, v in res_t.iteritems())
        dim2 = sum(len(v['X_t'].data) for k, v in res_t.iteritems())
        print 'Computing similarities and saving to db %s' % dim1
        print 'Nonzero elements %s' % dim2

        # set style RU
        # will account for symmetry in the database
        #    ru_sids = [ (top_sy[i], top_sy[j]) for i in xrange(len(top_sy)) for j in xrange(i,len(top_sy)) ]
        ru_sids = [(top_sy[i], top_sy[i]) for i in xrange(len(top_sy))]
        pool_inp = []
        for ruc in ru_sids:
            X_t_ref = res_t[ruc[0]]['X_t']
            b_id_ref = res_t[ruc[0]]['beer_ids']

            X_t_comp = res_t[ruc[1]]['X_t']
            b_id_comp = res_t[ruc[1]]['beer_ids']

            pool_inp.append((bs, b_id_ref, X_t_ref, b_id_comp, X_t_comp, 100))

        p = Pool(min(10, len(top_sy)))
        b_id_res = p.map(__asyncable_similarity, pool_inp)

        for res in b_id_res:
            if res[1] is not None:
                print '%s %s' % (', '.join(str(r) for r in res[0]), res[1])