Esempio n. 1
0
    def run_cb_predictions(self):
        for cv in self.content_vector_types:
            content_path = self.directory + self.data_name +'_cv_' + cv + '.pkl'
            content_vect = sl.load_from_hadoop(content_path, self.sc).repartition(self.num_partitions)

            for uv in self.user_vector_types:
                train_ratings_loc = self.directory + self.data_name + '_uv_train_' + uv + '.pkl'
                train_ratings = sl.load_from_hadoop(train_ratings_loc, self.sc).repartition(self.num_partitions)

                for cb_pred in self.cb_predictions:
                    pred_save_loc = self.directory + self.data_name + '_predictions_' + uv + '_' + cv + '_' + cb_pred  + '.pkl'
                    print pred_save_loc
                    if os.path.isdir(pred_save_loc)==False:
                        print 'Running ' + cb_pred + ' for user vector ' + uv + ' and content vector ' + cv
                        if cb_pred=='cb_vect':
                            predictions = content_based.predict(train_ratings, content_vect, num_partitions=self.num_partitions)
                            sl.save_to_hadoop(predictions, pred_save_loc)
                        elif cb_pred=='cb_kmeans_100':
                            predictions = content_based_kmeans.predict(train_ratings, content_vect, num_predictions=100, num_partitions=self.num_partitions)
                            sl.save_to_hadoop(predictions, pred_save_loc)
                        elif cb_pred=='cb_kmeans_1000':
                            predictions = content_based_kmeans.predict(train_ratings, content_vect, num_predictions=1000, num_partitions=self.num_partitions)
                            sl.save_to_hadoop(predictions, pred_save_loc)
                        else:
                            break
        print 'All CB predictions saved'
Esempio n. 2
0
    def run_vectorizer(self):
        for uv in self.user_vector_types:
            #set up the vectorizer.  The data going into each will be slightly different
            #Some don't have sqlCtx and some need support vectors
            vectorizor = self.get_vectorizer(uv, self.content_vector_types[0])

            user_info = vectorizor.get_user_vector().repartition(20)
            train_ratings, test_ratings = user_info.randomSplit([0.9,0.1], 41)

            uv_train_path = self.directory + self.data_name + '_uv_train_' + uv + '.pkl'
            uv_test_path = self.directory + self.data_name+ '_uv_test_' + uv + '.pkl'

            #ensures we don't write over the original
            if os.path.isdir(uv_train_path)==False:
                print uv_train_path
                sl.save_to_hadoop(train_ratings, uv_train_path)
            if os.path.isdir(uv_test_path)==False:
                print uv_test_path
                sl.save_to_hadoop(test_ratings, uv_test_path)

        for cv in self.content_vector_types:
            vectorizor = self.get_vectorizer(uv, cv)
            content_vector = vectorizor.get_content_vector()

            content_path = self.directory + self.data_name +'_cv_' + cv + '.pkl'

            if os.path.isdir(content_path)==False:
                print content_path
                sl.save_to_hadoop(content_vector, content_path)
Esempio n. 3
0
    def run_cf_predictions(self):
        for uv in self.user_vector_types:
            train_ratings_loc = self.directory + self.data_name + '_uv_train_' + uv + '.pkl'
            train_ratings = sl.load_from_hadoop(
                train_ratings_loc, self.sc).repartition(self.num_partitions)

            for cf_pred in self.cf_predictions:

                pred_save_loc = self.directory + self.data_name + '_predictions_' + uv + '_' + cf_pred + '.pkl'

                if os.path.isdir(pred_save_loc) == False:
                    print 'Running ' + cf_pred + ' for user vector ' + uv
                    print pred_save_loc
                    if cf_pred == 'cf_mllib':
                        predictions = cf.calc_cf_mllib(
                            train_ratings, num_partitions=self.num_partitions)
                        sl.save_to_hadoop(predictions, pred_save_loc)
                    elif cf_pred == 'cf_item':
                        predictions = cf.calc_item_item_cf(
                            train_ratings, num_partitions=self.num_partitions)
                        sl.save_to_hadoop(predictions, pred_save_loc)
                    elif cf_pred == 'cf_user':
                        predictions = cf.calc_user_user_cf2(
                            train_ratings, num_partitions=self.num_partitions)
                        sl.save_to_hadoop(predictions, pred_save_loc)
                    else:
                        break
        print 'All CF predictions saved'
Esempio n. 4
0
    def run_vectorizer(self):
        for uv in self.user_vector_types:
            #set up the vectorizer.  The data going into each will be slightly different
            #Some don't have sqlCtx and some need support vectors
            vectorizor = self.get_vectorizer(uv, self.content_vector_types[0])

            user_info = vectorizor.get_user_vector().repartition(20)
            train_ratings, test_ratings = user_info.randomSplit([0.9, 0.1], 41)

            uv_train_path = self.directory + self.data_name + '_uv_train_' + uv + '.pkl'
            uv_test_path = self.directory + self.data_name + '_uv_test_' + uv + '.pkl'

            #ensures we don't write over the original
            if os.path.isdir(uv_train_path) == False:
                print uv_train_path
                sl.save_to_hadoop(train_ratings, uv_train_path)
            if os.path.isdir(uv_test_path) == False:
                print uv_test_path
                sl.save_to_hadoop(test_ratings, uv_test_path)

        for cv in self.content_vector_types:
            vectorizor = self.get_vectorizer(uv, cv)
            content_vector = vectorizor.get_content_vector()

            content_path = self.directory + self.data_name + '_cv_' + cv + '.pkl'

            if os.path.isdir(content_path) == False:
                print content_path
                sl.save_to_hadoop(content_vector, content_path)
Esempio n. 5
0
    def run_cb_predictions(self):
        for cv in self.content_vector_types:
            content_path = self.directory + self.data_name + "_cv_" + cv + ".pkl"
            content_vect = sl.load_from_hadoop(content_path, self.sc).repartition(self.num_partitions)

            for uv in self.user_vector_types:
                train_ratings_loc = self.directory + self.data_name + "_uv_train_" + uv + ".pkl"
                train_ratings = sl.load_from_hadoop(train_ratings_loc, self.sc).repartition(self.num_partitions)

                for cb_pred in self.cb_predictions:
                    pred_save_loc = (
                        self.directory + self.data_name + "_predictions_" + uv + "_" + cv + "_" + cb_pred + ".pkl"
                    )
                    print pred_save_loc
                    if os.path.isdir(pred_save_loc) == False:
                        print "Running " + cb_pred + " for user vector " + uv + " and content vector " + cv
                        if cb_pred == "cb_vect":
                            predictions = content_based.predict(
                                train_ratings, content_vect, num_partitions=self.num_partitions
                            )
                            sl.save_to_hadoop(predictions, pred_save_loc)
                        elif cb_pred == "cb_kmeans_100":
                            predictions = content_based_kmeans.predict(
                                train_ratings, content_vect, num_predictions=100, num_partitions=self.num_partitions
                            )
                            sl.save_to_hadoop(predictions, pred_save_loc)
                        elif cb_pred == "cb_kmeans_1000":
                            predictions = content_based_kmeans.predict(
                                train_ratings, content_vect, num_predictions=1000, num_partitions=self.num_partitions
                            )
                            sl.save_to_hadoop(predictions, pred_save_loc)
                        else:
                            break
        print "All CB predictions saved"
Esempio n. 6
0
    def run_cb_predictions(self):
        for cv in self.content_vector_types:
            content_path = self.directory + self.data_name + '_cv_' + cv + '.pkl'
            content_vect = sl.load_from_hadoop(
                content_path, self.sc).repartition(self.num_partitions)

            for uv in self.user_vector_types:
                train_ratings_loc = self.directory + self.data_name + '_uv_train_' + uv + '.pkl'
                train_ratings = sl.load_from_hadoop(train_ratings_loc,
                                                    self.sc).repartition(
                                                        self.num_partitions)

                for cb_pred in self.cb_predictions:
                    pred_save_loc = self.directory + self.data_name + '_predictions_' + uv + '_' + cv + '_' + cb_pred + '.pkl'
                    print pred_save_loc
                    if os.path.isdir(pred_save_loc) == False:
                        print 'Running ' + cb_pred + ' for user vector ' + uv + ' and content vector ' + cv
                        if cb_pred == 'cb_vect':
                            predictions = content_based.predict(
                                train_ratings,
                                content_vect,
                                num_partitions=self.num_partitions)
                            sl.save_to_hadoop(predictions, pred_save_loc)
                        elif cb_pred == 'cb_kmeans_100':
                            predictions = content_based_kmeans.predict(
                                train_ratings,
                                content_vect,
                                num_predictions=100,
                                num_partitions=self.num_partitions)
                            sl.save_to_hadoop(predictions, pred_save_loc)
                        elif cb_pred == 'cb_kmeans_1000':
                            predictions = content_based_kmeans.predict(
                                train_ratings,
                                content_vect,
                                num_predictions=1000,
                                num_partitions=self.num_partitions)
                            sl.save_to_hadoop(predictions, pred_save_loc)
                        else:
                            break
        print 'All CB predictions saved'
Esempio n. 7
0
    def run_cf_predictions(self):
        for uv in self.user_vector_types:
            train_ratings_loc = self.directory + self.data_name + '_uv_train_' + uv + '.pkl'
            train_ratings = sl.load_from_hadoop(train_ratings_loc, self.sc).repartition(self.num_partitions)

            for cf_pred in self.cf_predictions:

                pred_save_loc = self.directory + self.data_name + '_predictions_' + uv + '_' + cf_pred  + '.pkl'

                if os.path.isdir(pred_save_loc)==False:
                    print 'Running ' + cf_pred + ' for user vector ' + uv
                    print pred_save_loc
                    if cf_pred=='cf_mllib':
                        predictions = cf.calc_cf_mllib(train_ratings, num_partitions=self.num_partitions)
                        sl.save_to_hadoop(predictions, pred_save_loc)
                    elif cf_pred=='cf_item':
                        predictions = cf.calc_item_item_cf(train_ratings, num_partitions=self.num_partitions)
                        sl.save_to_hadoop(predictions, pred_save_loc)
                    elif cf_pred=='cf_user':
                        predictions = cf.calc_user_user_cf2(train_ratings, num_partitions=self.num_partitions)
                        sl.save_to_hadoop(predictions, pred_save_loc)
                    else:
                        break
        print 'All CF predictions saved'
Esempio n. 8
0
    def run_cf_predictions(self):
        for uv in self.user_vector_types:
            train_ratings_loc = self.directory + self.data_name + "_uv_train_" + uv + ".pkl"
            train_ratings = sl.load_from_hadoop(train_ratings_loc, self.sc).repartition(self.num_partitions)

            for cf_pred in self.cf_predictions:

                pred_save_loc = self.directory + self.data_name + "_predictions_" + uv + "_" + cf_pred + ".pkl"

                if os.path.isdir(pred_save_loc) == False:
                    print "Running " + cf_pred + " for user vector " + uv
                    print pred_save_loc
                    if cf_pred == "cf_mllib":
                        predictions = cf.calc_cf_mllib(train_ratings, num_partitions=self.num_partitions)
                        sl.save_to_hadoop(predictions, pred_save_loc)
                    elif cf_pred == "cf_item":
                        predictions = cf.calc_item_item_cf(train_ratings, num_partitions=self.num_partitions)
                        sl.save_to_hadoop(predictions, pred_save_loc)
                    elif cf_pred == "cf_user":
                        predictions = cf.calc_user_user_cf2(train_ratings, num_partitions=self.num_partitions)
                        sl.save_to_hadoop(predictions, pred_save_loc)
                    else:
                        break
        print "All CF predictions saved"
Esempio n. 9
0
    def run_single_prediction(self, user_vector, content_vector, alg_type):
        train_ratings_loc = self.directory + self.data_name + '_uv_train_' + user_vector + '.pkl'
        train_ratings = sl.load_from_hadoop(train_ratings_loc, self.sc).repartition(self.num_partitions)

        if content_vector:
            content_path = self.directory + self.data_name +'_cv_' + content_vector + '.pkl'
            content_vect = sl.load_from_hadoop(content_path, self.sc).repartition(self.num_partitions)
            print 'Running ' + alg_type + ' for user vector ' + user_vector + ' and content vector ' + content_vector

            pred_save_loc = self.directory + self.data_name + '_predictions_' + user_vector + '_' + content_vector + '_' + alg_type  + '.pkl'
            print pred_save_loc

            if alg_type=='cb_vect':
                predictions = content_based.predict(train_ratings, content_vect, num_partitions=self.num_partitions)
                sl.save_to_hadoop(predictions, pred_save_loc)
            elif alg_type=='cb_kmeans_100':
                predictions = content_based_kmeans.predict(train_ratings, content_vect, num_predictions=100, num_partitions=self.num_partitions)
                sl.save_to_hadoop(predictions, pred_save_loc)
            elif alg_type=='cb_kmeans_1000':
                predictions = content_based_kmeans.predict(train_ratings, content_vect, num_predictions=1000, num_partitions=self.num_partitions)
                sl.save_to_hadoop(predictions, pred_save_loc)

        else:
            print 'Running ' + alg_type + ' for user vector ' + user_vector

            pred_save_loc = self.directory + self.data_name + '_predictions_' + user_vector + '_' + alg_type  + '.pkl'
            print pred_save_loc

            if alg_type=='cf_mllib':
                predictions = cf.calc_cf_mllib(train_ratings, num_partitions=self.num_partitions)
                sl.save_to_hadoop(predictions, pred_save_loc)
            elif alg_type=='cf_item':
                predictions = cf.calc_item_item_cf(train_ratings, num_partitions=self.num_partitions)
                sl.save_to_hadoop(predictions, pred_save_loc)
            elif alg_type=='cf_user':
                predictions = cf.calc_user_user_cf2(train_ratings, num_partitions=self.num_partitions)
                sl.save_to_hadoop(predictions, pred_save_loc)
Esempio n. 10
0
    def run_single_prediction(self, user_vector, content_vector, alg_type):
        train_ratings_loc = self.directory + self.data_name + '_uv_train_' + user_vector + '.pkl'
        train_ratings = sl.load_from_hadoop(
            train_ratings_loc, self.sc).repartition(self.num_partitions)

        if content_vector:
            content_path = self.directory + self.data_name + '_cv_' + content_vector + '.pkl'
            content_vect = sl.load_from_hadoop(
                content_path, self.sc).repartition(self.num_partitions)
            print 'Running ' + alg_type + ' for user vector ' + user_vector + ' and content vector ' + content_vector

            pred_save_loc = self.directory + self.data_name + '_predictions_' + user_vector + '_' + content_vector + '_' + alg_type + '.pkl'
            print pred_save_loc

            if alg_type == 'cb_vect':
                predictions = content_based.predict(
                    train_ratings,
                    content_vect,
                    num_partitions=self.num_partitions)
                sl.save_to_hadoop(predictions, pred_save_loc)
            elif alg_type == 'cb_kmeans_100':
                predictions = content_based_kmeans.predict(
                    train_ratings,
                    content_vect,
                    num_predictions=100,
                    num_partitions=self.num_partitions)
                sl.save_to_hadoop(predictions, pred_save_loc)
            elif alg_type == 'cb_kmeans_1000':
                predictions = content_based_kmeans.predict(
                    train_ratings,
                    content_vect,
                    num_predictions=1000,
                    num_partitions=self.num_partitions)
                sl.save_to_hadoop(predictions, pred_save_loc)

        else:
            print 'Running ' + alg_type + ' for user vector ' + user_vector

            pred_save_loc = self.directory + self.data_name + '_predictions_' + user_vector + '_' + alg_type + '.pkl'
            print pred_save_loc

            if alg_type == 'cf_mllib':
                predictions = cf.calc_cf_mllib(
                    train_ratings, num_partitions=self.num_partitions)
                sl.save_to_hadoop(predictions, pred_save_loc)
            elif alg_type == 'cf_item':
                predictions = cf.calc_item_item_cf(
                    train_ratings, num_partitions=self.num_partitions)
                sl.save_to_hadoop(predictions, pred_save_loc)
            elif alg_type == 'cf_user':
                predictions = cf.calc_user_user_cf2(
                    train_ratings, num_partitions=self.num_partitions)
                sl.save_to_hadoop(predictions, pred_save_loc)
            elif alg_type == 'cf_bayes_map':
                predictions = cf.calc_naive_bayes_map(train_ratings, self.sc)
                sl.save_to_hadoop(predictions, pred_save_loc)
            elif alg_type == 'cf_bayes_mse':
                predictions = cf.calc_naive_bayes_mse(train_ratings, self.sc)
                sl.save_to_hadoop(predictions, pred_save_loc)
            elif alg_type == 'cf_bayes_mae':
                predictions = cf.calc_naive_bayes_mae(train_ratings, self.sc)
                sl.save_to_hadoop(predictions, pred_save_loc)
            elif alg_type == 'cf_random':
                predictions = random_recommender.predict(
                    train_ratings, self.sc)
                sl.save_to_hadoop(predictions, pred_save_loc)
Esempio n. 11
0
    def run_single_prediction(self, user_vector, content_vector, alg_type):
        train_ratings_loc = self.directory + self.data_name + "_uv_train_" + user_vector + ".pkl"
        train_ratings = sl.load_from_hadoop(train_ratings_loc, self.sc).repartition(self.num_partitions)

        if content_vector:
            content_path = self.directory + self.data_name + "_cv_" + content_vector + ".pkl"
            content_vect = sl.load_from_hadoop(content_path, self.sc).repartition(self.num_partitions)
            print "Running " + alg_type + " for user vector " + user_vector + " and content vector " + content_vector

            pred_save_loc = (
                self.directory
                + self.data_name
                + "_predictions_"
                + user_vector
                + "_"
                + content_vector
                + "_"
                + alg_type
                + ".pkl"
            )
            print pred_save_loc

            if alg_type == "cb_vect":
                predictions = content_based.predict(train_ratings, content_vect, num_partitions=self.num_partitions)
                sl.save_to_hadoop(predictions, pred_save_loc)
            elif alg_type == "cb_kmeans_100":
                predictions = content_based_kmeans.predict(
                    train_ratings, content_vect, num_predictions=100, num_partitions=self.num_partitions
                )
                sl.save_to_hadoop(predictions, pred_save_loc)
            elif alg_type == "cb_kmeans_1000":
                predictions = content_based_kmeans.predict(
                    train_ratings, content_vect, num_predictions=1000, num_partitions=self.num_partitions
                )
                sl.save_to_hadoop(predictions, pred_save_loc)

        else:
            print "Running " + alg_type + " for user vector " + user_vector

            pred_save_loc = self.directory + self.data_name + "_predictions_" + user_vector + "_" + alg_type + ".pkl"
            print pred_save_loc

            if alg_type == "cf_mllib":
                predictions = cf.calc_cf_mllib(train_ratings, num_partitions=self.num_partitions)
                sl.save_to_hadoop(predictions, pred_save_loc)
            elif alg_type == "cf_item":
                predictions = cf.calc_item_item_cf(train_ratings, num_partitions=self.num_partitions)
                sl.save_to_hadoop(predictions, pred_save_loc)
            elif alg_type == "cf_user":
                predictions = cf.calc_user_user_cf2(train_ratings, num_partitions=self.num_partitions)
                sl.save_to_hadoop(predictions, pred_save_loc)
            elif alg_type == "cf_bayes_map":
                predictions = cf.calc_naive_bayes_map(train_ratings, self.sc)
                sl.save_to_hadoop(predictions, pred_save_loc)
            elif alg_type == "cf_bayes_mse":
                predictions = cf.calc_naive_bayes_mse(train_ratings, self.sc)
                sl.save_to_hadoop(predictions, pred_save_loc)
            elif alg_type == "cf_bayes_mae":
                predictions = cf.calc_naive_bayes_mae(train_ratings, self.sc)
                sl.save_to_hadoop(predictions, pred_save_loc)
            elif alg_type == "cf_random":
                predictions = random_recommender.predict(train_ratings, self.sc)
                sl.save_to_hadoop(predictions, pred_save_loc)