Python get_dataset_stats Examples, src.algorithms.dataset_stats.get_dataset_stats Python Examples

Example #1

0

Show file

File: hermes_run_script.py Project: agude/hermes

    def run_cf_results(self):
        for uv in self.user_vector_types:
            train_ratings_loc = self.directory + self.data_name + "_uv_train_" + uv + ".pkl"
            train_ratings = sl.load_from_hadoop(train_ratings_loc, self.sc).repartition(self.num_partitions)
            test_ratings_loc = self.directory + self.data_name + "_uv_test_" + uv + ".pkl"
            test_ratings = sl.load_from_hadoop(test_ratings_loc, self.sc).repartition(self.num_partitions)

            # get the first content vector for results purposes
            content_path = self.directory + self.data_name + "_cv_" + self.content_vector_types[0] + ".pkl"
            content_vect = sl.load_from_hadoop(content_path, self.sc).repartition(self.num_partitions)

            # Calculate statistics about the dataset
            stats = dataset_stats.get_dataset_stats(train_ratings, test_ratings)

            for cf_pred in self.cf_predictions:

                pred_save_loc = self.directory + self.data_name + "_predictions_" + uv + "_" + cf_pred + ".pkl"
                print "Getting results for: " + pred_save_loc
                preds = sl.load_from_hadoop(pred_save_loc, self.sc).repartition(self.num_partitions)

                for run in self.results_runs:
                    results = performance_metrics.get_perform_metrics(
                        test_ratings,
                        train_ratings,
                        preds,
                        content_vect,
                        self.sqlCtx,
                        num_predictions=run,
                        num_partitions=self.num_partitions,
                    )
                    # Merge the stats (which do not change run to run) with the results
                    results.update(stats)

                    # add some information to the results dictionary if it gets jumbled

                    results["N"] = run
                    results["dataset"] = self.data_name
                    results["CF_CB"] = "CF"
                    results["alg_type"] = cf_pred
                    results["user_vector"] = uv
                    results["content_vector"] = self.content_vector_types[0]
                    print results

                    # save off the results
                    results_path = (
                        self.results_directory
                        + self.data_name
                        + "_results_"
                        + uv
                        + "_"
                        + cf_pred
                        + "_"
                        + str(run)
                        + ".pkl"
                    )
                    f = open(results_path, "w")
                    f.write(str(results))
                    f.close()
        print "All CF predictions results aquired"

Example #2

0

Show file

    def run_cf_results(self):
        for uv in self.user_vector_types:
            train_ratings_loc = self.directory + self.data_name + '_uv_train_' + uv + '.pkl'
            train_ratings = sl.load_from_hadoop(
                train_ratings_loc, self.sc).repartition(self.num_partitions)
            test_ratings_loc = self.directory + self.data_name + '_uv_test_' + uv + '.pkl'
            test_ratings = sl.load_from_hadoop(
                test_ratings_loc, self.sc).repartition(self.num_partitions)

            #get the first content vector for results purposes
            content_path = self.directory + self.data_name + '_cv_' + self.content_vector_types[
                0] + '.pkl'
            content_vect = sl.load_from_hadoop(
                content_path, self.sc).repartition(self.num_partitions)

            # Calculate statistics about the dataset
            stats = dataset_stats.get_dataset_stats(train_ratings,
                                                    test_ratings)

            for cf_pred in self.cf_predictions:

                pred_save_loc = self.directory + self.data_name + '_predictions_' + uv + '_' + cf_pred + '.pkl'
                print 'Getting results for: ' + pred_save_loc
                preds = sl.load_from_hadoop(
                    pred_save_loc, self.sc).repartition(self.num_partitions)

                for run in self.results_runs:
                    results = performance_metrics.get_perform_metrics(test_ratings, train_ratings, preds, \
                                                    content_vect, self.sqlCtx, num_predictions = run, num_partitions=self.num_partitions)
                    # Merge the stats (which do not change run to run) with the results
                    results.update(stats)

                    #add some information to the results dictionary if it gets jumbled

                    results['N'] = run
                    results['dataset'] = self.data_name
                    results['CF_CB'] = 'CF'
                    results['alg_type'] = cf_pred
                    results['user_vector'] = uv
                    results['content_vector'] = self.content_vector_types[0]
                    print results

                    #save off the results
                    results_path = self.results_directory + self.data_name + '_results_' + uv + '_' \
                                + cf_pred  + '_' + str(run) + '.pkl'
                    f = open(results_path, 'w')
                    f.write(str(results))
                    f.close()
        print 'All CF predictions results aquired'

Example #3

0

Show file

    def run_single_result(self, user_vector, content_vector, alg_type,
                          algorithm, num_preds):

        train_ratings_loc = self.directory + self.data_name + '_uv_train_' + user_vector + '.pkl'
        train_ratings = sl.load_from_hadoop(
            train_ratings_loc, self.sc).repartition(self.num_partitions)
        test_ratings_loc = self.directory + self.data_name + '_uv_test_' + user_vector + '.pkl'
        test_ratings = sl.load_from_hadoop(
            test_ratings_loc, self.sc).repartition(self.num_partitions)

        content_path = self.directory + self.data_name + '_cv_' + content_vector + '.pkl'
        content_vect = sl.load_from_hadoop(content_path, self.sc).repartition(
            self.num_partitions)

        stats = dataset_stats.get_dataset_stats(train_ratings, test_ratings)

        if alg_type == 'cb':
            pred_save_loc = self.directory + self.data_name + '_predictions_' + user_vector + '_' + content_vector + '_' \
                                + algorithm + '.pkl'
            results_path = self.results_directory + self.data_name + '_results_' + user_vector + '_' + content_vector + '_' \
                + algorithm  + '_' + str(num_preds) + '.csv'
        else:
            pred_save_loc = self.directory + self.data_name + '_predictions_' + user_vector +  '_' \
                                + algorithm + '.pkl'
            results_path = self.results_directory + self.data_name + '_results_' + user_vector  + '_' \
                + algorithm  + '_' + str(num_preds) + '.csv'
        print 'Getting results for: ' + pred_save_loc
        preds = sl.load_from_hadoop(pred_save_loc,
                                    self.sc).repartition(self.num_partitions)

        results = performance_metrics.get_perform_metrics(test_ratings, train_ratings, preds, \
                             content_vect, self.sqlCtx, num_predictions = num_preds, num_partitions=self.num_partitions)
        # Merge the stats (which do not change run to run) with the results
        results.update(stats)
        #add some information to the results dictionary if it gets jumbled
        results['N'] = num_preds
        results['dataset'] = self.data_name
        results['CF_CB'] = 'CB'
        results['alg_type'] = algorithm
        results['user_vector'] = user_vector
        results['content_vector'] = content_vector
        print results

        #save off the results
        print results_path
        f = open(results_path, 'w')
        f.write(str(results))
        f.close()

Example #4

0

Show file

File: hermes_run_script.py Project: fototo/hermes

    def run_single_result(self, user_vector, content_vector, alg_type, algorithm, num_preds):

        train_ratings_loc = self.directory + self.data_name + '_uv_train_' + user_vector + '.pkl'
        train_ratings = sl.load_from_hadoop(train_ratings_loc, self.sc).repartition(self.num_partitions)
        test_ratings_loc = self.directory + self.data_name + '_uv_test_' + user_vector + '.pkl'
        test_ratings = sl.load_from_hadoop(test_ratings_loc, self.sc).repartition(self.num_partitions)

        content_path = self.directory + self.data_name +'_cv_' + content_vector + '.pkl'
        content_vect = sl.load_from_hadoop(content_path, self.sc).repartition(self.num_partitions)

        stats = dataset_stats.get_dataset_stats(train_ratings, test_ratings)

        if alg_type=='cb':
            pred_save_loc = self.directory + self.data_name + '_predictions_' + user_vector + '_' + content_vector + '_' \
                                + algorithm + '.pkl'
            results_path = self.results_directory + self.data_name + '_results_' + user_vector + '_' + content_vector + '_' \
                + algorithm  + '_' + str(num_preds) + '.csv'
        else:
            pred_save_loc = self.directory + self.data_name + '_predictions_' + user_vector +  '_' \
                                + algorithm + '.pkl'
            results_path = self.results_directory + self.data_name + '_results_' + user_vector  + '_' \
                + algorithm  + '_' + str(num_preds) + '.csv'
        print 'Getting results for: ' + pred_save_loc
        preds = sl.load_from_hadoop(pred_save_loc, self.sc).repartition(self.num_partitions)

        results = performance_metrics.get_perform_metrics(test_ratings, train_ratings, preds, \
                             content_vect, self.sqlCtx, num_predictions = num_preds, num_partitions=self.num_partitions)
        # Merge the stats (which do not change run to run) with the results
        results.update(stats)
        #add some information to the results dictionary if it gets jumbled
        results['N'] = num_preds
        results['dataset'] = self.data_name
        results['CF_CB'] = 'CB'
        results['alg_type'] = algorithm
        results['user_vector'] = user_vector
        results['content_vector'] = content_vector
        print results

        #save off the results
        print results_path
        f = open(results_path, 'w')
        f.write(str(results))
        f.close()

Example #5

0

Show file

File: hermes_run_script.py Project: fototo/hermes

    def run_cf_results(self):
        for uv in self.user_vector_types:
            train_ratings_loc = self.directory + self.data_name + '_uv_train_' + uv + '.pkl'
            train_ratings = sl.load_from_hadoop(train_ratings_loc, self.sc).repartition(self.num_partitions)
            test_ratings_loc = self.directory + self.data_name + '_uv_test_' + uv + '.pkl'
            test_ratings = sl.load_from_hadoop(test_ratings_loc, self.sc).repartition(self.num_partitions)

            #get the first content vector for results purposes
            content_path = self.directory + self.data_name +'_cv_' + self.content_vector_types[0] + '.pkl'
            content_vect = sl.load_from_hadoop(content_path, self.sc).repartition(self.num_partitions)

            # Calculate statistics about the dataset
            stats = dataset_stats.get_dataset_stats(train_ratings, test_ratings)

            for cf_pred in self.cf_predictions:

                pred_save_loc = self.directory + self.data_name + '_predictions_' + uv + '_' + cf_pred  + '.pkl'
                print 'Getting results for: ' + pred_save_loc
                preds = sl.load_from_hadoop(pred_save_loc, self.sc).repartition(self.num_partitions)

                for run in self.results_runs:
                    results = performance_metrics.get_perform_metrics(test_ratings, train_ratings, preds, \
                                                    content_vect, self.sqlCtx, num_predictions = run, num_partitions=self.num_partitions)
                    # Merge the stats (which do not change run to run) with the results
                    results.update(stats)

                    #add some information to the results dictionary if it gets jumbled

                    results['N'] = run
                    results['dataset'] = self.data_name
                    results['CF_CB'] = 'CF'
                    results['alg_type'] = cf_pred
                    results['user_vector'] = uv
                    results['content_vector'] = self.content_vector_types[0]
                    print results

                    #save off the results
                    results_path = self.results_directory + self.data_name + '_results_' + uv + '_' \
                                + cf_pred  + '_' + str(run) + '.pkl'
                    f = open(results_path, 'w')
                    f.write(str(results))
                    f.close()
        print 'All CF predictions results aquired'

Example #6

0

Show file

File: hermes_run_script.py Project: fototo/hermes

    def run_cb_results(self):
        for cv in self.content_vector_types:
            content_path = self.directory + self.data_name +'_cv_' + cv + '.pkl'
            content_vect = sl.load_from_hadoop(content_path, self.sc)

            for uv in self.user_vector_types:
                train_ratings_loc = self.directory + self.data_name + '_uv_train_' + uv + '.pkl'
                train_ratings = sl.load_from_hadoop(train_ratings_loc, self.sc).repartition(self.num_partitions)
                test_ratings_loc = self.directory + self.data_name + '_uv_test_' + uv + '.pkl'
                test_ratings = sl.load_from_hadoop(test_ratings_loc, self.sc).repartition(self.num_partitions)

                # Calculate statistics about the dataset
                stats = dataset_stats.get_dataset_stats(train_ratings, test_ratings)

                for cb_pred in self.cb_predictions:

                    pred_save_loc = self.directory + self.data_name + '_predictions_' + uv + '_' + cv + '_' \
                                + cb_pred + '.pkl'
                    print 'Getting results for: ' + pred_save_loc
                    preds = sl.load_from_hadoop(pred_save_loc, self.sc).repartition(self.num_partitions)
                    #print preds.count()

                    #if we ran the kmeans we do not need to complete both runs
                    #otherwise we do
                    if cb_pred=='cb_kmeans_100' or cb_pred=='cb_kmeans_1000':
                        if cb_pred=='cb_kmeans_1000':
                            run = 1000
                        else:
                            run = 100
                        results = performance_metrics.get_perform_metrics(test_ratings, train_ratings, preds, \
                                                            content_vect, self.sqlCtx, num_predictions = run, num_partitions=self.num_partitions)
                        # Merge the stats (which do not change run to run) with the results
                        results.update(stats)
                        #add some information to the results dictionary if it gets jumbled
                        results['N'] = run
                        results['dataset'] = self.data_name
                        results['CF_CB'] = 'CB'
                        results['alg_type'] = cb_pred
                        results['user_vector'] = uv
                        results['content_vector'] = cv
                        print results

                        #save off the results
                        results_path = self.results_directory + self.data_name + '_results_' + uv + '_' + cv + '_' \
                                        + cb_pred  + '_' + str(run) + '.csv'
                        print results_path
                        f = open(results_path, 'w')
                        f.write(str(results))
                        f.close()


                    else:
                        for run in self.results_runs:
                            results = performance_metrics.get_perform_metrics(test_ratings, train_ratings, preds, \
                                                            content_vect, self.sqlCtx, num_predictions = run, num_partitions=self.num_partitions)
                            # Merge the stats (which do not change run to run) with the results
                            results.update(stats)
                            #add some information to the results dictionary if it gets jumbled
                            results['N'] = run
                            results['dataset'] = self.data_name
                            results['CF_CB'] = 'CB'
                            results['alg_type'] = cb_pred
                            results['user_vector'] = uv
                            results['content_vector'] = cv
                            print results

                            #save off the results
                            results_path = self.results_directory + self.data_name + '_results_' + uv + '_' + cv \
                                        + '_' + cb_pred  + '_' + str(run) + '.csv'
                            print results_path
                            f = open(results_path, 'w')
                            f.write(str(results))
                            f.close()
        print 'All CB predictions results aquired'

Example #7

0

Show file

    def run_cb_results(self):
        for cv in self.content_vector_types:
            content_path = self.directory + self.data_name + '_cv_' + cv + '.pkl'
            content_vect = sl.load_from_hadoop(content_path, self.sc)

            for uv in self.user_vector_types:
                train_ratings_loc = self.directory + self.data_name + '_uv_train_' + uv + '.pkl'
                train_ratings = sl.load_from_hadoop(train_ratings_loc,
                                                    self.sc).repartition(
                                                        self.num_partitions)
                test_ratings_loc = self.directory + self.data_name + '_uv_test_' + uv + '.pkl'
                test_ratings = sl.load_from_hadoop(
                    test_ratings_loc, self.sc).repartition(self.num_partitions)

                # Calculate statistics about the dataset
                stats = dataset_stats.get_dataset_stats(
                    train_ratings, test_ratings)

                for cb_pred in self.cb_predictions:

                    pred_save_loc = self.directory + self.data_name + '_predictions_' + uv + '_' + cv + '_' \
                                + cb_pred + '.pkl'
                    print 'Getting results for: ' + pred_save_loc
                    preds = sl.load_from_hadoop(pred_save_loc,
                                                self.sc).repartition(
                                                    self.num_partitions)
                    #print preds.count()

                    #if we ran the kmeans we do not need to complete both runs
                    #otherwise we do
                    if cb_pred == 'cb_kmeans_100' or cb_pred == 'cb_kmeans_1000':
                        if cb_pred == 'cb_kmeans_1000':
                            run = 1000
                        else:
                            run = 100
                        results = performance_metrics.get_perform_metrics(test_ratings, train_ratings, preds, \
                                                            content_vect, self.sqlCtx, num_predictions = run, num_partitions=self.num_partitions)
                        # Merge the stats (which do not change run to run) with the results
                        results.update(stats)
                        #add some information to the results dictionary if it gets jumbled
                        results['N'] = run
                        results['dataset'] = self.data_name
                        results['CF_CB'] = 'CB'
                        results['alg_type'] = cb_pred
                        results['user_vector'] = uv
                        results['content_vector'] = cv
                        print results

                        #save off the results
                        results_path = self.results_directory + self.data_name + '_results_' + uv + '_' + cv + '_' \
                                        + cb_pred  + '_' + str(run) + '.csv'
                        print results_path
                        f = open(results_path, 'w')
                        f.write(str(results))
                        f.close()

                    else:
                        for run in self.results_runs:
                            results = performance_metrics.get_perform_metrics(test_ratings, train_ratings, preds, \
                                                            content_vect, self.sqlCtx, num_predictions = run, num_partitions=self.num_partitions)
                            # Merge the stats (which do not change run to run) with the results
                            results.update(stats)
                            #add some information to the results dictionary if it gets jumbled
                            results['N'] = run
                            results['dataset'] = self.data_name
                            results['CF_CB'] = 'CB'
                            results['alg_type'] = cb_pred
                            results['user_vector'] = uv
                            results['content_vector'] = cv
                            print results

                            #save off the results
                            results_path = self.results_directory + self.data_name + '_results_' + uv + '_' + cv \
                                        + '_' + cb_pred  + '_' + str(run) + '.csv'
                            print results_path
                            f = open(results_path, 'w')
                            f.write(str(results))
                            f.close()
        print 'All CB predictions results aquired'

Example #8

0

Show file

File: hermes_run_script.py Project: agude/hermes

    def run_single_result(self, user_vector, content_vector, alg_type, algorithm, num_preds):

        train_ratings_loc = self.directory + self.data_name + "_uv_train_" + user_vector + ".pkl"
        train_ratings = sl.load_from_hadoop(train_ratings_loc, self.sc).repartition(self.num_partitions)
        test_ratings_loc = self.directory + self.data_name + "_uv_test_" + user_vector + ".pkl"
        test_ratings = sl.load_from_hadoop(test_ratings_loc, self.sc).repartition(self.num_partitions)

        content_path = self.directory + self.data_name + "_cv_" + content_vector + ".pkl"
        content_vect = sl.load_from_hadoop(content_path, self.sc).repartition(self.num_partitions)

        stats = dataset_stats.get_dataset_stats(train_ratings, test_ratings)

        if alg_type == "cb":
            pred_save_loc = (
                self.directory
                + self.data_name
                + "_predictions_"
                + user_vector
                + "_"
                + content_vector
                + "_"
                + algorithm
                + ".pkl"
            )
            results_path = (
                self.results_directory
                + self.data_name
                + "_results_"
                + user_vector
                + "_"
                + content_vector
                + "_"
                + algorithm
                + "_"
                + str(num_preds)
                + ".csv"
            )
        else:
            pred_save_loc = self.directory + self.data_name + "_predictions_" + user_vector + "_" + algorithm + ".pkl"
            results_path = (
                self.results_directory
                + self.data_name
                + "_results_"
                + user_vector
                + "_"
                + algorithm
                + "_"
                + str(num_preds)
                + ".csv"
            )
        print "Getting results for: " + pred_save_loc
        preds = sl.load_from_hadoop(pred_save_loc, self.sc).repartition(self.num_partitions)

        results = performance_metrics.get_perform_metrics(
            test_ratings,
            train_ratings,
            preds,
            content_vect,
            self.sqlCtx,
            num_predictions=num_preds,
            num_partitions=self.num_partitions,
        )
        # Merge the stats (which do not change run to run) with the results
        results.update(stats)
        # add some information to the results dictionary if it gets jumbled
        results["N"] = num_preds
        results["dataset"] = self.data_name
        results["CF_CB"] = "CB"
        results["alg_type"] = algorithm
        results["user_vector"] = user_vector
        results["content_vector"] = content_vector
        print results

        # save off the results
        print results_path
        f = open(results_path, "w")
        f.write(str(results))
        f.close()

Example #9

0

Show file

File: hermes_run_script.py Project: agude/hermes

    def run_cb_results(self):
        for cv in self.content_vector_types:
            content_path = self.directory + self.data_name + "_cv_" + cv + ".pkl"
            content_vect = sl.load_from_hadoop(content_path, self.sc)

            for uv in self.user_vector_types:
                train_ratings_loc = self.directory + self.data_name + "_uv_train_" + uv + ".pkl"
                train_ratings = sl.load_from_hadoop(train_ratings_loc, self.sc).repartition(self.num_partitions)
                test_ratings_loc = self.directory + self.data_name + "_uv_test_" + uv + ".pkl"
                test_ratings = sl.load_from_hadoop(test_ratings_loc, self.sc).repartition(self.num_partitions)

                # Calculate statistics about the dataset
                stats = dataset_stats.get_dataset_stats(train_ratings, test_ratings)

                for cb_pred in self.cb_predictions:

                    pred_save_loc = (
                        self.directory + self.data_name + "_predictions_" + uv + "_" + cv + "_" + cb_pred + ".pkl"
                    )
                    print "Getting results for: " + pred_save_loc
                    preds = sl.load_from_hadoop(pred_save_loc, self.sc).repartition(self.num_partitions)
                    # print preds.count()

                    # if we ran the kmeans we do not need to complete both runs
                    # otherwise we do
                    if cb_pred == "cb_kmeans_100" or cb_pred == "cb_kmeans_1000":
                        if cb_pred == "cb_kmeans_1000":
                            run = 1000
                        else:
                            run = 100
                        results = performance_metrics.get_perform_metrics(
                            test_ratings,
                            train_ratings,
                            preds,
                            content_vect,
                            self.sqlCtx,
                            num_predictions=run,
                            num_partitions=self.num_partitions,
                        )
                        # Merge the stats (which do not change run to run) with the results
                        results.update(stats)
                        # add some information to the results dictionary if it gets jumbled
                        results["N"] = run
                        results["dataset"] = self.data_name
                        results["CF_CB"] = "CB"
                        results["alg_type"] = cb_pred
                        results["user_vector"] = uv
                        results["content_vector"] = cv
                        print results

                        # save off the results
                        results_path = (
                            self.results_directory
                            + self.data_name
                            + "_results_"
                            + uv
                            + "_"
                            + cv
                            + "_"
                            + cb_pred
                            + "_"
                            + str(run)
                            + ".csv"
                        )
                        print results_path
                        f = open(results_path, "w")
                        f.write(str(results))
                        f.close()

                    else:
                        for run in self.results_runs:
                            results = performance_metrics.get_perform_metrics(
                                test_ratings,
                                train_ratings,
                                preds,
                                content_vect,
                                self.sqlCtx,
                                num_predictions=run,
                                num_partitions=self.num_partitions,
                            )
                            # Merge the stats (which do not change run to run) with the results
                            results.update(stats)
                            # add some information to the results dictionary if it gets jumbled
                            results["N"] = run
                            results["dataset"] = self.data_name
                            results["CF_CB"] = "CB"
                            results["alg_type"] = cb_pred
                            results["user_vector"] = uv
                            results["content_vector"] = cv
                            print results

                            # save off the results
                            results_path = (
                                self.results_directory
                                + self.data_name
                                + "_results_"
                                + uv
                                + "_"
                                + cv
                                + "_"
                                + cb_pred
                                + "_"
                                + str(run)
                                + ".csv"
                            )
                            print results_path
                            f = open(results_path, "w")
                            f.write(str(results))
                            f.close()
        print "All CB predictions results aquired"