Esempio n. 1
0
    def run_single_fold(self, parameters):

        fold = parameters['fold']

        Constants.update_properties(parameters)

        Constants.print_properties()

        utilities.plant_seeds()
        self.load()

        records = self.original_records

        # self.plant_seeds()
        total_cycle_time = 0.0
        num_folds = Constants.CROSS_VALIDATION_NUM_FOLDS
        split = 1 - (1 / float(num_folds))
        self.records = copy.deepcopy(records)
        if Constants.SHUFFLE_DATA:
            self.shuffle(self.records)

        fold_start = time.time()
        cv_start = float(fold) / num_folds
        print('\nFold: %d/%d' % ((fold + 1), num_folds))

        self.create_tmp_file_names(0, fold)
        self.train_records, self.test_records = \
            ETLUtils.split_train_test_copy(
                self.records, split=split, start=cv_start)
        # subsample_size = int(len(self.train_records)*0.5)
        # self.train_records = self.train_records[:subsample_size]
        self.get_records_to_predict(True)
        if Constants.USE_CONTEXT:
            if Constants.SEPARATE_TOPIC_MODEL_RECSYS_REVIEWS:
                self.load_cache_context_topics(None, None)
            else:
                context_extractor = self.train_topic_model(0, fold)
                self.find_reviews_topics(context_extractor, 0, fold)
        else:
            self.context_rich_topics = []
        self.predict()
        metrics = self.evaluate()

        fold_end = time.time()
        fold_time = fold_end - fold_start
        total_cycle_time += fold_time
        self.clear()
        print("Total fold %d time = %f seconds" % ((fold + 1), fold_time))

        return metrics
Esempio n. 2
0
    def perform_cross_validation(self):

        print(Constants._properties)

        self.plant_seeds()

        total_metric = 0.0
        # total_specific_recall = 0.0
        # total_generic_recall = 0.0
        total_cycle_time = 0.0
        num_cycles = Constants.NUM_CYCLES
        num_folds = Constants.CROSS_VALIDATION_NUM_FOLDS
        total_iterations = num_cycles * num_folds
        split = 1 - (1 / float(num_folds))

        self.load()

        for i in range(num_cycles):

            print('\n\nCycle: %d/%d' % ((i + 1), num_cycles))

            if Constants.SHUFFLE_DATA:
                self.shuffle()
            self.records = copy.deepcopy(self.original_records)

            for j in range(num_folds):

                fold_start = time.time()
                cv_start = float(j) / num_folds
                print('\nFold: %d/%d' % ((j + 1), num_folds))

                self.create_tmp_file_names()
                self.train_records, self.test_records =\
                    ETLUtils.split_train_test_copy(
                        self.records, split=split, start=cv_start)
                self.get_records_to_predict()
                if Constants.USE_CONTEXT:
                    lda_based_context = self.train_topic_model(i, j)
                    self.find_reviews_topics(lda_based_context)
                self.predict()
                metric = self.evaluate()
                # recall = self.top_n_evaluator.recall
                # specific_recall = self.top_n_evaluator.specific_recall
                # generic_recall = self.top_n_evaluator.generic_recall
                total_metric += metric
                # total_specific_recall += specific_recall
                # total_generic_recall += generic_recall

                fold_end = time.time()
                fold_time = fold_end - fold_start
                total_cycle_time += fold_time
                self.clear()
                print("Total fold %d time = %f seconds" % ((j + 1), fold_time))

        metric_average = total_metric / total_iterations
        # average_specific_recall = total_specific_recall / total_iterations
        # average_generic_recall = total_generic_recall / total_iterations
        average_cycle_time = total_cycle_time / total_iterations
        print('average rmse: %f' % metric_average)
        # print('average specific recall: %f' % average_specific_recall)
        # print('average generic recall: %f' % average_generic_recall)
        print('average cycle time: %f' % average_cycle_time)
        print('End: %s' % time.strftime("%Y/%m/%d-%H:%M:%S"))
        #
        results = copy.deepcopy(Constants._properties)
        results[Constants.EVALUATION_METRIC] = metric_average
        # results['specific_recall'] = average_specific_recall
        # results['generic_recall'] = average_generic_recall
        results['cycle_time'] = average_cycle_time
        results['timestamp'] = time.strftime("%Y/%m/%d-%H:%M:%S")

        write_results_to_csv(results)
        write_results_to_json(results)
Esempio n. 3
0
    def perform_cross_validation(self):

        Constants.print_properties()

        utilities.plant_seeds()

        total_recall = 0.0
        total_specific_recall = 0.0
        total_generic_recall = 0.0
        total_cycle_time = 0.0
        num_cycles = Constants.NUM_CYCLES
        num_folds = Constants.CROSS_VALIDATION_NUM_FOLDS
        total_iterations = num_cycles * num_folds
        split = 1 - (1/float(num_folds))

        self.load()

        for i in range(num_cycles):

            print('\n\nCycle: %d/%d' % ((i+1), num_cycles))

            if Constants.SHUFFLE_DATA:
                self.shuffle()
            self.records = copy.deepcopy(self.original_records)
            self.reviews = copy.deepcopy(self.original_reviews)

            for j in range(num_folds):

                fold_start = time.time()
                cv_start = float(j) / num_folds
                print('\nFold: %d/%d' % ((j+1), num_folds))

                self.create_tmp_file_names()
                self.train_records, self.test_records = \
                    ETLUtils.split_train_test_copy(
                        self.records, split=split, start=cv_start)
                self.train_reviews, self.test_reviews = \
                    ETLUtils.split_train_test_copy(
                        self.reviews, split=split, start=cv_start)
                self.export()
                if Constants.USE_CONTEXT:
                    lda_based_context = self.train_word_model()
                    self.find_reviews_topics(lda_based_context)
                self.prepare()
                self.predict()
                self.evaluate()
                recall = self.top_n_evaluator.recall
                specific_recall = self.top_n_evaluator.specific_recall
                generic_recall = self.top_n_evaluator.generic_recall
                total_recall += recall
                total_specific_recall += specific_recall
                total_generic_recall += generic_recall

                fold_end = time.time()
                fold_time = fold_end - fold_start
                total_cycle_time += fold_time
                self.clear()
                print("Total fold %d time = %f seconds" % ((j+1), fold_time))

        average_recall = total_recall / total_iterations
        average_specific_recall = total_specific_recall / total_iterations
        average_generic_recall = total_generic_recall / total_iterations
        average_cycle_time = total_cycle_time / total_iterations
        print('average recall: %f' % average_recall)
        print('average specific recall: %f' % average_specific_recall)
        print('average generic recall: %f' % average_generic_recall)
        print('average cycle time: %f' % average_cycle_time)
        print('End: %s' % time.strftime("%Y/%m/%d-%H:%M:%S"))

        results = Constants.get_properties_copy()
        results['recall'] = average_recall
        results['specific_recall'] = average_specific_recall
        results['generic_recall'] = average_generic_recall
        results['cycle_time'] = average_cycle_time
        results['timestamp'] = time.strftime("%Y/%m/%d-%H:%M:%S")

        if not os.path.exists(Constants.CSV_RESULTS_FILE):
            with open(Constants.CSV_RESULTS_FILE, 'wb') as f:
                w = csv.DictWriter(f, sorted(results.keys()))
                w.writeheader()
                w.writerow(results)
        else:
            with open(Constants.CSV_RESULTS_FILE, 'a') as f:
                w = csv.DictWriter(f, sorted(results.keys()))
                w.writerow(results)
Esempio n. 4
0
    def perform_cross_validation(self):

        print(Constants._properties)

        self.plant_seeds()

        total_metric = 0.0
        total_specific_metric = 0.0
        total_generic_metric = 0.0
        total_cycle_time = 0.0
        num_cycles = Constants.NUM_CYCLES
        num_folds = Constants.CROSS_VALIDATION_NUM_FOLDS
        total_iterations = num_cycles * num_folds
        split = 1 - (1/float(num_folds))

        self.load()

        for i in range(num_cycles):

            print('\n\nCycle: %d/%d' % ((i+1), num_cycles))

            if Constants.SHUFFLE_DATA:
                self.shuffle()
            self.records = copy.deepcopy(self.original_records)

            for j in range(num_folds):

                fold_start = time.time()
                cv_start = float(j) / num_folds
                print('\nFold: %d/%d' % ((j+1), num_folds))

                self.create_tmp_file_names()
                self.train_records, self.test_records =\
                    ETLUtils.split_train_test_copy(
                        self.records, split=split, start=cv_start)
                self.get_records_to_predict()
                if Constants.USE_CONTEXT:
                    lda_based_context = self.train_topic_model(i, j)
                    self.find_reviews_topics(lda_based_context)
                self.predict()
                metric, specific_metric, generic_metric = self.evaluate()
                total_metric += metric
                total_specific_metric += specific_metric
                total_generic_metric += generic_metric

                fold_end = time.time()
                fold_time = fold_end - fold_start
                total_cycle_time += fold_time
                self.clear()
                print("Total fold %d time = %f seconds" % ((j+1), fold_time))

        metric_name = Constants.EVALUATION_METRIC
        metric_average = total_metric / total_iterations
        average_specific_metric = total_specific_metric / total_iterations
        average_generic_metric = total_generic_metric / total_iterations
        average_cycle_time = total_cycle_time / total_iterations
        print('average %s: %f' % (metric_name, metric_average))
        print(
            'average specific %s: %f' % (metric_name, average_specific_metric))
        print('average generic %s: %f' % (metric_name, average_generic_metric))
        print('average cycle time: %f' % average_cycle_time)
        print('End: %s' % time.strftime("%Y/%m/%d-%H:%M:%S"))
        #
        results = copy.deepcopy(Constants._properties)
        results[Constants.EVALUATION_METRIC] = metric_average
        results['specific_' + metric_name] = average_specific_metric
        results['generic_' + metric_name] = average_generic_metric
        results['cycle_time'] = average_cycle_time
        results['timestamp'] = time.strftime("%Y/%m/%d-%H:%M:%S")

        write_results_to_csv(results)
        write_results_to_json(results)
Esempio n. 5
0
    def perform_cross_validation(self, records):

        Constants.print_properties()

        # self.plant_seeds()

        metrics_list = []
        total_cycle_time = 0.0
        num_cycles = Constants.NUM_CYCLES
        num_folds = Constants.CROSS_VALIDATION_NUM_FOLDS
        total_iterations = num_cycles * num_folds
        split = 1 - (1 / float(num_folds))
        metric_name = Constants.EVALUATION_METRIC

        # self.load()

        for i in range(num_cycles):

            print('\n\nCycle: %d/%d' % ((i + 1), num_cycles))

            self.records = copy.deepcopy(records)
            if Constants.SHUFFLE_DATA:
                self.shuffle(self.records)

            for j in range(num_folds):

                fold_start = time.time()
                cv_start = float(j) / num_folds
                print('\nFold: %d/%d' % ((j + 1), num_folds))

                self.create_tmp_file_names(i, j)
                self.train_records, self.test_records = \
                    ETLUtils.split_train_test_copy(
                        self.records, split=split, start=cv_start)
                # subsample_size = int(len(self.train_records)*0.5)
                # self.train_records = self.train_records[:subsample_size]
                self.get_records_to_predict(True)
                if Constants.USE_CONTEXT:
                    if Constants.SEPARATE_TOPIC_MODEL_RECSYS_REVIEWS:
                        self.load_cache_context_topics(None, None)
                    else:
                        context_extractor = self.train_topic_model(i, j)
                        self.find_reviews_topics(context_extractor, i, j)
                else:
                    self.context_rich_topics = []
                self.predict()
                metrics = self.evaluate()

                metrics_list.append(metrics)
                print('Accumulated %s: %f' %
                      (metric_name,
                       numpy.mean([k[metric_name] for k in metrics_list])))

                fold_end = time.time()
                fold_time = fold_end - fold_start
                total_cycle_time += fold_time
                self.clear()
                print("Total fold %d time = %f seconds" % ((j + 1), fold_time))

        results = self.summarize_results(metrics_list)

        average_cycle_time = total_cycle_time / total_iterations
        results['cycle_time'] = average_cycle_time
        print('average cycle time: %f' % average_cycle_time)

        write_results_to_csv(results)
        write_results_to_json(results)

        return results