def run_single_fold(self, parameters): fold = parameters['fold'] Constants.update_properties(parameters) Constants.print_properties() utilities.plant_seeds() self.load() records = self.original_records # self.plant_seeds() total_cycle_time = 0.0 num_folds = Constants.CROSS_VALIDATION_NUM_FOLDS split = 1 - (1 / float(num_folds)) self.records = copy.deepcopy(records) if Constants.SHUFFLE_DATA: self.shuffle(self.records) fold_start = time.time() cv_start = float(fold) / num_folds print('\nFold: %d/%d' % ((fold + 1), num_folds)) self.create_tmp_file_names(0, fold) self.train_records, self.test_records = \ ETLUtils.split_train_test_copy( self.records, split=split, start=cv_start) # subsample_size = int(len(self.train_records)*0.5) # self.train_records = self.train_records[:subsample_size] self.get_records_to_predict(True) if Constants.USE_CONTEXT: if Constants.SEPARATE_TOPIC_MODEL_RECSYS_REVIEWS: self.load_cache_context_topics(None, None) else: context_extractor = self.train_topic_model(0, fold) self.find_reviews_topics(context_extractor, 0, fold) else: self.context_rich_topics = [] self.predict() metrics = self.evaluate() fold_end = time.time() fold_time = fold_end - fold_start total_cycle_time += fold_time self.clear() print("Total fold %d time = %f seconds" % ((fold + 1), fold_time)) return metrics
def perform_cross_validation(self): print(Constants._properties) self.plant_seeds() total_metric = 0.0 # total_specific_recall = 0.0 # total_generic_recall = 0.0 total_cycle_time = 0.0 num_cycles = Constants.NUM_CYCLES num_folds = Constants.CROSS_VALIDATION_NUM_FOLDS total_iterations = num_cycles * num_folds split = 1 - (1 / float(num_folds)) self.load() for i in range(num_cycles): print('\n\nCycle: %d/%d' % ((i + 1), num_cycles)) if Constants.SHUFFLE_DATA: self.shuffle() self.records = copy.deepcopy(self.original_records) for j in range(num_folds): fold_start = time.time() cv_start = float(j) / num_folds print('\nFold: %d/%d' % ((j + 1), num_folds)) self.create_tmp_file_names() self.train_records, self.test_records =\ ETLUtils.split_train_test_copy( self.records, split=split, start=cv_start) self.get_records_to_predict() if Constants.USE_CONTEXT: lda_based_context = self.train_topic_model(i, j) self.find_reviews_topics(lda_based_context) self.predict() metric = self.evaluate() # recall = self.top_n_evaluator.recall # specific_recall = self.top_n_evaluator.specific_recall # generic_recall = self.top_n_evaluator.generic_recall total_metric += metric # total_specific_recall += specific_recall # total_generic_recall += generic_recall fold_end = time.time() fold_time = fold_end - fold_start total_cycle_time += fold_time self.clear() print("Total fold %d time = %f seconds" % ((j + 1), fold_time)) metric_average = total_metric / total_iterations # average_specific_recall = total_specific_recall / total_iterations # average_generic_recall = total_generic_recall / total_iterations average_cycle_time = total_cycle_time / total_iterations print('average rmse: %f' % metric_average) # print('average specific recall: %f' % average_specific_recall) # print('average generic recall: %f' % average_generic_recall) print('average cycle time: %f' % average_cycle_time) print('End: %s' % time.strftime("%Y/%m/%d-%H:%M:%S")) # results = copy.deepcopy(Constants._properties) results[Constants.EVALUATION_METRIC] = metric_average # results['specific_recall'] = average_specific_recall # results['generic_recall'] = average_generic_recall results['cycle_time'] = average_cycle_time results['timestamp'] = time.strftime("%Y/%m/%d-%H:%M:%S") write_results_to_csv(results) write_results_to_json(results)
def perform_cross_validation(self): Constants.print_properties() utilities.plant_seeds() total_recall = 0.0 total_specific_recall = 0.0 total_generic_recall = 0.0 total_cycle_time = 0.0 num_cycles = Constants.NUM_CYCLES num_folds = Constants.CROSS_VALIDATION_NUM_FOLDS total_iterations = num_cycles * num_folds split = 1 - (1/float(num_folds)) self.load() for i in range(num_cycles): print('\n\nCycle: %d/%d' % ((i+1), num_cycles)) if Constants.SHUFFLE_DATA: self.shuffle() self.records = copy.deepcopy(self.original_records) self.reviews = copy.deepcopy(self.original_reviews) for j in range(num_folds): fold_start = time.time() cv_start = float(j) / num_folds print('\nFold: %d/%d' % ((j+1), num_folds)) self.create_tmp_file_names() self.train_records, self.test_records = \ ETLUtils.split_train_test_copy( self.records, split=split, start=cv_start) self.train_reviews, self.test_reviews = \ ETLUtils.split_train_test_copy( self.reviews, split=split, start=cv_start) self.export() if Constants.USE_CONTEXT: lda_based_context = self.train_word_model() self.find_reviews_topics(lda_based_context) self.prepare() self.predict() self.evaluate() recall = self.top_n_evaluator.recall specific_recall = self.top_n_evaluator.specific_recall generic_recall = self.top_n_evaluator.generic_recall total_recall += recall total_specific_recall += specific_recall total_generic_recall += generic_recall fold_end = time.time() fold_time = fold_end - fold_start total_cycle_time += fold_time self.clear() print("Total fold %d time = %f seconds" % ((j+1), fold_time)) average_recall = total_recall / total_iterations average_specific_recall = total_specific_recall / total_iterations average_generic_recall = total_generic_recall / total_iterations average_cycle_time = total_cycle_time / total_iterations print('average recall: %f' % average_recall) print('average specific recall: %f' % average_specific_recall) print('average generic recall: %f' % average_generic_recall) print('average cycle time: %f' % average_cycle_time) print('End: %s' % time.strftime("%Y/%m/%d-%H:%M:%S")) results = Constants.get_properties_copy() results['recall'] = average_recall results['specific_recall'] = average_specific_recall results['generic_recall'] = average_generic_recall results['cycle_time'] = average_cycle_time results['timestamp'] = time.strftime("%Y/%m/%d-%H:%M:%S") if not os.path.exists(Constants.CSV_RESULTS_FILE): with open(Constants.CSV_RESULTS_FILE, 'wb') as f: w = csv.DictWriter(f, sorted(results.keys())) w.writeheader() w.writerow(results) else: with open(Constants.CSV_RESULTS_FILE, 'a') as f: w = csv.DictWriter(f, sorted(results.keys())) w.writerow(results)
def perform_cross_validation(self): print(Constants._properties) self.plant_seeds() total_metric = 0.0 total_specific_metric = 0.0 total_generic_metric = 0.0 total_cycle_time = 0.0 num_cycles = Constants.NUM_CYCLES num_folds = Constants.CROSS_VALIDATION_NUM_FOLDS total_iterations = num_cycles * num_folds split = 1 - (1/float(num_folds)) self.load() for i in range(num_cycles): print('\n\nCycle: %d/%d' % ((i+1), num_cycles)) if Constants.SHUFFLE_DATA: self.shuffle() self.records = copy.deepcopy(self.original_records) for j in range(num_folds): fold_start = time.time() cv_start = float(j) / num_folds print('\nFold: %d/%d' % ((j+1), num_folds)) self.create_tmp_file_names() self.train_records, self.test_records =\ ETLUtils.split_train_test_copy( self.records, split=split, start=cv_start) self.get_records_to_predict() if Constants.USE_CONTEXT: lda_based_context = self.train_topic_model(i, j) self.find_reviews_topics(lda_based_context) self.predict() metric, specific_metric, generic_metric = self.evaluate() total_metric += metric total_specific_metric += specific_metric total_generic_metric += generic_metric fold_end = time.time() fold_time = fold_end - fold_start total_cycle_time += fold_time self.clear() print("Total fold %d time = %f seconds" % ((j+1), fold_time)) metric_name = Constants.EVALUATION_METRIC metric_average = total_metric / total_iterations average_specific_metric = total_specific_metric / total_iterations average_generic_metric = total_generic_metric / total_iterations average_cycle_time = total_cycle_time / total_iterations print('average %s: %f' % (metric_name, metric_average)) print( 'average specific %s: %f' % (metric_name, average_specific_metric)) print('average generic %s: %f' % (metric_name, average_generic_metric)) print('average cycle time: %f' % average_cycle_time) print('End: %s' % time.strftime("%Y/%m/%d-%H:%M:%S")) # results = copy.deepcopy(Constants._properties) results[Constants.EVALUATION_METRIC] = metric_average results['specific_' + metric_name] = average_specific_metric results['generic_' + metric_name] = average_generic_metric results['cycle_time'] = average_cycle_time results['timestamp'] = time.strftime("%Y/%m/%d-%H:%M:%S") write_results_to_csv(results) write_results_to_json(results)
def perform_cross_validation(self, records): Constants.print_properties() # self.plant_seeds() metrics_list = [] total_cycle_time = 0.0 num_cycles = Constants.NUM_CYCLES num_folds = Constants.CROSS_VALIDATION_NUM_FOLDS total_iterations = num_cycles * num_folds split = 1 - (1 / float(num_folds)) metric_name = Constants.EVALUATION_METRIC # self.load() for i in range(num_cycles): print('\n\nCycle: %d/%d' % ((i + 1), num_cycles)) self.records = copy.deepcopy(records) if Constants.SHUFFLE_DATA: self.shuffle(self.records) for j in range(num_folds): fold_start = time.time() cv_start = float(j) / num_folds print('\nFold: %d/%d' % ((j + 1), num_folds)) self.create_tmp_file_names(i, j) self.train_records, self.test_records = \ ETLUtils.split_train_test_copy( self.records, split=split, start=cv_start) # subsample_size = int(len(self.train_records)*0.5) # self.train_records = self.train_records[:subsample_size] self.get_records_to_predict(True) if Constants.USE_CONTEXT: if Constants.SEPARATE_TOPIC_MODEL_RECSYS_REVIEWS: self.load_cache_context_topics(None, None) else: context_extractor = self.train_topic_model(i, j) self.find_reviews_topics(context_extractor, i, j) else: self.context_rich_topics = [] self.predict() metrics = self.evaluate() metrics_list.append(metrics) print('Accumulated %s: %f' % (metric_name, numpy.mean([k[metric_name] for k in metrics_list]))) fold_end = time.time() fold_time = fold_end - fold_start total_cycle_time += fold_time self.clear() print("Total fold %d time = %f seconds" % ((j + 1), fold_time)) results = self.summarize_results(metrics_list) average_cycle_time = total_cycle_time / total_iterations results['cycle_time'] = average_cycle_time print('average cycle time: %f' % average_cycle_time) write_results_to_csv(results) write_results_to_json(results) return results