def train_and_save(classifier_list, classifier_name_list, training_data): results = train_multiple_classifiers(classifier_list, classifier_name_list, training_data) util.save_object(results, RESULTS_PATH) util.save_classifier_list(classifier_list, classifier_name_list, CLASSIFIERS_AND_RESULTS_DIR_PATH) return results
def train_and_save(classifier_list, classifier_name_list, training_data): global results results.append( train_multiple_classifiers(classifier_list, classifier_name_list, training_data)) util.save_object(results, RESULTS_PATH) util.save_classifier_list(classifier_list, classifier_name_list, CLASSIFIERS_AND_RESULTS_DIR_PATH)
def run_experiment(self): print('Running', self.exp_name, ', it.', self.classifier_iter) # Load dataset self.data_df = self.get_dataset_from_name(self.dataset_enum) self.avg_dataset_length = get_dataset_avg_length(self.data_df) print('Got dataset:', self.dataset_enum, 'num of cat.', self.get_num_of_categories()) # Split on train and test dataset self.train_corpus, self.test_corpus, self.train_label_names, \ self.test_label_names = \ train_test_split(np.array(self.data_df['Clean Article']), np.array(self.data_df['Target Name']), test_size=self.TEST_SET_SIZE_RATIO, random_state=42) # Tokenize corpus self.tokenized_train = [ tn.tokenizer.tokenize(text) for text in self.train_corpus ] self.tokenized_test = [ tn.tokenizer.tokenize(text) for text in self.test_corpus ] # Get list of words (I know, cool, not readable one-liner) data_word_list = ''.join(list( self.data_df['Clean Article'])).split(' ') self.vocabulary = set(data_word_list) # Calculate features self.train_features, self.test_features = self.get_features() print('Train features shape:', self.train_features.shape, ' Test features shape:', self.test_features.shape) # Pack data in one class self.training_data = TrainingData(self.train_features, self.train_label_names, self.test_features, self.test_label_names) # Perform actual training self.results = train_multiple_classifiers( self.classifier_list, self.classifier_name_list, self.training_data, self.CLASSIFIERS_AND_RESULTS_DIR_PATH, self.classifier_iter, self.RESULTS_PATH) # Extract scores for plotting self.display_results() pass
np.array(data_df['Target Name']), test_size=TEST_SET_SIZE_RATIO, random_state=42) cv_train_features, cv_test_features = get_simple_bag_of_words_features( train_corpus, test_corpus) # # pack data in one class training_data = TrainingData(cv_train_features, train_label_names, cv_test_features, test_label_names) # # Get classifier definitions classifier_list, classifier_name_list, classifier_name_shortcut_list = \ get_chosen_classifiers() # Train and save on disk results = train_multiple_classifiers(classifier_list, classifier_name_list, training_data) # # Load from disk # classifier_list = util.load_classifier_list(classifier_name_list, # CLASSIFIERS_AND_RESULTS_DIR_PATH) # results = util.load_object(RESULTS_PATH) # create_cv_test_time_plots(results, classifier_name_shortcut_list) cv_mean_scores = [round(result[1], SCORE_DECIMAL_PLACES) for result in results] test_scores = [round(result[2], SCORE_DECIMAL_PLACES) for result in results] elapsed_times = [round(result[3], TIME_DECIMAL_PLACES) for result in results] # create_bar_plot(classifier_name_shortcut_list, 'Classifier scores', 'Accuracy', # cv_mean_scores, y_range_tuple=(0, 1)) create_2_bar_plot(classifier_name_shortcut_list, 'Classifier scores', 'Accuracy', cv_mean_scores,