Beispiel #1
0
def train_and_save(classifier_list, classifier_name_list, training_data):
    results = train_multiple_classifiers(classifier_list, classifier_name_list,
                                         training_data)
    util.save_object(results, RESULTS_PATH)
    util.save_classifier_list(classifier_list, classifier_name_list,
                              CLASSIFIERS_AND_RESULTS_DIR_PATH)
    return results
Beispiel #2
0
def train_and_save(classifier_list, classifier_name_list, training_data):
    global results
    results.append(
        train_multiple_classifiers(classifier_list, classifier_name_list,
                                   training_data))
    util.save_object(results, RESULTS_PATH)
    util.save_classifier_list(classifier_list, classifier_name_list,
                              CLASSIFIERS_AND_RESULTS_DIR_PATH)
Beispiel #3
0
 def run_experiment(self):
     print('Running', self.exp_name, ', it.', self.classifier_iter)
     # Load dataset
     self.data_df = self.get_dataset_from_name(self.dataset_enum)
     self.avg_dataset_length = get_dataset_avg_length(self.data_df)
     print('Got dataset:', self.dataset_enum, 'num of cat.',
           self.get_num_of_categories())
     # Split on train and test dataset
     self.train_corpus, self.test_corpus, self.train_label_names, \
     self.test_label_names = \
         train_test_split(np.array(self.data_df['Clean Article']),
                          np.array(self.data_df['Target Name']),
                          test_size=self.TEST_SET_SIZE_RATIO, random_state=42)
     # Tokenize corpus
     self.tokenized_train = [
         tn.tokenizer.tokenize(text) for text in self.train_corpus
     ]
     self.tokenized_test = [
         tn.tokenizer.tokenize(text) for text in self.test_corpus
     ]
     # Get list of words (I know, cool, not readable one-liner)
     data_word_list = ''.join(list(
         self.data_df['Clean Article'])).split(' ')
     self.vocabulary = set(data_word_list)
     # Calculate features
     self.train_features, self.test_features = self.get_features()
     print('Train features shape:', self.train_features.shape,
           ' Test features shape:', self.test_features.shape)
     # Pack data in one class
     self.training_data = TrainingData(self.train_features,
                                       self.train_label_names,
                                       self.test_features,
                                       self.test_label_names)
     # Perform actual training
     self.results = train_multiple_classifiers(
         self.classifier_list, self.classifier_name_list,
         self.training_data, self.CLASSIFIERS_AND_RESULTS_DIR_PATH,
         self.classifier_iter, self.RESULTS_PATH)
     # Extract scores for plotting
     self.display_results()
     pass
Beispiel #4
0
    np.array(data_df['Target Name']),
    test_size=TEST_SET_SIZE_RATIO,
    random_state=42)

cv_train_features, cv_test_features = get_simple_bag_of_words_features(
    train_corpus, test_corpus)
# # pack data in one class
training_data = TrainingData(cv_train_features, train_label_names,
                             cv_test_features, test_label_names)

# # Get classifier definitions
classifier_list, classifier_name_list, classifier_name_shortcut_list = \
    get_chosen_classifiers()

# Train and save on disk
results = train_multiple_classifiers(classifier_list, classifier_name_list,
                                     training_data)
# # Load from disk
# classifier_list = util.load_classifier_list(classifier_name_list,
#                                             CLASSIFIERS_AND_RESULTS_DIR_PATH)
# results = util.load_object(RESULTS_PATH)

# create_cv_test_time_plots(results, classifier_name_shortcut_list)
cv_mean_scores = [round(result[1], SCORE_DECIMAL_PLACES) for result in results]
test_scores = [round(result[2], SCORE_DECIMAL_PLACES) for result in results]
elapsed_times = [round(result[3], TIME_DECIMAL_PLACES) for result in results]
# create_bar_plot(classifier_name_shortcut_list, 'Classifier scores', 'Accuracy',
#                 cv_mean_scores, y_range_tuple=(0, 1))
create_2_bar_plot(classifier_name_shortcut_list,
                  'Classifier scores',
                  'Accuracy',
                  cv_mean_scores,