def run_number_classifier(): rows = -1 # -1 means retrieving complete set. When testing, set lower for faster training (e.g. 5000). print('-- Executing number classification') print('Loading data...') data, labels = data_retriever.load_mnist(rows) print('Splitting data...') training_data, training_labels, test_data, test_labels = split_and_shuffle_data_set( data, labels) print('Extracting features...') extractor = BitmapFeatureExtractor() extractor.fit(training_data) training_features = extractor.transform(training_data) test_features = extractor.transform(test_data) print('Training classifier...') classifier = train_classifier(training_features, training_labels) print('Testing classifier...') validate_model(classifier, test_data, test_features, test_labels, bitmap=True)
def run_spam_filter(): row_count = -1 # set this number to some number below 2000 if you are having performance problems print('-- Executing spam filter') print('-- Loading data') data, labels = data_retriever.load_sms(cache_data=False, rows=row_count) # randomize and split the data training_data, test_data, training_labels, test_labels = split_and_shuffle_data_set(data, labels) # fit the transformer extractor = FeatureExtractor() extractor.fit(training_data) # extract the features from the test data training_features = extractor.transform(training_data) # train the classifier classifier = train_classifier(training_features, training_labels) # generate classification report test_features = extractor.transform(test_data) validate_model(classifier, test_data, test_features, test_labels)