def main(): """ Main entry point / top-level execution here """ labels_dict = extract_labels() raw_unlabeled_test_set_dict = read_unlabeled_test_set() processed_unlabeled_test_set = preprocess(None, raw_unlabeled_test_set_dict) raw_training_set_dict = read_training_set() processed_training_set = preprocess(labels_dict, raw_training_set_dict) split = len(processed_training_set) // CORPUS_SPLIT testing = dict(list(processed_training_set.items())[:split]) # 1/3 training = dict(list(processed_training_set.items())[split:]) # 2/3 d_print('training and testing set sizes', str(len(training)), str(len(testing)), source='main') # TODO: INSTANTIATE YOUR CLASSIFIER AND ADD IT TO THE DICT nb = naive_bayesian.NaiveBayesianClassifier() svm_clf = svm.SVMClassifier() dt = decision_tree.DecisionTreeClassifier() classifiers = {'Naive Bayesian': nb, 'SVM': svm_clf, 'Decision Tree': dt} train(classifiers, training) classify(classifiers, testing, processed_unlabeled_test_set)
def classify_all(self, emails): start = timer() ret = {} for key, email in emails.items(): ret[key] = self.classify(email)[0] end = timer() d_print("Classification done, t = " + str(end - start), source="SVM") return ret
def extract_labels(): """ Extract labels.txt to build a dictionary and save the result to JSON file. If there's already a labels.json available, just read from this file. """ if (os.path.isfile('./labels.json')): d_print('Reading labels from the local JSON cache', source='extract_labels') with open('labels.json') as labels_json: return json.load(labels_json) else: d_print('Generating a local JSON cache of labels', source='extract_labels') labels_txt = open('labels.txt') labels_txt_lines = labels_txt.readlines() labels_dict = dict( (label.split()[1], label.split()[0]) for label in labels_txt_lines) with open('labels.json', 'w') as labels_json: json.dump(labels_dict, labels_json) return labels_dict
def read_unlabeled_test_set(): """ Read all raw training files from the directory ./TRAINING into a dictionary, { filename: content ... } """ if (os.path.isfile('./unlabeled_test_set.json')): d_print('Reading unlabeled test set from the local JSON cache', source='read_unlabeled_test_set') with open('unlabeled_test_set.json') as unlabeled_test_set: return json.load(unlabeled_test_set) else: d_print('Generating unlabeled test set from EML files', source='read_unlabeled_test_set') test_files = os.listdir('TESTING') test_files_dict = {} for file_name in test_files: temp = open('TESTING/' + file_name, 'r', encoding='utf-8', errors='ignore') test_files_dict[file_name] = temp.read() with open('unlabeled_test_set.json', 'w') as unlabeled_test_set: json.dump(test_files_dict, unlabeled_test_set) return test_files_dict
def train(classifiers, training_set): """ Calls training routines of the classifiers. """ for cls_name in classifiers.keys(): if cls_name in EXCLUSION_LIST_FOR_LIVE_DEMO: d_print(cls_name, 'skipped for Live Demo', source=cls_name + ' (main)') else: d_print('Starting additional pre-processing and training', source=cls_name + ' (main)') start = timer() classifiers[cls_name].train(training_set) end = timer() d_print('Training complete, t =', str(end - start), source=cls_name + ' (main)')
def train(self, training_set): self.all_features = self.all_words(training_set) features = [] labels = [] # The number of emails to use for training n_laps = len(training_set) start = timer() i = 0 for _, email_data in training_set.items(): # Find all used words f_vec = self.get_feature_vector(email_data) labels.append(email_data["label"]) features.append(f_vec) # Abort earlier so we can limit the nr of features if i == n_laps: break else: i = i + 1 end = timer() d_print("Pre-processing done, t = " + str(end - start), source="SVM") # Reduce the number of features start = timer() self.feature_selection = SelectKBest(f_classif, k = self.number_of_features) important_features = self.feature_selection.fit_transform(features, labels) end = timer() d_print("Feature selection done, t = " + str(end - start), source="SVM") # Train the classifier start = timer() self.classifier = SVC() self.classifier.fit(important_features, labels) end = timer() d_print("Classifier training done, t = " + str(end - start), source="SVM")
def classify(classifiers, testing_set, unlabeled_testing_set): """ Calls classify routines of the classifiers using classify_all (in meta.py), and reports accuracy. """ for cls_name in classifiers.keys(): print() if cls_name in EXCLUSION_LIST_FOR_LIVE_DEMO: d_print(cls_name, 'skipped for Live Demo', source='classify') else: d_print('Starting classification of labeled testing set', source=cls_name + ' (main)') start = timer() result = classifiers[cls_name].classify_all(testing_set) end = timer() d_print('Classification of labeled testing set done, t =', str(end - start), source=cls_name + ' (main)') assert len(result) == len(testing_set) correct_result_count = 0 for eml_filename in result.keys(): if str(result[eml_filename]) == str( testing_set[eml_filename]['label']): correct_result_count += 1 print( '\n', correct_result_count, 'out of', len(result), 'cases were correct.\n', cls_name, 'is {:6.4f} % accurate.\n'.format(correct_result_count / len(result) * 100)) d_print('Starting classification of unlabeled testing set', source=cls_name + ' (main)') start = timer() result = classifiers[cls_name].classify_all(unlabeled_testing_set) end = timer() d_print('Classification of unlabeled testing set done, t =', str(end - start), source=cls_name + ' (main)') assert len(result) == len(unlabeled_testing_set) spam_result_count = 0 for eml_filename in result.keys(): if str(result[eml_filename]) == str(0): spam_result_count += 1 print( '\n', spam_result_count, 'out of', len(result), 'unlabeled cases were reported as spam.\n', cls_name, 'claims {:6.4f} % of unlabeled test set is spam.\n'.format( spam_result_count / len(result) * 100)) d_print('Finished classification', source=cls_name + ' (main)')