def run(self, show_weights=False): for learning_rate in self.learning_rates: for margin in self.margins: print( '\nMargin perceptron with learning rate %.2f and margin %.2f' % (learning_rate, margin)) features, labels = DataSetLoader(self.training_file).load() perceptron = MarginPerceptron(learning_rate, margin) weights = perceptron.train(features, labels, 20) if show_weights: print('\nDetected weights') print(weights) test_features, test_labels = DataSetLoader( self.testing_file).load() invalid_entries = 0 for i, x in enumerate(test_features): y1 = MarginPerceptron.predict(x, weights) y = test_labels[i] if y1 != y: invalid_entries += 1 error_rate = (invalid_entries / len(test_features)) * 100 print('Invalid classified entries:', invalid_entries, '-> Total entries:', len(test_features), '-> Error:', str(round(error_rate, 2)) + '%\n')
def run(self): train_features, train_labels = DataSetLoader(self.training_file).load() dev_features, dev_labels = DataSetLoader(self.development_file).load() test_features, test_labels = DataSetLoader(self.testing_file).load() train_error_rate =\ MajorityBaselineClassifierTester.error_rate(train_features, train_labels, dev_features, dev_labels) print('Majority Baseline accuracy for dev set: %.2f%%\n' % round(100 - train_error_rate, 2)) dev_error_rate = \ MajorityBaselineClassifierTester.error_rate(train_features, train_labels, test_features, test_labels) print('Majority Baseline accuracy for test set: %.2f%%\n' % round(100 - dev_error_rate, 2))
def get_cross_validation_error_rates_for(self, perceptron_cls_init_parameters): error_rates = [] for idx, _ in enumerate(self.training_files): folds = self.training_files[:] test_fold_filename = folds[idx] del (folds[idx]) features, labels = CrossValidatorTester.get_data_for(folds) test_features, test_labels = DataSetLoader( test_fold_filename).load() perceptron = self.cls(*perceptron_cls_init_parameters) weights = perceptron.train(features, labels, 10) invalid_entries = 0 for i, x in enumerate(test_features): y1 = self.cls.predict(x, weights) y = test_labels[i] if y1 != y: invalid_entries += 1 error_rates.append( round((invalid_entries / len(test_features)) * 100, 2)) return error_rates
def calculate_error_rates_at_depth(self, depth): error_rates = [] for idx, _ in enumerate(self.files): folds = self.files[:] test_fold_filename = folds[idx] del (folds[idx]) original_data_set = CrossValidator.get_data_set_for(folds) enricher = DataSetFeaturesEnricher( original_data_set, CrossValidator.feature_creation_labels) data_set = enricher.get_enrich_data_set() tree = DecisionTree(data_set, CrossValidator.feature_creation_labels, depth).make_tree() pruned_tree = TreePruner(tree).prune() cls = Classifier(pruned_tree, CrossValidator.feature_creation_labels) dsc = DataSetClassifier(cls, enricher) testing_data_set = DataSetLoader(test_fold_filename).load() dsc.classify_data_set(testing_data_set) error_rates.append(round(dsc.error_rate, 2)) print('At depth', depth, 'we got the error rates', error_rates, 'having average', round(sum(error_rates) / len(error_rates), 2), '%', 'and standard deviation', round(statistics.stdev(error_rates), 2)) return error_rates
def write(self): eval_features, _ = DataSetLoader(self.input_features_file).load() ids_list = self.read_ids_file() for i, feature in enumerate(eval_features): ids_list[i].append(LabelWriter.predict(feature, self.w)) with open(self.output_ids_file, mode='w', encoding='utf-8') as myfile: myfile.write('Id,Prediction\n' + '\n'.join([','.join(x) for x in ids_list]))
def get_data_for(files): features = [] labels = [] for data_set_filename in files: new_features, new_labels = DataSetLoader(data_set_filename).load() features += new_features labels += new_labels return features, labels
def run(self): train_features, train_labels = DataSetLoader( self.training_file, self.features_count).load(True) test_features, test_labels = DataSetLoader( self.test_file, self.features_count).load(True) classifier = BaggedForest(self.trees_count) trees = classifier.train(train_features, train_labels) error_rate = BaggedForestValidatorTester.calculate_error_rate( train_features, train_labels, trees) print( '\nTraining set error rates: %.2f%%. TRAINING SET ACCURACY %.2f%%' % (error_rate, 100 - error_rate)) error_rate = BaggedForestValidatorTester.calculate_error_rate( test_features, test_labels, trees) print( '\nTesting set error rates: %.2f%%. TESTING SET ACCURACY %.2f%%' % (error_rate, 100 - error_rate))
def write(self): eval_features, _ = DataSetLoader(self.input_features_file).load() ids_list = self.read_ids_file() predicted = self.clf.predict(eval_features) for i in range(len(ids_list)): ids_list[i].append(str(0 if predicted[i] == -1 else 1)) with open(self.output_ids_file, mode='w', encoding='utf-8') as myfile: myfile.write('Id,Prediction\n' + '\n'.join([','.join(x) for x in ids_list]))
def run(self): train_features, train_labels = DataSetLoader( self.training_file, self.features_count).load(True) test_features, test_labels = DataSetLoader( self.test_file, self.features_count).load(self.zeros) file = True try: b = open('mytrees.bin', 'rb') except FileNotFoundError: file = False if not file: classifier = BaggedForest(self.trees_count) trees = classifier.train(train_features, train_labels) binary_file = open('mytrees.bin', mode='wb') pickle.dump(trees, binary_file) binary_file.close() else: print('Skipping trees generation. Found previous trees in file.') trees = pickle.load(b) features_list = self.generate_features_list(train_features, trees) best_hyperparameters, error_rate = self.detect_best_hyperparameters( features_list, train_labels) print('BEST HYPER-PARAMETERS: %s CROSS VALIDATION ACCURACY: %.2f%%' % (self.get_print_value(best_hyperparameters), 100 - error_rate)) train_features, train_labels = DataSetLoader( self.training_file, self.features_count).load(self.zeros) w = self.train(best_hyperparameters, train_features, train_labels) error_rate = self.calculate_error_rate(train_features, train_labels, w) print( '\nTraining set error rates: %.2f%%. TRAINING SET ACCURACY %.2f%%' % (error_rate, 100 - error_rate)) error_rate = self.calculate_error_rate(test_features, test_labels, w) print( '\nTesting set error rates: %.2f%%. TESTING SET ACCURACY %.2f%%' % (error_rate, 100 - error_rate))
def run(self): best_hyperparameters, error_rate = self.detect_best_hyperparameters() print('BEST HYPER-PARAMETERS: %s CROSS VALIDATION ACCURACY: %.2f%%' % (self.get_print_value(best_hyperparameters), 100 - error_rate)) w = self.train(best_hyperparameters) test_features, test_labels = DataSetLoader(self.test_file).load() test_error_rate = self.calculate_error_rate(test_features, test_labels, w) print('\nTesting data error rate: %.2f%% TEST SET ACCURACY %.2f%%' % (test_error_rate, 100 - test_error_rate))
def train_and_test_final_tree(self, depth): original_data_set = CrossValidator.get_data_set_for(self.files) enricher = DataSetFeaturesEnricher( original_data_set, CrossValidator.feature_creation_labels) data_set = enricher.get_enrich_data_set() tree = DecisionTree(data_set, CrossValidator.feature_creation_labels, depth).make_tree() pruned_tree = TreePruner(tree).prune() cls = Classifier(pruned_tree, CrossValidator.feature_creation_labels) dsc = DataSetClassifier(cls, enricher) testing_data_set = DataSetLoader('dataset/test.data').load() dsc.classify_data_set(testing_data_set) print('The error rate for the test data is: ', round(dsc.error_rate, 2), '%')
def get_data_for(files, zeros, features_count): first_time = True features = None labels = None for data_set_filename in files: new_features, new_labels = DataSetLoader( data_set_filename, features_count).load(zeros) if first_time: features = new_features labels = new_labels first_time = False else: features = vstack([features, new_features]) labels = hstack([labels, new_labels]) # labels += new_labels return features, labels
def get_cross_validation_error_rates_for(self, classifier_cls_init_parameters): error_rates = [] for idx, _ in enumerate(self.training_files): folds = self.training_files[:] test_fold_filename = folds[idx] del (folds[idx]) features, labels = CrossValidatorTester.get_data_for( folds, self.zeros, self.features) test_features, test_labels = DataSetLoader( test_fold_filename, self.features).load(self.zeros) classifier = self.cls(*classifier_cls_init_parameters) weights = classifier.train(features, labels, self.hyper_parameter_epochs) error_rates.append( self.calculate_error_rate(test_features, test_labels, weights)) return error_rates
def get_data_set_for(files): data_set = [] for data_set_filename in files: data_set += DataSetLoader(data_set_filename).load() return data_set
def train(self, perceptron_cls_init_parameters): error_rates = [] best_error_rate = float('inf') best_w = None total_updates = 0 perceptron = self.cls(*perceptron_cls_init_parameters) features, labels = CrossValidatorTester.get_data_for( self.training_files) development_features, development_labels = DataSetLoader( self.development_file).load() w = np.array( [randrange(-100, 100, 1) / 10000 for _ in range(len(features[0]))]) u = np.array([0.0 for _ in range(len(features[0]))]) train_method_parameters = self.get_train_method_parameters( perceptron.train_one_epoch, { 'train': features, 'labels': labels, 'w': w, 'u': u, 'c': 1, 'epoch': 0 }) for _ in range(20): # The `train_one_epoch` has various signatures depending on the perceptron type used. # From all the possible parameters we are selecting only the ones that we infer from the method signature. new_train_method_parameters = perceptron.train_one_epoch( *train_method_parameters) updates_count = new_train_method_parameters.pop(0) new_train_method_parameters.insert(0, labels) new_train_method_parameters.insert(0, features) train_method_parameters = new_train_method_parameters if self.cls.__name__ == 'AveragedPerceptron': w = train_method_parameters[3] / train_method_parameters[4] else: w = train_method_parameters[2] error_rate = self.calculate_error_rate(development_features, development_labels, w) if error_rate < best_error_rate: best_error_rate = error_rate best_w = w total_updates += updates_count error_rates.append(error_rate) print('\nDevelopment set error rates: %s' % "% ".join(format(e, "7.2f") for e in error_rates)) print( 'Minimum error rate: %.2f%% Epoch: %d DEVELOPMENT SET ACCURACY %.2f%% UPDATES PERFORMED DURING TRAINING %d' % (min(error_rates), error_rates.index( min(error_rates)), 100 - min(error_rates), total_updates)) title = 'Perceptron type: {} {} '.format( self.cls.__name__, self.get_print_value(perceptron_cls_init_parameters)) CrossValidatorTester.plot(error_rates, title) return best_w
'longevity: hours', 'longevity: minutes', 'longevity: seconds', 'number of following', 'numberof followers', 'the ratio of the number of following and followers', 'the number of posted tweets', 'the number of posted tweets per day', 'the average number of links in tweets', 'the average number of unique links in tweets', 'the average numer of username in tweets', 'the average numer of unique username in tweets', 'the change rate of number of following' ] features, labels = DataSetLoader('../TwitterDataset/data-splits/data.train').load() test_features, test_labels = DataSetLoader('../TwitterDataset/data-splits/data.test').load() clf = tree.DecisionTreeClassifier() clf.fit(features, labels) predicted = clf.predict(test_features) counter = 0 for idx, val in enumerate(predicted): if predicted[idx] != test_labels[idx]: counter += 1 print('Error rate', counter / float(len(predicted)) * 100) dot_data = tree.export_graphviz( clf, out_file=None,
# This is a test to test my implementation from sklearn import tree import graphviz from data_set_loader import DataSetLoader from data_set_features_enricher import DataSetFeaturesEnricher original_data_set = DataSetLoader('dataset/training.data').load() # Create a data set using the following features. feature_creation_labels = [ 'first_name_longer_that_last_name', 'has_middle_name', 'first_name_starts_and_ends_with_same_letter', 'first_name_come_alphabetically_before_their_last_name', 'second_letter_of_their_first_name_a_vowel', 'is_the_number_of_last_name_letters_even' ] enricher = DataSetFeaturesEnricher(original_data_set, feature_creation_labels) data_set = enricher.get_enrich_data_set() features = [] labels = [] for entry in data_set: features.append(entry[:-1]) labels.append(entry[-1]) test_data = DataSetLoader('dataset/test.data').load() enricher = DataSetFeaturesEnricher(test_data, feature_creation_labels) test_data_set = enricher.get_enrich_data_set() test_features = []