def run(self, show_weights=False):
        for learning_rate in self.learning_rates:
            for margin in self.margins:
                print(
                    '\nMargin perceptron with learning rate %.2f and margin %.2f'
                    % (learning_rate, margin))

                features, labels = DataSetLoader(self.training_file).load()
                perceptron = MarginPerceptron(learning_rate, margin)

                weights = perceptron.train(features, labels, 20)
                if show_weights:
                    print('\nDetected weights')
                    print(weights)

                test_features, test_labels = DataSetLoader(
                    self.testing_file).load()
                invalid_entries = 0

                for i, x in enumerate(test_features):
                    y1 = MarginPerceptron.predict(x, weights)
                    y = test_labels[i]

                    if y1 != y:
                        invalid_entries += 1

                error_rate = (invalid_entries / len(test_features)) * 100
                print('Invalid classified entries:', invalid_entries,
                      '-> Total entries:', len(test_features), '-> Error:',
                      str(round(error_rate, 2)) + '%\n')
    def run(self):
        train_features, train_labels = DataSetLoader(self.training_file).load()
        dev_features, dev_labels = DataSetLoader(self.development_file).load()
        test_features, test_labels = DataSetLoader(self.testing_file).load()

        train_error_rate =\
            MajorityBaselineClassifierTester.error_rate(train_features, train_labels, dev_features, dev_labels)
        print('Majority Baseline accuracy for dev set: %.2f%%\n' %
              round(100 - train_error_rate, 2))

        dev_error_rate = \
            MajorityBaselineClassifierTester.error_rate(train_features, train_labels, test_features, test_labels)
        print('Majority Baseline accuracy for test set: %.2f%%\n' %
              round(100 - dev_error_rate, 2))
Exemple #3
0
    def get_cross_validation_error_rates_for(self,
                                             perceptron_cls_init_parameters):
        error_rates = []

        for idx, _ in enumerate(self.training_files):
            folds = self.training_files[:]
            test_fold_filename = folds[idx]
            del (folds[idx])

            features, labels = CrossValidatorTester.get_data_for(folds)
            test_features, test_labels = DataSetLoader(
                test_fold_filename).load()

            perceptron = self.cls(*perceptron_cls_init_parameters)
            weights = perceptron.train(features, labels, 10)

            invalid_entries = 0

            for i, x in enumerate(test_features):
                y1 = self.cls.predict(x, weights)
                y = test_labels[i]

                if y1 != y:
                    invalid_entries += 1

            error_rates.append(
                round((invalid_entries / len(test_features)) * 100, 2))

        return error_rates
Exemple #4
0
    def calculate_error_rates_at_depth(self, depth):
        error_rates = []

        for idx, _ in enumerate(self.files):
            folds = self.files[:]
            test_fold_filename = folds[idx]
            del (folds[idx])

            original_data_set = CrossValidator.get_data_set_for(folds)
            enricher = DataSetFeaturesEnricher(
                original_data_set, CrossValidator.feature_creation_labels)

            data_set = enricher.get_enrich_data_set()
            tree = DecisionTree(data_set,
                                CrossValidator.feature_creation_labels,
                                depth).make_tree()
            pruned_tree = TreePruner(tree).prune()

            cls = Classifier(pruned_tree,
                             CrossValidator.feature_creation_labels)

            dsc = DataSetClassifier(cls, enricher)
            testing_data_set = DataSetLoader(test_fold_filename).load()
            dsc.classify_data_set(testing_data_set)

            error_rates.append(round(dsc.error_rate, 2))

        print('At depth', depth, 'we got the error rates', error_rates,
              'having average', round(sum(error_rates) / len(error_rates),
                                      2), '%', 'and standard deviation',
              round(statistics.stdev(error_rates), 2))
        return error_rates
Exemple #5
0
    def write(self):
        eval_features, _ = DataSetLoader(self.input_features_file).load()
        ids_list = self.read_ids_file()

        for i, feature in enumerate(eval_features):
            ids_list[i].append(LabelWriter.predict(feature, self.w))

        with open(self.output_ids_file, mode='w', encoding='utf-8') as myfile:
            myfile.write('Id,Prediction\n' + '\n'.join([','.join(x) for x in ids_list]))
Exemple #6
0
    def get_data_for(files):
        features = []
        labels = []

        for data_set_filename in files:
            new_features, new_labels = DataSetLoader(data_set_filename).load()
            features += new_features
            labels += new_labels

        return features, labels
    def run(self):
        train_features, train_labels = DataSetLoader(
            self.training_file, self.features_count).load(True)
        test_features, test_labels = DataSetLoader(
            self.test_file, self.features_count).load(True)

        classifier = BaggedForest(self.trees_count)
        trees = classifier.train(train_features, train_labels)

        error_rate = BaggedForestValidatorTester.calculate_error_rate(
            train_features, train_labels, trees)
        print(
            '\nTraining set error rates: %.2f%%. TRAINING SET ACCURACY %.2f%%'
            % (error_rate, 100 - error_rate))

        error_rate = BaggedForestValidatorTester.calculate_error_rate(
            test_features, test_labels, trees)
        print(
            '\nTesting set error rates: %.2f%%. TESTING SET ACCURACY %.2f%%' %
            (error_rate, 100 - error_rate))
    def write(self):
        eval_features, _ = DataSetLoader(self.input_features_file).load()
        ids_list = self.read_ids_file()
        predicted = self.clf.predict(eval_features)

        for i in range(len(ids_list)):
            ids_list[i].append(str(0 if predicted[i] == -1 else 1))

        with open(self.output_ids_file, mode='w', encoding='utf-8') as myfile:
            myfile.write('Id,Prediction\n' +
                         '\n'.join([','.join(x) for x in ids_list]))
    def run(self):
        train_features, train_labels = DataSetLoader(
            self.training_file, self.features_count).load(True)
        test_features, test_labels = DataSetLoader(
            self.test_file, self.features_count).load(self.zeros)

        file = True
        try:
            b = open('mytrees.bin', 'rb')
        except FileNotFoundError:
            file = False

        if not file:
            classifier = BaggedForest(self.trees_count)
            trees = classifier.train(train_features, train_labels)
            binary_file = open('mytrees.bin', mode='wb')
            pickle.dump(trees, binary_file)
            binary_file.close()
        else:
            print('Skipping trees generation. Found previous trees in file.')
            trees = pickle.load(b)

        features_list = self.generate_features_list(train_features, trees)
        best_hyperparameters, error_rate = self.detect_best_hyperparameters(
            features_list, train_labels)
        print('BEST HYPER-PARAMETERS: %s CROSS VALIDATION ACCURACY: %.2f%%' %
              (self.get_print_value(best_hyperparameters), 100 - error_rate))

        train_features, train_labels = DataSetLoader(
            self.training_file, self.features_count).load(self.zeros)
        w = self.train(best_hyperparameters, train_features, train_labels)

        error_rate = self.calculate_error_rate(train_features, train_labels, w)
        print(
            '\nTraining set error rates: %.2f%%. TRAINING SET ACCURACY %.2f%%'
            % (error_rate, 100 - error_rate))

        error_rate = self.calculate_error_rate(test_features, test_labels, w)
        print(
            '\nTesting set error rates: %.2f%%. TESTING SET ACCURACY %.2f%%' %
            (error_rate, 100 - error_rate))
Exemple #10
0
    def run(self):
        best_hyperparameters, error_rate = self.detect_best_hyperparameters()
        print('BEST HYPER-PARAMETERS: %s CROSS VALIDATION ACCURACY: %.2f%%' %
              (self.get_print_value(best_hyperparameters), 100 - error_rate))

        w = self.train(best_hyperparameters)
        test_features, test_labels = DataSetLoader(self.test_file).load()

        test_error_rate = self.calculate_error_rate(test_features, test_labels,
                                                    w)
        print('\nTesting data error rate: %.2f%% TEST SET ACCURACY %.2f%%' %
              (test_error_rate, 100 - test_error_rate))
Exemple #11
0
    def train_and_test_final_tree(self, depth):
        original_data_set = CrossValidator.get_data_set_for(self.files)
        enricher = DataSetFeaturesEnricher(
            original_data_set, CrossValidator.feature_creation_labels)

        data_set = enricher.get_enrich_data_set()
        tree = DecisionTree(data_set, CrossValidator.feature_creation_labels,
                            depth).make_tree()
        pruned_tree = TreePruner(tree).prune()

        cls = Classifier(pruned_tree, CrossValidator.feature_creation_labels)

        dsc = DataSetClassifier(cls, enricher)
        testing_data_set = DataSetLoader('dataset/test.data').load()
        dsc.classify_data_set(testing_data_set)

        print('The error rate for the test data is: ',
              round(dsc.error_rate, 2), '%')
Exemple #12
0
    def get_data_for(files, zeros, features_count):
        first_time = True
        features = None
        labels = None

        for data_set_filename in files:
            new_features, new_labels = DataSetLoader(
                data_set_filename, features_count).load(zeros)
            if first_time:
                features = new_features
                labels = new_labels
                first_time = False
            else:
                features = vstack([features, new_features])
                labels = hstack([labels, new_labels])

            # labels += new_labels

        return features, labels
Exemple #13
0
    def get_cross_validation_error_rates_for(self,
                                             classifier_cls_init_parameters):
        error_rates = []

        for idx, _ in enumerate(self.training_files):
            folds = self.training_files[:]
            test_fold_filename = folds[idx]
            del (folds[idx])

            features, labels = CrossValidatorTester.get_data_for(
                folds, self.zeros, self.features)
            test_features, test_labels = DataSetLoader(
                test_fold_filename, self.features).load(self.zeros)

            classifier = self.cls(*classifier_cls_init_parameters)
            weights = classifier.train(features, labels,
                                       self.hyper_parameter_epochs)

            error_rates.append(
                self.calculate_error_rate(test_features, test_labels, weights))

        return error_rates
Exemple #14
0
    def get_data_set_for(files):
        data_set = []
        for data_set_filename in files:
            data_set += DataSetLoader(data_set_filename).load()

        return data_set
Exemple #15
0
    def train(self, perceptron_cls_init_parameters):
        error_rates = []
        best_error_rate = float('inf')
        best_w = None
        total_updates = 0

        perceptron = self.cls(*perceptron_cls_init_parameters)

        features, labels = CrossValidatorTester.get_data_for(
            self.training_files)
        development_features, development_labels = DataSetLoader(
            self.development_file).load()
        w = np.array(
            [randrange(-100, 100, 1) / 10000 for _ in range(len(features[0]))])
        u = np.array([0.0 for _ in range(len(features[0]))])

        train_method_parameters = self.get_train_method_parameters(
            perceptron.train_one_epoch, {
                'train': features,
                'labels': labels,
                'w': w,
                'u': u,
                'c': 1,
                'epoch': 0
            })

        for _ in range(20):
            # The `train_one_epoch` has various signatures depending on the perceptron type used.
            # From all the possible parameters we are selecting only the ones that we infer from the method signature.

            new_train_method_parameters = perceptron.train_one_epoch(
                *train_method_parameters)
            updates_count = new_train_method_parameters.pop(0)
            new_train_method_parameters.insert(0, labels)
            new_train_method_parameters.insert(0, features)
            train_method_parameters = new_train_method_parameters

            if self.cls.__name__ == 'AveragedPerceptron':
                w = train_method_parameters[3] / train_method_parameters[4]
            else:
                w = train_method_parameters[2]

            error_rate = self.calculate_error_rate(development_features,
                                                   development_labels, w)

            if error_rate < best_error_rate:
                best_error_rate = error_rate
                best_w = w

            total_updates += updates_count

            error_rates.append(error_rate)

        print('\nDevelopment set error rates: %s' %
              "% ".join(format(e, "7.2f") for e in error_rates))
        print(
            'Minimum error rate: %.2f%% Epoch: %d DEVELOPMENT SET ACCURACY %.2f%% UPDATES PERFORMED DURING TRAINING %d'
            % (min(error_rates), error_rates.index(
                min(error_rates)), 100 - min(error_rates), total_updates))

        title = 'Perceptron type: {} {} '.format(
            self.cls.__name__,
            self.get_print_value(perceptron_cls_init_parameters))
        CrossValidatorTester.plot(error_rates, title)
        return best_w
Exemple #16
0
    'longevity: hours',
    'longevity: minutes',
    'longevity: seconds',
    'number of following',
    'numberof followers',
    'the ratio of the number of following and followers',
    'the number of posted tweets',
    'the number of posted tweets per day',
    'the average number of links in tweets',
    'the average number of unique links in tweets',
    'the average numer of username in tweets',
    'the average numer of unique username in tweets',
    'the change rate of number of following'
]

features, labels = DataSetLoader('../TwitterDataset/data-splits/data.train').load()
test_features, test_labels = DataSetLoader('../TwitterDataset/data-splits/data.test').load()
clf = tree.DecisionTreeClassifier()
clf.fit(features, labels)

predicted = clf.predict(test_features)
counter = 0
for idx, val in enumerate(predicted):
    if predicted[idx] != test_labels[idx]:
        counter += 1

print('Error rate', counter / float(len(predicted)) * 100)

dot_data = tree.export_graphviz(
    clf,
    out_file=None,
Exemple #17
0
# This is a test to test my implementation

from sklearn import tree
import graphviz
from data_set_loader import DataSetLoader
from data_set_features_enricher import DataSetFeaturesEnricher

original_data_set = DataSetLoader('dataset/training.data').load()

# Create a data set using the following features.
feature_creation_labels = [
    'first_name_longer_that_last_name', 'has_middle_name',
    'first_name_starts_and_ends_with_same_letter',
    'first_name_come_alphabetically_before_their_last_name',
    'second_letter_of_their_first_name_a_vowel',
    'is_the_number_of_last_name_letters_even'
]
enricher = DataSetFeaturesEnricher(original_data_set, feature_creation_labels)
data_set = enricher.get_enrich_data_set()

features = []
labels = []
for entry in data_set:
    features.append(entry[:-1])
    labels.append(entry[-1])

test_data = DataSetLoader('dataset/test.data').load()
enricher = DataSetFeaturesEnricher(test_data, feature_creation_labels)
test_data_set = enricher.get_enrich_data_set()

test_features = []