Beispiel #1
0
def main():
    lines_rectangle, correct_responces, lengths_list = fd.prepare_rectangle_data(
        sys.argv)
    correct_responces = np.array(correct_responces)
    if not (lines_rectangle is None):
        line_len = lengths_list[1]
        frequency_map = su.calc_freq_over_cols(lines_rectangle,
                                               range(line_len))
        # for letter in su.alphabet:
        #     print "{0} : {1}".format(letter, frequency_map[letter])
        alpha_order, _ = find_good_order(su.alphabet, frequency_map)
        order_map = {}
        for letter in alpha_order:
            order_map[letter] = np.where(alpha_order == letter)[0][0]
        lines_rectangle = transform_lines(lines_rectangle)
        lines_rectangle = lines_rectangle[:, :line_len]

        train_data = get_numeric_data(lines_rectangle, order_map)

        #adaboost_model = train_model(train_data, correct_responces)

        print "Training started: {0}".format(datetime.datetime.now())
        model = learn_decision_tree(train_data, correct_responces, alpha_order,
                                    def_ratio_search_dist)
        print "Training Ended: {0}".format(datetime.datetime.now())

        print "Evaluation started: {0}".format(datetime.datetime.now())
        accuracy = evaluate_model(model, train_data, correct_responces)
        print "Evaluation ended: {0}".format(datetime.datetime.now())

        print "Accuracy: {0}".format(accuracy)
Beispiel #2
0
def main():
    lines_rectangle, correct_responces, lengths_list = fd.prepare_rectangle_data(
        sys.argv)
    if not (lines_rectangle is None):
        line_len = lengths_list[1]
        total_variation_dist = su.calculate_total_var_dist(
            lines_rectangle, line_len)
        sorted_cols_indices = np.argsort(total_variation_dist)

        letter_frequency_map = su.calc_freq_over_cols(lines_rectangle,
                                                      sorted_cols_indices)
        lines_rectangle = np.matrix(map(lambda x: list(x), lines_rectangle))

        entropy_list = []
        for column_index in sorted_cols_indices:
            entropy_list.append(
                su.calc_entropy_of_col(lines_rectangle[:, column_index],
                                       letter_frequency_map))

        total_variation_dist = [
            total_variation_dist[i] for i in sorted_cols_indices
        ]

        x = np.array(range(1, len(total_variation_dist) + 1))
        fig = plt.figure(1, figsize=(10, 6))
        fig.suptitle('Distance to uniform distribution, sorted')
        plt.plot(x, total_variation_dist, 'b')
        plt.xlabel('Sorted columns')
        plt.ylabel('Distance to uniform distribution')
        plt.savefig("Experiments/Experiment3/DistanceToUniform.png")

        min_acc = np.min(entropy_list)
        max_acc = np.max(entropy_list)

        delta = (max_acc - min_acc) * 0.1
        min_acc -= delta
        max_acc += delta

        fig = plt.figure(2, figsize=(10, 6))
        fig.suptitle('Entropy of sorted by distance columns')
        axis = plt.gca()
        axis.get_yaxis().get_major_formatter().set_useOffset(False)
        axis.set_ylim([min_acc, max_acc])
        axis.set_xlim([-5, len(x) + 5])
        plt.plot(x, entropy_list, 'bo')
        plt.xlabel('Sorted columns')
        plt.ylabel('Entropy, bits')

        # plotting linear regression
        x_pred = x.reshape(-1, 1)
        model = LinearRegression(n_jobs=8)
        model.fit(x_pred, entropy_list)
        y_pred = model.predict(x_pred)
        plt.plot(x, y_pred, 'r')

        plt.savefig("Experiments/Experiment3/Entropy.png")
Beispiel #3
0
def main():
    lines_rectangle, correct_responces, lengths_list = fd.prepare_rectangle_data(
        sys.argv)
    if not (lines_rectangle is None):
        cur_figure = 1
        line_len = lengths_list[1]
        for vec_size in range(exp_start, exp_end, exp_step):
            accuracy_list = run_experiments(vec_size, experiment_number,
                                            line_len, lines_rectangle,
                                            correct_responces)

            # minimal distance approach
            totat_variation_dist = su.calculate_total_var_dist(
                lines_rectangle, line_len)
            sorted_cols_indices = np.argsort(totat_variation_dist)

            test_cols = sorted_cols_indices[:vec_size]
            test_mapping = fd.generate_mapping(lines_rectangle,
                                               correct_responces, test_cols)
            test_accuracy = fd.evaluate_mapping_accuracy(
                lines_rectangle, correct_responces, test_cols, test_mapping)

            percentile = int(find_percentile(accuracy_list, test_accuracy))

            correct_responces = np.array(correct_responces)
            p = len(correct_responces[correct_responces]) / float(
                len(correct_responces))
            random_acc = p**2 + (1.0 - p)**2

            # plotting the histogram
            fig = plt.figure(cur_figure, figsize=(10, 6))
            cur_figure += 1
            axis = plt.gca()
            axis.set_xlim([-0.1, 1.1])
            fig.suptitle(
                'C = {3}, K = {0}, Experiments: {4}\nTest accuracy: {1}, Percentile: {2}'
                .format(vec_size, test_accuracy, percentile, line_len,
                        experiment_number))

            plt.hist(accuracy_list, bins=bins_num)
            test_line = plt.axvline(test_accuracy,
                                    c='r',
                                    label='Test accuracy')
            rnd_line = plt.axvline(random_acc, c='g', label='Random guess')
            plt.xlabel('Accuracy')
            plt.ylabel('Appearances')
            plt.legend(handles=[test_line, rnd_line])
            plt.ioff()
            plt.savefig("Experiments/Experiment5/k{0}.png".format(vec_size))
Beispiel #4
0
def main():
    lines_rectangle, correct_responces, lengths_list = fd.prepare_rectangle_data(
        sys.argv)
    if not (lines_rectangle is None):
        cur_figure = 1
        line_len = lengths_list[1]
        for vec_size in range(exp_start, exp_end, exp_step):
            delta_list = run_experiments(vec_size, experiment_number, line_len,
                                         lines_rectangle, correct_responces)

            # plotting the histogram
            fig = plt.figure(cur_figure, figsize=(6, 6))
            cur_figure += 1
            axis = plt.gca()
            axis.set_xlim([-0.05, 1.05])
            fig.suptitle('C = {1}, K = {0}, Experiments: {2}'.format(
                vec_size, line_len, experiment_number))

            plt.hist(delta_list, bins=bins_num)
            plt.xlabel('Delta P')
            plt.ylabel('Appearances')
            plt.ioff()
Beispiel #5
0
def main():
    lines_rectangle, correct_responces, lengths_list = fd.prepare_rectangle_data(
        sys.argv)
    if not (lines_rectangle is None):
        cur_figure = 1
        line_len = lengths_list[1]
        for vec_size in range(exp_start, exp_end, exp_step):
            accuracy_list = run_experiments(vec_size, experiment_number,
                                            line_len, lines_rectangle,
                                            correct_responces)

            # minimal distance approach
            totat_variation_dist = su.calculate_total_var_dist(
                lines_rectangle, line_len)
            sorted_cols_indices = np.argsort(totat_variation_dist)

            test_cols = sorted_cols_indices[:vec_size]
            test_mapping = fd.generate_mapping(lines_rectangle,
                                               correct_responces, test_cols)
            test_accuracy = fd.evaluate_mapping_accuracy(
                lines_rectangle, correct_responces, test_cols, test_mapping)

            percentile = int(find_percentile(accuracy_list, test_accuracy))

            # plotting the histogram
            fig = plt.figure(cur_figure, figsize=(6, 6))
            cur_figure += 1
            fig.suptitle(
                'C = {3}, K = {0}, Experiments: {4}\nTest accuracy: {1}, Percentile: {2}'
                .format(vec_size, test_accuracy, percentile, line_len,
                        experiment_number))

            plt.hist(accuracy_list, bins=bins_num)
            plt.axvline(test_accuracy, c='r', label='Test accuracy')
            plt.xlabel('Accuracy')
            plt.ylabel('Appearances')
            plt.ioff()
            plt.savefig("Experiments/Experiment1/k{0}.png".format(vec_size))
Beispiel #6
0
def main():
    lines_rectangle, correct_responces, lengths_list = fd.prepare_rectangle_data(
        sys.argv)
    correct_responces = np.array(correct_responces)
    if not (lines_rectangle is None):
        line_len = lengths_list[1]
        frequency_map = su.calc_freq_over_cols(lines_rectangle,
                                               range(line_len))
        # for letter in su.alphabet:
        #     print "{0} : {1}".format(letter, frequency_map[letter])
        cur_figure = 1
        for dist in range(dist_start, dist_end, dist_step):
            print 'Distance {0} of {1} started: {2}'.format(
                dist, dist_end, datetime.datetime.now())
            sys.stdout.flush()
            accuracy_list = []
            with tqdm(total=dist_experiment_count) as progress:
                progress.set_description("Running experiments")
                for experiment in range(dist_experiment_count):
                    alpha_order, _ = dt.find_good_order(
                        su.alphabet, frequency_map)
                    order_map = {}
                    for letter in alpha_order:
                        order_map[letter] = np.where(
                            alpha_order == letter)[0][0]
                    lines_train = dt.transform_lines(lines_rectangle)
                    lines_train = lines_train[:, :line_len]

                    train_data = dt.get_numeric_data(lines_train, order_map)

                    model = dt.learn_decision_tree(train_data,
                                                   correct_responces,
                                                   alpha_order, dist)
                    #sys.stdout.flush()

                    accuracy = dt.evaluate_model(model, train_data,
                                                 correct_responces)
                    #sys.stdout.flush()
                    accuracy_list.append(accuracy)
                    progress.update(1)

                    #if experiment % 50 == 0:
                    #    print 'Experiment {0} of {1} finished: '.format(experiment, dist_experiment_count, datetime.datetime.now())

            p = len(correct_responces[correct_responces]) / float(
                len(correct_responces))
            random_acc = p**2 + (1.0 - p)**2

            aver_acc = np.mean(accuracy_list)

            # plotting the histogram
            fig = plt.figure(cur_figure, figsize=(10, 10))
            cur_figure += 1
            axis = plt.gca()
            axis.set_xlim([-0.05, 1.05])
            fig.suptitle(
                'Distance = {0}, Experiments = {1}, Random guess = {2}, Average accuracy = {3}'
                .format(dist, dist_experiment_count, random_acc, aver_acc))

            plt.hist(accuracy_list, bins=bins_num)
            rnd_line = plt.axvline(random_acc, c='g', label='Random guess')
            aver_line = plt.axvline(aver_acc, c='r', label='Average accuracy')
            plt.xlabel('Accuracy')
            plt.ylabel('Appearances')
            plt.legend(handles=[rnd_line, aver_line], loc=2)
            plt.ioff()
            plt.savefig("Experiments/Experiment7/id{0}.png".format(dist))

            print 'Distance {0} of {1} ended: {2}'.format(
                dist, dist_end, datetime.datetime.now())
            sys.stdout.flush()
Beispiel #7
0
def main():
    lines_rectangle, correct_responces, lengths_list = fd.prepare_rectangle_data(
        sys.argv)
    if not (lines_rectangle is None):
        line_len = lengths_list[1]
        total_variation_dist = su.calculate_total_var_dist(
            lines_rectangle, line_len)
        sorted_cols_indices = np.argsort(total_variation_dist)

        all_lists = {}

        min_acc = 1.0
        max_acc = 0.0
        max_size = 0

        cur_figure = 1
        for vec_size in range(exp_start, exp_end, exp_step):
            print "Experiment K={0} started: {1}".format(
                vec_size, datetime.datetime.now())
            working_cols = sorted_cols_indices[:vec_size]
            mapping = fd.generate_mapping(lines_rectangle, correct_responces,
                                          working_cols)
            accuracy_list = improve_solution(lines_rectangle,
                                             correct_responces, working_cols,
                                             mapping, line_len)
            all_lists[vec_size] = accuracy_list
            cur_min = np.min(accuracy_list)
            cur_max = np.max(accuracy_list)
            cur_size = len(accuracy_list)

            if min_acc > cur_min:
                min_acc = cur_min
            if max_acc < cur_max:
                max_acc = cur_max
            if cur_size > max_size:
                max_size = cur_size

            print "Experiment K={0} finished: {1}".format(
                vec_size, datetime.datetime.now())

        delta = (max_acc - min_acc) * 0.1
        min_acc -= delta
        max_acc += delta
        max_size += 1

        for vec_size in range(exp_start, exp_end, exp_step):
            accuracy_list = all_lists[vec_size]
            # plotting
            fig = plt.figure(cur_figure, figsize=(10, 6))
            fig.suptitle('Accuracy improvement\nC = {1}, K = {0}'.format(
                vec_size, line_len))
            axis = plt.gca()
            axis.set_ylim([min_acc, max_acc])
            axis.set_xlim([-1, max_size])
            cur_figure += 1
            plt.plot(range(len(accuracy_list)), accuracy_list, 'b')

            plt.ylabel('Accuracy')
            plt.xlabel('Step')

            plt.savefig("Experiments/Experiment4/2k{0}.png".format(vec_size))
Beispiel #8
0
def main():
    lines_rectangle, correct_responces, lengths_list = fd.prepare_rectangle_data(
        sys.argv)
    if not (lines_rectangle is None):
        line_len = lengths_list[1]
        total_variation_dist = su.calculate_total_var_dist(
            lines_rectangle, line_len)
        sorted_cols_indices = np.argsort(total_variation_dist)
        cur_figure = 1
        all_lists = {}
        min_acc = 1.0
        max_acc = 0.0
        max_size = 0
        for vec_size in range(exp_start, exp_end, exp_step):
            accuracy_list = []
            for i in range(vec_size, line_len + 1):
                working_cols = sorted_cols_indices[i - vec_size:i]
                test_mapping = fd.generate_mapping(lines_rectangle,
                                                   correct_responces,
                                                   working_cols)
                test_accuracy = fd.evaluate_mapping_accuracy(
                    lines_rectangle, correct_responces, working_cols,
                    test_mapping)
                accuracy_list.append(test_accuracy)

                if min_acc > test_accuracy:
                    min_acc = test_accuracy
                if max_acc < test_accuracy:
                    max_acc = test_accuracy

            if len(accuracy_list) > max_size:
                max_size = len(accuracy_list)
            all_lists[vec_size] = accuracy_list

        delta = (max_acc - min_acc) * 0.1
        min_acc -= delta
        max_acc += delta
        max_size += 5

        for vec_size in range(exp_start, exp_end, exp_step):
            accuracy_list = all_lists[vec_size]
            # plotting
            fig = plt.figure(cur_figure, figsize=(10, 6))
            cur_figure += 1
            axis = plt.gca()
            axis.set_ylim([min_acc, max_acc])
            axis.set_xlim([-5, max_size])
            fig.suptitle('C = {1}, K = {0}'.format(vec_size, line_len))
            x = np.array(range(1, len(accuracy_list) + 1))
            y = np.array(accuracy_list)
            plt.plot(x, y, 'bo')

            # plotting linear regression
            x_pred = x.reshape(-1, 1)
            model = LinearRegression(n_jobs=8)
            model.fit(x_pred, y)
            y_pred = model.predict(x_pred)
            line = plt.plot(x, y_pred)
            plt.setp(line, 'color', 'r', 'linewidth', 2.0)

            plt.ylabel('Accuracy')
            plt.xlabel('First column rank')
            plt.savefig("Experiments/Experiment2/k{0}.png".format(vec_size))