def main(): lines_rectangle, correct_responces, lengths_list = fd.prepare_rectangle_data( sys.argv) correct_responces = np.array(correct_responces) if not (lines_rectangle is None): line_len = lengths_list[1] frequency_map = su.calc_freq_over_cols(lines_rectangle, range(line_len)) # for letter in su.alphabet: # print "{0} : {1}".format(letter, frequency_map[letter]) alpha_order, _ = find_good_order(su.alphabet, frequency_map) order_map = {} for letter in alpha_order: order_map[letter] = np.where(alpha_order == letter)[0][0] lines_rectangle = transform_lines(lines_rectangle) lines_rectangle = lines_rectangle[:, :line_len] train_data = get_numeric_data(lines_rectangle, order_map) #adaboost_model = train_model(train_data, correct_responces) print "Training started: {0}".format(datetime.datetime.now()) model = learn_decision_tree(train_data, correct_responces, alpha_order, def_ratio_search_dist) print "Training Ended: {0}".format(datetime.datetime.now()) print "Evaluation started: {0}".format(datetime.datetime.now()) accuracy = evaluate_model(model, train_data, correct_responces) print "Evaluation ended: {0}".format(datetime.datetime.now()) print "Accuracy: {0}".format(accuracy)
def main(): lines_rectangle, correct_responces, lengths_list = fd.prepare_rectangle_data( sys.argv) if not (lines_rectangle is None): line_len = lengths_list[1] total_variation_dist = su.calculate_total_var_dist( lines_rectangle, line_len) sorted_cols_indices = np.argsort(total_variation_dist) letter_frequency_map = su.calc_freq_over_cols(lines_rectangle, sorted_cols_indices) lines_rectangle = np.matrix(map(lambda x: list(x), lines_rectangle)) entropy_list = [] for column_index in sorted_cols_indices: entropy_list.append( su.calc_entropy_of_col(lines_rectangle[:, column_index], letter_frequency_map)) total_variation_dist = [ total_variation_dist[i] for i in sorted_cols_indices ] x = np.array(range(1, len(total_variation_dist) + 1)) fig = plt.figure(1, figsize=(10, 6)) fig.suptitle('Distance to uniform distribution, sorted') plt.plot(x, total_variation_dist, 'b') plt.xlabel('Sorted columns') plt.ylabel('Distance to uniform distribution') plt.savefig("Experiments/Experiment3/DistanceToUniform.png") min_acc = np.min(entropy_list) max_acc = np.max(entropy_list) delta = (max_acc - min_acc) * 0.1 min_acc -= delta max_acc += delta fig = plt.figure(2, figsize=(10, 6)) fig.suptitle('Entropy of sorted by distance columns') axis = plt.gca() axis.get_yaxis().get_major_formatter().set_useOffset(False) axis.set_ylim([min_acc, max_acc]) axis.set_xlim([-5, len(x) + 5]) plt.plot(x, entropy_list, 'bo') plt.xlabel('Sorted columns') plt.ylabel('Entropy, bits') # plotting linear regression x_pred = x.reshape(-1, 1) model = LinearRegression(n_jobs=8) model.fit(x_pred, entropy_list) y_pred = model.predict(x_pred) plt.plot(x, y_pred, 'r') plt.savefig("Experiments/Experiment3/Entropy.png")
def main(): lines_rectangle, correct_responces, lengths_list = fd.prepare_rectangle_data( sys.argv) if not (lines_rectangle is None): cur_figure = 1 line_len = lengths_list[1] for vec_size in range(exp_start, exp_end, exp_step): accuracy_list = run_experiments(vec_size, experiment_number, line_len, lines_rectangle, correct_responces) # minimal distance approach totat_variation_dist = su.calculate_total_var_dist( lines_rectangle, line_len) sorted_cols_indices = np.argsort(totat_variation_dist) test_cols = sorted_cols_indices[:vec_size] test_mapping = fd.generate_mapping(lines_rectangle, correct_responces, test_cols) test_accuracy = fd.evaluate_mapping_accuracy( lines_rectangle, correct_responces, test_cols, test_mapping) percentile = int(find_percentile(accuracy_list, test_accuracy)) correct_responces = np.array(correct_responces) p = len(correct_responces[correct_responces]) / float( len(correct_responces)) random_acc = p**2 + (1.0 - p)**2 # plotting the histogram fig = plt.figure(cur_figure, figsize=(10, 6)) cur_figure += 1 axis = plt.gca() axis.set_xlim([-0.1, 1.1]) fig.suptitle( 'C = {3}, K = {0}, Experiments: {4}\nTest accuracy: {1}, Percentile: {2}' .format(vec_size, test_accuracy, percentile, line_len, experiment_number)) plt.hist(accuracy_list, bins=bins_num) test_line = plt.axvline(test_accuracy, c='r', label='Test accuracy') rnd_line = plt.axvline(random_acc, c='g', label='Random guess') plt.xlabel('Accuracy') plt.ylabel('Appearances') plt.legend(handles=[test_line, rnd_line]) plt.ioff() plt.savefig("Experiments/Experiment5/k{0}.png".format(vec_size))
def main(): lines_rectangle, correct_responces, lengths_list = fd.prepare_rectangle_data( sys.argv) if not (lines_rectangle is None): cur_figure = 1 line_len = lengths_list[1] for vec_size in range(exp_start, exp_end, exp_step): delta_list = run_experiments(vec_size, experiment_number, line_len, lines_rectangle, correct_responces) # plotting the histogram fig = plt.figure(cur_figure, figsize=(6, 6)) cur_figure += 1 axis = plt.gca() axis.set_xlim([-0.05, 1.05]) fig.suptitle('C = {1}, K = {0}, Experiments: {2}'.format( vec_size, line_len, experiment_number)) plt.hist(delta_list, bins=bins_num) plt.xlabel('Delta P') plt.ylabel('Appearances') plt.ioff()
def main(): lines_rectangle, correct_responces, lengths_list = fd.prepare_rectangle_data( sys.argv) if not (lines_rectangle is None): cur_figure = 1 line_len = lengths_list[1] for vec_size in range(exp_start, exp_end, exp_step): accuracy_list = run_experiments(vec_size, experiment_number, line_len, lines_rectangle, correct_responces) # minimal distance approach totat_variation_dist = su.calculate_total_var_dist( lines_rectangle, line_len) sorted_cols_indices = np.argsort(totat_variation_dist) test_cols = sorted_cols_indices[:vec_size] test_mapping = fd.generate_mapping(lines_rectangle, correct_responces, test_cols) test_accuracy = fd.evaluate_mapping_accuracy( lines_rectangle, correct_responces, test_cols, test_mapping) percentile = int(find_percentile(accuracy_list, test_accuracy)) # plotting the histogram fig = plt.figure(cur_figure, figsize=(6, 6)) cur_figure += 1 fig.suptitle( 'C = {3}, K = {0}, Experiments: {4}\nTest accuracy: {1}, Percentile: {2}' .format(vec_size, test_accuracy, percentile, line_len, experiment_number)) plt.hist(accuracy_list, bins=bins_num) plt.axvline(test_accuracy, c='r', label='Test accuracy') plt.xlabel('Accuracy') plt.ylabel('Appearances') plt.ioff() plt.savefig("Experiments/Experiment1/k{0}.png".format(vec_size))
def main(): lines_rectangle, correct_responces, lengths_list = fd.prepare_rectangle_data( sys.argv) correct_responces = np.array(correct_responces) if not (lines_rectangle is None): line_len = lengths_list[1] frequency_map = su.calc_freq_over_cols(lines_rectangle, range(line_len)) # for letter in su.alphabet: # print "{0} : {1}".format(letter, frequency_map[letter]) cur_figure = 1 for dist in range(dist_start, dist_end, dist_step): print 'Distance {0} of {1} started: {2}'.format( dist, dist_end, datetime.datetime.now()) sys.stdout.flush() accuracy_list = [] with tqdm(total=dist_experiment_count) as progress: progress.set_description("Running experiments") for experiment in range(dist_experiment_count): alpha_order, _ = dt.find_good_order( su.alphabet, frequency_map) order_map = {} for letter in alpha_order: order_map[letter] = np.where( alpha_order == letter)[0][0] lines_train = dt.transform_lines(lines_rectangle) lines_train = lines_train[:, :line_len] train_data = dt.get_numeric_data(lines_train, order_map) model = dt.learn_decision_tree(train_data, correct_responces, alpha_order, dist) #sys.stdout.flush() accuracy = dt.evaluate_model(model, train_data, correct_responces) #sys.stdout.flush() accuracy_list.append(accuracy) progress.update(1) #if experiment % 50 == 0: # print 'Experiment {0} of {1} finished: '.format(experiment, dist_experiment_count, datetime.datetime.now()) p = len(correct_responces[correct_responces]) / float( len(correct_responces)) random_acc = p**2 + (1.0 - p)**2 aver_acc = np.mean(accuracy_list) # plotting the histogram fig = plt.figure(cur_figure, figsize=(10, 10)) cur_figure += 1 axis = plt.gca() axis.set_xlim([-0.05, 1.05]) fig.suptitle( 'Distance = {0}, Experiments = {1}, Random guess = {2}, Average accuracy = {3}' .format(dist, dist_experiment_count, random_acc, aver_acc)) plt.hist(accuracy_list, bins=bins_num) rnd_line = plt.axvline(random_acc, c='g', label='Random guess') aver_line = plt.axvline(aver_acc, c='r', label='Average accuracy') plt.xlabel('Accuracy') plt.ylabel('Appearances') plt.legend(handles=[rnd_line, aver_line], loc=2) plt.ioff() plt.savefig("Experiments/Experiment7/id{0}.png".format(dist)) print 'Distance {0} of {1} ended: {2}'.format( dist, dist_end, datetime.datetime.now()) sys.stdout.flush()
def main(): lines_rectangle, correct_responces, lengths_list = fd.prepare_rectangle_data( sys.argv) if not (lines_rectangle is None): line_len = lengths_list[1] total_variation_dist = su.calculate_total_var_dist( lines_rectangle, line_len) sorted_cols_indices = np.argsort(total_variation_dist) all_lists = {} min_acc = 1.0 max_acc = 0.0 max_size = 0 cur_figure = 1 for vec_size in range(exp_start, exp_end, exp_step): print "Experiment K={0} started: {1}".format( vec_size, datetime.datetime.now()) working_cols = sorted_cols_indices[:vec_size] mapping = fd.generate_mapping(lines_rectangle, correct_responces, working_cols) accuracy_list = improve_solution(lines_rectangle, correct_responces, working_cols, mapping, line_len) all_lists[vec_size] = accuracy_list cur_min = np.min(accuracy_list) cur_max = np.max(accuracy_list) cur_size = len(accuracy_list) if min_acc > cur_min: min_acc = cur_min if max_acc < cur_max: max_acc = cur_max if cur_size > max_size: max_size = cur_size print "Experiment K={0} finished: {1}".format( vec_size, datetime.datetime.now()) delta = (max_acc - min_acc) * 0.1 min_acc -= delta max_acc += delta max_size += 1 for vec_size in range(exp_start, exp_end, exp_step): accuracy_list = all_lists[vec_size] # plotting fig = plt.figure(cur_figure, figsize=(10, 6)) fig.suptitle('Accuracy improvement\nC = {1}, K = {0}'.format( vec_size, line_len)) axis = plt.gca() axis.set_ylim([min_acc, max_acc]) axis.set_xlim([-1, max_size]) cur_figure += 1 plt.plot(range(len(accuracy_list)), accuracy_list, 'b') plt.ylabel('Accuracy') plt.xlabel('Step') plt.savefig("Experiments/Experiment4/2k{0}.png".format(vec_size))
def main(): lines_rectangle, correct_responces, lengths_list = fd.prepare_rectangle_data( sys.argv) if not (lines_rectangle is None): line_len = lengths_list[1] total_variation_dist = su.calculate_total_var_dist( lines_rectangle, line_len) sorted_cols_indices = np.argsort(total_variation_dist) cur_figure = 1 all_lists = {} min_acc = 1.0 max_acc = 0.0 max_size = 0 for vec_size in range(exp_start, exp_end, exp_step): accuracy_list = [] for i in range(vec_size, line_len + 1): working_cols = sorted_cols_indices[i - vec_size:i] test_mapping = fd.generate_mapping(lines_rectangle, correct_responces, working_cols) test_accuracy = fd.evaluate_mapping_accuracy( lines_rectangle, correct_responces, working_cols, test_mapping) accuracy_list.append(test_accuracy) if min_acc > test_accuracy: min_acc = test_accuracy if max_acc < test_accuracy: max_acc = test_accuracy if len(accuracy_list) > max_size: max_size = len(accuracy_list) all_lists[vec_size] = accuracy_list delta = (max_acc - min_acc) * 0.1 min_acc -= delta max_acc += delta max_size += 5 for vec_size in range(exp_start, exp_end, exp_step): accuracy_list = all_lists[vec_size] # plotting fig = plt.figure(cur_figure, figsize=(10, 6)) cur_figure += 1 axis = plt.gca() axis.set_ylim([min_acc, max_acc]) axis.set_xlim([-5, max_size]) fig.suptitle('C = {1}, K = {0}'.format(vec_size, line_len)) x = np.array(range(1, len(accuracy_list) + 1)) y = np.array(accuracy_list) plt.plot(x, y, 'bo') # plotting linear regression x_pred = x.reshape(-1, 1) model = LinearRegression(n_jobs=8) model.fit(x_pred, y) y_pred = model.predict(x_pred) line = plt.plot(x, y_pred) plt.setp(line, 'color', 'r', 'linewidth', 2.0) plt.ylabel('Accuracy') plt.xlabel('First column rank') plt.savefig("Experiments/Experiment2/k{0}.png".format(vec_size))