def compute_errors_and_ks(self, m, k_first, k_last, step): k_array = [] for k in range(k_first, k_last + step, step): k_array.append(k) k_array = np.array(k_array) empirical_array = np.array( [0 for x in range(k_first, k_last + step, step)], dtype=np.float32) true_array = np.array([0 for x in range(k_first, k_last + step, step)], dtype=np.float32) sample_array = self.sample_from_D(m) sample_array = sample_array[sample_array[:, 0].argsort()] for k in range(k_first, k_last + step, step): xs = sample_array[:, 0] ys = sample_array[:, 1] all_intervals = intervals.find_best_interval(xs, ys, k)[0] # calculate empirical err sum = 0 for index in range(len(xs)): is_in_any_interval = False for interval in all_intervals: if (self.is_in_interval(interval, xs[index])): is_in_any_interval = True if ((is_in_any_interval and ys[index] == 0) or (not is_in_any_interval and ys[index] == 1)): sum = sum + 1 empirical_err = sum / m true_err = self.calculate_error_from_intervals(all_intervals) empirical_array[int((k - k_first) / step)] = empirical_array[int( (k - k_first) / step)] + empirical_err true_array[int((k - k_first) / step)] = true_array[int( (k - k_first) / step)] + true_err return k_array, empirical_array, true_array
def part_iii(): k_intervals, error = intervals.find_best_interval(x, y, 2) for x0, x1 in k_intervals: pyplot.plot([x0, x1], [0.5, 0.5], 'g-', linewidth=4, label="ERM interval")
def Q2_f(output_directory): print "::Question 2f::" m = 50 k_values = range(1, 21) errors = [] x, y = RandomDSorted(m) validation_x, validation_y = RandomDSorted(m) for k in k_values: k_intervals, error = intervals.find_best_interval(x, y, k) validation_error = EmpiricalError(validation_x, validation_y, k_intervals) errors.append(validation_error) best_k = k_values[numpy.argmin(errors)] print "\tBest k using validation:", best_k pyplot.plot([best_k, best_k], [0, 1], "r--", label="best k") pyplot.plot(k_values, errors, 'b^--', label="validation empirical error") pyplot.legend() pyplot.title( "Quetion 2f:: Empirical error of validation\nas a function of k (using ERM)" ) pyplot.xlabel("k") pyplot.ylabel("Empirical error") pyplot.ylim([min(errors) - 0.01, max(errors) + 0.01]) pyplot.savefig(os.path.join(output_directory, "Q2f.png")) pyplot.clf()
def Q2_c(output_directory, T=100, m_values=tuple(range(10, 101, 5))): print "::Question 2c::" k = 2 average_true_errors = [] average_empricial_errors = [] for m in m_values: total_true_error = 0 total_empirical_error = 0 for i in xrange(T): # part i x, y = RandomDSorted(m) k_intervals, best_error = intervals.find_best_interval(x, y, k) # part ii empirical_error = best_error / float(m) total_empirical_error += empirical_error # part iii true_error = TrueError(k_intervals) total_true_error += true_error average_empricial_errors.append(total_empirical_error / T) average_true_errors.append(total_true_error / T) pyplot.plot(m_values, average_empricial_errors, 'ro--', label="empirical error") pyplot.plot(m_values, average_true_errors, 'b^--', label="true error") pyplot.legend() pyplot.xlabel("m (samples amount)") pyplot.ylabel("Average error") pyplot.title( "Question 2c:: Average error of ERM using k=2 intervals,\nas a function of samples amount" ) pyplot.savefig(os.path.join(output_directory, "Q2c.png")) pyplot.clf()
def draw_sample_intervals(self, m, k): """ Plots the data as asked in (a) i ii and iii. Input: m - an integer, the size of the data sample. k - an integer, the maximum number of intervals. Returns: None. """ samples = self.sample_from_D(m) samples = samples[samples[:, 0].argsort()] plt.scatter(samples[:, 0], samples[:, 1]) plt.xlim(-0.1, 1.1) plt.ylim(-0.1, 1.1) plt.axvline(x=0.2) plt.axvline(x=0.4) plt.axvline(x=0.6) plt.axvline(x=0.8) (inters, best_err) = intervals.find_best_interval(samples[:, 0], samples[:, 1], 3) flat_inters = [] for i in range(k): flat_inters.append(inters[i][0]) flat_inters.append(inters[i][1]) for interval in inters: dots = np.linspace(interval[0], interval[1], num=1000) plt.plot(dots, [-0.1] * 1000, linewidth=5) plt.xticks(flat_inters) plt.show() return inters
def experiment_k_range_srm(self, m, k_first, k_last, step): """Runs the experiment in (d). Plots additionally the penalty for the best ERM hypothesis. and the sum of penalty and empirical error. Input: m - an integer, the size of the data sample. k_first - an integer, the maximum number of intervals in the first experiment. m_last - an integer, the maximum number of intervals in the last experiment. step - an integer, the difference between the size of k in each experiment. Returns: The best k value (an integer) according to the SRM algorithm. """ # TODO: Implement the loop samples = self.sample_from_D(m) index = 0 best_k = 0 results = np.zeros((len(range(k_first, k_last + 1, step)), 3)) for k in range(k_first, k_last + 1, step): erm_result = intervals.find_best_interval(samples[:, 0], samples[:, 1], k) results[index][0] = erm_result[1] / m results[index][1] = self.calc_true_error(erm_result[0]) results[index][2] = self.calc_penalty(m, k) if(results[index][0] + results[index][2] < results[best_k][0] + results[best_k][2]): best_k = index index += 1 # self.plotE(k_first, k_last, step, results) return best_k + 1
def draw_sample_intervals(self, m, k): """ Plots the data as asked in (a) i ii and iii. Input: m - an integer, the size of the data sample. k - an integer, the maximum number of intervals. Returns: None. """ p = self.sample_from_D(m) sorted_p = sorted(p, key=lambda x_y_point: x_y_point[0]) zero_points = [x[0] for x in p if x[1] == 0] one_points = [x[0] for x in p if x[1] == 1] plt.plot(one_points, [1 for _ in range(len(one_points))], 'o', label='one') plt.plot(zero_points, [0 for _ in range(len(zero_points))], 'o', label='zero') plt.axvline(0.2, color='r', linestyle='--', linewidth=1.0) plt.axvline(0.4, color='r', linestyle='--', linewidth=1.0) plt.axvline(0.6, color='r', linestyle='--', linewidth=1.0) plt.axvline(0.8, color='r', linestyle='--', linewidth=1.0) plt.axis([0, 1, -0.1, 1.1]) intervals, empirical_error = find_best_interval([x[0] for x in sorted_p], [y[1] for y in sorted_p], k) for i in range(len(intervals)): interval_points = np.linspace(intervals[i][0], intervals[i][1], 100) plt.plot(interval_points, [-0.09 for _ in range(100)], color='g', linewidth=5.0) plt.annotate("{:.2f}".format(interval_points[0]),(interval_points[0], -0.1)) plt.annotate("{:.2f}".format(interval_points[99]),(interval_points[99], -0.1)) plt.legend(loc='best') plt.ylabel('y') plt.xlabel('x') plt.title("best intervals (m={},k={})".format(m, k)) plt.savefig("section a - draw_sample_intervals") plt.close()
def experiment_m_range_erm(self, m_first, m_last, step, k, T): """Runs the ERM algorithm. Calculates the empirical error and the true error. Plots the average empirical and true errors. Input: m_first - an integer, the smallest size of the data sample in the range. m_last - an integer, the largest size of the data sample in the range. step - an integer, the difference between the size of m in each loop. k - an integer, the maximum number of intervals. T - an integer, the number of times the experiment is performed. Returns: np.ndarray of shape (n_steps,2). A two dimensional array that contains the average empirical error and the average true error for each m in the range accordingly. """ # TODO: Implement the loop index = 0 results = np.zeros((len(range(m_first, m_last+1, step)), 2)) for m in range(m_first, m_last+1, step): cnt_true_error = 0 cnt_emp_error = 0 for i in range(T): samples = self.sample_from_D(m) best_intervals = intervals.find_best_interval(samples[:, 0], samples[:, 1], k) cnt_emp_error += best_intervals[1] / m cnt_true_error += self.calc_true_error(best_intervals[0]) results[index][0] = cnt_emp_error / T results[index][1] = cnt_true_error / T index += 1 # self.plotC(m_first, m_last, step, results) return results
def experiment_k_range_erm(self, m, k_first, k_last, step): """Finds the best hypothesis for k= 1,2,...,10. Plots the empirical and true errors as a function of k. Input: m - an integer, the size of the data sample. k_first - an integer, the maximum number of intervals in the first experiment. m_last - an integer, the maximum number of intervals in the last experiment. step - an integer, the difference between the size of k in each experiment. Returns: The best k value (an integer) according to the ERM algorithm. """ emp_errors = [] true_errors = [] min_erm_error = 1 min_k = 0 sample = self.sample_from_D(m) for k in range(k_first, k_last + 1, step): best_intervals, error_amount = intervals.find_best_interval(sample[:, 0], sample[:, 1], k) emp_error = error_amount / m emp_errors.append(emp_error) true_errors.append(self.caluclate_true_error(best_intervals)) if emp_error < min_erm_error: min_erm_error = emp_error min_k = k plt.plot([k for k in range(k_first, k_last + 1, step)], emp_errors, color='red') plt.plot([k for k in range(k_first, k_last + 1, step)], true_errors, color='blue') plt.legend(['Empirical Error', 'True Error'], loc='upper right') # plt.show() return min_k
def draw_sample_intervals(self, m, k): """ Plots the data as asked in (a) i ii and iii. Input: m - an integer, the size of the data sample. k - an integer, the maximum number of intervals. Returns: None. """ sorted_samples = sorted(self.sample_from_D(m), key=lambda p: p[0]) best_intervals = intervals.find_best_interval( [sample[0] for sample in sorted_samples], [sample[1] for sample in sorted_samples], k)[0] plt.figure() plt.plot([sample[0] for sample in sorted_samples], [sample[1] for sample in sorted_samples], '.') plt.xlabel('x') plt.ylabel('y') plt.axis([0, 1, -0.1, 1.1]) plt.xticks(np.arange(0, 1, 0.2)) plt.gca().axvline(0.2) plt.gca().axvline(0.4) plt.gca().axvline(0.6) plt.gca().axvline(0.8) plt.gca().axhline(0) plt.gca().axhline(1) for interval in best_intervals: plt.hlines(0.5, interval[0], interval[1]) plt.savefig('q1a.png')
def experiment_m_range_erm(self, m_first, m_last, step, k, T): """Runs the ERM algorithm. Calculates the empirical error and the true error. Plots the average empirical and true errors. Input: m_first - an integer, the smallest size of the data sample in the range. m_last - an integer, the largest size of the data sample in the range. step - an integer, the difference between the size of m in each loop. k - an integer, the maximum number of intervals. T - an integer, the number of times the experiment is performed. Returns: np.ndarray of shape (n_steps,2). A two dimensional array that contains the average empirical error and the average true error for each m in the range accordingly. """ sum_emp_error = 0 sum_true_error = 0 emp_lst = [0 for i in range(m_first, m_last + 1, step)] true_lst = [0 for i in range(m_first, m_last + 1, step)] i = 0 for m in range(m_first, m_last + 1, step): for j in range(T): pairs = self.sample_from_D(m) pairs = sorted(pairs, key=lambda x: x[0]) x = np.array([p[0] for p in pairs]) y = np.array([p[1] for p in pairs]) interval, emp_error = intervals.find_best_interval(x, y, k) sum_emp_error += (emp_error / m) sum_true_error += self.true_error(interval) emp_lst[i] = (sum_emp_error / T) true_lst[i] = (sum_true_error / T) sum_emp_error = 0 sum_true_error = 0 i += 1 plt.plot([m for m in range(m_first, m_last + 1, step)], emp_lst, 'ro', label='empirical error') plt.plot([m for m in range(m_first, m_last + 1, step)], true_lst, 'bo', label='true error') plt.xlabel('m') plt.ylabel('Error') plt.legend() plt.title('empirical vs true error') plt.savefig('Qc_new.pdf') plt.clf() plt.cla() res = np.ndarray(shape=(len(emp_lst), 2)) for i in range(len(emp_lst)): res[i][0] = emp_lst[i] res[i][1] = true_lst[i] return res
def cross_validation(self, m, T): """Finds a k that gives a good test error. Chooses the best hypothesis based on 3 experiments. Input: m - an integer, the size of the data sample. T - an integer, the number of times the experiment is performed. Returns: The best k value (an integer) found by the cross validation algorithm. """ k_counts = [0 for k in range(11)] sample = self.sample_from_D(m) for i in range(T): np.random.shuffle(sample) training_set = [sample[i] for i in range(4 * m // 5)] training_set.sort(key=lambda x: x[0]) holdout_set_x = [sample[i][0] for i in range(4 * m // 5, m)] holdout_set_y = [sample[i][1] for i in range(4 * m // 5, m)] min_k = 0 min_error = 1 for k in range(1, 11): best_intervals, error_amount = intervals.find_best_interval( [training_set[i][0] for i in range(len(training_set))], [training_set[i][1] for i in range(len(training_set))], k) holdout_error = self.calc_holdout_error(best_intervals, holdout_set_x, holdout_set_y) / m if holdout_error < min_error: min_error = holdout_error min_k = k k_counts[min_k] += 1 highest_freq = max(k_counts) return k_counts.index(highest_freq) + 1
def draw_sample_intervals(self, m, k): """ Plots the data as asked in (a) i ii and iii. Input: m - an integer, the size of the data sample. k - an integer, the maximum number of intervals. Returns: None. """ #i pairs = self.sample_from_D(m) pairs = sorted(pairs, key=lambda x: x[0]) x = np.array([p[0] for p in pairs]) y = np.array([p[1] for p in pairs]) plt.ylabel("Labels") plt.ylim(-0.1, 1.1) plt.scatter(x, y, color='blue') #ii plt.axvline(x=0.2, color='black') plt.axvline(x=0.4, color='black') plt.axvline(x=0.6, color='black') plt.axvline(x=0.8, color='black') ##iii interval, error = intervals.find_best_interval(x, y, k) for ints in interval: plt.hlines(-0.05, ints[0], ints[1], 'red', lw=5) plt.savefig('Qa_new.pdf') plt.clf() plt.cla() return None
def cross_validation(self, m, T): """Finds a k that gives a good test error. Chooses the best hypothesis based on 3 experiments. Input: m - an integer, the size of the data sample. T - an integer, the number of times the experiment is performed. Returns: The best k value (an integer) found by the cross validation algorithm. """ samples = self.sample_from_D(m) best_k_list = [] for i in range(T): print('{} out of T={}'.format(i, T)) np.random.shuffle(samples) holdout_samples = samples[:m // 5, :] train_samples = samples[m // 5:, :] train_samples = np.asarray( sorted(train_samples, key=lambda a_entry: a_entry[0])) min_k = 1 min_k_holdout_error = 1 for k in range(1, 10): print('k={}'.format(k)) h, _ = intervals.find_best_interval(train_samples[:, 0], train_samples[:, 1], k) holdout_error = calc_holdout_error(m // 5, h, holdout_samples) if holdout_error < min_k_holdout_error: min_k_holdout_error = holdout_error min_k = k print('min_k {}'.format(min_k)) best_k_list.append(min_k) print(best_k_list) counts = np.bincount(best_k_list) return np.argmax(counts)
def experiment_k_range_erm(self, m, k_first, k_last, step): """Finds the best hypothesis for k= 1,2,...,10. Plots the empirical and true errors as a function of k. Input: m - an integer, the size of the data sample. k_first - an integer, the maximum number of intervals in the first experiment. m_last - an integer, the maximum number of intervals in the last experiment. step - an integer, the difference between the size of k in each experiment. Returns: The best k value (an integer) according to the ERM algorithm. """ k_list = np.arange(k_first, k_last + step, step) samples = self.sample_from_D(m) empirical_error_list = [] true_error_list = [] for k in k_list: print(k) h, _ = intervals.find_best_interval(samples[:, 0], samples[:, 1], k) empirical_error = calc_empirical_error(samples, h) true_error = calc_true_error(h) empirical_error_list.append(empirical_error) true_error_list.append(true_error) plt.plot(k_list, empirical_error_list, label="empirical errors") plt.plot(k_list, true_error_list, label="true errors") plt.legend() plt.savefig('results/section_d.png') best_k = k_list[np.argmin(empirical_error_list)] print('best k {}'.format(best_k)) return best_k
def experiment_k_range_erm(self, m, k_first, k_last, step): """Finds the best hypothesis for k= 1,2,...,10. Plots the empirical and true errors as a function of k. Input: m - an integer, the size of the data sample. k_first - an integer, the maximum number of intervals in the first experiment. m_last - an integer, the maximum number of intervals in the last experiment. step - an integer, the difference between the size of k in each experiment. Returns: The best k value (an integer) according to the ERM algorithm. """ # TODO: Implement the loop samples = self.sample_from_D(m) index = 0 best_k = 0 results = np.zeros((len(range(k_first, k_last + 1, step)), 2)) for k in range(k_first, k_last + 1, step): erm_result = intervals.find_best_interval(samples[:, 0], samples[:, 1], k) results[index][0] = erm_result[1] / m results[index][1] = self.calc_true_error(erm_result[0]) if (results[index][0] < results[best_k][0]): best_k = index index += 1 # self.plotD(k_first, k_last, step, results) return best_k + 1
def cross_validation(self, m, k_first, k_last, step, T): """Finds a k that gives a good test error. Chooses the best hypothesis based on 3 experiments. Input: m - an integer, the size of the data sample. T - an integer, the number of times the experiment is performed. Returns: The best k value (an integer) found by the cross validation algorithm. """ n_steps = int((k_last - k_first) / step + 1) m_ho = int(0.2 * m) # size of holdout set m_t = m - m_ho # size of train data E = np.zeros((n_steps, m_ho), dtype=float) for t in range(T): print("t {}".format(t)) S_t = self.sample_from_D(m_t) S_ho = self.sample_from_D(m_ho) i = 0 for k in range(k_first, k_last + step, step): intervals, _ = find_best_interval(S_t[0, :], S_t[1, :], k) for j in range(m_ho): y_pred = self.predict_y(intervals, S_ho[0, j]) y = S_ho[1, j] E[i, j] += (y != y_pred) i += 1 best_k = k_first + np.argmin(np.sum(E, axis=1)) * step print("best k value found by cross validation algorithm {}".format( best_k)) return best_k
def experiment_k_range_erm(self, m, k_first, k_last, step): """Finds the best hypothesis for k= 1,2,...,20. Plots the empirical and true errors as a function of k. Input: m - an integer, the size of the data sample. k_first - an integer, the maximum number of intervals in the first experiment. m_last - an integer, the maximum number of intervals in the last experiment. step - an integer, the difference between the size of k in each experiment. Returns: The best k value (an integer) according to the ERM algorithm. """ empirical_errors, true_errors = [], [] sorted_samples = sorted(self.sample_from_D(m), key=lambda p: p[0]) for k in np.arange(k_first, k_last + 1, step): print("Experimenting for k =", k) best_intervals, best_error_count = intervals.find_best_interval( [sample[0] for sample in sorted_samples], [sample[1] for sample in sorted_samples], k) empirical_errors.append(best_error_count / (m * 1.0)) true_errors.append(self.get_true_error(best_intervals)) plt.figure() plt.plot(np.arange(k_first, k_last + 1, step), empirical_errors, '.') plt.plot(np.arange(k_first, k_last + 1, step), true_errors, '.') plt.xlabel('k') plt.ylabel('error') plt.xticks(np.arange(k_first, k_last + 1, step)) plt.yticks(np.arange(0, 1.05, 0.05)) plt.grid(True) plt.savefig('q1d.png') return empirical_errors.index(min(empirical_errors)) + 1
def run_for_k(self, k, samples): x = samples[:, 0] y = samples[:, 1] best_intervals, emp_error = intervals.find_best_interval(x, y, k) true_error = self.calc_ep(best_intervals) return emp_error / len(samples), true_error, k, len( samples), best_intervals
def cross_validation(self, m, T): """Finds a k that gives a good test error. Chooses the best hypothesis based on 3 experiments. Input: m - an integer, the size of the data sample. T - an integer, the number of times the experiment is performed. Returns: The best k value (an integer) found by the cross validation algorithm. """ p = self.sample_from_D(m) holdout_error_list = np.zeros(10) for _ in range(T): train_points, test_points = train_test_split(p, test_size=0.2, random_state=42) train_points = sorted(train_points, key=lambda x: x[0]) for k in range(1, 11): intervals, empirical_error = find_best_interval([x[0] for x in train_points], [y[1] for y in train_points], k) holdout_error_list[k-1] += (calc_holdout_error(test_points, intervals) / T) best = 1 best_k = 0 for k in range(10): if holdout_error_list[k] < best: best = holdout_error_list[k] best_k = k+1 return best_k
def part_c(): T = 100 m_range = range(10, 101, 5) empirical_error_average = [] true_error_average = [] for m in m_range: sum_empirical_error = 0.0 sum_true_error = 0.0 for _ in range(1, T + 1): # (i) Draw a sample of size m and run the ERM algorithm on it xs, ys = sample_points_from_distribution(m) intervals, best_error = find_best_interval(xs, ys, k=2) # (ii) Calculate the empirical error for the returned hypothesis sum_empirical_error += float(best_error) / m # (iii) Calculate the true error for the returned hypothesis sum_true_error += calculate_true_error(intervals) empirical_error_average += [sum_empirical_error / T] true_error_average += [sum_true_error / T] # Plot the average empirical and true errors, averaged across the T runs, as a function of m plt.xlabel('m') plt.ylabel('error') plt.title('Empirical error vs True error') fig = plt.gcf() fig.canvas.set_window_title('Programming Assignment: Question 1(c)') plt.scatter(m_range, empirical_error_average, marker='o', label='empirical error') plt.scatter(m_range, true_error_average, marker='+', label='true error') plt.legend() plt.savefig('q1_part_c.png') plt.clf()
def experiment_k_range_erm(self, m, k_first, k_last, step): """Finds the best hypothesis for k= 1,2,...,10. Plots the empirical and true errors as a function of k. Input: m - an integer, the size of the data sample. k_first - an integer, the maximum number of intervals in the first experiment. m_last - an integer, the maximum number of intervals in the last experiment. step - an integer, the difference between the size of k in each experiment. Returns: The best k value (an integer) according to the ERM algorithm. """ true_err = [] emp_err = [] for k in range(k_first, k_last + 1, step): #print(k) vals = self.sample_from_D(m) x_vals = vals[:, 0] y_vals = vals[:, 1] intervals_lst, eP_s = intervals.find_best_interval( x_vals, y_vals, k) true_err.append(self.calcErr(intervals_lst)) emp_err.append(eP_s / m) plt.clf() plt.ylim((-0.1, 1.1)) plt.xlabel("step") plt.ylabel("error") X = [k for k in range(k_first, k_last + 1, step)] plt.plot(true_err, marker='o', color='blue', label='true_error') plt.plot(emp_err, marker='o', color='red', label='empirical_error') plt.legend() #plt.show() minimum = np.argmin(emp_err) #index of minimal empirical error return minimum * step + k_first
def part_a(): sample_size = 100 xs, ys = sample_points_from_distribution(sample_size) intervals, best_error = find_best_interval(xs, ys, k=2) plt.xticks([0, 0.25, 0.5, 0.75, 1]) plt.yticks([-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1, 1.1]) plt.xlabel('x') plt.ylabel('y') plt.title('Sample points from distribution') fig = plt.gcf() fig.canvas.set_window_title('Programming Assignment: Question 1(a)') plt.scatter(xs, ys, label='data points') for x_tick in [0.25, 0.5, 0.75]: plt.plot([x_tick, x_tick], [-0.1, 1.1], 'r') for i, interval in enumerate(intervals): if i == 0: plt.plot(interval, [0.5, 0.5], 'b', label='model') else: plt.plot(interval, [0.5, 0.5], 'b') plt.legend(loc=7) plt.savefig('q1_part_a.png') plt.clf()
def part_e(): m = 50 k_range = range(1, 21) empirical_error_train = [] empirical_error_test = [] true_error = [] xs_test, ys_test = sample_points_from_distribution(m) xs_train, ys_train = sample_points_from_distribution(m) # Find the best ERM hypothesis for k=1,2,...,20 for k in k_range: intervals, best_error = find_best_interval(xs_train, ys_train, k=k) empirical_error_train += [float(best_error) / m] true_error += [float(calculate_true_error(intervals))] empirical_error_test += [float(calculate_empirical_error(intervals, xs_test, ys_test)) / m] # plot the empirical and true errors as a function of k. plt.xlabel('k') plt.ylabel('errors') plt.title('Empirical error vs True error') fig = plt.gcf() fig.canvas.set_window_title('Programming Assignment: Question 1(e)') plt.scatter(k_range, true_error, marker='+', label='true error') plt.scatter(k_range, empirical_error_train, marker='o', label='empirical error for train') plt.scatter(k_range, empirical_error_test, marker='x', label='empirical error for test') plt.legend() plt.savefig('q1_part_e.png') plt.clf()
def draw_sample_intervals(self, m, k): """ Plots the data as asked in (a) i ii and iii. Input: m - an integer, the size of the data sample. k - an integer, the maximum number of intervals. Returns: None. """ S = self.sample_from_D(m) intervals, _ = find_best_interval(S[0, :], S[1, :], k) for inter in intervals: plt.hlines(0.8, inter[0], inter[1], 'b', lw=3) # print horizontal line plt.plot(S[0, :], S[1, :], 'ro') title = 'sampled_intervals' plt.axvline(x=0.2) # print vertical line plt.axvline(x=0.4) plt.axvline(x=0.6) plt.axvline(x=0.8) plt.axis([0, 1, -0.1, 1.1]) plt.title(title) plt.savefig(title + '.png') # plt.show() plt.close() return
def measure_empirical_error(x, y, k): #sort sample idx = numpy.argsort(x) x = x[idx] y = y[idx] #run ERM intervals, error = find_best_interval(x, y, k) return float(error) / float(len(x))
def experiment_k_range_erm(self, m, k_first, k_last, step): """Finds the best hypothesis for k= 1,2,...,20. Plots the empirical and true errors as a function of k. Input: m - an integer, the size of the data sample. k_first - an integer, the maximum number of intervals in the first experiment. m_last - an integer, the maximum number of intervals in the last experiment. step - an integer, the difference between the size of k in each experiment. Returns: The best k value (an integer) according to the ERM algorithm. """ print('\nRunning experiment_k_range_erm\n') true_errs_array = [] sample_errs_array = [] points = self.sample_from_D(m) k_arr = np.arange(k_first, k_last + step, step) for k in k_arr: print(f'k={k}') best_intervals, besterr = intervals.find_best_interval( points[:, 0], points[:, 1], k) sample_errs_array.append(besterr / float(m)) true_err = self.calc_true_error(best_intervals) true_errs_array.append(true_err) print( f'true_err = {round(true_err,2)}, Empirical_err = {round(besterr/float(m),2)}' ) o_path = os.getcwd() fig, ax = plt.subplots() plt.xlabel('K (number of intervals)') plt.ylabel('Error') plt.title(f'Errors wrt K') k_sorted_by_err = [(err, k) for err, k in sorted(zip(true_errs_array, k_arr))] ax.plot(k_arr, true_errs_array, c='blue', label='True Err') ax.scatter(k_sorted_by_err[0][1], k_sorted_by_err[0][0], c='green', label='Minimal True Err', marker='D') ax.plot(k_arr, sample_errs_array, c='red', label='Empirical Err') plt.ylim(0, max(true_errs_array + sample_errs_array) + 0.05) plt.xlim(k_first - 1, k_last + 1) # plt.xticks(np.arange(m_first,k_first,k_last+step,15)) # plt.yticks(np.arange(0,round(max(mean_true_errs_array+mean_sample_errs_array),1)+0.1,0.05)) ax.legend() plt.savefig(o_path + '/Errors_wrt_K.pdf') print('\nErrors_wrt_K.pdf saved to CWD\n') return k_sorted_by_err[0][1]
def experiment_m_range_erm(self, m_first, m_last, step, k, T): """Runs the ERM algorithm. Calculates the empirical error and the true error. Plots the average empirical and true errors. Input: m_first - an integer, the smallest size of the data sample in the range. m_last - an integer, the largest size of the data sample in the range. step - an integer, the difference between the size of m in each loop. k - an integer, the maximum number of intervals. T - an integer, the number of times the experiment is performed. Returns: np.ndarray of shape (n_steps,2). A two dimensional array that contains the average empirical error and the average true error for each m in the range accordingly. """ n_steps = int((m_last - m_first) / step + 1) E = np.zeros((n_steps, 2), dtype=float) for t in range(T): print("t {}".format(t)) i = 0 for m in range(m_first, m_last + step, step): S = self.sample_from_D(m) intervals, besterror = find_best_interval(S[0, :], S[1, :], k) E[i, 0] += (besterror / m) E[i, 1] += self.calc_true_error(intervals) i += 1 for i in range(n_steps): for j in range(2): E[i, j] /= T m_vals = np.arange(m_first, m_last + step, step) plt.plot(m_vals, E[:, 0], 'r-', m_vals, E[:, 1], 'b-') plt.axis([m_first, m_last, 0, E.max()]) plt.text(0.5, 0.9, 'red = Es', transform=plt.gca().transAxes, ha='center') plt.text(0.5, 0.85, 'blue = Ep', transform=plt.gca().transAxes, ha='center') title = 'm_range_erm' plt.xlabel('samples (m)') plt.ylabel('Error') plt.title(title) plt.savefig(title + '.png') # plt.show() plt.close() return E
def plot_2a(): for x in (0.25, 0.5, 0.75): plt.plot([x, x], [-.1, 1.1], 'k--') x, y = draw_samples(100) plt.plot(x, y, 'k.') plt.ylim([-.1, 1.1]) idx = numpy.argsort(x) intervals = find_best_interval(x[idx], y[idx], k=2) for interval in intervals[0]: print interval plt.plot(interval, [0.5, 0.5], 'k', linewidth=10) plt.show()
def measure_intervals(m=50, k=2): #draw sample x, y = draw_samples(m) #sort sample idx = numpy.argsort(x) x = x[idx] y = y[idx] #run ERM intervals, error = find_best_interval(x, y, k) empirical_error = float(error) / float(m) true_error = calculate_true_error(intervals) return true_error, empirical_error