コード例 #1
0
ファイル: assignment2.py プロジェクト: BarY7/k_intervals_ex2
 def compute_errors_and_ks(self, m, k_first, k_last, step):
     k_array = []
     for k in range(k_first, k_last + step, step):
         k_array.append(k)
     k_array = np.array(k_array)
     empirical_array = np.array(
         [0 for x in range(k_first, k_last + step, step)], dtype=np.float32)
     true_array = np.array([0 for x in range(k_first, k_last + step, step)],
                           dtype=np.float32)
     sample_array = self.sample_from_D(m)
     sample_array = sample_array[sample_array[:, 0].argsort()]
     for k in range(k_first, k_last + step, step):
         xs = sample_array[:, 0]
         ys = sample_array[:, 1]
         all_intervals = intervals.find_best_interval(xs, ys, k)[0]
         # calculate empirical err
         sum = 0
         for index in range(len(xs)):
             is_in_any_interval = False
             for interval in all_intervals:
                 if (self.is_in_interval(interval, xs[index])):
                     is_in_any_interval = True
             if ((is_in_any_interval and ys[index] == 0)
                     or (not is_in_any_interval and ys[index] == 1)):
                 sum = sum + 1
         empirical_err = sum / m
         true_err = self.calculate_error_from_intervals(all_intervals)
         empirical_array[int((k - k_first) / step)] = empirical_array[int(
             (k - k_first) / step)] + empirical_err
         true_array[int((k - k_first) / step)] = true_array[int(
             (k - k_first) / step)] + true_err
     return k_array, empirical_array, true_array
コード例 #2
0
 def part_iii():
     k_intervals, error = intervals.find_best_interval(x, y, 2)
     for x0, x1 in k_intervals:
         pyplot.plot([x0, x1], [0.5, 0.5],
                     'g-',
                     linewidth=4,
                     label="ERM interval")
コード例 #3
0
def Q2_f(output_directory):
    print "::Question 2f::"
    m = 50
    k_values = range(1, 21)
    errors = []
    x, y = RandomDSorted(m)
    validation_x, validation_y = RandomDSorted(m)
    for k in k_values:
        k_intervals, error = intervals.find_best_interval(x, y, k)
        validation_error = EmpiricalError(validation_x, validation_y,
                                          k_intervals)
        errors.append(validation_error)
    best_k = k_values[numpy.argmin(errors)]
    print "\tBest k using validation:", best_k
    pyplot.plot([best_k, best_k], [0, 1], "r--", label="best k")
    pyplot.plot(k_values, errors, 'b^--', label="validation empirical error")
    pyplot.legend()
    pyplot.title(
        "Quetion 2f:: Empirical error of validation\nas a function of k (using ERM)"
    )
    pyplot.xlabel("k")
    pyplot.ylabel("Empirical error")
    pyplot.ylim([min(errors) - 0.01, max(errors) + 0.01])
    pyplot.savefig(os.path.join(output_directory, "Q2f.png"))
    pyplot.clf()
コード例 #4
0
def Q2_c(output_directory, T=100, m_values=tuple(range(10, 101, 5))):
    print "::Question 2c::"
    k = 2
    average_true_errors = []
    average_empricial_errors = []
    for m in m_values:
        total_true_error = 0
        total_empirical_error = 0
        for i in xrange(T):
            # part i
            x, y = RandomDSorted(m)
            k_intervals, best_error = intervals.find_best_interval(x, y, k)
            # part ii
            empirical_error = best_error / float(m)
            total_empirical_error += empirical_error
            # part iii
            true_error = TrueError(k_intervals)
            total_true_error += true_error
        average_empricial_errors.append(total_empirical_error / T)
        average_true_errors.append(total_true_error / T)
    pyplot.plot(m_values,
                average_empricial_errors,
                'ro--',
                label="empirical error")
    pyplot.plot(m_values, average_true_errors, 'b^--', label="true error")
    pyplot.legend()
    pyplot.xlabel("m (samples amount)")
    pyplot.ylabel("Average error")
    pyplot.title(
        "Question 2c:: Average error of ERM using k=2 intervals,\nas a function of samples amount"
    )
    pyplot.savefig(os.path.join(output_directory, "Q2c.png"))
    pyplot.clf()
コード例 #5
0
ファイル: assignment2.py プロジェクト: BarY7/k_intervals_ex2
    def draw_sample_intervals(self, m, k):
        """
        Plots the data as asked in (a) i ii and iii.
        Input: m - an integer, the size of the data sample.
               k - an integer, the maximum number of intervals.

        Returns: None.
        """
        samples = self.sample_from_D(m)
        samples = samples[samples[:, 0].argsort()]
        plt.scatter(samples[:, 0], samples[:, 1])
        plt.xlim(-0.1, 1.1)
        plt.ylim(-0.1, 1.1)
        plt.axvline(x=0.2)
        plt.axvline(x=0.4)
        plt.axvline(x=0.6)
        plt.axvline(x=0.8)
        (inters,
         best_err) = intervals.find_best_interval(samples[:, 0], samples[:, 1],
                                                  3)
        flat_inters = []
        for i in range(k):
            flat_inters.append(inters[i][0])
            flat_inters.append(inters[i][1])
        for interval in inters:
            dots = np.linspace(interval[0], interval[1], num=1000)
            plt.plot(dots, [-0.1] * 1000, linewidth=5)
        plt.xticks(flat_inters)
        plt.show()
        return inters
コード例 #6
0
    def experiment_k_range_srm(self, m, k_first, k_last, step):
        """Runs the experiment in (d).
        Plots additionally the penalty for the best ERM hypothesis.
        and the sum of penalty and empirical error.
        Input: m - an integer, the size of the data sample.
               k_first - an integer, the maximum number of intervals in the first experiment.
               m_last - an integer, the maximum number of intervals in the last experiment.
               step - an integer, the difference between the size of k in each experiment.

        Returns: The best k value (an integer) according to the SRM algorithm.
        """
        # TODO: Implement the loop
        samples = self.sample_from_D(m)
        index = 0
        best_k = 0
        results = np.zeros((len(range(k_first, k_last + 1, step)), 3))
        for k in range(k_first, k_last + 1, step):
            erm_result = intervals.find_best_interval(samples[:, 0], samples[:, 1], k)
            results[index][0] = erm_result[1] / m
            results[index][1] = self.calc_true_error(erm_result[0])
            results[index][2] = self.calc_penalty(m, k)
            if(results[index][0] + results[index][2] < results[best_k][0] + results[best_k][2]):
                best_k = index
            index += 1
        # self.plotE(k_first, k_last, step, results)
        return best_k + 1
コード例 #7
0
    def draw_sample_intervals(self, m, k):
        """
                Plots the data as asked in (a) i ii and iii.
                Input: m - an integer, the size of the data sample.
                       k - an integer, the maximum number of intervals.

                Returns: None.
                """
        p = self.sample_from_D(m)
        sorted_p = sorted(p, key=lambda x_y_point: x_y_point[0])
        zero_points = [x[0] for x in p if x[1] == 0]
        one_points = [x[0] for x in p if x[1] == 1]
        plt.plot(one_points, [1 for _ in range(len(one_points))], 'o', label='one')
        plt.plot(zero_points, [0 for _ in range(len(zero_points))], 'o', label='zero')
        plt.axvline(0.2, color='r', linestyle='--', linewidth=1.0)
        plt.axvline(0.4, color='r', linestyle='--', linewidth=1.0)
        plt.axvline(0.6, color='r', linestyle='--', linewidth=1.0)
        plt.axvline(0.8, color='r', linestyle='--', linewidth=1.0)
        plt.axis([0, 1, -0.1, 1.1])

        intervals, empirical_error = find_best_interval([x[0] for x in sorted_p], [y[1] for y in sorted_p], k)

        for i in range(len(intervals)):
            interval_points = np.linspace(intervals[i][0], intervals[i][1], 100)
            plt.plot(interval_points, [-0.09 for _ in range(100)], color='g',
                     linewidth=5.0)
            plt.annotate("{:.2f}".format(interval_points[0]),(interval_points[0], -0.1))
            plt.annotate("{:.2f}".format(interval_points[99]),(interval_points[99], -0.1))

        plt.legend(loc='best')
        plt.ylabel('y')
        plt.xlabel('x')
        plt.title("best intervals (m={},k={})".format(m, k))
        plt.savefig("section a - draw_sample_intervals")
        plt.close()
コード例 #8
0
    def experiment_m_range_erm(self, m_first, m_last, step, k, T):
        """Runs the ERM algorithm.
        Calculates the empirical error and the true error.
        Plots the average empirical and true errors.
        Input: m_first - an integer, the smallest size of the data sample in the range.
               m_last - an integer, the largest size of the data sample in the range.
               step - an integer, the difference between the size of m in each loop.
               k - an integer, the maximum number of intervals.
               T - an integer, the number of times the experiment is performed.

        Returns: np.ndarray of shape (n_steps,2).
            A two dimensional array that contains the average empirical error
            and the average true error for each m in the range accordingly.
        """
        # TODO: Implement the loop
        index = 0
        results = np.zeros((len(range(m_first, m_last+1, step)), 2))
        for m in range(m_first, m_last+1, step):
            cnt_true_error = 0
            cnt_emp_error = 0
            for i in range(T):
                samples = self.sample_from_D(m)
                best_intervals = intervals.find_best_interval(samples[:, 0], samples[:, 1], k)
                cnt_emp_error += best_intervals[1] / m
                cnt_true_error += self.calc_true_error(best_intervals[0])

            results[index][0] = cnt_emp_error / T
            results[index][1] = cnt_true_error / T
            index += 1

        # self.plotC(m_first, m_last, step, results)
        return results
コード例 #9
0
    def experiment_k_range_erm(self, m, k_first, k_last, step):
        """Finds the best hypothesis for k= 1,2,...,10.
        Plots the empirical and true errors as a function of k.
        Input: m - an integer, the size of the data sample.
               k_first - an integer, the maximum number of intervals in the first experiment.
               m_last - an integer, the maximum number of intervals in the last experiment.
               step - an integer, the difference between the size of k in each experiment.

        Returns: The best k value (an integer) according to the ERM algorithm.
        """
        emp_errors = []
        true_errors = []
        min_erm_error = 1
        min_k = 0
        sample = self.sample_from_D(m)
        for k in range(k_first, k_last + 1, step):
            best_intervals, error_amount = intervals.find_best_interval(sample[:, 0], sample[:, 1], k)
            emp_error = error_amount / m
            emp_errors.append(emp_error)
            true_errors.append(self.caluclate_true_error(best_intervals))
            if emp_error < min_erm_error:
                min_erm_error = emp_error
                min_k = k

        plt.plot([k for k in range(k_first, k_last + 1, step)], emp_errors, color='red')
        plt.plot([k for k in range(k_first, k_last + 1, step)], true_errors, color='blue')

        plt.legend(['Empirical Error', 'True Error'], loc='upper right')

        # plt.show()

        return min_k
コード例 #10
0
ファイル: imp.py プロジェクト: RazLandau/ML
    def draw_sample_intervals(self, m, k):
        """
        Plots the data as asked in (a) i ii and iii.
        Input: m - an integer, the size of the data sample.
               k - an integer, the maximum number of intervals.

        Returns: None.
        """
        sorted_samples = sorted(self.sample_from_D(m), key=lambda p: p[0])
        best_intervals = intervals.find_best_interval(
            [sample[0] for sample in sorted_samples],
            [sample[1] for sample in sorted_samples], k)[0]

        plt.figure()
        plt.plot([sample[0] for sample in sorted_samples],
                 [sample[1] for sample in sorted_samples], '.')
        plt.xlabel('x')
        plt.ylabel('y')
        plt.axis([0, 1, -0.1, 1.1])
        plt.xticks(np.arange(0, 1, 0.2))
        plt.gca().axvline(0.2)
        plt.gca().axvline(0.4)
        plt.gca().axvline(0.6)
        plt.gca().axvline(0.8)
        plt.gca().axhline(0)
        plt.gca().axhline(1)
        for interval in best_intervals:
            plt.hlines(0.5, interval[0], interval[1])
        plt.savefig('q1a.png')
コード例 #11
0
    def experiment_m_range_erm(self, m_first, m_last, step, k, T):
        """Runs the ERM algorithm.
        Calculates the empirical error and the true error.
        Plots the average empirical and true errors.
        Input: m_first - an integer, the smallest size of the data sample in the range.
               m_last - an integer, the largest size of the data sample in the range.
               step - an integer, the difference between the size of m in each loop.
               k - an integer, the maximum number of intervals.
               T - an integer, the number of times the experiment is performed.

        Returns: np.ndarray of shape (n_steps,2).
            A two dimensional array that contains the average empirical error
            and the average true error for each m in the range accordingly.
        """
        sum_emp_error = 0
        sum_true_error = 0

        emp_lst = [0 for i in range(m_first, m_last + 1, step)]
        true_lst = [0 for i in range(m_first, m_last + 1, step)]
        i = 0
        for m in range(m_first, m_last + 1, step):
            for j in range(T):

                pairs = self.sample_from_D(m)
                pairs = sorted(pairs, key=lambda x: x[0])
                x = np.array([p[0] for p in pairs])
                y = np.array([p[1] for p in pairs])

                interval, emp_error = intervals.find_best_interval(x, y, k)
                sum_emp_error += (emp_error / m)
                sum_true_error += self.true_error(interval)

            emp_lst[i] = (sum_emp_error / T)
            true_lst[i] = (sum_true_error / T)
            sum_emp_error = 0
            sum_true_error = 0
            i += 1

        plt.plot([m for m in range(m_first, m_last + 1, step)],
                 emp_lst,
                 'ro',
                 label='empirical error')
        plt.plot([m for m in range(m_first, m_last + 1, step)],
                 true_lst,
                 'bo',
                 label='true error')
        plt.xlabel('m')
        plt.ylabel('Error')
        plt.legend()
        plt.title('empirical vs true error')
        plt.savefig('Qc_new.pdf')
        plt.clf()
        plt.cla()

        res = np.ndarray(shape=(len(emp_lst), 2))
        for i in range(len(emp_lst)):
            res[i][0] = emp_lst[i]
            res[i][1] = true_lst[i]

        return res
コード例 #12
0
    def cross_validation(self, m, T):
        """Finds a k that gives a good test error.
        Chooses the best hypothesis based on 3 experiments.
        Input: m - an integer, the size of the data sample.
               T - an integer, the number of times the experiment is performed.

        Returns: The best k value (an integer) found by the cross validation algorithm.
        """
        k_counts = [0 for k in range(11)]
        sample = self.sample_from_D(m)
        for i in range(T):
            np.random.shuffle(sample)
            training_set = [sample[i] for i in range(4 * m // 5)]
            training_set.sort(key=lambda x: x[0])
            holdout_set_x = [sample[i][0] for i in range(4 * m // 5, m)]
            holdout_set_y = [sample[i][1] for i in range(4 * m // 5, m)]
            min_k = 0
            min_error = 1
            for k in range(1, 11):
                best_intervals, error_amount = intervals.find_best_interval(
                    [training_set[i][0] for i in range(len(training_set))],
                    [training_set[i][1] for i in range(len(training_set))],
                    k)
                holdout_error = self.calc_holdout_error(best_intervals, holdout_set_x, holdout_set_y) / m
                if holdout_error < min_error:
                    min_error = holdout_error
                    min_k = k
            k_counts[min_k] += 1

        highest_freq = max(k_counts)
        return k_counts.index(highest_freq) + 1
コード例 #13
0
    def draw_sample_intervals(self, m, k):
        """
        Plots the data as asked in (a) i ii and iii.
        Input: m - an integer, the size of the data sample.
               k - an integer, the maximum number of intervals.

        Returns: None.
        """

        #i
        pairs = self.sample_from_D(m)
        pairs = sorted(pairs, key=lambda x: x[0])
        x = np.array([p[0] for p in pairs])
        y = np.array([p[1] for p in pairs])
        plt.ylabel("Labels")
        plt.ylim(-0.1, 1.1)
        plt.scatter(x, y, color='blue')

        #ii
        plt.axvline(x=0.2, color='black')
        plt.axvline(x=0.4, color='black')
        plt.axvline(x=0.6, color='black')
        plt.axvline(x=0.8, color='black')

        ##iii
        interval, error = intervals.find_best_interval(x, y, k)
        for ints in interval:
            plt.hlines(-0.05, ints[0], ints[1], 'red', lw=5)
        plt.savefig('Qa_new.pdf')
        plt.clf()
        plt.cla()

        return None
コード例 #14
0
ファイル: assignment2.py プロジェクト: shanys8/hw-ML
    def cross_validation(self, m, T):
        """Finds a k that gives a good test error.
        Chooses the best hypothesis based on 3 experiments.
        Input: m - an integer, the size of the data sample.
               T - an integer, the number of times the experiment is performed.

        Returns: The best k value (an integer) found by the cross validation algorithm.
        """

        samples = self.sample_from_D(m)
        best_k_list = []
        for i in range(T):
            print('{} out of T={}'.format(i, T))
            np.random.shuffle(samples)
            holdout_samples = samples[:m // 5, :]
            train_samples = samples[m // 5:, :]
            train_samples = np.asarray(
                sorted(train_samples, key=lambda a_entry: a_entry[0]))
            min_k = 1
            min_k_holdout_error = 1
            for k in range(1, 10):
                print('k={}'.format(k))
                h, _ = intervals.find_best_interval(train_samples[:, 0],
                                                    train_samples[:, 1], k)
                holdout_error = calc_holdout_error(m // 5, h, holdout_samples)
                if holdout_error < min_k_holdout_error:
                    min_k_holdout_error = holdout_error
                    min_k = k
            print('min_k {}'.format(min_k))
            best_k_list.append(min_k)

        print(best_k_list)
        counts = np.bincount(best_k_list)
        return np.argmax(counts)
コード例 #15
0
ファイル: assignment2.py プロジェクト: shanys8/hw-ML
    def experiment_k_range_erm(self, m, k_first, k_last, step):
        """Finds the best hypothesis for k= 1,2,...,10.
        Plots the empirical and true errors as a function of k.
        Input: m - an integer, the size of the data sample.
               k_first - an integer, the maximum number of intervals in the first experiment.
               m_last - an integer, the maximum number of intervals in the last experiment.
               step - an integer, the difference between the size of k in each experiment.

        Returns: The best k value (an integer) according to the ERM algorithm.
        """
        k_list = np.arange(k_first, k_last + step, step)
        samples = self.sample_from_D(m)
        empirical_error_list = []
        true_error_list = []
        for k in k_list:
            print(k)
            h, _ = intervals.find_best_interval(samples[:, 0], samples[:, 1],
                                                k)
            empirical_error = calc_empirical_error(samples, h)
            true_error = calc_true_error(h)
            empirical_error_list.append(empirical_error)
            true_error_list.append(true_error)

        plt.plot(k_list, empirical_error_list, label="empirical errors")
        plt.plot(k_list, true_error_list, label="true errors")
        plt.legend()
        plt.savefig('results/section_d.png')
        best_k = k_list[np.argmin(empirical_error_list)]
        print('best k {}'.format(best_k))
        return best_k
コード例 #16
0
    def experiment_k_range_erm(self, m, k_first, k_last, step):
        """Finds the best hypothesis for k= 1,2,...,10.
        Plots the empirical and true errors as a function of k.
        Input: m - an integer, the size of the data sample.
               k_first - an integer, the maximum number of intervals in the first experiment.
               m_last - an integer, the maximum number of intervals in the last experiment.
               step - an integer, the difference between the size of k in each experiment.

        Returns: The best k value (an integer) according to the ERM algorithm.
        """
        # TODO: Implement the loop
        samples = self.sample_from_D(m)
        index = 0
        best_k = 0
        results = np.zeros((len(range(k_first, k_last + 1, step)), 2))
        for k in range(k_first, k_last + 1, step):
            erm_result = intervals.find_best_interval(samples[:, 0], samples[:, 1], k)
            results[index][0] = erm_result[1] / m
            results[index][1] = self.calc_true_error(erm_result[0])
            if (results[index][0] < results[best_k][0]):
                best_k = index
            index += 1

        # self.plotD(k_first, k_last, step, results)
        return best_k + 1
コード例 #17
0
    def cross_validation(self, m, k_first, k_last, step, T):
        """Finds a k that gives a good test error.
		Chooses the best hypothesis based on 3 experiments.
		Input: m - an integer, the size of the data sample.
			   T - an integer, the number of times the experiment is performed.
		Returns: The best k value (an integer) found by the cross validation algorithm.
		"""
        n_steps = int((k_last - k_first) / step + 1)
        m_ho = int(0.2 * m)  # size of holdout set
        m_t = m - m_ho  # size of train data
        E = np.zeros((n_steps, m_ho), dtype=float)

        for t in range(T):
            print("t {}".format(t))
            S_t = self.sample_from_D(m_t)
            S_ho = self.sample_from_D(m_ho)
            i = 0
            for k in range(k_first, k_last + step, step):
                intervals, _ = find_best_interval(S_t[0, :], S_t[1, :], k)
                for j in range(m_ho):
                    y_pred = self.predict_y(intervals, S_ho[0, j])
                    y = S_ho[1, j]
                    E[i, j] += (y != y_pred)
                i += 1

        best_k = k_first + np.argmin(np.sum(E, axis=1)) * step
        print("best k value found by cross validation algorithm {}".format(
            best_k))
        return best_k
コード例 #18
0
ファイル: imp.py プロジェクト: RazLandau/ML
    def experiment_k_range_erm(self, m, k_first, k_last, step):
        """Finds the best hypothesis for k= 1,2,...,20.
        Plots the empirical and true errors as a function of k.
        Input: m - an integer, the size of the data sample.
               k_first - an integer, the maximum number of intervals in the first experiment.
               m_last - an integer, the maximum number of intervals in the last experiment.
               step - an integer, the difference between the size of k in each experiment.

        Returns: The best k value (an integer) according to the ERM algorithm.
        """
        empirical_errors, true_errors = [], []
        sorted_samples = sorted(self.sample_from_D(m), key=lambda p: p[0])
        for k in np.arange(k_first, k_last + 1, step):
            print("Experimenting for k =", k)
            best_intervals, best_error_count = intervals.find_best_interval(
                [sample[0] for sample in sorted_samples],
                [sample[1] for sample in sorted_samples], k)
            empirical_errors.append(best_error_count / (m * 1.0))
            true_errors.append(self.get_true_error(best_intervals))

        plt.figure()
        plt.plot(np.arange(k_first, k_last + 1, step), empirical_errors, '.')
        plt.plot(np.arange(k_first, k_last + 1, step), true_errors, '.')
        plt.xlabel('k')
        plt.ylabel('error')
        plt.xticks(np.arange(k_first, k_last + 1, step))
        plt.yticks(np.arange(0, 1.05, 0.05))
        plt.grid(True)
        plt.savefig('q1d.png')

        return empirical_errors.index(min(empirical_errors)) + 1
コード例 #19
0
ファイル: skeleton.py プロジェクト: darc1/intro2ml
 def run_for_k(self, k, samples):
     x = samples[:, 0]
     y = samples[:, 1]
     best_intervals, emp_error = intervals.find_best_interval(x, y, k)
     true_error = self.calc_ep(best_intervals)
     return emp_error / len(samples), true_error, k, len(
         samples), best_intervals
コード例 #20
0
    def cross_validation(self, m, T):
        """Finds a k that gives a good test error.
                Chooses the best hypothesis based on 3 experiments.
                Input: m - an integer, the size of the data sample.
                       T - an integer, the number of times the experiment is performed.

                Returns: The best k value (an integer) found by the cross validation algorithm.
                """
        p = self.sample_from_D(m)
        holdout_error_list = np.zeros(10)

        for _ in range(T):
            train_points, test_points = train_test_split(p, test_size=0.2, random_state=42)
            train_points = sorted(train_points, key=lambda x: x[0])
            for k in range(1, 11):
                intervals, empirical_error = find_best_interval([x[0] for x in train_points], [y[1] for y in train_points], k)
                holdout_error_list[k-1] += (calc_holdout_error(test_points, intervals) / T)

        best = 1
        best_k = 0
        for k in range(10):
            if holdout_error_list[k] < best:
                best = holdout_error_list[k]
                best_k = k+1

        return best_k
コード例 #21
0
def part_c():
    T = 100
    m_range = range(10, 101, 5)
    empirical_error_average = []
    true_error_average = []
    for m in m_range:
        sum_empirical_error = 0.0
        sum_true_error = 0.0
        for _ in range(1, T + 1):
            #  (i) Draw a sample of size m and run the ERM algorithm on it
            xs, ys = sample_points_from_distribution(m)
            intervals, best_error = find_best_interval(xs, ys, k=2)

            #  (ii) Calculate the empirical error for the returned hypothesis
            sum_empirical_error += float(best_error) / m

            #  (iii) Calculate the true error for the returned hypothesis
            sum_true_error += calculate_true_error(intervals)
        empirical_error_average += [sum_empirical_error / T]
        true_error_average += [sum_true_error / T]

    # Plot the average empirical and true errors, averaged across the T runs, as a function of m
    plt.xlabel('m')
    plt.ylabel('error')
    plt.title('Empirical error vs True error')
    fig = plt.gcf()
    fig.canvas.set_window_title('Programming Assignment: Question 1(c)')

    plt.scatter(m_range, empirical_error_average, marker='o', label='empirical error')
    plt.scatter(m_range, true_error_average, marker='+', label='true error')
    plt.legend()
    plt.savefig('q1_part_c.png')
    plt.clf()
コード例 #22
0
    def experiment_k_range_erm(self, m, k_first, k_last, step):
        """Finds the best hypothesis for k= 1,2,...,10.
        Plots the empirical and true errors as a function of k.
        Input: m - an integer, the size of the data sample.
               k_first - an integer, the maximum number of intervals in the first experiment.
               m_last - an integer, the maximum number of intervals in the last experiment.
               step - an integer, the difference between the size of k in each experiment.

        Returns: The best k value (an integer) according to the ERM algorithm.
        """
        true_err = []
        emp_err = []

        for k in range(k_first, k_last + 1, step):
            #print(k)
            vals = self.sample_from_D(m)
            x_vals = vals[:, 0]
            y_vals = vals[:, 1]
            intervals_lst, eP_s = intervals.find_best_interval(
                x_vals, y_vals, k)
            true_err.append(self.calcErr(intervals_lst))
            emp_err.append(eP_s / m)

        plt.clf()
        plt.ylim((-0.1, 1.1))
        plt.xlabel("step")
        plt.ylabel("error")
        X = [k for k in range(k_first, k_last + 1, step)]
        plt.plot(true_err, marker='o', color='blue', label='true_error')
        plt.plot(emp_err, marker='o', color='red', label='empirical_error')
        plt.legend()
        #plt.show()
        minimum = np.argmin(emp_err)  #index of minimal empirical error
        return minimum * step + k_first
コード例 #23
0
def part_a():
    sample_size = 100
    xs, ys = sample_points_from_distribution(sample_size)
    intervals, best_error = find_best_interval(xs, ys, k=2)

    plt.xticks([0, 0.25, 0.5, 0.75, 1])
    plt.yticks([-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1, 1.1])
    plt.xlabel('x')
    plt.ylabel('y')
    plt.title('Sample points from distribution')
    fig = plt.gcf()
    fig.canvas.set_window_title('Programming Assignment: Question 1(a)')

    plt.scatter(xs, ys, label='data points')
    for x_tick in [0.25, 0.5, 0.75]:
        plt.plot([x_tick, x_tick], [-0.1, 1.1], 'r')
    for i, interval in enumerate(intervals):
        if i == 0:
            plt.plot(interval, [0.5, 0.5], 'b', label='model')
        else:
            plt.plot(interval, [0.5, 0.5], 'b')

    plt.legend(loc=7)
    plt.savefig('q1_part_a.png')
    plt.clf()
コード例 #24
0
def part_e():
    m = 50
    k_range = range(1, 21)
    empirical_error_train = []
    empirical_error_test = []
    true_error = []
    xs_test, ys_test = sample_points_from_distribution(m)
    xs_train, ys_train = sample_points_from_distribution(m)

    # Find the best ERM hypothesis for k=1,2,...,20
    for k in k_range:
        intervals, best_error = find_best_interval(xs_train, ys_train, k=k)
        empirical_error_train += [float(best_error) / m]
        true_error += [float(calculate_true_error(intervals))]
        empirical_error_test += [float(calculate_empirical_error(intervals, xs_test, ys_test)) / m]

    # plot the empirical and true errors as a function of k.
    plt.xlabel('k')
    plt.ylabel('errors')
    plt.title('Empirical error vs True error')
    fig = plt.gcf()
    fig.canvas.set_window_title('Programming Assignment: Question 1(e)')

    plt.scatter(k_range, true_error, marker='+', label='true error')
    plt.scatter(k_range, empirical_error_train, marker='o', label='empirical error for train')
    plt.scatter(k_range, empirical_error_test, marker='x', label='empirical error for test')
    plt.legend()
    plt.savefig('q1_part_e.png')
    plt.clf()
コード例 #25
0
    def draw_sample_intervals(self, m, k):
        """
		Plots the data as asked in (a) i ii and iii.
		Input: m - an integer, the size of the data sample.
			k - an integer, the maximum number of intervals.
		Returns: None.
		"""

        S = self.sample_from_D(m)

        intervals, _ = find_best_interval(S[0, :], S[1, :], k)
        for inter in intervals:
            plt.hlines(0.8, inter[0], inter[1], 'b',
                       lw=3)  # print horizontal line

        plt.plot(S[0, :], S[1, :], 'ro')

        title = 'sampled_intervals'
        plt.axvline(x=0.2)  # print vertical line
        plt.axvline(x=0.4)
        plt.axvline(x=0.6)
        plt.axvline(x=0.8)
        plt.axis([0, 1, -0.1, 1.1])
        plt.title(title)
        plt.savefig(title + '.png')
        # plt.show()
        plt.close()
        return
コード例 #26
0
def measure_empirical_error(x, y, k):
    #sort sample
    idx = numpy.argsort(x)
    x = x[idx]
    y = y[idx]
    #run ERM
    intervals, error = find_best_interval(x, y, k)
    return float(error) / float(len(x))
コード例 #27
0
ファイル: ERM.py プロジェクト: yoavshosh/ML_mini_projects
    def experiment_k_range_erm(self, m, k_first, k_last, step):
        """Finds the best hypothesis for k= 1,2,...,20.
        Plots the empirical and true errors as a function of k.
        Input: m - an integer, the size of the data sample.
               k_first - an integer, the maximum number of intervals in the first experiment.
               m_last - an integer, the maximum number of intervals in the last experiment.
               step - an integer, the difference between the size of k in each experiment.

        Returns: The best k value (an integer) according to the ERM algorithm.
        """

        print('\nRunning experiment_k_range_erm\n')

        true_errs_array = []
        sample_errs_array = []
        points = self.sample_from_D(m)
        k_arr = np.arange(k_first, k_last + step, step)

        for k in k_arr:
            print(f'k={k}')
            best_intervals, besterr = intervals.find_best_interval(
                points[:, 0], points[:, 1], k)
            sample_errs_array.append(besterr / float(m))
            true_err = self.calc_true_error(best_intervals)
            true_errs_array.append(true_err)
            print(
                f'true_err = {round(true_err,2)}, Empirical_err = {round(besterr/float(m),2)}'
            )

        o_path = os.getcwd()
        fig, ax = plt.subplots()
        plt.xlabel('K (number of intervals)')
        plt.ylabel('Error')
        plt.title(f'Errors wrt K')

        k_sorted_by_err = [(err, k)
                           for err, k in sorted(zip(true_errs_array, k_arr))]
        ax.plot(k_arr, true_errs_array, c='blue', label='True Err')
        ax.scatter(k_sorted_by_err[0][1],
                   k_sorted_by_err[0][0],
                   c='green',
                   label='Minimal True Err',
                   marker='D')
        ax.plot(k_arr, sample_errs_array, c='red', label='Empirical Err')
        plt.ylim(0, max(true_errs_array + sample_errs_array) + 0.05)
        plt.xlim(k_first - 1, k_last + 1)
        #        plt.xticks(np.arange(m_first,k_first,k_last+step,15))
        #        plt.yticks(np.arange(0,round(max(mean_true_errs_array+mean_sample_errs_array),1)+0.1,0.05))
        ax.legend()

        plt.savefig(o_path + '/Errors_wrt_K.pdf')
        print('\nErrors_wrt_K.pdf saved to CWD\n')

        return k_sorted_by_err[0][1]
コード例 #28
0
    def experiment_m_range_erm(self, m_first, m_last, step, k, T):
        """Runs the ERM algorithm.
		Calculates the empirical error and the true error.
		Plots the average empirical and true errors.
		Input: m_first - an integer, the smallest size of the data sample in the range.
			   m_last - an integer, the largest size of the data sample in the range.
			   step - an integer, the difference between the size of m in each loop.
			   k - an integer, the maximum number of intervals.
			   T - an integer, the number of times the experiment is performed.
		Returns: np.ndarray of shape (n_steps,2).
			A two dimensional array that contains the average empirical error
			and the average true error for each m in the range accordingly.
		"""
        n_steps = int((m_last - m_first) / step + 1)

        E = np.zeros((n_steps, 2), dtype=float)

        for t in range(T):
            print("t {}".format(t))
            i = 0
            for m in range(m_first, m_last + step, step):
                S = self.sample_from_D(m)
                intervals, besterror = find_best_interval(S[0, :], S[1, :], k)
                E[i, 0] += (besterror / m)
                E[i, 1] += self.calc_true_error(intervals)
                i += 1

        for i in range(n_steps):
            for j in range(2):
                E[i, j] /= T

        m_vals = np.arange(m_first, m_last + step, step)

        plt.plot(m_vals, E[:, 0], 'r-', m_vals, E[:, 1], 'b-')
        plt.axis([m_first, m_last, 0, E.max()])
        plt.text(0.5,
                 0.9,
                 'red = Es',
                 transform=plt.gca().transAxes,
                 ha='center')
        plt.text(0.5,
                 0.85,
                 'blue = Ep',
                 transform=plt.gca().transAxes,
                 ha='center')
        title = 'm_range_erm'
        plt.xlabel('samples (m)')
        plt.ylabel('Error')
        plt.title(title)
        plt.savefig(title + '.png')
        # plt.show()
        plt.close()

        return E
コード例 #29
0
ファイル: t.py プロジェクト: aranc/imlhw
def plot_2a():
    for x in (0.25, 0.5, 0.75):
        plt.plot([x, x], [-.1, 1.1], 'k--')
    x, y = draw_samples(100)
    plt.plot(x, y, 'k.')
    plt.ylim([-.1, 1.1])
    idx = numpy.argsort(x)
    intervals = find_best_interval(x[idx], y[idx], k=2)
    for interval in intervals[0]:
        print interval
        plt.plot(interval, [0.5, 0.5], 'k', linewidth=10)
    plt.show()
コード例 #30
0
ファイル: t.py プロジェクト: aranc/imlhw
def measure_intervals(m=50, k=2):
    #draw sample
    x, y = draw_samples(m)
    #sort sample
    idx = numpy.argsort(x)
    x = x[idx]
    y = y[idx]
    #run ERM
    intervals, error = find_best_interval(x, y, k)
    empirical_error = float(error) / float(m)
    true_error = calculate_true_error(intervals)
    return true_error, empirical_error