예제 #1
0
def retrain_in_f_with_grid(name, label_p, label_n, oracle, n_features, ftype,
                           test_x, test_y, benchmark):
    print '--------------- retrain in F with grid -----------------'
    for n_pts in xrange(50, 601, 50):

        online = OnlineBase(name,
                            label_p,
                            label_n,
                            oracle,
                            n_features,
                            ftype,
                            error=.1)
        online.collect_pts(n_pts, -1)
        ex = RBFKernelRetraining(
            name,
            online.get_QSV(),
            online.get_QSV_labels(),  # training data
            online.get_QSV(),
            online.get_QSV_labels(),  # validation data
            test_x,
            test_y,  # test data
            n_features)

        print 'nQSV=%d, Q=%d, dim=100,' % (
            n_pts, online.get_n_query()), ex.grid_retrain_in_f(100)
예제 #2
0
def CAL_v(name, label_p, label_n, oracle, n_features, ftype, test_x, test_y):
    online = OnlineBase(name, label_p, label_n, oracle, n_features, ftype, error=.5)
    x, y = online.collect_pts(100, -1)
    i = 0
    q = online.get_n_query()
    C_range = np.logspace(-2, 5, 10, base=10)
    gamma_range = np.logspace(-5, 1, 10, base=10)
    param_grid = dict(gamma=gamma_range, C=C_range)
    while q < 3500:
        i += 1
        # h_ = ex.fit(x, y)

        cv = StratifiedShuffleSplit(y, n_iter=5, test_size=0.2, random_state=42)
        grid = GridSearchCV(svm.SVC(), param_grid=param_grid, cv=cv, verbose=0, n_jobs=-1)
        grid.fit(x, y)
        h_ = grid.best_estimator_

        online_ = OnlineBase('', label_p, label_n, h_.predict, n_features, ftype, error=.1)
        x_, _ = online_.collect_pts(10, 200)
        if x_ is not None and len(x_) > 0:
            x.extend(x_)
            y.extend(oracle(x_))
        q += online_.get_n_query()
        pred_y = h_.predict(test_x)
        print len(x), q, sm.accuracy_score(test_y, pred_y)
예제 #3
0
def CAL_v(name, label_p, label_n, oracle, n_features, ftype, test_x, test_y):
    online = OnlineBase(name,
                        label_p,
                        label_n,
                        oracle,
                        n_features,
                        ftype,
                        error=.5)
    x, y = online.collect_pts(100, -1)
    i = 0
    q = online.get_n_query()
    C_range = np.logspace(-2, 5, 10, base=10)
    gamma_range = np.logspace(-5, 1, 10, base=10)
    param_grid = dict(gamma=gamma_range, C=C_range)
    while q < 3500:
        i += 1
        # h_ = ex.fit(x, y)

        cv = StratifiedShuffleSplit(y,
                                    n_iter=5,
                                    test_size=0.2,
                                    random_state=42)
        grid = GridSearchCV(svm.SVC(),
                            param_grid=param_grid,
                            cv=cv,
                            verbose=0,
                            n_jobs=-1)
        grid.fit(x, y)
        h_ = grid.best_estimator_

        online_ = OnlineBase('',
                             label_p,
                             label_n,
                             h_.predict,
                             n_features,
                             ftype,
                             error=.1)
        x_, _ = online_.collect_pts(10, 200)
        if x_ is not None and len(x_) > 0:
            x.extend(x_)
            y.extend(oracle(x_))
        q += online_.get_n_query()
        pred_y = h_.predict(test_x)
        print len(x), q, sm.accuracy_score(test_y, pred_y)
예제 #4
0
    print 'BASELINE:    %f' % baseline

    def polynomial_map(x):
        # feature map for polynomial kernel (gamma* u`v + c)^2
        # assume gamma=1, c = 0
        n = len(x)
        r = []

        r.extend([x[i] * x[i] for i in range(n - 1, -1, -1)])
        for i in range(n - 1, -1, -1):
            for j in range(i - 1, -1, -1):
                r.append(sqrt(2) * x[i] * x[j])
        return r

    print 'solve in F'
    online = OnlineBase(train_data, p, n, poly_svc.predict, n_features, f_type,
                        1e-5)
    online.collect_pts(-1, budget=5000)

    ex = PolySolver(online.get_QSV(), online.get_QSV_labels(), Xt, Yt,
                    polynomial_map, n_features)
    ex.solve_in_f()
    print 'TRAIN SCORE  : %f' % ex.solve_score
    print 'TEST SCORE   : %f' % ex.calc_test_score()

    # print 'retrain in F'
    # ex = RBFKernelRetraining(train_data,
    #                   poly_svc.predict, Xt, Yt,
    #                          n_features, OfflineMethods.RT_in_F, error=1,
    #              kernel='poly', fmap=polynomial_map)
    # ex.train_SGD_for_poly_in_F()
    # ex.benchmark()
예제 #5
0
    print 'BASELINE:    %f' % baseline

    def polynomial_map(x):
        # feature map for polynomial kernel (gamma* u`v + c)^2
        # assume gamma=1, c = 0
        n = len(x)
        r = []

        r.extend([x[i]*x[i] for i in range(n-1, -1, -1)])
        for i in range(n-1, -1, -1):
            for j in range(i-1, -1, -1):
                r.append(sqrt(2)*x[i]*x[j])
        return r

    print 'solve in F'
    online = OnlineBase(train_data, p, n, poly_svc.predict, n_features, f_type, 1e-5)
    online.collect_pts(-1, budget=5000)

    ex = PolySolver(online.get_QSV(), online.get_QSV_labels(), Xt, Yt, polynomial_map, n_features)
    ex.solve_in_f()
    print 'TRAIN SCORE  : %f' % ex.solve_score
    print 'TEST SCORE   : %f' % ex.calc_test_score()

    # print 'retrain in F'
    # ex = RBFKernelRetraining(train_data,
    #                   poly_svc.predict, Xt, Yt,
    #                          n_features, OfflineMethods.RT_in_F, error=1,
    #              kernel='poly', fmap=polynomial_map)
    # ex.train_SGD_for_poly_in_F()
    # ex.benchmark()
    # ex.print_perf()
예제 #6
0
def CAL(name, label_p, label_n, oracle, n_features, ftype, test_x, test_y):
    online = OnlineBase(name, label_p, label_n, oracle, n_features, ftype, error=.5)

    q = online.get_n_query()
    C_range = np.logspace(-2, 5, 10, base=10)
    gamma_range = np.logspace(-5, 1, 10, base=10)
    param_grid = dict(gamma=gamma_range, C=C_range)

    x, y = online.collect_pts(100, -1)

    i = 0

    cv = StratifiedShuffleSplit(y, n_iter=5, test_size=0.2, random_state=42)
    grid = GridSearchCV(svm.SVC(), param_grid=param_grid, cv=cv, verbose=0, n_jobs=-1)
    grid.fit(x, y)
    h_ = grid.best_estimator_
    while q < 3500:
        i += 1
        # h_ = ex.fit(x, y)

        online_ = OnlineBase('', label_p, label_n, h_.predict, n_features, ftype, error=.1)
        x_ = online_.collect_one_pair()
        if x_ is not None and len(x_) > 0:
            for _x in x_:
                x.append(_x)
                y.append(1)
                cv = StratifiedShuffleSplit(y, n_iter=5, test_size=0.2, random_state=42)
                grid = GridSearchCV(svm.SVC(), param_grid=param_grid, cv=cv, verbose=0, n_jobs=-1)
                grid.fit(x, y)
                h1 = grid.best_estimator_
                s1 = sm.accuracy_score(y, h1.predict(x))

                y[-1] = -1
                cv = StratifiedShuffleSplit(y, n_iter=5, test_size=0.2, random_state=42)
                grid = GridSearchCV(svm.SVC(), param_grid=param_grid, cv=cv, verbose=0, n_jobs=-1)
                grid.fit(x, y)
                h2 = grid.best_estimator_
                s2 = sm.accuracy_score(y, h2.predict(x))
                if s1 >= .99 and s2 >= .99:
                    print 'branch 1'
                    y[-1] = oracle(x_)[0]
                elif s1 >= .99 and s2 < .99:
                    print 'branch 2'
                    y[-1] = 1
                elif s1 < .99 and s2 >= .99:
                    print 'branch 3'
                    y[-1] = -1
                else:
                    print 'branch 4: ', s1, s2
                    del x[-1]
                    del y[-1]
                    continue

            if y[-1] == 1:
                h_ = h1
            else:
                h_ = h2

        q += online_.get_n_query()
        pred_y = h_.predict(test_x)
        print q, sm.accuracy_score(test_y, pred_y)
예제 #7
0
def CAL(name, label_p, label_n, oracle, n_features, ftype, test_x, test_y):
    """
    Learn with adaptive learning the oracle, using an SVM
     with RBF kernel,
     prints the accuracy as function of amount of queries to
     the LOCAL MODEL (weird function).
    :param name:
    :param label_p:
    :param label_n:
    :param oracle:
    :param n_features:
    :param ftype:
    :param test_x:
    :param test_y:
    :return:
    """
    online = OnlineBase(name,
                        label_p,
                        label_n,
                        oracle,
                        n_features,
                        ftype,
                        error=.5)
    # This is weird - the count should be zero here.
    q = online.get_n_query()
    C_range = np.logspace(-2, 5, 10, base=10)
    gamma_range = np.logspace(-5, 1, 10, base=10)
    param_grid = dict(gamma=gamma_range, C=C_range)

    x, y = online.collect_pts(100, -1)

    i = 0

    cv = StratifiedShuffleSplit(y, n_iter=5, test_size=0.2, random_state=42)
    grid = GridSearchCV(svm.SVC(),
                        param_grid=param_grid,
                        cv=cv,
                        verbose=0,
                        n_jobs=-1)
    grid.fit(x, y)
    h_ = grid.best_estimator_
    while q < 3500:
        i += 1
        # h_ = ex.fit(x, y)
        # This is not really an online model - we set oracle=h_.predict.
        local_model = OnlineBase('',
                                 label_p,
                                 label_n,
                                 h_.predict,
                                 n_features,
                                 ftype,
                                 error=.1)
        x_ = local_model.collect_one_pair()
        if x_ is not None and len(x_) > 0:
            for _x in x_:
                #
                x.append(_x)
                y.append(1)
                cv = StratifiedShuffleSplit(y,
                                            n_iter=5,
                                            test_size=0.2,
                                            random_state=42)
                grid = GridSearchCV(svm.SVC(),
                                    param_grid=param_grid,
                                    cv=cv,
                                    verbose=0,
                                    n_jobs=-1)
                grid.fit(x, y)
                h1 = grid.best_estimator_
                s1 = sm.accuracy_score(y, h1.predict(x))

                y[-1] = -1
                cv = StratifiedShuffleSplit(y,
                                            n_iter=5,
                                            test_size=0.2,
                                            random_state=42)
                grid = GridSearchCV(svm.SVC(),
                                    param_grid=param_grid,
                                    cv=cv,
                                    verbose=0,
                                    n_jobs=-1)
                grid.fit(x, y)
                h2 = grid.best_estimator_
                s2 = sm.accuracy_score(y, h2.predict(x))
                # Assume implicitly that the local model can reach
                # over 99% accuracy over the training set.
                # Check whether there is a reason the query the oracle about x_:
                #   * If for a specific prediction, the performance of
                #   of the model over the so-far found points will
                #   degrade under 99%, it would be useless to query the
                #   oracle because we can already guess this prediction
                #   is wrong.
                #   * Otherwise, we are not certain about oracle(x_) - so we
                #   query the oracle.
                # Very weird - add the point as training point anyway,
                # also when we guess oracle(x_).
                # Notice: I expect that most of the times, only
                # the first "if" will take effect and actually run,
                # Because the points are really close to each other.
                if s1 >= .99 and s2 >= .99:
                    print 'branch 1'
                    y[-1] = oracle(x_)[0]
                elif s1 >= .99 and s2 < .99:
                    print 'branch 2'
                    y[-1] = 1
                elif s1 < .99 and s2 >= .99:
                    print 'branch 3'
                    y[-1] = -1
                else:
                    print 'branch 4: ', s1, s2
                    del x[-1]
                    del y[-1]
                    continue

            if y[-1] == 1:
                h_ = h1
            else:
                h_ = h2
        # This is weird - why do we count the queries of the local_model ?
        # I think we should count the queries to the oracle !
        q += local_model.get_n_query()
        pred_y = h_.predict(test_x)
        print q, sm.accuracy_score(test_y, pred_y)
예제 #8
0
def CAL_v(name, label_p, label_n, oracle, n_features, ftype, test_x, test_y):
    """
    Prints the test accuracy of an RBF-kernel SVM predictor
    for a varying amount of "points near the boundary"
    [boundary of the oracle].

    :param name:
    :param label_p:
    :param label_n:
    :param oracle:
    :param n_features:
    :param ftype:
    :param test_x:
    :param test_y:
    :return:
    """
    online = OnlineBase(name,
                        label_p,
                        label_n,
                        oracle,
                        n_features,
                        ftype,
                        error=.5)
    x, y = online.collect_pts(100, -1)
    i = 0
    q = online.get_n_query()

    C_range = np.logspace(-2, 5, 10, base=10)
    gamma_range = np.logspace(-5, 1, 10, base=10)
    param_grid = dict(gamma=gamma_range, C=C_range)

    while q < 3500:
        i += 1
        # h_ = ex.fit(x, y)

        cv = StratifiedShuffleSplit(y,
                                    n_iter=5,
                                    test_size=0.2,
                                    random_state=42)
        grid = GridSearchCV(svm.SVC(),
                            param_grid=param_grid,
                            cv=cv,
                            verbose=0,
                            n_jobs=-1)
        grid.fit(x, y)
        h_ = grid.best_estimator_

        online_ = OnlineBase('',
                             label_p,
                             label_n,
                             h_.predict,
                             n_features,
                             ftype,
                             error=.1)
        x_, _ = online_.collect_pts(10, 200)
        if x_ is not None and len(x_) > 0:
            x.extend(x_)
            y.extend(oracle(x_))
        q += online_.get_n_query()
        pred_y = h_.predict(test_x)

        print "total amount of ", len(x), q, sm.accuracy_score(test_y, pred_y)
예제 #9
0
def main():
    X1, Y1 = make_circles(n_samples=800, noise=0.07,
                          factor=0.4)  # defined in sklearn.datasets
    # gererates a data set X1 and labels Y1 with data from two circles, an inner circle
    # and an outer circle. The labels in Y1 are 0 or 1, indiciating the inner or outer circle.
    # n_samples is the number of data points, noise is the noise on the data, factor is the
    # ratio between the radius of the inner circle to the radius of the outer circle
    frac0 = len(np.where(Y1 == 0)[0]) / float(
        len(Y1))  # the number of points in the inner circle
    frac1 = len(np.where(Y1 == 1)[0]) / float(
        len(Y1))  # the number of points in the outer circle

    print("Percentage of '0' labels:", frac0)
    print("Percentage of '1' labels:", frac1)

    plt.figure()
    plt.subplot(121)
    plt.title(
        "Our Dataset: N=200, '0': {0} '1': {1} ".format(
            frac0, frac1),  # format is a way of printing reals/integers 
        fontsize="large")

    plt.scatter(X1[:, 0], X1[:, 1], marker='o', c=Y1)
    plt.xlim((-2, 2))
    plt.ylim((-2, 2))

    clf = svm.SVC()  # creates a support vector classification object.
    clf.fit(X1, Y1)  # fits the SVC to the data given

    print(accuracy_score(Y1, clf.predict(
        X1)))  # prints the accuracy of the model on the training data

    ex = OnlineBase('circle', 1, 0, clf.predict, 2, 'uniform', .1)
    step = 6
    train_x, train_y = [], []
    val_x, val_y = [], []
    while True:
        ex.collect_pts(
            step)  # collects step points around the decision boundary of ex
        train_x.extend(ex.pts_near_b)  # first step this list is empty.
        train_y.extend(ex.pts_near_b_labels)  # first step this list is empty
        #val_x.extend(ex.support_pts)
        #val_y.extend(ex.support_labels)
        try:
            e = RBFKernelRetraining(
                'circle', [train_x, train_y], [train_x, train_y], n_features=2
            )  # creates a new object every time? is this the smartest way to retrain?
            print(
                ex.get_n_query(), e.grid_retrain_in_x()
            )  # TODO I do not get how ex and e are connected, it seems to me that
            # grid_retrain_in_x() indeeds does something like retraing the model, but there are no points added to pts_near_b or are there?
        except KeyboardInterrupt:  ## TODO stop condition!!
            print('Done')
            break

    train_x = np.array(train_x)
    plt.subplot(122)
    plt.scatter(train_x[:, 0], train_x[:, 1], c=train_y)
    plt.xlim((-2, 2))
    plt.ylim((-2, 2))
    plt.show()
예제 #10
0
    def do(self):
        # get some initial points
        self.ex.collect_up_to_budget(self.budget_per_round * 2)
        x, y = self.ex.pts_near_b, self.ex.pts_near_b_labels

        if len(np.unique(y)) < 2:
            return 1, 1

        # gamma_range = np.logspace(-5, 1, 10, base=10)
        # param_grid = dict(gamma=gamma_range)

        try:
            # cv = StratifiedShuffleSplit(y, n_iter=5, test_size=.2)
            # grid = GridSearchCV(svm.SVC(C=1e5), param_grid=param_grid, cv=cv, n_jobs=-1)
            # grid.fit(x, y)
            # h_best = grid.best_estimator_
            raise ValueError
        except ValueError:
            h_best = svm.SVC(C=1e5)
            h_best.fit(x, y)

        for i in range(1, self.n_rounds - 1):
            online_ = OnlineBase('',
                                 +1,
                                 self.NEG,
                                 h_best.predict,
                                 self.n_features,
                                 'uniform',
                                 error=.1)
            x_, _ = online_.collect_pts(self.budget_per_round,
                                        50000)  # budget doesn't matter

            xx_ = None
            if x_ is None or len(x_) < self.budget_per_round:
                print('Run out of budget when getting x_')
                xx_ = np.random.uniform(
                    -1, 1, (self.budget_per_round - len(x_), self.n_features))

            if x_ is not None and len(x_) > 0:
                x.extend(x_)
                y.extend(self.oracle(x_))

            if xx_ is not None:
                x.extend(xx_)
                y.extend(self.oracle(xx_))

            try:
                # cv = StratifiedShuffleSplit(y, n_iter=5, test_size=.2)
                # grid = GridSearchCV(svm.SVC(C=1e5), param_grid=param_grid, cv=cv, n_jobs=-1)
                # grid.fit(x, y)
                # h_best = grid.best_estimator_
                raise ValueError
            except ValueError:
                h_best = svm.SVC(C=1e5)
                h_best.fit(x, y)

            # h_best.fit(x, y)

        self.set_clf2(h_best)
        return self.benchmark(
        )  # (ex.batch_predict, h_.predict, test_x, n_features)
예제 #11
0
def CAL(name, label_p, label_n, oracle, n_features, ftype, test_x, test_y):
    online = OnlineBase(name,
                        label_p,
                        label_n,
                        oracle,
                        n_features,
                        ftype,
                        error=.5)

    q = online.get_n_query()
    C_range = np.logspace(-2, 5, 10, base=10)
    gamma_range = np.logspace(-5, 1, 10, base=10)
    param_grid = dict(gamma=gamma_range, C=C_range)

    x, y = online.collect_pts(100, -1)

    i = 0

    cv = StratifiedShuffleSplit(y, n_iter=5, test_size=0.2, random_state=42)
    grid = GridSearchCV(svm.SVC(),
                        param_grid=param_grid,
                        cv=cv,
                        verbose=0,
                        n_jobs=-1)
    grid.fit(x, y)
    h_ = grid.best_estimator_
    while q < 3500:
        i += 1
        # h_ = ex.fit(x, y)

        online_ = OnlineBase('',
                             label_p,
                             label_n,
                             h_.predict,
                             n_features,
                             ftype,
                             error=.1)
        x_ = online_.collect_one_pair()
        if x_ is not None and len(x_) > 0:
            for _x in x_:
                x.append(_x)
                y.append(1)
                cv = StratifiedShuffleSplit(y,
                                            n_iter=5,
                                            test_size=0.2,
                                            random_state=42)
                grid = GridSearchCV(svm.SVC(),
                                    param_grid=param_grid,
                                    cv=cv,
                                    verbose=0,
                                    n_jobs=-1)
                grid.fit(x, y)
                h1 = grid.best_estimator_
                s1 = sm.accuracy_score(y, h1.predict(x))

                y[-1] = -1
                cv = StratifiedShuffleSplit(y,
                                            n_iter=5,
                                            test_size=0.2,
                                            random_state=42)
                grid = GridSearchCV(svm.SVC(),
                                    param_grid=param_grid,
                                    cv=cv,
                                    verbose=0,
                                    n_jobs=-1)
                grid.fit(x, y)
                h2 = grid.best_estimator_
                s2 = sm.accuracy_score(y, h2.predict(x))
                if s1 >= .99 and s2 >= .99:
                    print 'branch 1'
                    y[-1] = oracle(x_)[0]
                elif s1 >= .99 and s2 < .99:
                    print 'branch 2'
                    y[-1] = 1
                elif s1 < .99 and s2 >= .99:
                    print 'branch 3'
                    y[-1] = -1
                else:
                    print 'branch 4: ', s1, s2
                    del x[-1]
                    del y[-1]
                    continue

            if y[-1] == 1:
                h_ = h1
            else:
                h_ = h2

        q += online_.get_n_query()
        pred_y = h_.predict(test_x)
        print q, sm.accuracy_score(test_y, pred_y)