def retrain_in_f_with_grid(name, label_p, label_n, oracle, n_features, ftype, test_x, test_y, benchmark): print '--------------- retrain in F with grid -----------------' for n_pts in xrange(50, 601, 50): online = OnlineBase(name, label_p, label_n, oracle, n_features, ftype, error=.1) online.collect_pts(n_pts, -1) ex = RBFKernelRetraining( name, online.get_QSV(), online.get_QSV_labels(), # training data online.get_QSV(), online.get_QSV_labels(), # validation data test_x, test_y, # test data n_features) print 'nQSV=%d, Q=%d, dim=100,' % ( n_pts, online.get_n_query()), ex.grid_retrain_in_f(100)
def CAL_v(name, label_p, label_n, oracle, n_features, ftype, test_x, test_y): online = OnlineBase(name, label_p, label_n, oracle, n_features, ftype, error=.5) x, y = online.collect_pts(100, -1) i = 0 q = online.get_n_query() C_range = np.logspace(-2, 5, 10, base=10) gamma_range = np.logspace(-5, 1, 10, base=10) param_grid = dict(gamma=gamma_range, C=C_range) while q < 3500: i += 1 # h_ = ex.fit(x, y) cv = StratifiedShuffleSplit(y, n_iter=5, test_size=0.2, random_state=42) grid = GridSearchCV(svm.SVC(), param_grid=param_grid, cv=cv, verbose=0, n_jobs=-1) grid.fit(x, y) h_ = grid.best_estimator_ online_ = OnlineBase('', label_p, label_n, h_.predict, n_features, ftype, error=.1) x_, _ = online_.collect_pts(10, 200) if x_ is not None and len(x_) > 0: x.extend(x_) y.extend(oracle(x_)) q += online_.get_n_query() pred_y = h_.predict(test_x) print len(x), q, sm.accuracy_score(test_y, pred_y)
print 'BASELINE: %f' % baseline def polynomial_map(x): # feature map for polynomial kernel (gamma* u`v + c)^2 # assume gamma=1, c = 0 n = len(x) r = [] r.extend([x[i] * x[i] for i in range(n - 1, -1, -1)]) for i in range(n - 1, -1, -1): for j in range(i - 1, -1, -1): r.append(sqrt(2) * x[i] * x[j]) return r print 'solve in F' online = OnlineBase(train_data, p, n, poly_svc.predict, n_features, f_type, 1e-5) online.collect_pts(-1, budget=5000) ex = PolySolver(online.get_QSV(), online.get_QSV_labels(), Xt, Yt, polynomial_map, n_features) ex.solve_in_f() print 'TRAIN SCORE : %f' % ex.solve_score print 'TEST SCORE : %f' % ex.calc_test_score() # print 'retrain in F' # ex = RBFKernelRetraining(train_data, # poly_svc.predict, Xt, Yt, # n_features, OfflineMethods.RT_in_F, error=1, # kernel='poly', fmap=polynomial_map) # ex.train_SGD_for_poly_in_F() # ex.benchmark()
print 'BASELINE: %f' % baseline def polynomial_map(x): # feature map for polynomial kernel (gamma* u`v + c)^2 # assume gamma=1, c = 0 n = len(x) r = [] r.extend([x[i]*x[i] for i in range(n-1, -1, -1)]) for i in range(n-1, -1, -1): for j in range(i-1, -1, -1): r.append(sqrt(2)*x[i]*x[j]) return r print 'solve in F' online = OnlineBase(train_data, p, n, poly_svc.predict, n_features, f_type, 1e-5) online.collect_pts(-1, budget=5000) ex = PolySolver(online.get_QSV(), online.get_QSV_labels(), Xt, Yt, polynomial_map, n_features) ex.solve_in_f() print 'TRAIN SCORE : %f' % ex.solve_score print 'TEST SCORE : %f' % ex.calc_test_score() # print 'retrain in F' # ex = RBFKernelRetraining(train_data, # poly_svc.predict, Xt, Yt, # n_features, OfflineMethods.RT_in_F, error=1, # kernel='poly', fmap=polynomial_map) # ex.train_SGD_for_poly_in_F() # ex.benchmark() # ex.print_perf()
def CAL(name, label_p, label_n, oracle, n_features, ftype, test_x, test_y): online = OnlineBase(name, label_p, label_n, oracle, n_features, ftype, error=.5) q = online.get_n_query() C_range = np.logspace(-2, 5, 10, base=10) gamma_range = np.logspace(-5, 1, 10, base=10) param_grid = dict(gamma=gamma_range, C=C_range) x, y = online.collect_pts(100, -1) i = 0 cv = StratifiedShuffleSplit(y, n_iter=5, test_size=0.2, random_state=42) grid = GridSearchCV(svm.SVC(), param_grid=param_grid, cv=cv, verbose=0, n_jobs=-1) grid.fit(x, y) h_ = grid.best_estimator_ while q < 3500: i += 1 # h_ = ex.fit(x, y) online_ = OnlineBase('', label_p, label_n, h_.predict, n_features, ftype, error=.1) x_ = online_.collect_one_pair() if x_ is not None and len(x_) > 0: for _x in x_: x.append(_x) y.append(1) cv = StratifiedShuffleSplit(y, n_iter=5, test_size=0.2, random_state=42) grid = GridSearchCV(svm.SVC(), param_grid=param_grid, cv=cv, verbose=0, n_jobs=-1) grid.fit(x, y) h1 = grid.best_estimator_ s1 = sm.accuracy_score(y, h1.predict(x)) y[-1] = -1 cv = StratifiedShuffleSplit(y, n_iter=5, test_size=0.2, random_state=42) grid = GridSearchCV(svm.SVC(), param_grid=param_grid, cv=cv, verbose=0, n_jobs=-1) grid.fit(x, y) h2 = grid.best_estimator_ s2 = sm.accuracy_score(y, h2.predict(x)) if s1 >= .99 and s2 >= .99: print 'branch 1' y[-1] = oracle(x_)[0] elif s1 >= .99 and s2 < .99: print 'branch 2' y[-1] = 1 elif s1 < .99 and s2 >= .99: print 'branch 3' y[-1] = -1 else: print 'branch 4: ', s1, s2 del x[-1] del y[-1] continue if y[-1] == 1: h_ = h1 else: h_ = h2 q += online_.get_n_query() pred_y = h_.predict(test_x) print q, sm.accuracy_score(test_y, pred_y)
def CAL(name, label_p, label_n, oracle, n_features, ftype, test_x, test_y): """ Learn with adaptive learning the oracle, using an SVM with RBF kernel, prints the accuracy as function of amount of queries to the LOCAL MODEL (weird function). :param name: :param label_p: :param label_n: :param oracle: :param n_features: :param ftype: :param test_x: :param test_y: :return: """ online = OnlineBase(name, label_p, label_n, oracle, n_features, ftype, error=.5) # This is weird - the count should be zero here. q = online.get_n_query() C_range = np.logspace(-2, 5, 10, base=10) gamma_range = np.logspace(-5, 1, 10, base=10) param_grid = dict(gamma=gamma_range, C=C_range) x, y = online.collect_pts(100, -1) i = 0 cv = StratifiedShuffleSplit(y, n_iter=5, test_size=0.2, random_state=42) grid = GridSearchCV(svm.SVC(), param_grid=param_grid, cv=cv, verbose=0, n_jobs=-1) grid.fit(x, y) h_ = grid.best_estimator_ while q < 3500: i += 1 # h_ = ex.fit(x, y) # This is not really an online model - we set oracle=h_.predict. local_model = OnlineBase('', label_p, label_n, h_.predict, n_features, ftype, error=.1) x_ = local_model.collect_one_pair() if x_ is not None and len(x_) > 0: for _x in x_: # x.append(_x) y.append(1) cv = StratifiedShuffleSplit(y, n_iter=5, test_size=0.2, random_state=42) grid = GridSearchCV(svm.SVC(), param_grid=param_grid, cv=cv, verbose=0, n_jobs=-1) grid.fit(x, y) h1 = grid.best_estimator_ s1 = sm.accuracy_score(y, h1.predict(x)) y[-1] = -1 cv = StratifiedShuffleSplit(y, n_iter=5, test_size=0.2, random_state=42) grid = GridSearchCV(svm.SVC(), param_grid=param_grid, cv=cv, verbose=0, n_jobs=-1) grid.fit(x, y) h2 = grid.best_estimator_ s2 = sm.accuracy_score(y, h2.predict(x)) # Assume implicitly that the local model can reach # over 99% accuracy over the training set. # Check whether there is a reason the query the oracle about x_: # * If for a specific prediction, the performance of # of the model over the so-far found points will # degrade under 99%, it would be useless to query the # oracle because we can already guess this prediction # is wrong. # * Otherwise, we are not certain about oracle(x_) - so we # query the oracle. # Very weird - add the point as training point anyway, # also when we guess oracle(x_). # Notice: I expect that most of the times, only # the first "if" will take effect and actually run, # Because the points are really close to each other. if s1 >= .99 and s2 >= .99: print 'branch 1' y[-1] = oracle(x_)[0] elif s1 >= .99 and s2 < .99: print 'branch 2' y[-1] = 1 elif s1 < .99 and s2 >= .99: print 'branch 3' y[-1] = -1 else: print 'branch 4: ', s1, s2 del x[-1] del y[-1] continue if y[-1] == 1: h_ = h1 else: h_ = h2 # This is weird - why do we count the queries of the local_model ? # I think we should count the queries to the oracle ! q += local_model.get_n_query() pred_y = h_.predict(test_x) print q, sm.accuracy_score(test_y, pred_y)
def CAL_v(name, label_p, label_n, oracle, n_features, ftype, test_x, test_y): """ Prints the test accuracy of an RBF-kernel SVM predictor for a varying amount of "points near the boundary" [boundary of the oracle]. :param name: :param label_p: :param label_n: :param oracle: :param n_features: :param ftype: :param test_x: :param test_y: :return: """ online = OnlineBase(name, label_p, label_n, oracle, n_features, ftype, error=.5) x, y = online.collect_pts(100, -1) i = 0 q = online.get_n_query() C_range = np.logspace(-2, 5, 10, base=10) gamma_range = np.logspace(-5, 1, 10, base=10) param_grid = dict(gamma=gamma_range, C=C_range) while q < 3500: i += 1 # h_ = ex.fit(x, y) cv = StratifiedShuffleSplit(y, n_iter=5, test_size=0.2, random_state=42) grid = GridSearchCV(svm.SVC(), param_grid=param_grid, cv=cv, verbose=0, n_jobs=-1) grid.fit(x, y) h_ = grid.best_estimator_ online_ = OnlineBase('', label_p, label_n, h_.predict, n_features, ftype, error=.1) x_, _ = online_.collect_pts(10, 200) if x_ is not None and len(x_) > 0: x.extend(x_) y.extend(oracle(x_)) q += online_.get_n_query() pred_y = h_.predict(test_x) print "total amount of ", len(x), q, sm.accuracy_score(test_y, pred_y)
def main(): X1, Y1 = make_circles(n_samples=800, noise=0.07, factor=0.4) # defined in sklearn.datasets # gererates a data set X1 and labels Y1 with data from two circles, an inner circle # and an outer circle. The labels in Y1 are 0 or 1, indiciating the inner or outer circle. # n_samples is the number of data points, noise is the noise on the data, factor is the # ratio between the radius of the inner circle to the radius of the outer circle frac0 = len(np.where(Y1 == 0)[0]) / float( len(Y1)) # the number of points in the inner circle frac1 = len(np.where(Y1 == 1)[0]) / float( len(Y1)) # the number of points in the outer circle print("Percentage of '0' labels:", frac0) print("Percentage of '1' labels:", frac1) plt.figure() plt.subplot(121) plt.title( "Our Dataset: N=200, '0': {0} '1': {1} ".format( frac0, frac1), # format is a way of printing reals/integers fontsize="large") plt.scatter(X1[:, 0], X1[:, 1], marker='o', c=Y1) plt.xlim((-2, 2)) plt.ylim((-2, 2)) clf = svm.SVC() # creates a support vector classification object. clf.fit(X1, Y1) # fits the SVC to the data given print(accuracy_score(Y1, clf.predict( X1))) # prints the accuracy of the model on the training data ex = OnlineBase('circle', 1, 0, clf.predict, 2, 'uniform', .1) step = 6 train_x, train_y = [], [] val_x, val_y = [], [] while True: ex.collect_pts( step) # collects step points around the decision boundary of ex train_x.extend(ex.pts_near_b) # first step this list is empty. train_y.extend(ex.pts_near_b_labels) # first step this list is empty #val_x.extend(ex.support_pts) #val_y.extend(ex.support_labels) try: e = RBFKernelRetraining( 'circle', [train_x, train_y], [train_x, train_y], n_features=2 ) # creates a new object every time? is this the smartest way to retrain? print( ex.get_n_query(), e.grid_retrain_in_x() ) # TODO I do not get how ex and e are connected, it seems to me that # grid_retrain_in_x() indeeds does something like retraing the model, but there are no points added to pts_near_b or are there? except KeyboardInterrupt: ## TODO stop condition!! print('Done') break train_x = np.array(train_x) plt.subplot(122) plt.scatter(train_x[:, 0], train_x[:, 1], c=train_y) plt.xlim((-2, 2)) plt.ylim((-2, 2)) plt.show()
def do(self): # get some initial points self.ex.collect_up_to_budget(self.budget_per_round * 2) x, y = self.ex.pts_near_b, self.ex.pts_near_b_labels if len(np.unique(y)) < 2: return 1, 1 # gamma_range = np.logspace(-5, 1, 10, base=10) # param_grid = dict(gamma=gamma_range) try: # cv = StratifiedShuffleSplit(y, n_iter=5, test_size=.2) # grid = GridSearchCV(svm.SVC(C=1e5), param_grid=param_grid, cv=cv, n_jobs=-1) # grid.fit(x, y) # h_best = grid.best_estimator_ raise ValueError except ValueError: h_best = svm.SVC(C=1e5) h_best.fit(x, y) for i in range(1, self.n_rounds - 1): online_ = OnlineBase('', +1, self.NEG, h_best.predict, self.n_features, 'uniform', error=.1) x_, _ = online_.collect_pts(self.budget_per_round, 50000) # budget doesn't matter xx_ = None if x_ is None or len(x_) < self.budget_per_round: print('Run out of budget when getting x_') xx_ = np.random.uniform( -1, 1, (self.budget_per_round - len(x_), self.n_features)) if x_ is not None and len(x_) > 0: x.extend(x_) y.extend(self.oracle(x_)) if xx_ is not None: x.extend(xx_) y.extend(self.oracle(xx_)) try: # cv = StratifiedShuffleSplit(y, n_iter=5, test_size=.2) # grid = GridSearchCV(svm.SVC(C=1e5), param_grid=param_grid, cv=cv, n_jobs=-1) # grid.fit(x, y) # h_best = grid.best_estimator_ raise ValueError except ValueError: h_best = svm.SVC(C=1e5) h_best.fit(x, y) # h_best.fit(x, y) self.set_clf2(h_best) return self.benchmark( ) # (ex.batch_predict, h_.predict, test_x, n_features)