def test_alce_lr_embed5(self): cost_matrix = np.random.RandomState(1126).rand(3, 3) np.fill_diagonal(cost_matrix, 0) ds = Dataset(self.X + self.X_pool, self.y[:3] + [None for _ in range(len(self.X_pool))]) qs = ALCE(ds, cost_matrix, LinearRegression(), embed_dim=5, random_state=1126) qseq = run_qs(ds, qs, self.y_truth, self.quota) assert_array_equal(qseq, np.array([16, 63, 34, 122, 38, 35, 17, 24, 43, 18]))
def initialDataSetup(trainFeatures, trainClasses, testFeatures, testClasses, SNLabel='0'): """ Set up ideal labeler. input: trainFeatures, array - train matrix trainClasses, list - train labels testFeatures, array - test (photometric) matrix testClasses, list - test (photometric) labels SNLabel, str - SN Ia flag output: tuple, (train_dataset, fullLabels, labeler) """ # Concatenate features fullFeatures = np.vstack([trainFeatures, testFeatures]) # Include None in place of labels from target sample partialClasses = np.concatenate([(trainClasses[:, 2] == SNLabel).astype(int), np.array([None] * testFeatures.shape[0])]) # Complete concatenated labels for train and target samples fullClasses = np.concatenate([(trainClasses[:, 2] == SNLabel).astype(int), (testClasses[:, 2] == SNLabel).astype(int)]) # Concatenate labels fullLabels = np.concatenate([trainClasses, testClasses]) # Concatenated features and class labels with None on target data train_dataset = Dataset(fullFeatures, partialClasses) # Define ideal labeler labeler = IdealLabeler(Dataset(fullFeatures, fullClasses)) return (train_dataset, fullLabels, labeler)
def test_multilabel_with_auxiliary_learner_shlr(self): trn_ds = Dataset(self.X, self.y[:5] + [None] * (len(self.y) - 5)) qs = MultilabelWithAuxiliaryLearner(trn_ds, major_learner=BinaryRelevance(LogisticRegression(solver='liblinear', multi_class="ovr")), auxiliary_learner=BinaryRelevance(SVM(gamma="auto")), criterion='shlr', b=1., random_state=1126) qseq = run_qs(trn_ds, qs, self.y, self.quota) assert_array_equal(qseq, np.array([1258, 805, 459, 550, 783, 964, 736, 1004, 38, 750]))
def split_train_test(n_classes): from sklearn.datasets import load_digits n_labeled = 5 digits = load_digits(n_class=n_classes) # consider binary case X = digits.data y = digits.target print(np.shape(X)) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33) while len(np.unique(y_train[:n_labeled])) < n_classes: X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33) trn_ds = Dataset( X_train, np.concatenate( [y_train[:n_labeled], [None] * (len(y_train) - n_labeled)])) tst_ds = Dataset(X_test, y_test) return trn_ds, tst_ds, digits
def test_hs_subsampling(self): ds = Dataset(self.X, self.y[:10] + [None] * (len(self.y) - 10)) sub_qs = UncertaintySampling(ds, model=SVM(gamma='auto', decision_function_shape='ovr')) qs = HS(ds, self.classes, subsample_qs=sub_qs, random_state=1126) qseq = run_qs(ds, qs, self.y, len(self.y) - 10) assert_array_equal( np.concatenate([qseq[:10], qseq[-10:]]), np.array([ 120, 50, 33, 28, 78, 133, 52, 124, 102, 109, 81, 108, 12, 10, 89, 114, 92, 126, 48, 25 ]))
def initializeAWTLPool(X_source, y_source, X_target_train, y_target_train, n_labeled, sample_weights): # create training set by appending the train sample from target to all instances from source X_train = np.vstack([X_source,X_target_train]) y_train = np.append(y_source,y_target_train) # train_ds is the whole X_train but only the n_labeled (all source) instances are labeled train_ds = le.AWTLDataset(X=X_train,y=np.concatenate([y_train[:n_labeled], [None] * (len(y_train) - n_labeled)]), da_weights=sample_weights) # here we have the fully labeled training set fully_labeled_trn_ds = Dataset(X=X_train,y=y_train) return train_ds, fully_labeled_trn_ds
def test_multilabel_with_auxiliary_learner_mmr(self): trn_ds = Dataset(self.X, self.y[:5] + [None] * (len(self.y) - 5)) qs = MultilabelWithAuxiliaryLearner( trn_ds, major_learner=BinaryRelevance( LogisticRegression(solver='liblinear', multi_class="ovr")), auxiliary_learner=BinaryRelevance(SVM(gamma="auto")), criterion='mmr', random_state=1126) qseq = run_qs(trn_ds, qs, self.y, self.quota) assert_array_equal( qseq, np.array([1258, 1461, 231, 1198, 1498, 1374, 955, 1367, 265, 144]))
def _E(args): X, y, qx, clf, label_count, sigma, model = args sigmoid = lambda x: 1 / (1 + np.exp(-x)) query_point = sigmoid(clf.predict_real([qx])) feature_count = len(X[0]) ret = 0.0 for i in range(label_count): clf_ = copy.copy(model) clf_.train(Dataset(np.vstack((X, [qx])), np.append(y, i))) PI = sigmoid(clf_.predict_real(np.vstack((X, [qx])))) ret += query_point[-1][i] * _Phi(sigma, PI[:-1], X, PI[-1], qx, label_count, feature_count) return ret
def test_QueryByCommittee(self): trn_ds = Dataset( self.X, np.concatenate([self.y[:10], [None] * (len(self.y) - 10)])) qs = QueryByCommittee(trn_ds, models=[ LogisticRegression(C=1.0), LogisticRegression(C=0.01), LogisticRegression(C=100) ], random_state=1126) qseq = run_qs(trn_ds, qs, self.y, self.quota) assert_array_equal( qseq, np.array([267, 210, 229, 220, 134, 252, 222, 142, 245, 228]))
def main(): # n_labeled = 10 # number of samples that are initially labeled # Load dataset # val_data # X_test = np.load('../ziqi/train_data_5000.npy') # y_test_csv = pd.read_csv('../ziqi/train_data_label_5000.csv') # y_test = np.array([i for i in y_test_csv['label']]) # tst_ds = Dataset(X_test, y_test) # # noisy_data y_csv = pd.read_csv('data_label.csv') y_test = y_csv['label'] y_test = [i for i in y_test] y_test = np.array(y_test) X_pool = np.load('data.npy') y_pool = [None] * len(X_pool) pool_ds = Dataset(X_pool, y_pool) # Comparing UncertaintySampling strategy with entropy uncertainty sampling model_folder = './model/vgg-x30-9982.pb' # model predict print('model predict....') model = MyModelTF(model_folder, batch_size=256) score = model.predict(X_pool) score = pd.DataFrame(score) score.to_csv('noisy_predict_all.csv', index=False) # score = np.argmax(score, axis=1) # all_csv = {'indexs':y_test_csv['filename'], 'pre_label': score, 'name_label': y_test} # all_csv = pd.DataFrame(all_csv) # all_csv.to_csv('../ziqi/result/train_predict_result.csv') # acc_test = model.score(tst_ds) # print('acc: ', acc_test) print('make query') qs = UncertaintySampling(pool_ds, method='entropy', model=MyModelTF(model_folder, batch_size=128)) ask_ids, labels = qs.make_query(return_label=True, n_instances=5000) print('ask_ids : ', np.shape(ask_ids)) csv = { 'indexs': ask_ids, 'pre_label': labels, 'name_label': y_test[ask_ids] } csv = pd.DataFrame(csv) csv.to_csv('./result/sample_result.csv', index=False)
def runrnn(trn_ds, tst_ds, val_ds, lbr, model, quota, best_val, batchsize): E_in, E_out = [], [] intern = 0 finalnum = 0 print("[Important] Start the RNN Train:") start_time = time.time() if quota % batchsize == 0: intern = int(quota / batchsize) else: intern = int(quota / batchsize) + 1 finalnum = int(quota % batchsize) for t in range(intern): print("[RNN]this is the " + str(t) + " times to ask") x_first_train = [] y_first_train = [] scores = model.predict_pro(trn_ds) unlabeled_entry_ids, X_pool = zip(*trn_ds.get_unlabeled_entries()) if t == intern - 1 and finalnum != 0: max_n = heapq.nsmallest(finalnum, range(len(scores)), scores.take) else: max_n = heapq.nsmallest(batchsize, range(len(scores)), scores.take) X, _ = zip(*trn_ds.data) print(max_n) for ask_id in max_n: real_id = unlabeled_entry_ids[ask_id] lb = lbr.label(X[real_id]) trn_ds.update(real_id, lb) x_first_train.append(X[real_id]) y_first_train.append(lb) x_first_train = np.array(x_first_train) y_first_train = np.array(y_first_train) first_train = Dataset(x_first_train, y_first_train) best_val = model.retrain(trn_ds, val_ds, best_val, first_train) # E_in = np.append(E_in, 1 - model.score(trn_ds)) E_out = np.append(E_out, model.score(tst_ds)) # print (E_in) print(E_out) E_time = get_time_dif(start_time) return E_out, E_time
def split_train_test(train_dir, vocab_dir, test_size, n_labeled, wordslength): #train_dir = './data/labeled1.txt' #vocab_dir = './data/vocab_yinan_test_rnn4.txt' if not os.path.exists(vocab_dir): build_vocab(train_dir,vocab_dir,1000) categories, cat_to_id = read_category() words, word_to_id = read_vocab(vocab_dir) x,y = process_file(train_dir, word_to_id, cat_to_id, wordslength) # x_rnn, y_rnn = process_file_rnn(train_dir, word_to_id, cat_to_id, 600) listy = [] for i in range(np.shape(y)[0]): for j in range(np.shape(y)[1]): if y[i][j] == 1: listy.append(j) listy = np.array(listy) X_train, X_test, y_train, y_test = \ train_test_split(x, listy, test_size=test_size) # X_train = X_train[:(n_labeled + 24)] trn_ds = Dataset(X_train, np.concatenate( [y_train[:n_labeled], [None] * (len(y_train) - n_labeled)])) # trn_ds = Dataset(X_train, np.concatenate( # [y_train[:n_labeled], [None] * (len(y_train) - n_labeled)])) fully_tst_ds = Dataset(X_test, y_test) X_val, X_real_test, y_val, y_real_test = \ train_test_split(X_test, y_test, test_size=0.5) tst_ds = Dataset(X_real_test, y_real_test) val_ds = Dataset(X_val, y_val) fully_labeled_trn_ds = Dataset(X_train, y_train) # print (fully_labeled_trn_ds.get_entries()[0]) return trn_ds, tst_ds, y_train, fully_labeled_trn_ds, fully_tst_ds, val_ds
def test_multilabel_with_auxiliary_learner_hlr(self): trn_ds = Dataset(self.X, self.y[:5] + [None] * (len(self.y) - 5)) qs = MultilabelWithAuxiliaryLearner(trn_ds, major_learner=BinaryRelevance( LogisticRegression(solver='liblinear', multi_class="ovr", random_state=1126)), auxiliary_learner=BinaryRelevance(SVM(gamma="auto")), criterion='hlr', random_state=1126) qseq = run_qs(trn_ds, qs, self.y, self.quota) assert_array_equal(qseq, np.array([701, 1403, 147, 897, 974, 1266, 870, 703, 292, 1146]))
def test_QueryByCommittee(self): #import ipdb; ipdb.set_trace() random.seed(1126) trn_ds = Dataset( self.X, np.concatenate([self.y[:10], [None] * (len(self.y) - 10)])) qs = QueryByCommittee(trn_ds, models=[ LogisticRegression(C=1.0), LogisticRegression(C=0.01), LogisticRegression(C=100) ]) qseq = run_qs(trn_ds, qs, self.y, self.quota) assert_array_equal( qseq, np.array([11, 207, 101, 30, 116, 108, 83, 172, 211, 42]))
def test_query_by_committee_kl_divergence(self): trn_ds = Dataset( self.X, np.concatenate([self.y[:10], [None] * (len(self.y) - 10)])) qs = QueryByCommittee(trn_ds, disagreement='kl_divergence', models=[ LogisticRegression(C=1.0), LogisticRegression(C=0.01), LogisticRegression(C=100) ], random_state=1126) qseq = run_qs(trn_ds, qs, self.y, self.quota) assert_array_equal( qseq, np.array([228, 111, 162, 243, 213, 122, 110, 108, 156, 37]))
def split_train_test(X, y, test_size, n_class): target = np.unique(y) # mapping the targets to 0 to n_classes-1 # y = np.array([np.where(target == i)[0][0] for i in data['target']]) X_trn, X_tst, y_trn, y_tst = train_test_split(X, y, test_size=test_size) X_trn = X_trn[:, 1:] y_tst = X_tst[:, 0] X_tst = X_tst[:, 1:] y_tst = y_tst.astype('int32') # making sure each class appears ones initially init_y_ind = np.array( [np.where(y_trn == i)[0][0] for i in range(len(target))]) y_ind = np.array([i for i in range(len(X_trn)) if i not in init_y_ind]) trn_ds = Dataset( np.vstack((X_trn[init_y_ind], X_trn[y_ind])), np.concatenate((y_trn[init_y_ind], [None] * (len(y_ind))))) tst_ds = Dataset(X_tst, y_tst) fully_labeled_trn_ds = Dataset( np.vstack((X_trn[init_y_ind], X_trn[y_ind])), np.concatenate((y_trn[init_y_ind], y_trn[y_ind]))) cost_matrix = 1.0 * np.ones([len(target), len(target)]) for ii in range(0, len(target)): for jj in range(0, len(target)): # cost_matrix[ii, jj] = (ii - jj) * (ii - jj) cost_matrix[ii, jj] = abs(ii - jj) / n_class np.fill_diagonal(cost_matrix, 0) return trn_ds, tst_ds, fully_labeled_trn_ds, cost_matrix
def test_binary_relevance_lr(self): br = BinaryRelevance(base_clf=LogisticRegression( solver='liblinear', multi_class="ovr", random_state=1126)) br.train(Dataset(self.X_train, self.Y_train)) br_pred_train = br.predict(self.X_train).astype(int) br_pred_test = br.predict(self.X_test).astype(int) br_pred_proba_train = br.predict_proba(self.X_train).astype(float) br_pred_proba_test = br.predict_proba(self.X_test).astype(float) for i in range(np.shape(self.Y_train)[1]): clf = sklearn.linear_model.LogisticRegression(solver='liblinear', multi_class="ovr", random_state=1126) clf.fit(self.X_train, self.Y_train[:, i]) assert_array_almost_equal( clf.predict(self.X_train).astype(int), br_pred_train[:, i]) assert_array_almost_equal( clf.predict(self.X_test).astype(int), br_pred_test[:, i]) assert_array_almost_equal( clf.predict_proba(self.X_train)[:, 1].astype(float), br_pred_proba_train[:, i].astype(float)) assert_array_almost_equal( clf.predict_proba(self.X_test)[:, 1].astype(float), br_pred_proba_test[:, i].astype(float)) self.assertEqual( np.mean(np.abs(self.Y_test - br_pred_test).mean(axis=1)), br.score(Dataset(self.X_test, self.Y_test), 'hamming')) self.assertRaises( NotImplementedError, lambda: br.score(Dataset(self.X_test, self.Y_test), criterion='not_exist'))
def test_query_by_committee_vote(self): #self.skipTest("In this version we randomize make queries") trn_ds = Dataset( self.X, np.concatenate([self.y[:10], [None] * (len(self.y) - 10)])) qs = QueryByCommittee(trn_ds, disagreement='vote', models=[ LogisticRegression(C=1.0), LogisticRegression(C=0.01), LogisticRegression(C=100) ], random_state=1126) qseq = run_qs(trn_ds, qs, self.y, self.quota) assert_array_equal(qseq, np.array([10, 12, 11, 13, 16, 14, 17, 18, 19, 21]))
def test_ActiveLearningByLearning(self): np.random.seed(1126) trn_ds = Dataset( self.X, np.concatenate([self.y[:10], [None] * (len(self.y) - 10)])) qs = ActiveLearningByLearning(trn_ds, T=self.quota, query_strategies=[ UncertaintySampling( trn_ds, model=LogisticRegression()), HintSVM(trn_ds) ], model=LogisticRegression()) qseq = run_qs(trn_ds, qs, self.y, self.quota) assert_array_equal( qseq, np.array([103, 220, 118, 75, 176, 50, 247, 199, 46, 55]))
def test_ActiveLearningByLearning(self): trn_ds = Dataset( self.X, np.concatenate([self.y[:10], [None] * (len(self.y) - 10)])) qs = ActiveLearningByLearning(trn_ds, T=self.quota, query_strategies=[ UncertaintySampling( trn_ds, model=LogisticRegression()), HintSVM(trn_ds, random_state=1126) ], model=LogisticRegression(), random_state=1126) qseq = run_qs(trn_ds, qs, self.y, self.quota) assert_array_equal( qseq, np.array([173, 103, 133, 184, 187, 147, 251, 83, 93, 33]))
def test_cost_sensitive_random_pair_encoding(self): trn_ds = Dataset(self.X, self.y[:5] + [None] * (len(self.y) - 5)) model = BinaryRelevance(LogisticRegression(solver='liblinear', multi_class="ovr")) base_model = LogisticRegression( solver='liblinear', multi_class="ovr", random_state=1126) qs = CostSensitiveReferencePairEncoding( trn_ds, scoring_fn=pairwise_f1_score, model=model, base_model=base_model, n_models=10, n_jobs=1, random_state=1126) qseq = run_qs(trn_ds, qs, self.y, self.quota) assert_array_equal(qseq, np.array([149, 434, 1126, 719, 983, 564, 816, 732, 101, 1242]))
def test_ALBLTestCase(self): trn_ds = Dataset( self.X, np.concatenate([self.y[:10], [None] * (len(self.y) - 10)])) qs = ActiveLearningByLearning( trn_ds, T=self.quota, query_strategies=[ UncertaintySampling(trn_ds, model=SVM(kernel="linear", decision_function_shape="ovr")), QUIRE(trn_ds), RandomSampling(trn_ds) ], model=SVM(kernel="linear", decision_function_shape="ovr"), random_state=1126) qseq = run_qs(trn_ds, qs, self.y, self.quota) assert_array_equal( qseq, np.array([173, 103, 133, 184, 187, 147, 251, 83, 93, 33]))
def make_query(self): Xlabeled, y = self.dataset.get_labeled_entries() Xlabeled = np.array(Xlabeled) y = list(y) unlabeled_entry_ids, X_pool = self.dataset.get_unlabeled_entries() label_count = self.dataset.get_num_of_labels() clf = copy.copy(self.model) clf.train(Dataset(Xlabeled, y)) p = Pool(self.n_jobs) errors = p.map( _E, [(Xlabeled, y, x, clf, label_count, self.sigma, self.model) for x in X_pool]) p.terminate() return unlabeled_entry_ids[errors.index(min(errors))]
def test_density_weighted_meta_uncertainty_lc(self): trn_ds = Dataset(self.X[:20], np.concatenate([self.y[:6], [None] * 14])) base_qs = UncertaintySampling(trn_ds, method='lc', model=LogisticRegression( solver='liblinear', multi_class="ovr")) similarity_metric = cosine_similarity clustering_method = KMeans(n_clusters=3, random_state=1126) qs = DensityWeightedMeta(dataset=trn_ds, base_query_strategy=base_qs, similarity_metric=similarity_metric, clustering_method=clustering_method, beta=1.0, random_state=1126) model = LogisticRegression(solver='liblinear', multi_class="ovr") qseq = run_qs(trn_ds, qs, self.y, self.quota) assert_array_equal(qseq, np.array([13, 18, 9, 12, 8, 16, 10, 19, 15, 17]))
def train(self, X, y): self.n_samples = np.shape(X)[0] self.n_labels = np.shape(y)[1] score0 = self.scoring_fn( y, np.tile(self.rep_label[0], (self.n_samples, 1))) score1 = self.scoring_fn( y, np.tile(self.rep_label[1], (self.n_samples, 1))) lbl = (((score1 - score0) > 0) + 0.0) weight = np.abs(score1 - score0) if np.sum(weight) > 0: weight = weight / np.sum(weight) * len(X) if len(np.unique(lbl)) == 1: self.label = np.unique(lbl)[0] self.base_clf_ = None else: self.base_clf_ = copy.deepcopy(self.base_clf) self.base_clf_.train(Dataset(X, lbl), sample_weight=weight)
def make_query(self, n_queries=1, n_jobs=20): labeled_entries = self.dataset.get_labeled_entries() Xlabeled, y = zip(*labeled_entries) Xlabeled = np.array(Xlabeled) y = list(y) unlabeled_entries = self.dataset.get_unlabeled_entries() unlabeled_entry_ids, X_pool = zip(*unlabeled_entries) label_count = self.dataset.get_num_of_labels() clf = copy.copy(self.model) clf.train(Dataset(Xlabeled, y)) p = Pool(n_jobs) errors = p.map(self.E, [(Xlabeled, y, x, clf, label_count) for x in X_pool]) p.terminate() return unlabeled_entry_ids[errors.index(min(errors))]
def test_query_by_committee_vote(self): trn_ds = Dataset( self.X, np.concatenate([self.y[:10], [None] * (len(self.y) - 10)])) qs = QueryByCommittee(trn_ds, disagreement='vote', models=[ LogisticRegression(C=1.0, solver="liblinear", multi_class="ovr"), LogisticRegression(C=0.01, solver="liblinear", multi_class="ovr"), LogisticRegression(C=100, solver="liblinear", multi_class="ovr") ], random_state=1126) qseq = run_qs(trn_ds, qs, self.y, self.quota) assert_array_equal( qseq, np.array([267, 210, 229, 220, 134, 252, 222, 142, 245, 228]))
def make_query(self): dataset = self.dataset X, Y = dataset.get_labeled_entries() Y = np.array(Y) unlabeled_entry_ids, X_pool = dataset.get_unlabeled_entries() X_pool = np.array(X_pool) clfs = [] boundaries = [] for i in range(self.n_labels): if len(np.unique(Y[:, i])) == 1: clf = DummyClf() else: clf = copy.deepcopy(self.base_clf) clf.train(Dataset(X, Y[:, i])) boundaries.append(np.abs(clf.predict_real(X_pool)[:, 1])) clfs.append(clf) choices = np.where(np.array(boundaries) == np.min(boundaries))[1] ask_id = self.random_state_.choice(choices) return unlabeled_entry_ids[ask_id]
def train(self, dataset): r"""Train model with given feature. Parameters ---------- X : array-like, shape=(n_samples, n_features) Train feature vector. Y : array-like, shape=(n_samples, n_labels) Target labels. Attributes ---------- clfs\_ : list of :py:mod:`libact.models` object instances Classifier instances. Returns ------- self : object Retuen self. """ X, Y = dataset.format_sklearn() X = np.array(X) Y = np.array(Y) self.n_labels_ = np.shape(Y)[1] self.n_features_ = np.shape(X)[1] self.clfs_ = [] for i in range(self.n_labels_): # TODO should we handle it here or we should handle it before calling if len(np.unique(Y[:, i])) == 1: clf = DummyClf() else: clf = copy.deepcopy(self.base_clf) clf.train(Dataset(X, Y[:, i])) self.clfs_.append(clf) return self
def make_query(self): dataset = self.dataset X, y = zip(*dataset.get_labeled_entries()) unlabled_entries = dataset.get_unlabeled_entries() if isinstance(self.random_sampling, int): unlabled_entries = random.sample(unlabled_entries, k=self.random_sampling) elif isinstance(self.random_sampling, float): unlabled_entries = random.sample( unlabled_entries, k=int(len(unlabled_entries) * self.random_sampling)) unlabeled_entry_ids, X_pool = zip(*unlabled_entries) classes = np.unique(y) n_classes = len(classes) self.model.train(dataset) proba = self.model.predict_proba(X_pool) scores = [] for i, x in enumerate(X_pool): score = [] for yi in range(n_classes): m = copy.deepcopy(self.model) m.train(Dataset(np.vstack((X, [x])), y + (yi, ))) p = m.predict_proba(X_pool) if self.loss == '01': # 0/1 loss score.append(proba[i, yi] * np.sum(1 - np.max(p, axis=1))) elif self.loss == 'log': # log loss score.append(proba[i, yi] * -np.sum(p * np.log(p))) scores.append(np.sum(score)) choices = np.where(np.array(scores) == np.min(scores))[0] ask_idx = self.random_state_.choice(choices) return unlabeled_entry_ids[ask_idx]