def make_query(self): dataset = self.dataset unlabeled_entry_ids, unlabeled_pool = zip( *dataset.get_unlabeled_entries()) labeled_pool, y = zip(*dataset.get_labeled_entries()) if len(np.unique(y)) > 2: raise ValueError("HintSVM query strategy support binary class " "active learning only. Found %s classes" % len(np.unique(y))) hint_pool_idx = self.random_state_.choice( len(unlabeled_pool), int(len(unlabeled_pool) * self.p)) hint_pool = np.array(unlabeled_pool)[hint_pool_idx] weight = [1.0 for _ in range(len(labeled_pool))] +\ [(self.ch / self.cl) for _ in range(len(hint_pool))] y = list(y) + [0 for _ in range(len(hint_pool))] X = [x for x in labeled_pool] +\ [x for x in hint_pool] p_val = hintsvm_query( np.array(X, dtype=np.float64), np.array(y, dtype=np.float64), np.array(weight, dtype=np.float64), np.array(unlabeled_pool, dtype=np.float64), self.svm_params) p_val = [abs(float(val[0])) for val in p_val] idx = int(np.argmax(p_val)) return unlabeled_entry_ids[idx]
def retrieve_score_list(self): dataset = self.dataset unlabeled_entry_ids, unlabeled_pool = zip( *dataset.get_unlabeled_entries()) labeled_pool, y = zip(*dataset.get_labeled_entries()) if len(np.unique(y)) > 2: raise ValueError("HintSVM query strategy support binary class " "active learning only. Found %s classes" % len(np.unique(y))) hint_pool_idx = self.random_state_.choice( len(unlabeled_pool), int(len(unlabeled_pool) * self.p)) hint_pool = np.array(unlabeled_pool)[hint_pool_idx] weight = [1.0 for _ in range(len(labeled_pool))] + \ [(self.ch / self.cl) for _ in range(len(hint_pool))] if 0 in y: assert -1 not in y, "we need to switch 0 to -1 in this case" y = map(lambda v: v if v != 0 else -1, y) y = list(y) + [0 for _ in range(len(hint_pool))] # cant have zeros !!! X = [x for x in labeled_pool] + \ [x for x in hint_pool] p_val = hintsvm_query(np.array(X, dtype=np.float64), np.array(y, dtype=np.float64), np.array(weight, dtype=np.float64), np.array(unlabeled_pool, dtype=np.float64), self.svm_params) p_val = [abs(float(val[0])) for val in p_val] return dict(zip(unlabeled_entry_ids, p_val))
def make_query(self, return_score=False): """Return the index of the sample to be queried and labeled and selection score of each sample. Read-only. No modification to the internal states. Returns ------- ask_id : int The index of the next unlabeled sample to be queried and labeled. score : list of (index, score) tuple Selection score of unlabled entries, the larger the better. """ dataset = self.dataset unlabeled_entry_ids, _ = zip(*dataset.get_unlabeled_entries()) unlabeled_entry_ids, scores = zip(*self._get_scores()) ask_id = np.argmax(scores) if return_score: return unlabeled_entry_ids[ask_id], \ list(zip(unlabeled_entry_ids, scores)) else: return unlabeled_entry_ids[ask_id]
def retrieve_score_list(self): unlabeled_entry_ids, _ = zip(*self.dataset.get_unlabeled_entries()) labeled_entry_ids = np.array([eid for eid, x in enumerate(self.dataset.data) if x[1] != None]) labels = np.array([x[1] for eid, x in enumerate(self.dataset.data) if x[1] != None]).reshape(-1, 1) centers = self.kmeans_.cluster_centers_ P_k_x = self.P_k_x p_x = self.p_x[list(unlabeled_entry_ids)] clf = DensityWeightedLogisticRegression(P_k_x[labeled_entry_ids, :], centers, self.C) clf.train(labeled_entry_ids, labels) P_y_k = clf.predict() P_y_x = np.zeros(len(unlabeled_entry_ids)) for k, center in enumerate(centers): P_y_x += P_y_k[k] * P_k_x[unlabeled_entry_ids, k] # binary case expected_error = P_y_x expected_error[P_y_x >= 0.5] = 1. - P_y_x[P_y_x >= 0.5] scores = expected_error * p_x return dict(zip(unlabeled_entry_ids, scores))
def make_query(self, return_score=False): dataset = self.dataset self.model.train(dataset) unlabeled_entry_ids, X_pool = zip(*dataset.get_unlabeled_entries()) if isinstance(self.model, ProbabilisticModel): dvalue = self.model.predict_proba(X_pool) elif isinstance(self.model, ContinuousModel): dvalue = self.model.predict_real(X_pool) if np.shape(dvalue)[1] > 2: dvalue = -(np.partition(-dvalue, 2, axis=1)[:, :2]) dist = np.abs(dvalue[:, 0] - dvalue[:, 1]) arr1, arr2 = [], [] arr1.append(np.array(dvalue[:, 0]).tolist()) arr2.append(np.array(dvalue[:, 1]).tolist()) div = -np.max(cosine_similarity(arr1, arr2), axis=1) score = (self.lmbda * dist) + ((1 - self.lmbda) * div) ask_id = np.argmin(score) if return_score: return unlabeled_entry_ids[ask_id], \ list(zip(unlabeled_entry_ids, score)) else: return unlabeled_entry_ids[ask_id]
def make_query(self, return_score=False): dataset = self.dataset self.model.train(dataset) unlabeled_entry_ids, X_pool = zip(*dataset.get_unlabeled_entries()) if isinstance(self.model, ProbabilisticModel): dvalue = self.model.predict_proba(X_pool) elif isinstance(self.model, ContinuousModel): dvalue = self.model.predict_real(X_pool) '''if self.method == 'sm': if np.shape(dvalue)[1] > 2: dvalue = -(np.partition(-dvalue, 2, axis=1)[:, :2]) score = -np.abs(dvalue[:, 0] - dvalue[:, 1])''' if self.method == 'mm': # max margin margin = np.partition(-dvalue, 1, axis=1) score = -np.abs(margin[:, 0] - margin[:, 1]) '''elif self.method == 'entropy': score = np.sum(-dvalue * np.log(dvalue), axis=1)''' ask_id = np.argmax(score) if return_score: return unlabeled_entry_ids[ask_id], \ list(zip(unlabeled_entry_ids, score)) else: return unlabeled_entry_ids[ask_id]
def make_query(self): dataset = self.dataset unlabeled_entry_ids, unlabeled_pool = zip( *dataset.get_unlabeled_entries()) labeled_pool, y = zip(*dataset.get_labeled_entries()) if len(np.unique(y)) > 2: raise ValueError("HintSVM query strategy support binary class " "active learning only. Found %s classes" % len(np.unique(y))) hint_pool_idx = self.random_state_.choice( len(unlabeled_pool), int(len(unlabeled_pool) * self.p)) hint_pool = np.array(unlabeled_pool)[hint_pool_idx] weight = [1.0 for _ in range(len(labeled_pool))] +\ [(self.ch / self.cl) for _ in range(len(hint_pool))] y = list(y) + [0 for _ in range(len(hint_pool))] X = [x.tolist() for x in labeled_pool] +\ [x.tolist() for x in hint_pool] p_val = hintsvm_query(np.array(X), np.array(y), np.array(weight), np.array([x.tolist() for x in unlabeled_pool]), self.svm_params) p_val = [abs(float(val[0])) for val in p_val] idx = int(np.argmax(p_val)) return unlabeled_entry_ids[idx]
def make_query(self): dataset = self.dataset labeled_pool, Y = zip(*dataset.get_labeled_entries()) unlabeled_entry_ids, X_pool = zip(*dataset.get_unlabeled_entries()) labeled_pool = np.array(labeled_pool) Y = np.array(Y) X_pool = np.array(X_pool) br = BinaryRelevance(self.br_base) br.train(Dataset(labeled_pool, Y)) trnf = br.predict_proba(labeled_pool) poolf = br.predict_proba(X_pool) f = poolf * 2 - 1 trnf = np.sort(trnf, axis=1)[:, ::-1] trnf /= np.tile(trnf.sum(axis=1).reshape(-1, 1), (1, trnf.shape[1])) if len(np.unique(Y.sum(axis=1))) == 1: lr = DummyClf() else: lr = self.logistic_regression_ lr.train(Dataset(trnf, Y.sum(axis=1))) idx_poolf = np.argsort(poolf, axis=1)[:, ::-1] poolf = np.sort(poolf, axis=1)[:, ::-1] poolf /= np.tile(poolf.sum(axis=1).reshape(-1, 1), (1, poolf.shape[1])) pred_num_lbl = lr.predict(poolf).astype(int) yhat = -1 * np.ones((len(X_pool), self.n_labels), dtype=int) for i, p in enumerate(pred_num_lbl): yhat[i, idx_poolf[i, :p]] = 1 score = ((1 - yhat * f) / 2).sum(axis=1) ask_id = self.random_state_.choice(np.where(score == np.max(score))[0]) return unlabeled_entry_ids[ask_id]
def make_query(self): dataset = self.dataset unlabeled_entry_ids, unlabeled_pool = zip( *dataset.get_unlabeled_entries()) labeled_pool, y = zip(*dataset.get_labeled_entries()) cl = self.cl ch = self.ch p = self.p hint_pool_idx = self.random_state_.choice( len(unlabeled_pool), int(len(unlabeled_pool)*p)) hint_pool = np.array(unlabeled_pool)[hint_pool_idx] weight = [1.0 for _ in range(len(labeled_pool))] +\ [(ch/cl) for i in range(len(hint_pool))] y = list(y) + [0 for i in range(len(hint_pool))] X = [x.tolist() for x in labeled_pool] +\ [x.tolist() for x in hint_pool] p_val = hintsvm_query( np.array(X), np.array(y), np.array(weight), np.array([x.tolist() for x in unlabeled_pool]), self.svm_params) p_val = [abs(float(val[0])) for val in p_val] idx = int(np.argmax(p_val)) return unlabeled_entry_ids[idx]
def __init__(self, dataset, classes, active_selecting=True, subsample_qs=None, random_state=None): super(HierarchicalSampling, self).__init__(dataset) X = np.array(next(zip(*self.dataset.get_entries()))) cluster = AgglomerativeClustering() cluster.fit(X) childrens = cluster.children_ if subsample_qs is not None: if not isinstance(subsample_qs, QueryStrategy): raise TypeError("subsample_qs has to be a QueryStrategy") self.sub_qs = subsample_qs else: self.sub_qs = None self.active_selecting = active_selecting self.random_state_ = seed_random_state(random_state) self.n = len(childrens) + 1 self.m = self.n * 2 - 1 self.num_class = len(classes) self.classes = list(classes) self.class_id = dict(zip(self.classes, range(self.num_class))) self.parent = np.full(self.m, NO_NODE, dtype=int) self.size = np.zeros(self.m, dtype=int) self.depth = np.zeros(self.m, dtype=int) for i, (left_child, right_child) in enumerate(childrens): parent = i + self.n self.parent[left_child] = parent self.parent[right_child] = parent self.left_child = np.concatenate([np.full(self.n, NO_NODE), childrens[:,0]]).astype(int) self.right_child = np.concatenate([np.full(self.n, NO_NODE), childrens[:,1]]).astype(int) for i in range(self.n): node = i cur_depth = 0 while node != NO_NODE: assert node >= 0 and node < self.m self.size[node] += 1 self.depth[node] = max(self.depth[node], cur_depth) cur_depth += 1 node = self.parent[node] self.count = np.zeros((self.m, self.num_class), dtype=int) self.total = np.zeros(self.m, dtype=int) self.upper_bound = np.ones((self.m, self.num_class), dtype=float) self.lower_bound = np.zeros((self.m, self.num_class), dtype=float) self.admissible = np.zeros((self.m, self.num_class), dtype=bool) self.best_label = np.full(self.m, NO_LABEL, dtype=int) self.split = np.zeros(self.m, dtype=bool) self.cost = self.size.copy() self.prunings = [self.m-1] for i, entry in enumerate(self.dataset.data): if entry[1] != None: self.update(i, entry[1])
def import_scipy_mat(filename): from scipy.io import loadmat data = loadmat(filename) X = data['X'] y = data['y'] zipper = list(zip(X, y)) np.random.shuffle(zipper) X, y = zip(*zipper) X, y = np.array(X), np.array(y).reshape(-1) return Dataset(X, y)
def _get_scores(self): dataset = self.dataset _, y = dataset.get_entries() unlabeled_entry_ids, X_pool = dataset.get_unlabeled_entries() dvalue, score = (None, [random()]) if y.any(): self.model.train(dataset) if isinstance(self.model, ProbabilisticModel): dvalue = self.model.predict_proba(X_pool) elif isinstance(self.model, ContinuousModel): dvalue = self.model.predict_real(X_pool) if self.method == 'lc': # least confident score = -np.max(dvalue, axis=1) elif self.method == 'sm': # smallest margin if np.shape(dvalue)[1] > 2: # Find 2 largest decision values dvalue = -(np.partition(-dvalue, 2, axis=1)[:, :2]) score = -np.abs(dvalue[:, 0] - dvalue[:, 1]) elif self.method == 'entropy': score = np.sum(-dvalue * np.log(dvalue), axis=1) return zip(unlabeled_entry_ids, score)
def make_query(self): dataset = self.dataset unlabeled_entry_ids, X_pool = zip(*dataset.get_unlabeled_entries()) if self.disagreement == 'vote': # Let the trained students vote for unlabeled data votes = np.zeros((len(X_pool), len(self.students))) for i, student in enumerate(self.students): votes[:, i] = student.predict(X_pool) vote_entropy = self._vote_disagreement(votes) ask_idx = self.random_state_.choice( np.where(np.isclose(vote_entropy, np.max(vote_entropy)))[0]) elif self.disagreement == 'kl_divergence': proba = [] for student in self.students: proba.append(student.predict_proba(X_pool)) proba = np.array(proba).transpose(1, 0, 2).astype(float) avg_kl = self._kl_divergence_disagreement(proba) ask_idx = self.random_state_.choice( np.where(np.isclose(avg_kl, np.max(avg_kl)))[0]) return unlabeled_entry_ids[ask_idx]
def _labeled_uniform_sample(self, sample_size): """sample labeled entries uniformly""" labeled_entries = self.dataset.get_labeled_entries() samples = [labeled_entries[ self.random_state_.randint(0, len(labeled_entries)) ]for _ in range(sample_size)] return Dataset(*zip(*samples))
def make_query(self): dataset = self.dataset self.model.train(dataset) unlabeled_entry_ids, X_pool = zip(*dataset.get_unlabeled_entries()) if isinstance(self.model, ProbabilisticModel): dvalue = self.model.predict_proba(X_pool) elif isinstance(self.model, ContinuousModel): dvalue = self.model.predict_real(X_pool) if self.method == 'lc': # least confident ask_id = np.argmin(np.max(dvalue, axis=1)) elif self.method == 'sm': # smallest margin if np.shape(dvalue)[1] > 2: # Find 2 largest decision values dvalue = -(np.partition(-dvalue, 2, axis=1)[:, :2]) margin = np.abs(dvalue[:, 0] - dvalue[:, 1]) ask_id = np.argmin(margin) elif self.method == 'entropy': entropy = np.sum(-dvalue * np.log(dvalue), axis=1) ask_id = np.argmax(entropy) return unlabeled_entry_ids[ask_id]
def make_query(self): dataset = self.dataset unlabeled_entry_ids, scores = zip(*self._get_scores()) ask_id = self.random_state_.choice(np.where(scores == np.max(scores))[0]) return unlabeled_entry_ids[ask_id]
def run(trn_ds, tst_ds, lbr, model, qs, quota, j): E_in, E_out, l = [], [], [] model.train(trn_ds) E_out.append(1 - model.score(tst_ds)) counter = 0 steps = 15 k = int(j / steps) if k == 0: k = 1 for _ in range(0, steps): counter = counter + 1 l = [] for i in range(0, k): ask_id = qs.make_query() l.append(ask_id) X, _ = zip(*trn_ds.data) lb = lbr.label(X[ask_id]) trn_ds.update(ask_id, lb) model.train(trn_ds) E_out = np.append(E_out, 1 - model.score(tst_ds)) #print '-->', len(E_out) return E_out, len(E_out), trn_ds, k, j
def make_query(self, n=1, return_score=False): """Return the index of the sample to be queried and labeled and selection score of each sample. Read-only. No modification to the internal states. Returns ------- ask_ids : list The batch of indexes of the next unlabeled samples to be queried and labeled. score : list of (index, score) tuple Selection score of unlabled entries, the larger the better. """ dataset = self.dataset # unlabeled_entry_ids, _ = dataset.get_unlabeled_entries() unlabeled_entry_ids, scores = zip(*self._get_scores()) # ask_ids = np.argmax(scores) ask_ids = np.argsort(scores)[-n:][::-1] # return unlabeled_entry_ids[ask_id] res = [unlabeled_entry_ids[i] for i in ask_ids] return res
def make_query(self, return_score=False, return_label=False, n_instances=1): """Return the index of the sample to be queried and labeled and selection score of each sample. Read-only. No modification to the internal states. Returns ------- ask_id : int The index of the next unlabeled sample to be queried and labeled. score : list of (index, score) tuple Selection score of unlabled entries, the larger the better. """ dataset = self.dataset # self.model.train(dataset) unlabeled_entry_ids, X_pool = zip(*dataset.get_unlabeled_entries()) X_pool = np.array(X_pool) unlabeled_entry_ids = np.array(unlabeled_entry_ids) if isinstance(self.model, ProbabilisticModel): dvalue = self.model.predict_proba(X_pool) elif isinstance(self.model, ContinuousModel): dvalue = self.model.predict_real(X_pool) if self.method == 'lc': # least confident score = -np.max(dvalue, axis=1) elif self.method == 'sm': # smallest margin if np.shape(dvalue)[1] > 2: # Find 2 largest decision values dvalue = -(np.partition(-dvalue, 2, axis=1)[:, :2]) score = -np.abs(dvalue[:, 0] - dvalue[:, 1]) elif self.method == 'entropy': score = np.sum(-dvalue * np.log(dvalue), axis=1) # ask_id = np.argmax(score) ask_ids = multi_argmax(score, n_instances=n_instances) # pre_label = np.argmax(dvalue, axis=1) # ask_label = pre_label[ask_ids] if return_score and return_label: pre_label = np.argmax(dvalue, axis=1) return unlabeled_entry_ids[ask_ids], score[ask_ids], pre_label[ ask_ids] elif return_score: return unlabeled_entry_ids[ask_ids], score[ask_ids] elif return_label: pre_label = np.argmax(dvalue, axis=1) return unlabeled_entry_ids[ask_ids], pre_label[ask_ids] else: return unlabeled_entry_ids[ask_ids]
def retrieve_score_list(self): dataset = self.dataset unlabeled_entry_ids, X_pool = zip(*dataset.get_unlabeled_entries()) if self.disagreement == 'vote': # Let the trained students vote for unlabeled data votes = np.zeros((len(X_pool), len(self.students))) for i, student in enumerate(self.students): votes[:, i] = student.predict(X_pool) score_list = self._vote_disagreement(votes) elif self.disagreement == 'kl_divergence': proba = [] for student in self.students: proba.append(student.predict_proba(X_pool)) proba = np.array(proba).transpose(1, 0, 2).astype(float) score_list = self._kl_divergence_disagreement(proba) return dict(zip(unlabeled_entry_ids, score_list))
def make_query(self): dataset = self.dataset unlabeled_entry_ids, pool_X = zip(*dataset.get_unlabeled_entries()) # The input class should be 0-n_classes X, y = zip(*dataset.get_labeled_entries()) pred_embed = np.zeros((len(pool_X), self.embed_dim)) for i in range(self.embed_dim): self.regressors[i].fit(X, self.class_embed[y, i]) pred_embed[:, i] = self.regressors[i].predict(pool_X) dist, _ = self.nn_.kneighbors(pred_embed) dist = dist[:, 0] ask_idx = self.random_state_.choice( np.where(np.isclose(dist, np.max(dist)))[0]) return unlabeled_entry_ids[ask_idx]
def make_query(self, n=1): dataset = self.dataset unlabeled_entry_ids, _ = zip(*dataset.get_unlabeled_entries()) entry_id = self.random_state_.choice(unlabeled_entry_ids, size=n, replace=False) return list(entry_id) if n > 1 else entry_id[0]
def make_query(self): dataset = self.dataset unlabeled_entry_ids, _ = zip(*dataset.get_unlabeled_entries()) unlabeled_entry_ids = shuffle(unlabeled_entry_ids) return_len = min(len(unlabeled_entry_ids), self.batch_size) entry_id = unlabeled_entry_ids[:return_len] # entry_id = [20, 104,106,64,77,37,164,136,127,130] return entry_id
def __init__(self, *args, **kwargs): super(ActiveLearningByLearning, self).__init__(*args, **kwargs) self.query_strategies_ = kwargs.pop("query_strategies", None) if self.query_strategies_ is None: raise TypeError("__init__() missing required keyword-only argument: " "'query_strategies'") elif not self.query_strategies_: raise ValueError("query_strategies list is empty") # check if query_strategies share the same dataset with albl for qs in self.query_strategies_: if qs.dataset != self.dataset: raise ValueError("query_strategies should share the same" "dataset instance with albl") # parameters for Exp4.p self.delta = kwargs.pop("delta", 0.1) # query budget self.T = kwargs.pop("T", None) if self.T is None: raise TypeError("__init__() missing required keyword-only argument: 'T'") self.unlabeled_entry_ids, _ = zip(*self.dataset.get_unlabeled_entries()) self.unlabeled_invert_id_idx = {} for i, entry in enumerate(self.dataset.get_unlabeled_entries()): self.unlabeled_invert_id_idx[entry[0]] = i self.uniform_sampler = kwargs.pop("uniform_sampler", True) if not isinstance(self.uniform_sampler, bool): raise ValueError("'uniform_sampler' should be {True, False}") self.pmin = kwargs.pop("pmin", None) n_algorithms = len(self.query_strategies_) + self.uniform_sampler if self.pmin and (self.pmin < (1.0 / n_algorithms) or self.pmin < 0): raise ValueError("'pmin' should be 0 < pmin < " "1/len(n_active_algorithm)") self.exp4p_ = Exp4P( query_strategies=self.query_strategies_, T=self.T, delta=self.delta, pmin=self.pmin, unlabeled_invert_id_idx=self.unlabeled_invert_id_idx, uniform_sampler=self.uniform_sampler, ) self.budget_used = 0 # classifier instance self.model = kwargs.pop("model", None) if self.model is None: raise TypeError("__init__() missing required keyword-only argument: 'model'") random_state = kwargs.pop("random_state", None) self.random_state_ = seed_random_state(random_state) self.query_dist = None self.W = [] self.queried_hist_ = []
def make_query(self): dataset = self.dataset unlabeled_entry_ids, X_pool = zip(*dataset.get_unlabeled_entries()) # Let the trained students vote for unlabeled data votes = np.zeros((len(X_pool), len(self.students))) for i, student in enumerate(self.students): votes[:, i] = student.predict(X_pool) id_disagreement = [(i, dis) for i, dis in zip(unlabeled_entry_ids, self.disagreement(votes))] disagreement = sorted(id_disagreement, key=lambda id_dis: id_dis[1], reverse=True) ask_id = self.random_state_.choice( [e[0] for e in disagreement if e[1] == disagreement[0][1]]) return ask_id
def make_query(self, return_score=False): """Return the index of the sample to be queried and labeled and selection score of each sample. Read-only. No modification to the internal states. Returns ------- ask_id : int The index of the next unlabeled sample to be queried and labeled. score : list of (index, score) tuple Selection score of unlabled entries, the larger the better. """ dataset = self.dataset self.model.train(dataset) unlabeled_entry_ids, X_pool = zip(*dataset.get_unlabeled_entries()) if isinstance(self.model, ProbabilisticModel): dvalue = self.model.predict_proba(X_pool) elif isinstance(self.model, ContinuousModel): dvalue = self.model.predict_real(X_pool) if self.method == 'lc': # least confident score = -np.max(dvalue, axis=1) elif self.method == 'sm': # smallest margin if np.shape(dvalue)[1] > 2: # Find 2 largest decision values dvalue = -(np.partition(-dvalue, 2, axis=1)[:, :2]) score = -np.abs(dvalue[:, 0] - dvalue[:, 1]) elif self.method == 'entropy': score = np.sum(-dvalue * np.log(dvalue), axis=1) ask_id = np.argmax(score) if return_score: return unlabeled_entry_ids[ask_id], \ list(zip(unlabeled_entry_ids, score)) else: return unlabeled_entry_ids[ask_id]
def make_query(self): dataset = self.dataset labeled_pool, Y = zip(*dataset.get_labeled_entries()) unlabeled_entry_ids, X_pool = zip(*dataset.get_unlabeled_entries()) major_clf = copy.deepcopy(self.major_learner) major_clf.train(dataset) aux_clf = copy.deepcopy(self.auxiliary_learner) aux_clf.train(dataset) if self.criterion == 'hlr': major_pred = major_clf.predict(X_pool) aux_pred = aux_clf.predict(X_pool) score = np.abs(major_pred - aux_pred).mean(axis=1) elif self.criterion in ['mmr', 'shlr']: major_pred = major_clf.predict(X_pool) * 2 - 1 if 'predict_real' in dir(aux_clf): aux_pred = aux_clf.predict_real(X_pool) elif 'predict_proba' in dir(aux_clf): aux_pred = aux_clf.predict_proba(X_pool) * 2 - 1 else: raise AttributeError("aux_learner did not support either" "'predict_real' or 'predict_proba'" "method") loss = (major_pred * aux_pred).mean(axis=1) if self.criterion == 'mmr': score = (1. - major_pred * aux_pred) / 2. score = np.sum(score, axis=1) elif self.criterion == 'shlr': b = self.b score = (b - np.clip(major_pred * aux_pred, -b, b)) / 2. / b score = np.sum(score, axis=1) else: raise TypeError( "supported criterion are ['hlr', 'shlr', 'mmr'], the given " "one is: " + self.criterion ) ask_id = self.random_state_.choice(np.where(score == np.max(score))[0]) return unlabeled_entry_ids[ask_id]
def make_query(self): labeled_entries = self.dataset.get_labeled_entries() Xlabeled, y = zip(*labeled_entries) Xlabeled = np.array(Xlabeled) y = list(y) unlabeled_entries = self.dataset.get_unlabeled_entries() unlabeled_entry_ids, X_pool = zip(*unlabeled_entries) label_count = self.dataset.get_num_of_labels() clf = copy.copy(self.model) clf.train(Dataset(Xlabeled, y)) p = Pool(self.n_jobs) errors = p.map(_E, [(Xlabeled, y, x, clf, label_count, self.sigma, self.model) for x in X_pool]) p.terminate() return unlabeled_entry_ids[errors.index(min(errors))]
def make_query(self): dataset = self.dataset labeled_pool, Y = zip(*dataset.get_labeled_entries()) unlabeled_entry_ids, X_pool = zip(*dataset.get_unlabeled_entries()) major_clf = copy.deepcopy(self.major_learner) major_clf.train(dataset) aux_clf = copy.deepcopy(self.auxiliary_learner) aux_clf.train(dataset) if self.criterion == 'hlr': major_pred = major_clf.predict(X_pool) aux_pred = aux_clf.predict(X_pool) score = np.abs(major_pred - aux_pred).mean(axis=1) elif self.criterion in ['mmr', 'shlr']: major_pred = major_clf.predict(X_pool) * 2 - 1 if 'predict_real' in dir(aux_clf): aux_pred = aux_clf.predict_real(X_pool) elif 'predict_proba' in dir(aux_clf): aux_pred = aux_clf.predict_proba(X_pool) * 2 - 1 else: raise AttributeError("aux_learner did not support either" "'predict_real' or 'predict_proba'" "method") loss = (major_pred * aux_pred).mean(axis=1) if self.criterion == 'mmr': score = (1. - major_pred * aux_pred) / 2. score = np.sum(score, axis=1) elif self.criterion == 'shlr': b = self.b score = (b - np.clip(major_pred * aux_pred, -b, b)) / 2. / b score = np.sum(score, axis=1) else: raise TypeError( "supported criterion are ['hlr', 'shlr', 'mmr'], the given " "one is: " + self.criterion) ask_id = self.random_state_.choice(np.where(score == np.max(score))[0]) return unlabeled_entry_ids[ask_id]
def make_query(self, n_instances=1): dataset = self.dataset unlabeled_entry_ids, _ = zip(*dataset.get_unlabeled_entries()) unlabeled_entry_ids = np.array(unlabeled_entry_ids) rs = random.sample(range(0, len(unlabeled_entry_ids)), n_instances) entry_ids = unlabeled_entry_ids[rs] # entry_id = unlabeled_entry_ids[ # self.random_state_.randint(0, len(unlabeled_entry_ids))] return entry_ids
def _get_scores(self): dataset = self.dataset X, _ = zip(*dataset.data) scores = self.base_query_strategy._get_scores() _, X_pool = zip(*dataset.get_unlabeled_entries()) unlabeled_entry_ids, base_scores = zip(*scores) self.clustering_method.fit(X) pool_cluster = self.clustering_method.predict(X_pool) cluster_center = self.clustering_method.cluster_centers_ similarity = [] for i in range(len(X_pool)): similarity.append( self.similarity_metric( X_pool[i].reshape(1, -1), cluster_center[pool_cluster[i]].reshape(1, -1))[0][0]) similarity = np.asarray(similarity) scores = base_scores * similarity**self.beta return zip(unlabeled_entry_ids, scores)
def make_query(self): dataset = self.dataset self.model.fit(*(dataset.format_sklearn())) unlabeled_entry_ids, X_pool = zip(*dataset.get_unlabeled_entries()) mean, var = self.model.predict_mean_var(X_pool) score = 1.96 * var**.5 - np.abs(mean) ask_id = np.argmax(score) return unlabeled_entry_ids[ask_id]
def make_query(self): labeled_entries = self.dataset.get_labeled_entries() Xlabeled, y = zip(*labeled_entries) Xlabeled = np.array(Xlabeled) y = list(y) unlabeled_entries = self.dataset.get_unlabeled_entries() unlabeled_entry_ids, X_pool = zip(*unlabeled_entries) label_count = self.dataset.get_num_of_labels() clf = copy.copy(self.model) clf.train(Dataset(Xlabeled, y)) p = Pool(self.n_jobs) errors = p.map( _E, [(Xlabeled, y, x, clf, label_count, self.sigma, self.model) for x in X_pool]) p.terminate() return unlabeled_entry_ids[errors.index(min(errors))]
def make_query(self): dataset = self.dataset unlabeled_entry_ids, X_pool = zip(*dataset.get_unlabeled_entries()) # Let the trained students vote for unlabeled data votes = np.zeros((len(X_pool), len(self.students))) for i, student in enumerate(self.students): votes[:, i] = student.predict(X_pool) id_disagreement = [ (i, dis) for i, dis in zip(unlabeled_entry_ids, self.disagreement(votes)) ] disagreement = sorted(id_disagreement, key=lambda id_dis: id_dis[1], reverse=True) ask_id = self.random_state_.choice( [e[0] for e in disagreement if e[1] == disagreement[0][1]]) return ask_id
def make_query(self, return_score=False, n_instances=2): """Return the index of the sample to be queried and labeled and selection score of each sample. Read-only. No modification to the internal states. Returns ------- ask_id : int The index of the next unlabeled sample to be queried and labeled. score : list of (index, score) tuple Selection score of unlabled entries, the larger the better. """ dataset = self.dataset unlabeled_entry_ids, _ = zip(*dataset.get_unlabeled_entries()) unlabeled_entry_ids, scores = zip(*self._get_scores()) unlabeled_entry_ids = np.array(unlabeled_entry_ids) scores = np.array(scores) # ask_id = np.argmax(scores) # # if return_score: # return unlabeled_entry_ids[ask_id], \ # list(zip(unlabeled_entry_ids, scores)) # else: # return unlabeled_entry_ids[ask_id] ask_ids = multi_argmax(scores, n_instances=n_instances) # pre_label = np.argmax(dvalue, axis=1) # ask_label = pre_label[ask_ids] if return_score: return unlabeled_entry_ids[ask_ids], \ list(zip(unlabeled_entry_ids, scores)) else: return unlabeled_entry_ids[ask_ids]
def make_query(self): dataset = self.dataset X, Y = zip(*dataset.get_labeled_entries()) Y = np.array(Y) unlabeled_entry_ids, X_pool = zip(*dataset.get_unlabeled_entries()) X_pool = np.array(X_pool) clfs = [] boundaries = [] for i in range(self.n_labels): if len(np.unique(Y[:, i])) == 1: clf = DummyClf() else: clf = copy.deepcopy(self.base_clf) clf.train(Dataset(X, Y[:, i])) boundaries.append(np.abs(clf.predict_real(X_pool)[:, 1])) clfs.append(clf) choices = np.where(np.array(boundaries) == np.min(boundaries))[1] ask_id = self.random_state_.choice(choices) return unlabeled_entry_ids[ask_id]
def format_sklearn(self): """ Returns dataset in (X, y) format for use in scikit-learn. Unlabeled entries are ignored. Returns ------- X : numpy array, shape = (n_samples, n_features) Sample feature set. y : numpy array, shape = (n_samples) Sample labels. """ X, y = zip(*self.get_labeled_entries()) return np.array(X), np.array(y)
def labeled_uniform_sample(self, sample_size, replace=True): """Returns a Dataset object with labeled data only, which is resampled uniformly with given sample size. Parameter `replace` decides whether sampling with replacement or not. Parameters ---------- sample_size """ if replace: samples = [ random.choice(self.get_labeled_entries()) for _ in range(sample_size) ] else: samples = random.sample(self.get_labeled_entries(), sample_size) return Dataset(*zip(*samples))
def make_query(self): dataset = self.dataset self.model.train(dataset) unlabeled_entry_ids, X_pool = zip(*dataset.get_unlabeled_entries()) if self.method == "lc": # least confident ask_id = np.argmin(np.max(self.model.predict_real(X_pool), axis=1)) elif self.method == "sm": # smallest margin dvalue = self.model.predict_real(X_pool) if np.shape(dvalue)[1] > 2: # Find 2 largest decision values dvalue = -(np.partition(-dvalue, 2, axis=1)[:, :2]) margin = np.abs(dvalue[:, 0] - dvalue[:, 1]) ask_id = np.argmin(margin) return unlabeled_entry_ids[ask_id]
def make_query(self): dataset = self.dataset try: unlabeled_entry_ids, _ = zip(*dataset.get_unlabeled_entries()) except ValueError: # might be no more unlabeled data left return while self.budget_used < self.T: self.calc_query() ask_idx = self.random_state_.choice( np.arange(len(self.unlabeled_invert_id_idx)), size=1, p=self.query_dist )[0] ask_id = self.unlabeled_entry_ids[ask_idx] if ask_id in unlabeled_entry_ids: self.budget_used += 1 return ask_id else: self.update(ask_id, dataset.data[ask_id][1]) raise ValueError("Out of query budget")
def __init__(self, X=None, y=None): if X is None: X = [] if y is None: y = [] self.data = list(zip(X, y)) self.modified = True self._update_callback = set()
def __init__(self, X=[], y=[]): self.data = list(zip(X, y)) self.modified = True self._update_callback = set()
def make_query(self): dataset = self.dataset unlabeled_entry_ids, _ = zip(*dataset.get_unlabeled_entries()) entry_id = unlabeled_entry_ids[ self.random_state_.randint(0, len(unlabeled_entry_ids))] return entry_id