def predict_proba(self, X, mode=None): modes = ['knn', 'lp', 'pair'] if mode is not None and mode not in modes: raise ValueError('predict_proba can have modes: {}'.format(modes)) u, l = self.graph.n_unlabeled, self.graph.n_labeled logger.info('Now testing on {} samples...'.format(len(X))) neighbors, distances = self.graph.find_labeled_neighbors(X) affinity_mat = construct_weight_mat(neighbors, distances, (X.shape[0], l), self.graph.dtype) p_tl = normalize(affinity_mat.tocsr(), norm='l1', axis=1) y_from_labeled = ssdot(p_tl, self.datastore.y_labeled[:l], True) neighbors, distances = self.graph.find_unlabeled_neighbors(X) affinity_mat = construct_weight_mat(neighbors, distances, (X.shape[0], u), self.graph.dtype) p_tu = normalize(affinity_mat.tocsr(), norm='l1', axis=1) y_from_unlabeled = ssdot(p_tu, self.y_unlabeled[:u], True) y_pred_proba = y_from_labeled + y_from_unlabeled logger.info('Labels have been predicted.') if mode is None: return y_pred_proba elif mode == 'knn': return y_from_labeled elif mode == 'lp': return y_from_unlabeled elif mode == 'pair': return y_from_labeled, y_pred_proba
def train(self, tokens, images, Wvi, context=False, use_dask=False, n_worker=-1, n_chunk=200, verbose=False): verboseprint = lambda x: print(x) if verbose else None verboseprint('Constructing matrices...') if verbose and use_dask == False: tokens = tqdm(tokens) if use_dask: tVC, tVV_diag, tCC_diag = construct_matrix_dask( tokens, self.window_size, self.vocab_size, self._tokens2idx, n_worker, n_chunk, verbose) else: tVC, tVV_diag, tCC_diag = construct_matrix(tokens, self.window_size, self.vocab_size) self.mean_image = np.mean(images, axis=0, keepdims=True) Xvis = images - self.mean_image verboseprint('Squashing...') tVC, tVV_diag, tCC_diag = self._squash_arrays(tVC, tVV_diag, tCC_diag) verboseprint('Preparing arrays...') n_tags_per_vocab = mu.sum(Wvi, axis=1) tVWviXvis = ssdot(ssdot(sparse.diags(tVV_diag), Wvi), Xvis) Gvv_diag = tVV_diag + tVV_diag * n_tags_per_vocab Gvis = Xvis.T @ ssdot(sparse.diags(ssdot(Wvi.T, tVV_diag)), Xvis) verboseprint('Calculating word vectors...') H = bm.block_sym_mat([[None, tVC, tVWviXvis], [None, None, None], [None, None, None]]) G = bm.block_diag_mat( [sparse.diags(Gvv_diag), sparse.diags(tCC_diag), Gvis]) eigenvalues, A = randomized_ghep(H, G, n_components=self.dim, n_oversamples=self.dim + self.oversampling, n_iter=self.n_iter) self.ev = eigenvalues[::-1] self._set_keyedvector('wv', self.word_dict.keys(), self.dim, vec=A[:self.vocab_size, ::-1]) self.image_mapping = A[-Xvis.shape[1]:, ::-1] if context: self.context = Context(A[self.vocab_size:-Xvis.shape[1], ::-1], len(self.word_dict), self.window_size)
def _(self, other): # TODO: check dims res = [*itertools.repeat(None, self.block_shape[0])] for i in range(self.block_shape[0]): if self[i, i] is None or other[i, i] is None: continue res[i] = ssdot(self[i, i], other[i, i]) return block_diag_mat(res)
def get_next_candidates(major_changes, y_u_tent, y_u, a_rev_uu, p_uu): candidates = set() for index, label_diff in major_changes: back_neighbors = a_rev_uu.get(index, set()) for neigh in back_neighbors: y_u_tent.setdefault(neigh, y_u[neigh].copy()) y_u_tent[neigh] += ssdot(p_uu[neigh, index], label_diff, True) candidates.add(neigh) return candidates
def _offline_lp(self, return_iter=False, max_iter=30, tol=0.001): """Perform the offline label propagation until convergence of the label estimates of the unlabeled points. Parameters ---------- return_iter : bool, default=False Whether or not to return the number of iterations till convergence of the label estimates. Returns ------- y_unlabeled, num_iter : the new label estimates and optionally the number of iterations """ logger.debug('Doing Offline LP...') u, l = self.graph.n_unlabeled, self.graph.n_labeled p_ul = self.graph.subgraph_ul.transition_matrix[:u] p_uu = self.graph.subgraph_uu.transition_matrix[:u, :u] y_unlabeled = self.y_unlabeled[:u] y_labeled = self.datastore.y_labeled # First iteration y_static = ssdot(p_ul, y_labeled, dense_output=True) # Continue loop n_iter = 0 converged = False while n_iter < max_iter and not converged: y_unlabeled_prev = y_unlabeled.copy() y_unlabeled = y_static + ssdot(p_uu, y_unlabeled, True) n_iter += 1 converged = _converged(y_unlabeled, y_unlabeled_prev, tol) logger.info('Offline LP took {} iterations'.format(n_iter)) if return_iter: return y_unlabeled, n_iter else: return y_unlabeled
def _(self, other): # TODO: check dims res = [[val for val in itertools.repeat(None, self.block_shape[1])] for _ in range(other.block_shape[0])] for i, j in itertools.product(range(other.block_shape[0]), range(self.block_shape[1])): if other[i, j] is None or self[i, j] is None: continue res[i][j] = ssdot(other[i, j], self[j, j]) return block_mat(res)
def _(self, other): # TODO: check dims if other.ndim == 1: res = np.zeros(self.shape[0]) elif other.ndim == 2: res = np.zeros((self.shape[0], other.shape[1])) else: return NotImplemented start_row = 0 for i in range(self.block_shape[0]): end_row = start_row + self.shape_detail[1][i] if self[i, i] is None: start_row = end_row continue res[start_row:end_row, ] = ssdot(self[i, i], other[start_row:end_row, ]) start_row = end_row return res
def update_transitions(self, normalizer): self.transition_matrix = ssdot(normalizer, self.weight_matrix)
def _propagate_single(self, ind_new, y_new, return_iter=False): """Perform label propagation until convergence of the label estimates of the unlabeled points. Assume the new node has already been added to the graph, but no label has been estimated. Parameters ---------- ind_new : int The index of the new observation determined during graph addition. y_new : int The label of the new observation (-1 if point is unlabeled). return_iter : bool, default=False Whether to return the number of iterations until convergence of the label estimates. Returns ------- y_unlabeled, num_iter : returns the new label estimates and optionally the number of iterations """ # The number of labeled and unlabeled nodes now includes the new point y_u = self.y_unlabeled y_l = self.datastore.y_labeled p_ul = self.graph.subgraph_ul.transition_matrix p_uu = self.graph.subgraph_uu.transition_matrix a_rev_ul = self.graph.subgraph_ul.rev_adj a_rev_uu = self.graph.subgraph_uu.rev_adj if y_new == -1: # Estimate the label of the new unlabeled point label_new = ssdot(p_ul[ind_new], y_l, True) \ + ssdot(p_uu[ind_new], y_u, True) y_u[ind_new] = label_new # The first LP candidates are the unlabeled samples that have # the new point as a nearest neighbor candidates = a_rev_uu.get(ind_new, set()) else: # The label of the new labeled point is already in the data store candidates = a_rev_ul.get(ind_new, set()) # Initialize a tentative label matrix / hash-map y_u_tent = {} # y_u[:u].copy() # Tentative labels are the label est. after the new point insertion candid1_norms = [] for ind in candidates: y_u_tent.setdefault(ind, y_u[ind].copy()) label = ssdot(p_ul[ind], y_l, True) + ssdot(p_uu[ind], y_u, True) y_u_tent[ind] = label.ravel() n_updates_per_iter = [] n_iter = 0 k_u = self.graph.n_neighbors_unlabeled u = max(self.graph.n_unlabeled, 1) max_iter = int(np.log(u) / np.log(k_u)) if k_u > 1 else self.max_iter while len(candidates) and n_iter < max_iter: # < self.max_iter: # Pick the ones that change significantly and change them updates, norm = filter_and_update(candidates, y_u_tent, y_u, self.theta) n_updates_per_iter.append(len(updates)) # Get the next set of candidates (farther from the source) candidates = get_next_candidates(updates, y_u_tent, y_u, a_rev_uu, p_uu) n_iter += 1 # Print the total number of updates n_updates = sum(n_updates_per_iter) if n_updates: logger.info('Iter {:6}: {:6} updates in {:2} LP iters, ' 'max_iter = {:2}'.format(self.n_iter_online, n_updates, n_iter, max_iter)) if return_iter: return y_u, n_iter else: return y_u