def predict_proba(self, X, mode=None):

        modes = ['knn', 'lp', 'pair']
        if mode is not None and mode not in modes:
            raise ValueError('predict_proba can have modes: {}'.format(modes))

        u, l = self.graph.n_unlabeled, self.graph.n_labeled

        logger.info('Now testing on {} samples...'.format(len(X)))
        neighbors, distances = self.graph.find_labeled_neighbors(X)
        affinity_mat = construct_weight_mat(neighbors, distances,
                                            (X.shape[0], l), self.graph.dtype)
        p_tl = normalize(affinity_mat.tocsr(), norm='l1', axis=1)
        y_from_labeled = ssdot(p_tl, self.datastore.y_labeled[:l], True)

        neighbors, distances = self.graph.find_unlabeled_neighbors(X)
        affinity_mat = construct_weight_mat(neighbors, distances,
                                            (X.shape[0], u), self.graph.dtype)
        p_tu = normalize(affinity_mat.tocsr(), norm='l1', axis=1)
        y_from_unlabeled = ssdot(p_tu, self.y_unlabeled[:u], True)

        y_pred_proba = y_from_labeled + y_from_unlabeled
        logger.info('Labels have been predicted.')

        if mode is None:
            return y_pred_proba
        elif mode == 'knn':
            return y_from_labeled
        elif mode == 'lp':
            return y_from_unlabeled
        elif mode == 'pair':
            return y_from_labeled, y_pred_proba
Ejemplo n.º 2
0
    def train(self,
              tokens,
              images,
              Wvi,
              context=False,
              use_dask=False,
              n_worker=-1,
              n_chunk=200,
              verbose=False):
        verboseprint = lambda x: print(x) if verbose else None

        verboseprint('Constructing matrices...')
        if verbose and use_dask == False:
            tokens = tqdm(tokens)

        if use_dask:
            tVC, tVV_diag, tCC_diag = construct_matrix_dask(
                tokens, self.window_size, self.vocab_size, self._tokens2idx,
                n_worker, n_chunk, verbose)
        else:
            tVC, tVV_diag, tCC_diag = construct_matrix(tokens,
                                                       self.window_size,
                                                       self.vocab_size)

        self.mean_image = np.mean(images, axis=0, keepdims=True)
        Xvis = images - self.mean_image

        verboseprint('Squashing...')
        tVC, tVV_diag, tCC_diag = self._squash_arrays(tVC, tVV_diag, tCC_diag)

        verboseprint('Preparing arrays...')
        n_tags_per_vocab = mu.sum(Wvi, axis=1)
        tVWviXvis = ssdot(ssdot(sparse.diags(tVV_diag), Wvi), Xvis)
        Gvv_diag = tVV_diag + tVV_diag * n_tags_per_vocab
        Gvis = Xvis.T @ ssdot(sparse.diags(ssdot(Wvi.T, tVV_diag)), Xvis)

        verboseprint('Calculating word vectors...')
        H = bm.block_sym_mat([[None, tVC, tVWviXvis], [None, None, None],
                              [None, None, None]])
        G = bm.block_diag_mat(
            [sparse.diags(Gvv_diag),
             sparse.diags(tCC_diag), Gvis])
        eigenvalues, A = randomized_ghep(H,
                                         G,
                                         n_components=self.dim,
                                         n_oversamples=self.dim +
                                         self.oversampling,
                                         n_iter=self.n_iter)

        self.ev = eigenvalues[::-1]
        self._set_keyedvector('wv',
                              self.word_dict.keys(),
                              self.dim,
                              vec=A[:self.vocab_size, ::-1])
        self.image_mapping = A[-Xvis.shape[1]:, ::-1]
        if context:
            self.context = Context(A[self.vocab_size:-Xvis.shape[1], ::-1],
                                   len(self.word_dict), self.window_size)
Ejemplo n.º 3
0
def _(self, other):
    # TODO: check dims
    res = [*itertools.repeat(None, self.block_shape[0])]
    for i in range(self.block_shape[0]):
        if self[i, i] is None or other[i, i] is None:
            continue
        res[i] = ssdot(self[i, i], other[i, i])

    return block_diag_mat(res)
def get_next_candidates(major_changes, y_u_tent, y_u, a_rev_uu, p_uu):
    candidates = set()
    for index, label_diff in major_changes:
        back_neighbors = a_rev_uu.get(index, set())
        for neigh in back_neighbors:
            y_u_tent.setdefault(neigh, y_u[neigh].copy())
            y_u_tent[neigh] += ssdot(p_uu[neigh, index], label_diff, True)
            candidates.add(neigh)
    return candidates
    def _offline_lp(self, return_iter=False, max_iter=30, tol=0.001):
        """Perform the offline label propagation until convergence of the label 
        estimates of the unlabeled points.

        Parameters
        ----------
        return_iter : bool, default=False
            Whether or not to return the number of iterations till convergence 
            of the label estimates.

        Returns
        -------
        y_unlabeled, num_iter : 
            the new label estimates and optionally the number of iterations
        """

        logger.debug('Doing Offline LP...')

        u, l = self.graph.n_unlabeled, self.graph.n_labeled

        p_ul = self.graph.subgraph_ul.transition_matrix[:u]
        p_uu = self.graph.subgraph_uu.transition_matrix[:u, :u]
        y_unlabeled = self.y_unlabeled[:u]
        y_labeled = self.datastore.y_labeled

        # First iteration
        y_static = ssdot(p_ul, y_labeled, dense_output=True)

        # Continue loop
        n_iter = 0
        converged = False
        while n_iter < max_iter and not converged:
            y_unlabeled_prev = y_unlabeled.copy()
            y_unlabeled = y_static + ssdot(p_uu, y_unlabeled, True)
            n_iter += 1

            converged = _converged(y_unlabeled, y_unlabeled_prev, tol)

        logger.info('Offline LP took {} iterations'.format(n_iter))

        if return_iter:
            return y_unlabeled, n_iter
        else:
            return y_unlabeled
Ejemplo n.º 6
0
    def _(self, other):
        # TODO: check dims
        res = [[val for val in itertools.repeat(None, self.block_shape[1])]
               for _ in range(other.block_shape[0])]
        for i, j in itertools.product(range(other.block_shape[0]), range(self.block_shape[1])):
            if other[i, j] is None or self[i, j] is None:
                continue
            res[i][j] = ssdot(other[i, j], self[j, j])

        return block_mat(res)
Ejemplo n.º 7
0
    def _(self, other):
        # TODO: check dims
        if other.ndim == 1:
            res = np.zeros(self.shape[0])
        elif other.ndim == 2:
            res = np.zeros((self.shape[0], other.shape[1]))
        else:
            return NotImplemented

        start_row = 0
        for i in range(self.block_shape[0]):
            end_row = start_row + self.shape_detail[1][i]
            if self[i, i] is None:
                start_row = end_row
                continue
            res[start_row:end_row, ] = ssdot(self[i, i],
                                              other[start_row:end_row, ])
            start_row = end_row
        return res
 def update_transitions(self, normalizer):
     self.transition_matrix = ssdot(normalizer, self.weight_matrix)
    def _propagate_single(self, ind_new, y_new, return_iter=False):
        """Perform label propagation until convergence of the label
        estimates of the unlabeled points. Assume the new node has already 
        been added to the graph, but no label has been estimated.

        Parameters
        ----------
        ind_new : int 
            The index of the new observation determined during graph addition.

        y_new : int 
            The label of the new observation (-1 if point is unlabeled).

        return_iter : bool, default=False
            Whether to return the number of iterations until convergence of 
            the label estimates.

        Returns
        -------
        y_unlabeled, num_iter : returns the new label estimates and optionally 
                                the number of iterations
        """
        # The number of labeled and unlabeled nodes now includes the new point
        y_u = self.y_unlabeled
        y_l = self.datastore.y_labeled

        p_ul = self.graph.subgraph_ul.transition_matrix
        p_uu = self.graph.subgraph_uu.transition_matrix

        a_rev_ul = self.graph.subgraph_ul.rev_adj
        a_rev_uu = self.graph.subgraph_uu.rev_adj

        if y_new == -1:
            # Estimate the label of the new unlabeled point
            label_new = ssdot(p_ul[ind_new], y_l, True) \
                        + ssdot(p_uu[ind_new], y_u, True)
            y_u[ind_new] = label_new

            # The first LP candidates are the unlabeled samples that have
            # the new point as a nearest neighbor
            candidates = a_rev_uu.get(ind_new, set())
        else:
            # The label of the new labeled point is already in the data store
            candidates = a_rev_ul.get(ind_new, set())

        # Initialize a tentative label matrix / hash-map
        y_u_tent = {}  # y_u[:u].copy()

        # Tentative labels are the label est. after the new point insertion
        candid1_norms = []
        for ind in candidates:
            y_u_tent.setdefault(ind, y_u[ind].copy())
            label = ssdot(p_ul[ind], y_l, True) + ssdot(p_uu[ind], y_u, True)
            y_u_tent[ind] = label.ravel()

        n_updates_per_iter = []
        n_iter = 0
        k_u = self.graph.n_neighbors_unlabeled
        u = max(self.graph.n_unlabeled, 1)
        max_iter = int(np.log(u) / np.log(k_u)) if k_u > 1 else self.max_iter
        while len(candidates) and n_iter < max_iter:  # < self.max_iter:

            # Pick the ones that change significantly and change them
            updates, norm = filter_and_update(candidates, y_u_tent, y_u,
                                              self.theta)
            n_updates_per_iter.append(len(updates))

            # Get the next set of candidates (farther from the source)
            candidates = get_next_candidates(updates, y_u_tent, y_u, a_rev_uu,
                                             p_uu)

            n_iter += 1

        # Print the total number of updates
        n_updates = sum(n_updates_per_iter)
        if n_updates:
            logger.info('Iter {:6}: {:6} updates in {:2} LP iters, '
                        'max_iter = {:2}'.format(self.n_iter_online, n_updates,
                                                 n_iter, max_iter))

        if return_iter:
            return y_u, n_iter
        else:
            return y_u