Esempio n. 1
0
    def find_pseudo_labels(
            self,
            labeled_file_path: str,
            unlabeled_file_path: str,
            temperature: int = 10,
            batch_size: int = None,
            **kwargs
    ):
        labeled_data = load_data_jsonl(
            labeled_file_path,
        )

        unlabeled_data = load_data_jsonl(
            unlabeled_file_path,
        )

        if not batch_size:
            batch_size = len(unlabeled_data)
        unlabeled_data_chunks = chunks(unlabeled_data, batch_size)
        n_batches = len(range(0, len(unlabeled_data), batch_size))

        all_recovered = list()
        labeled_embeddings = np.array(self.embedder.embed_sentences([d['sentence'] for d in labeled_data]))

        for batch_ix, unlabeled_data_chunk in enumerate(unlabeled_data_chunks):
            logger.info(f'Finding pseudo labels for batch {batch_ix + 1}/{n_batches}')
            unlabeled_embeddings = np.array(self.embedder.embed_sentences([d['sentence'] for d in unlabeled_data_chunk]))
            embeddings = np.concatenate((labeled_embeddings, unlabeled_embeddings), axis=0)
            w = (1 - pairwise_distances(embeddings, embeddings, metric='cosine')).astype(np.float32)
            all_recovered += get_nKNN_pseudo_labels(w, labeled_data, unlabeled_data_chunk, temperature=temperature)

        return all_recovered
Esempio n. 2
0
    def find_pseudo_labels(
            self,
            labeled_file_path: str,
            unlabeled_file_path: str,
            batch_size: int = None,
            **kwargs
    ):
        self.fit(labeled_file_path)
        unlabeled_data = load_data_jsonl(unlabeled_file_path)

        if not batch_size:
            batch_size = len(unlabeled_data)
        unlabeled_data_chunks = chunks(unlabeled_data, batch_size)
        n_batches = len(range(0, len(unlabeled_data), batch_size))

        recovered = list()

        for batch_ix, unlabeled_data_chunk in enumerate(unlabeled_data_chunks):
            logger.info(f'Finding pseudo labels for batch {batch_ix + 1}/{n_batches}')
            X = self.embedder.embed_sentences([str(d['sentence']) for d in unlabeled_data_chunk])
            predictions = self.logreg.predict_proba(X)
            pseudo_labels = predictions.argmax(1)
            pseudo_labels_scores = predictions.max(1)

            for original_data, pseudo_label, pseudo_label_score in zip(
                    unlabeled_data_chunk, pseudo_labels, pseudo_labels_scores):
                recovered.append(dict(
                    data=original_data.copy(),
                    pseudo_label=self.labels_vocab(pseudo_label, rev=True),
                    pseudo_label_score=float(pseudo_label_score)
                ))
        return recovered
    def find_pseudo_labels(self,
                           labeled_file_path: str,
                           unlabeled_file_path: str,
                           temperature: int = 10,
                           batch_size: int = None,
                           **kwargs):
        labeled_data = load_data_jsonl(labeled_file_path, )

        unlabeled_data = load_data_jsonl(unlabeled_file_path, )

        if not batch_size:
            batch_size = len(unlabeled_data)
        unlabeled_data_chunks = chunks(unlabeled_data, batch_size)
        n_batches = len(range(0, len(unlabeled_data), batch_size))

        all_recovered = list()
        labeled_embeddings = np.array(
            self.embedder.embed_sentences([d['input'] for d in labeled_data]))

        for batch_ix, unlabeled_data_chunk in enumerate(unlabeled_data_chunks):
            logger.info(
                f'Finding pseudo labels for batch {batch_ix + 1}/{n_batches}')
            unlabeled_embeddings = np.array(
                self.embedder.embed_sentences(
                    [d['input'] for d in unlabeled_data_chunk]))
            embeddings = np.concatenate(
                (labeled_embeddings, unlabeled_embeddings), axis=0)

            nn = NearestNeighbors(n_neighbors=10, metric='cosine')
            nn.fit(embeddings)
            graph = nn.kneighbors_graph().toarray()
            w = (graph.T + graph > 0).astype(int)

            # D
            d = np.diag(w.sum(0))
            d_half = fractional_matrix_power(d, -0.5)

            # Normalized laplacian
            l_sym = np.eye(len(w)) - d_half @ w @ d_half

            # Eigen decomposition
            eigs = eigh(l_sym, eigvals=(1, min(31, len(l_sym) - 1)))
            normed_eigs = eigs[1] / np.sqrt(eigs[0])

            # W_prime
            w_prime = (normed_eigs @ normed_eigs.T).astype(np.float32)

            all_recovered += get_nKNN_pseudo_labels(w_prime,
                                                    labeled_data,
                                                    unlabeled_data_chunk,
                                                    temperature=temperature)

        return all_recovered
Esempio n. 4
0
    def find_pseudo_labels(
            self,
            labeled_file_path: str,
            unlabeled_file_path: str,
            temperature: int = 10,
            batch_size: int = None,
            **kwargs
    ):
        labeled_data = load_data_jsonl(
            labeled_file_path,
        )

        unlabeled_data = load_data_jsonl(
            unlabeled_file_path,
        )
        if not batch_size:
            batch_size = len(unlabeled_data)
        unlabeled_data_chunks = chunks(unlabeled_data, batch_size)
        n_batches = len(range(0, len(unlabeled_data), batch_size))

        all_recovered = list()
        labeled_embeddings = np.array(self.embedder.embed_sentences([d['sentence'] for d in labeled_data], detached=True))

        for batch_ix, unlabeled_data_chunk in enumerate(unlabeled_data_chunks):
            logger.info(f'Finding pseudo labels for batch {batch_ix + 1}/{n_batches}')
            unlabeled_embeddings = np.array(self.embedder.embed_sentences([d['sentence'] for d in unlabeled_data_chunk], detached=True))
            labels = [d['label'] for d in labeled_data] + ['' for _ in unlabeled_data_chunk]
            labels_vocab = Vocab([d['label'] for d in labeled_data])
            embeddings = np.concatenate((labeled_embeddings, unlabeled_embeddings), axis=0)

            # Build similarity matrix
            w = (1 - pairwise_distances(embeddings, embeddings, metric='cosine')).astype(np.float32)

            # Extracts splits of W for each label. Will be used to compute score
            w_label = dict()
            for label in labels_vocab.labels:
                labelled_global_indices = [ix for ix, d in enumerate(labeled_data) if d['label'] == label]
                w_label[label] = w[:, labelled_global_indices]

            # Build hierarchical tree, bottom to top
            Z = linkage(embeddings, 'ward')
            root_tree = to_tree(Z)

            # Split tree, top to bottom
            trees = get_unique_label_trees(root_tree=root_tree, labels=labels)

            # Recover data
            recovered = list()
            for tree, path in trees:
                output = list()

                # Get all indices in the tree
                order = tree.pre_order()
                tree_labels = [labels[ix] for ix in order]

                # Case when all elements of tree are unlabelled
                if set(tree_labels) == {''}:
                    recovered += output
                    continue

                # Case when samples are mixed (labeled & unlabeled), but with a unique label
                # Get the label
                pseudo_label = [l for o, l in zip(order, tree_labels) if len(labels[o])][0]

                # Iterate over items
                for ix in order:
                    # Case if item is unlabeled
                    if labels[ix] == '':
                        # Compute score
                        global_ix = ix
                        z_i = np.array([
                            w_label[label][global_ix].mean()
                            for label in labels_vocab.labels
                        ])
                        # temperature
                        z_i *= temperature
                        z_i_bar = np.exp(z_i)
                        z_i_bar /= z_i_bar.sum()

                        pseudo_label_score = float(z_i_bar[labels_vocab(pseudo_label)])

                        # Output
                        dat = unlabeled_data_chunk[ix - len(labeled_data)].copy()
                        output.append(dict(
                            data=dat,
                            pseudo_label=pseudo_label,
                            pseudo_label_score=pseudo_label_score
                        ))
                recovered += output
            all_recovered += recovered
        return all_recovered