Example #1
0
    def replace_random_tokens(self, n_samples, replacement='',
                              random_state=None,
                              min_replace=1, max_replace=1.0,
                              group_size=1):
        """ 
        Return a list of ``(text, replaced_count, mask)``
        tuples with n_samples versions of text with some words replaced.
        By default words are replaced with '', i.e. removed.
        """
        n_tokens = len(self.tokens)
        indices = np.arange(n_tokens)
        if not n_tokens:
            nomask = np.array([], dtype=int)
            return [('', 0, nomask)] * n_samples

        min_replace, max_replace = self._get_min_max(min_replace, max_replace,
                                                     n_tokens)
        rng = check_random_state(random_state)
        replace_sizes = rng.randint(low=min_replace, high=max_replace + 1,
                                    size=n_samples)
        res = []
        for num_to_replace in replace_sizes:
            idx_to_replace = rng.choice(indices, num_to_replace, replace=False)
            idx_to_replace = np.array([idx_to_replace] + [
                idx_to_replace + shift for shift in range(1, group_size)
            ]).ravel()
            padded_size = n_tokens + group_size - 1
            mask = indices_to_bool_mask(idx_to_replace, padded_size)[:n_tokens]
            s = self.split.masked(mask, replacement)
            res.append((s.text, num_to_replace, mask))
        return res
Example #2
0
    def replace_random_tokens_bow(self, n_samples, replacement='',
                                  random_state=None,
                                  min_replace=1, max_replace=1.0):
        """ 
        Return a list of ``(text, replaced_words_count, mask)`` tuples with
        n_samples versions of text with some words replaced.
        If a word is replaced, all duplicate words are also replaced
        from the text. By default words are replaced with '', i.e. removed.
        """
        if not self.vocab:
            nomask = np.array([], dtype=int)
            return [('', 0, nomask)] * n_samples

        min_replace, max_replace = self._get_min_max(min_replace, max_replace,
                                                     len(self.vocab))
        rng = check_random_state(random_state)
        replace_sizes = rng.randint(low=min_replace, high=max_replace + 1,
                                    size=n_samples)
        res = []
        for num_to_replace in replace_sizes:
            tokens_to_replace = set(rng.choice(self.vocab, num_to_replace,
                                               replace=False))
            idx_to_replace = [idx for idx, token in enumerate(self.tokens)
                              if token in tokens_to_replace]
            mask = indices_to_bool_mask(idx_to_replace, len(self.tokens))
            s = self.split.masked(idx_to_replace, replacement)
            res.append((s.text, num_to_replace, mask))
        return res