Exemple #1
0
def generate_samples(
        text,  # type: TokenizedText
        n_samples=500,  # type: int
        bow=True,  # type: bool
        random_state=None,
        replacement='',  # type: str
        min_replace=1,  # type: Union[int, float]
        max_replace=1.0,  # type: Union[int, float]
        group_size=1,  # type: int
):
    # type: (...) -> Tuple[List[str], np.ndarray, np.ndarray]
    """
    Return ``n_samples`` changed versions of text (with some words removed),
    along with distances between the original text and a generated
    examples. If ``bow=False``, all tokens are considered unique
    (i.e. token position matters).
    """
    kwargs = dict(
        n_samples=n_samples,
        replacement=replacement,
        random_state=random_state,
        min_replace=min_replace,
        max_replace=max_replace,
    )
    if bow:
        num_tokens = len(text.vocab)
        res = text.replace_random_tokens_bow(**kwargs)
    else:
        num_tokens = len(text.tokens)
        res = text.replace_random_tokens(group_size=group_size, **kwargs)

    texts, num_removed_vec, masks = zip(*res)
    similarity = cosine_similarity_vec(num_tokens, num_removed_vec)
    return texts, similarity, vstack(masks)
Exemple #2
0
def test_vstack_sparse():
    res = vstack([
        sp.csr_matrix([1, 2, 3]),
        sp.csr_matrix([4, 5, 6]),
    ])
    assert res.shape == (2, 3)
    assert sp.issparse(res)
    assert np.array_equal(res.todense(), np.array([[1, 2, 3], [4, 5, 6]]))
Exemple #3
0
 def sample_near_with_mask(self,
                           doc,         # type: str
                           n_samples=1  # type: int
                           ):
     # type: (...) -> Tuple[List[str], np.ndarray, np.ndarray, TokenizedText]
     assert n_samples >= 1
     text = TokenizedText(doc, token_pattern=self.token_pattern)
     all_docs = []  # type: List[str]
     similarities = []
     masks = []
     for sampler, freq in self._sampler_n_samples(n_samples):
         docs, sims, mask, _text = sampler.sample_near_with_mask(text, freq)
         all_docs.extend(docs)
         similarities.append(sims)
         masks.append(mask)
     return all_docs, np.hstack(similarities), vstack(masks), text
Exemple #4
0
def expanded_X_y_sample_weights(X,
                                y_proba,
                                expand_factor=10,
                                sample_weight=None,
                                shuffle=True,
                                random_state=None):
    """
    scikit-learn can't optimize cross-entropy directly if target
    probability values are not indicator vectors.
    As a workaround this function expands the dataset according to
    target probabilities. ``expand_factor=None`` means no dataset
    expansion.
    """
    rng = check_random_state(random_state)
    if expand_factor:
        if sample_weight is not None:
            X, y, sample_weight = zip(
                *expand_dataset(X,
                                y_proba,
                                factor=expand_factor,
                                random_state=rng,
                                extra_arrays=[sample_weight]))
        else:
            X, y = zip(*expand_dataset(
                X, y_proba, factor=expand_factor, random_state=rng))
    else:
        y = y_proba.argmax(axis=1)

    if isinstance(X, (list, tuple)) and len(X) and issparse(X[0]):
        X = vstack(X)

    if shuffle:
        if sample_weight is not None:
            X, y, sample_weight = _shuffle(X,
                                           y,
                                           sample_weight,
                                           random_state=rng)
        else:
            X, y = _shuffle(X, y, random_state=rng)
    return X, y, sample_weight
Exemple #5
0
def test_vstack_dense():
    res = vstack([np.array([1, 2, 3]), np.array([4, 5, 6])])
    assert res.shape == (2, 3)
    assert not sp.issparse(res)
    assert np.array_equal(res, np.array([[1, 2, 3], [4, 5, 6]]))
Exemple #6
0
def test_vstack_empty():
    assert vstack([]).shape == (0, )