def generate_samples( text, # type: TokenizedText n_samples=500, # type: int bow=True, # type: bool random_state=None, replacement='', # type: str min_replace=1, # type: Union[int, float] max_replace=1.0, # type: Union[int, float] group_size=1, # type: int ): # type: (...) -> Tuple[List[str], np.ndarray, np.ndarray] """ Return ``n_samples`` changed versions of text (with some words removed), along with distances between the original text and a generated examples. If ``bow=False``, all tokens are considered unique (i.e. token position matters). """ kwargs = dict( n_samples=n_samples, replacement=replacement, random_state=random_state, min_replace=min_replace, max_replace=max_replace, ) if bow: num_tokens = len(text.vocab) res = text.replace_random_tokens_bow(**kwargs) else: num_tokens = len(text.tokens) res = text.replace_random_tokens(group_size=group_size, **kwargs) texts, num_removed_vec, masks = zip(*res) similarity = cosine_similarity_vec(num_tokens, num_removed_vec) return texts, similarity, vstack(masks)
def test_vstack_sparse(): res = vstack([ sp.csr_matrix([1, 2, 3]), sp.csr_matrix([4, 5, 6]), ]) assert res.shape == (2, 3) assert sp.issparse(res) assert np.array_equal(res.todense(), np.array([[1, 2, 3], [4, 5, 6]]))
def sample_near_with_mask(self, doc, # type: str n_samples=1 # type: int ): # type: (...) -> Tuple[List[str], np.ndarray, np.ndarray, TokenizedText] assert n_samples >= 1 text = TokenizedText(doc, token_pattern=self.token_pattern) all_docs = [] # type: List[str] similarities = [] masks = [] for sampler, freq in self._sampler_n_samples(n_samples): docs, sims, mask, _text = sampler.sample_near_with_mask(text, freq) all_docs.extend(docs) similarities.append(sims) masks.append(mask) return all_docs, np.hstack(similarities), vstack(masks), text
def expanded_X_y_sample_weights(X, y_proba, expand_factor=10, sample_weight=None, shuffle=True, random_state=None): """ scikit-learn can't optimize cross-entropy directly if target probability values are not indicator vectors. As a workaround this function expands the dataset according to target probabilities. ``expand_factor=None`` means no dataset expansion. """ rng = check_random_state(random_state) if expand_factor: if sample_weight is not None: X, y, sample_weight = zip( *expand_dataset(X, y_proba, factor=expand_factor, random_state=rng, extra_arrays=[sample_weight])) else: X, y = zip(*expand_dataset( X, y_proba, factor=expand_factor, random_state=rng)) else: y = y_proba.argmax(axis=1) if isinstance(X, (list, tuple)) and len(X) and issparse(X[0]): X = vstack(X) if shuffle: if sample_weight is not None: X, y, sample_weight = _shuffle(X, y, sample_weight, random_state=rng) else: X, y = _shuffle(X, y, random_state=rng) return X, y, sample_weight
def test_vstack_dense(): res = vstack([np.array([1, 2, 3]), np.array([4, 5, 6])]) assert res.shape == (2, 3) assert not sp.issparse(res) assert np.array_equal(res, np.array([[1, 2, 3], [4, 5, 6]]))
def test_vstack_empty(): assert vstack([]).shape == (0, )