Beispiel #1
0
 def get_max_squared_sum(X):
     if sparse.issparse(X):
         X = X.tocsr()
         from sklearn.utils import sparsefuncs_fast
         return sparsefuncs_fast.csr_row_norms(X).max()
     else:
         return np.sum(X**2, axis=1).max()
Beispiel #2
0
 def get_max_squared_sum(X):
     if sparse.issparse(X):
         X = X.tocsr()
         from sklearn.utils import sparsefuncs_fast
         return sparsefuncs_fast.csr_row_norms(X).max()
     else:
         return np.sum(X ** 2, axis=1).max()
def test_smart_tfidf_transformer(scheme):
    tf = CountVectorizer().fit_transform(documents)

    estimator = SmartTfidfTransformer(weighting=scheme)

    X = estimator.fit_transform(tf)

    scheme_t, scheme_d, scheme_n = _validate_smart_notation(scheme)
    if scheme_d not in 'dp':
        # the resulting document term matrix should be positive
        # (unless we use probabilistic idf weighting)
        assert (X.A >= 0).all()

    # norm cannot be zero
    X_norm = csr_row_norms(X)
    assert (X_norm > 0).all()

    X_ref = None
    if scheme == 'nnn':
        X_ref = X
    elif scheme == 'nnc':
        X_ref = TfidfVectorizer(use_idf=False,
                                smooth_idf=False).fit_transform(documents)
    elif scheme == 'ntc':
        X_ref = TfidfVectorizer(use_idf=True,
                                smooth_idf=False).fit_transform(documents)
    elif scheme == 'lnn':
        X_ref = TfidfVectorizer(use_idf=False,
                                sublinear_tf=True,
                                smooth_idf=False,
                                norm=None).fit_transform(documents)
    elif scheme == 'ltc':
        X_ref = TfidfVectorizer(use_idf=True,
                                sublinear_tf=True,
                                smooth_idf=False).fit_transform(documents)
    elif scheme == 'ltl':
        X_ref = TfidfVectorizer(use_idf=True,
                                sublinear_tf=True,
                                smooth_idf=False,
                                norm='l1').fit_transform(documents)

    if X_ref is not None:
        assert_allclose(X.A, X_ref.A, rtol=1e-7, atol=1e-6)

    assert len(estimator.dl_) == tf.shape[0]
    assert len(estimator.du_) == tf.shape[0]
    if scheme_d in ['tsp']:
        assert len(estimator.df_) == tf.shape[1]

    X_2 = SmartTfidfTransformer(weighting=scheme).fit(tf).transform(tf)
    assert_allclose(X.A, X_2.A, rtol=1e-6, atol=1e-6)

    if scheme_d in 'stp':
        assert estimator.df_ is not None

    sl = slice(2)
    tf_w_sl = estimator.transform(tf[sl])
    assert_allclose(X[sl].A, tf_w_sl.A)
def test_csr_row_norms(dtype):
    # checks that csr_row_norms returns the same output as
    # scipy.sparse.linalg.norm, and that the dype is the same as X.dtype.
    X = sp.random(100, 10, format='csr', dtype=dtype, random_state=42)

    scipy_norms = sp.linalg.norm(X, axis=1)**2
    norms = csr_row_norms(X)

    assert norms.dtype == dtype
    rtol = 1e-6 if dtype == np.float32 else 1e-7
    assert_allclose(norms, scipy_norms, rtol=rtol)
Beispiel #5
0
def row_norms(X, squared=False):
    """Row-wise (squared) Euclidean norm of X.

    Equivalent to np.sqrt((X * X).sum(axis=1)), but also supports sparse
    matrices and does not create an X.shape-sized temporary.

    Performs no input validation.
    """
    if issparse(X):
        if not isinstance(X, csr_matrix):
            X = csr_matrix(X)
        norms = csr_row_norms(X)
    else:
        norms = np.einsum('ij,ij->i', X, X)

    if not squared:
        np.sqrt(norms, norms)
    return norms
Beispiel #6
0
def row_norms(X, squared=False):
    """Row-wise (squared) Euclidean norm of X.

    Equivalent to np.sqrt((X * X).sum(axis=1)), but also supports sparse
    matrices and does not create an X.shape-sized temporary.

    Performs no input validation.
    """
    if issparse(X):
        if not isinstance(X, csr_matrix):
            X = csr_matrix(X)
        norms = csr_row_norms(X)
    else:
        norms = np.einsum('ij,ij->i', X, X)

    if not squared:
        np.sqrt(norms, norms)
    return norms
Beispiel #7
0
def _smart_tfidf(tf, weighting, df=None, df_n_samples=None, norm_alpha=0.75,
                 norm_pivot=None, return_pivot=False):
    """
    Apply TF-IDF feature weighting using the SMART notation.


    Parameters
    ----------
    df : sparse csr array
      the term frequency matrix (n_documents, n_features)

    weighting : str, default='nnc'
      the SMART notation for document term weighting and normalization.
      In the form [nlabL][ntspd][nclu][p] , see
      https://en.wikipedia.org/wiki/SMART_Information_Retrieval_System

    df : array, shape=[n_features], optional
      precomputed inverse document frequency matrix (n_features,).
      If not provided, it will be recomputed if necessary. Both df and
      df_n must be provided at the same time.

    df_n_samples : float, default=None
      when using a inverse document frequency matrix, the number of
      documents that were used to compute the df. Both df and df_n
      must be provided at the same time.

    norm_alpha : float, default=0.75
      the alpha parameter in the pivoted normalization. Only used when
      weighting='???p'.

    norm_pivot : float, default=None
      the pivot value used for the normalization. If not provided, and
      weighting='???p', it is computed as the mean of the norm(tf*idf).

    return_pivot : bool, default=False
      return the computed norm_pivot

    Returns
    -------

    X : sparse csr array
      the weighted term frequency matrix

    norm_pivot : flot
      return the norm pivot (only when return_pivot=True)

    References
    ----------
    .. [Manning2008] C.D. Manning, P. Raghavan, H. Schütze,
       `"Document and query weighting schemes"
       <https://nlp.stanford.edu/IR-book/html/htmledition/document-and-query-weighting-schemes-1.html>`_ ,
       2008
    .. [Singhal1996] A. Singhal, C. Buckley, and M. Mitra.
       `"Pivoted document length normalization."
       <https://ecommons.cornell.edu/bitstream/handle/1813/7217/95-1560.pdf?sequence=1>`_ , 1996
    """  # noqa

    tf = check_array(tf, ['csr'])
    if (df is None) != (df_n_samples is None):
        raise ValueError(('df={} and df_n_samples={}, while both should be '
                          'either provided or not provided')
                         .format(df is None, df_n_samples))
    if df is not None:
        df = check_array(df, ensure_2d=False)
        if df.shape[0] != tf.shape[-1]:
            raise ValueError(('df array provided with n_features={} ,'
                              'while in the tf array n_features={}')
                             .format(df.shape[0], tf.shape[1]))

    if not 0 <= norm_alpha <= 1:
        raise ValueError('norm_alpha={} not in [0, 1]'.format(norm_alpha))

    n_samples, n_features = tf.shape
    if df_n_samples is None:
        df_n_samples = n_samples

    scheme_t, scheme_d, scheme_n = _validate_smart_notation(weighting)

    X = tf

    # term weighting
    if scheme_t == 'n':
        pass
    elif scheme_t == 'l':
        X.data = 1 + np.log(tf.data)
    elif scheme_t == 'd':
        X.data = 1 + np.log(1 + np.log(tf.data))
    elif scheme_t == 'a':
        max_tf = np.squeeze(tf.max(axis=1).A)
        # if max_tf is zero, the tf are going to be all zero anyway
        # so we set it to 1 in order to prevent overflows
        max_tf[max_tf == 0] = 1
        _max_tf_diag = sp.spdiags(1. / max_tf, diags=0, m=n_samples,
                                  n=n_samples, format='csr')
        X = 0.5 * _max_tf_diag.dot(tf)
        X.data += 0.5

    elif scheme_t == 'b':
        X.data = tf.data.astype('bool').astype('int')
    elif scheme_t == 'L':
        mean_tf = _mean_csr_nonzero_axis1(tf)
        # if mean_tf is zero, the tf are going to be all zero anyway
        # so we set it to 1 in order to prevent overflows
        mean_tf[mean_tf == 0] = 1.0
        mean_tf = (1 + np.log(mean_tf))
        _mean_tf_diag = sp.spdiags(1./mean_tf, diags=0, m=n_samples,
                                   n=n_samples, format='csr')

        X.data = (1 + np.log(tf.data))
        X = _mean_tf_diag.dot(X)
    else:
        raise ValueError

    # document weighting
    if scheme_d == 'n':
        pass
    elif scheme_d in 'tpsd':
        if df is None:
            df = _document_frequency(tf)
        if scheme_d == 't':
            idf = np.log(float(df_n_samples) / df) + 1.0
        elif scheme_d == 's':
            idf = np.log(float(df_n_samples + 1) / (df + 1)) + 1.0
        elif scheme_d == 'p':
            with warnings.catch_warnings():
                warnings.filterwarnings("ignore",
                                        message="divide by zero encountered in log",  # noqa
                                        category=RuntimeWarning)
                idf = np.log((float(df_n_samples) - df)/df)
        elif scheme_d == 'd':
            idf = np.log((float(df_n_samples) + 1 - df)/(df + 1))
        _idf_diag = sp.spdiags(idf, diags=0, m=n_features,
                               n=n_features, format='csr')
        X = X.dot(_idf_diag)
    else:
        raise ValueError

    # normalization
    if scheme_n == 'n':
        pass
    elif scheme_n == 'c':
        with warnings.catch_warnings():
            warnings.filterwarnings("ignore",
                                    category=DataConversionWarning)
            X = normalize(X, norm="l2", copy=False)
    elif scheme_n == 'l':
        with warnings.catch_warnings():
            warnings.filterwarnings("ignore",
                                    category=DataConversionWarning)
            X = normalize(X, norm="l1", copy=False)
    elif scheme_n == 'u':
        X_norm = np.diff(X.indptr)
        X_norm[X_norm == 0] = 1.
        # empty documents (with a zero norm) don't need to be normalized
        _diag_norm = sp.spdiags(1./X_norm, diags=0, m=n_samples,
                                n=n_samples, format='csr')
        X = _diag_norm.dot(X)
    elif scheme_n in ['cp', 'lp', 'up']:
        if scheme_n == 'cp':
            X_norm = np.sqrt(csr_row_norms(X))
        elif scheme_n == 'lp':
            X_data = X.data.copy()
            X.data = np.abs(X.data)
            X_norm = np.squeeze(X.sum(axis=1).A)
            X.data = X_data
        elif scheme_n == 'up':
            X_norm = np.diff(X.indptr)

        if norm_pivot is None:
            norm_pivot = X_norm.mean()

        # empty documents (with a zero norm) don't need to be normalized
        X_norm[X_norm == 0] = 1.

        pivoted_norm = (1 - norm_alpha)*norm_pivot + norm_alpha*X_norm
        _diag_pivoted_norm = sp.spdiags(1./pivoted_norm, diags=0, m=n_samples,
                                        n=n_samples, format='csr')
        X = _diag_pivoted_norm.dot(X)
    else:
        raise ValueError
    if return_pivot:
        return X, norm_pivot
    else:
        return X