Ejemplo n.º 1
0
    def transform(self, X):
        """
        Used to transform data after fiting
        :param X: list
        :return: list
        """
        X = check_array(X,
                        self.sparse_format,
                        copy=self.copy,
                        estimator='the normalize function',
                        dtype=FLOAT_DTYPES)
        if self.axis == 0:
            X = X.T
        if sparse.issparse(X):
            if self.norm == 'l1':
                inplace_csr_row_normalize_l1(X)
            elif self.norm == 'l2':
                inplace_csr_row_normalize_l2(X)
            elif self.norm == 'max':
                norms_elementwise = self.norms.repeat(np.diff(X.indptr))
                mask = norms_elementwise != 0
                X.data[mask] /= norms_elementwise[mask]
        else:
            X /= self.norms[:, np.newaxis]

        if self.axis == 0:
            X = X.T

        return X
Ejemplo n.º 2
0
 def transform(self, titles, conts):
     if not isinstance(titles, list) or not isinstance(conts, list):
         raise ValueError('List of doc string expected.')
     if len(titles) != len(conts):
         raise ValueError('Docs and titles must have the same length.')
     if not self.vocabulary_ready_:
         raise ValueError('Feature vocabulary not initiaized yet!'
                          'Can not do transforming.')
     # Re-construct feature vector.
     values = []
     j_indices = []
     indptr = [0]
     for i in range(len(titles)):
         if i % 10000 == 0:
             logging.info('Finished transforming %d lines' % i)
         title = titles[i]
         cont = conts[i]
         feature_counter = {}
         # Fill n-gram features.
         ngrams = (self.ngram_feature_obj.get_ngrams(title, prefix=True) +
                   self.ngram_feature_obj.get_ngrams(cont, prefix=True))
         for fea_name in ngrams:
             if fea_name in self.vocabulary_:
                 fea_idx = self.vocabulary_[fea_name]
                 feature_counter[fea_idx] = feature_counter.get(
                     fea_idx, 0) + 1
         # Fill named entity normalization features.
         doc = ' '.join([title, cont]).strip()
         ne_features = self.ne_feature_obj.transform_one(doc)
         for fea_name, fea_freq in ne_features.items():
             if fea_name in self.vocabulary_:
                 fea_idx = self.vocabulary_[fea_name]
                 feature_counter[fea_idx] = feature_counter.get(
                     fea_idx, 0) + fea_freq
         # Fill manual features.
         mn_features = self.mn_feature_obj.transform_one(titles[i], conts[i])
         for fea_name, fea_freq in mn_features.items():
             if fea_name in self.vocabulary_:
                 fea_idx = self.vocabulary_[fea_name]
                 feature_counter[fea_idx] = feature_counter.get(
                     fea_idx, 0) + fea_freq
         # Update csr_matrix data.
         j_indices.extend(feature_counter.keys())
         values.extend(feature_counter.values())
         indptr.append(len(j_indices))
     j_indices = np.asarray(j_indices, dtype=np.intc)
     indptr = np.asarray(indptr, dtype=np.intc)
     values = np.asarray(values, dtype=np.float64)
     X = sp.csr_matrix((values, j_indices, indptr),
                       shape=(len(indptr)-1, len(self.vocabulary_)))
     X.sort_indices()
     inplace_csr_row_normalize_l2(X)
     return X
Ejemplo n.º 3
0
    def fit(self, X):
        """
        Used to fit Noramlizer with data
        :param X: list
        :return: nothing
        """
        if self.norm not in ('l1', 'l2', 'max'):
            raise ValueError("'%s' is not a supported norm" % self.norm)

        if self.axis == 0:
            self.sparse_format = 'csc'
        elif self.axis == 1:
            self.sparse_format = 'csr'
        else:
            raise ValueError("'%d' is not a supported axis" % self.axis)

        X = check_array(X,
                        self.sparse_format,
                        copy=self.copy,
                        estimator='the normalize function',
                        dtype=FLOAT_DTYPES)
        if self.axis == 0:
            X = X.T

        if sparse.issparse(X):
            if self.norm == 'l1':
                inplace_csr_row_normalize_l1(X)
            elif self.norm == 'l2':
                inplace_csr_row_normalize_l2(X)
            elif self.norm == 'max':
                _, self.norms = min_max_axis(X, 1)
        else:
            if self.norm == 'l1':
                self.norms = np.abs(X).sum(axis=1)
            elif self.norm == 'l2':
                self.norms = row_norms(X)
            elif self.norm == 'max':
                self.norms = np.max(X, axis=1)
            self.norms = _handle_zeros_in_scale(self.norms, copy=False)
Ejemplo n.º 4
0
 def normalize(self, X, norm='l2', axis=1, copy=True):
     """Normalize a dataset along any axis
 
     Parameters
     ----------
     X : array or scipy.sparse matrix with shape [n_samples, n_features]
         The data to normalize, element by element.
         scipy.sparse matrices should be in CSR format to avoid an
         un-necessary copy.
 
     norm : 'l1' or 'l2', optional ('l2' by default)
         The norm to use to normalize each non zero sample (or each non-zero
         feature if axis is 0).
 
     axis : 0 or 1, optional (1 by default)
         axis used to normalize the data along. If 1, independently normalize
         each sample, otherwise (if 0) normalize each feature.
 
     copy : boolean, optional, default is True
         set to False to perform inplace row normalization and avoid a
         copy (if the input is already a numpy array or a scipy.sparse
         CSR matrix and if axis is 1).
 
     See also
     --------
     :class:`sklearn.preprocessing.Normalizer` to perform normalization
     using the ``Transformer`` API (e.g. as part of a preprocessing
     :class:`sklearn.pipeline.Pipeline`)
     """
     if norm not in ('l1', 'l2'):
         raise ValueError("'%s' is not a supported norm" % norm)
 
     if axis == 0:
         sparse_format = 'csc'
     elif axis == 1:
         sparse_format = 'csr'
     else:
         raise ValueError("'%d' is not a supported axis" % axis)
 
     X = check_arrays(X, sparse_format=sparse_format, copy=copy)[0]
     warn_if_not_float(X, 'The normalize function')
     if axis == 0:
         X = X.T
 
     if sparse.issparse(X):
         if norm == 'l1':
             inplace_csr_row_normalize_l1(X)
         elif norm == 'l2':
             inplace_csr_row_normalize_l2(X)
     else:
         if norm == 'l1':
             norms = np.abs(X).sum(axis=1)
             norms[norms == 0.0] = 1.0
         elif norm == 'l2':
             norms = row_norms(X)
             norms[norms == 0.0] = 1.0
         X /= norms[:, np.newaxis]
 
     if axis == 0:
         X = X.T
 
     return X
Ejemplo n.º 5
0
def normalize(X, norm='l2', axis=1, copy=True):
    """Scale input vectors individually to unit norm (vector length).

    Parameters
    ----------
    X : array or scipy.sparse matrix with shape [n_samples, n_features]
        The data to normalize, element by element.
        scipy.sparse matrices should be in CSR format to avoid an
        un-necessary copy.

    norm : 'l1' or 'l2', optional ('l2' by default)
        The norm to use to normalize each non zero sample (or each non-zero
        feature if axis is 0).

    axis : 0 or 1, optional (1 by default)
        axis used to normalize the data along. If 1, independently normalize
        each sample, otherwise (if 0) normalize each feature.

    copy : boolean, optional, default True
        set to False to perform inplace row normalization and avoid a
        copy (if the input is already a numpy array or a scipy.sparse
        CSR matrix and if axis is 1).

    See also
    --------
    :class:`sklearn.preprocessing.Normalizer` to perform normalization
    using the ``Transformer`` API (e.g. as part of a preprocessing
    :class:`sklearn.pipeline.Pipeline`)
    """
    if norm not in ('l1', 'l2'):
        raise ValueError("'%s' is not a supported norm" % norm)

    if axis == 0:
        sparse_format = 'csc'
    elif axis == 1:
        sparse_format = 'csr'
    else:
        raise ValueError("'%d' is not a supported axis" % axis)

    X = check_array(X, sparse_format, copy=copy)
    warn_if_not_float(X, 'The normalize function')
    if axis == 0:
        X = X.T

    if sparse.issparse(X):
        X = check_array(X, accept_sparse=sparse_format, dtype=np.float64)
        if norm == 'l1':
            inplace_csr_row_normalize_l1(X)
        elif norm == 'l2':
            inplace_csr_row_normalize_l2(X)
    else:
        if norm == 'l1':
            norms = np.abs(X).sum(axis=1)
            norms[norms == 0.0] = 1.0
        elif norm == 'l2':
            norms = row_norms(X)
            norms[norms == 0.0] = 1.0
        X /= norms[:, np.newaxis]

    if axis == 0:
        X = X.T

    return X