Ejemplo n.º 1
def test_min_max_axis1():
    X = np.array([[0, 3, 0],
                  [2, -1, 0],
                  [0, 0, 0],
                  [9, 8, 7],
                  [4, 0, 5]], dtype=np.float64)
    X_csr = sp.csr_matrix(X)
    X_csc = sp.csc_matrix(X)

    mins_csr, maxs_csr = min_max_axis(X_csr, axis=1)
    assert_array_equal(mins_csr, X.min(axis=1))
    assert_array_equal(maxs_csr, X.max(axis=1))

    mins_csc, maxs_csc = min_max_axis(X_csc, axis=1)
    assert_array_equal(mins_csc, X.min(axis=1))
    assert_array_equal(maxs_csc, X.max(axis=1))

    X = X.astype(np.float32)
    X_csr = sp.csr_matrix(X)
    X_csc = sp.csc_matrix(X)
    mins_csr, maxs_csr = min_max_axis(X_csr, axis=1)
    assert_array_equal(mins_csr, X.min(axis=1))
    assert_array_equal(maxs_csr, X.max(axis=1))
    mins_csc, maxs_csc = min_max_axis(X_csc, axis=1)
    assert_array_equal(mins_csc, X.min(axis=1))
    assert_array_equal(maxs_csc, X.max(axis=1))
Ejemplo n.º 2
def test_min_max_axis1():
    X = np.array([[0, 3, 0],
                  [2, -1, 0],
                  [0, 0, 0],
                  [9, 8, 7],
                  [4, 0, 5]], dtype=np.float64)
    X_csr = sp.csr_matrix(X)
    X_csc = sp.csc_matrix(X)

    mins_csr, maxs_csr = min_max_axis(X_csr, axis=1)
    assert_array_equal(mins_csr, X.min(axis=1))
    assert_array_equal(maxs_csr, X.max(axis=1))

    mins_csc, maxs_csc = min_max_axis(X_csc, axis=1)
    assert_array_equal(mins_csc, X.min(axis=1))
    assert_array_equal(maxs_csc, X.max(axis=1))

    X = X.astype(np.float32)
    X_csr = sp.csr_matrix(X)
    X_csc = sp.csc_matrix(X)
    mins_csr, maxs_csr = min_max_axis(X_csr, axis=1)
    assert_array_equal(mins_csr, X.min(axis=1))
    assert_array_equal(maxs_csr, X.max(axis=1))
    mins_csc, maxs_csc = min_max_axis(X_csc, axis=1)
    assert_array_equal(mins_csc, X.min(axis=1))
    assert_array_equal(maxs_csc, X.max(axis=1))
Ejemplo n.º 3
def test_min_max(
    X = np.array(
            [0, 3, 0],
            [2, -1, missing_values],
            [0, 0, 0],
            [9, missing_values, 7],
            [4, 0, 5],
    X_sparse = sparse_format(X)
    if large_indices:
        X_sparse.indices = X_sparse.indices.astype("int64")
        X_sparse.indptr = X_sparse.indptr.astype("int64")

    mins_sparse, maxs_sparse = min_max_axis(X_sparse,
    assert_array_equal(mins_sparse, min_func(X, axis=axis))
    assert_array_equal(maxs_sparse, max_func(X, axis=axis))
Ejemplo n.º 4
def test_min_max(dtype, axis, sparse_format, missing_values, min_func,
                 max_func, ignore_nan):
    X = np.array([[0, 3, 0], [2, -1, missing_values], [0, 0, 0],
                  [9, missing_values, 7], [4, 0, 5]],
    X_sparse = sparse_format(X)

    mins_sparse, maxs_sparse = min_max_axis(X_sparse,
    assert_array_equal(mins_sparse, min_func(X, axis=axis))
    assert_array_equal(maxs_sparse, max_func(X, axis=axis))
Ejemplo n.º 5
def test_min_max(dtype, axis, sparse_format, missing_values, min_func,
                 max_func, ignore_nan):
    X = np.array([[0, 3, 0],
                  [2, -1, missing_values],
                  [0, 0, 0],
                  [9, missing_values, 7],
                  [4, 0, 5]], dtype=dtype)
    X_sparse = sparse_format(X)

    mins_sparse, maxs_sparse = min_max_axis(X_sparse, axis=axis,
    assert_array_equal(mins_sparse, min_func(X, axis=axis))
    assert_array_equal(maxs_sparse, max_func(X, axis=axis))
def test_min_max(dtype, axis, sparse_format, missing_values, min_func,
                 max_func, ignore_nan, large_indices):
    X = np.array([[0, 3, 0],
                  [2, -1, missing_values],
                  [0, 0, 0],
                  [9, missing_values, 7],
                  [4, 0, 5]], dtype=dtype)
    X_sparse = sparse_format(X)
    if large_indices:
        X_sparse.indices = X_sparse.indices.astype('int64')
        X_sparse.indptr = X_sparse.indptr.astype('int64')

    mins_sparse, maxs_sparse = min_max_axis(X_sparse, axis=axis,
    assert_array_equal(mins_sparse, min_func(X, axis=axis))
    assert_array_equal(maxs_sparse, max_func(X, axis=axis))
Ejemplo n.º 7
def test_min_max_axis_errors():
    X = np.array([[0, 3, 0], [2, -1, 0], [0, 0, 0], [9, 8, 7], [4, 0, 5]],
    X_csr = sp.csr_matrix(X)
    X_csc = sp.csc_matrix(X)
    with pytest.raises(TypeError):
        min_max_axis(X_csr.tolil(), axis=0)
    with pytest.raises(ValueError):
        min_max_axis(X_csr, axis=2)
    with pytest.raises(ValueError):
        min_max_axis(X_csc, axis=-3)
Ejemplo n.º 8
def _inverse_binarize_multiclass(y, classes):
    """Inverse label binarization transformation for multiclass.

    Multiclass uses the maximal score instead of a threshold.
    classes = np.asarray(classes)

    if sp.issparse(y):
        # Find the argmax for each row in y where y is a CSR matrix

        y = y.tocsr()
        n_samples, n_outputs = y.shape
        outputs = np.arange(n_outputs)
        row_max = min_max_axis(y, 1)[1]
        row_nnz = np.diff(y.indptr)

        y_data_repeated_max = np.repeat(row_max, row_nnz)
        # picks out all indices obtaining the maximum per row
        y_i_all_argmax = np.flatnonzero(y_data_repeated_max == y.data)

        # For corner case where last row has a max of 0
        if row_max[-1] == 0:
            y_i_all_argmax = np.append(y_i_all_argmax, [len(y.data)])

        # Gets the index of the first argmax in each row from y_i_all_argmax
        index_first_argmax = np.searchsorted(y_i_all_argmax, y.indptr[:-1])
        # first argmax of each row
        y_ind_ext = np.append(y.indices, [0])
        y_i_argmax = y_ind_ext[y_i_all_argmax[index_first_argmax]]
        # Handle rows of all 0
        y_i_argmax[np.where(row_nnz == 0)[0]] = 0

        # Handles rows with max of 0 that contain negative numbers
        samples = np.arange(n_samples)[(row_nnz > 0) & (row_max.ravel() == 0)]
        for i in samples:
            ind = y.indices[y.indptr[i]:y.indptr[i + 1]]
            y_i_argmax[i] = classes[np.setdiff1d(outputs, ind)][0]

        return classes[y_i_argmax]
        return classes.take(y.argmax(axis=1), mode="clip")
Ejemplo n.º 9
    def fit(self, X):
        Used to fit Noramlizer with data
        :param X: list
        :return: nothing
        if self.norm not in ('l1', 'l2', 'max'):
            raise ValueError("'%s' is not a supported norm" % self.norm)

        if self.axis == 0:
            self.sparse_format = 'csc'
        elif self.axis == 1:
            self.sparse_format = 'csr'
            raise ValueError("'%d' is not a supported axis" % self.axis)

        X = check_array(X,
                        estimator='the normalize function',
        if self.axis == 0:
            X = X.T

        if sparse.issparse(X):
            if self.norm == 'l1':
            elif self.norm == 'l2':
            elif self.norm == 'max':
                _, self.norms = min_max_axis(X, 1)
            if self.norm == 'l1':
                self.norms = np.abs(X).sum(axis=1)
            elif self.norm == 'l2':
                self.norms = row_norms(X)
            elif self.norm == 'max':
                self.norms = np.max(X, axis=1)
            self.norms = _handle_zeros_in_scale(self.norms, copy=False)
def normalize(X, norm='l2', axis=1, copy=True, return_norm=False, shrink=0):
    """Scale input vectors individually to unit norm (vector length).

    Read more in the :ref:`User Guide <preprocessing_normalization>`.

    X : {array-like, sparse matrix}, shape [n_samples, n_features]
        The data to normalize, element by element.
        scipy.sparse matrices should be in CSR format to avoid an
        un-necessary copy.

    norm : 'l1', 'l2', or 'max', optional ('l2' by default)
        The norm to use to normalize each non zero sample (or each non-zero
        feature if axis is 0).

    axis : 0 or 1, optional (1 by default)
        axis used to normalize the data along. If 1, independently normalize
        each sample, otherwise (if 0) normalize each feature.

    copy : boolean, optional, default True
        set to False to perform inplace row normalization and avoid a
        copy (if the input is already a numpy array or a scipy.sparse
        CSR matrix and if axis is 1).

    return_norm : boolean, default False
        whether to return the computed norms

    X : {array-like, sparse matrix}, shape [n_samples, n_features]
        Normalized input X.

    norms : array, shape [n_samples] if axis=1 else [n_features]
        An array of norms along given axis for X.
        When X is sparse, a NotImplementedError will be raised
        for norm 'l1' or 'l2'.

    See also
    Normalizer: Performs normalization using the ``Transformer`` API
        (e.g. as part of a preprocessing :class:`sklearn.pipeline.Pipeline`).

    For a comparison of the different scalers, transformers, and normalizers,
    see :ref:`examples/preprocessing/plot_all_scaling.py

    if norm not in ('l1', 'l2', 'max'):
        raise ValueError("'%s' is not a supported norm" % norm)

    if axis == 0:
        sparse_format = 'csc'
    elif axis == 1:
        sparse_format = 'csr'
        raise ValueError("'%d' is not a supported axis" % axis)

    X = check_array(X,
                    estimator='the normalize function',
    if axis == 0:
        X = X.T

    if sparse.issparse(X):
        if return_norm and norm in ('l1', 'l2'):
            raise NotImplementedError("return_norm=True is not implemented "
                                      "for sparse matrices with norm 'l1' "
                                      "or norm 'l2'")
        if norm == 'l1':
        elif norm == 'l2':
            inplace_csr_row_normalize_l2(X, shrink)
        elif norm == 'max':
            _, norms = min_max_axis(X, 1)
            norms_elementwise = norms.repeat(np.diff(X.indptr))
            mask = norms_elementwise != 0
            X.data[mask] /= norms_elementwise[mask]
        if norm == 'l1':
            norms = np.abs(X).sum(axis=1)
        elif norm == 'l2':
            norms = row_norms(X)
        elif norm == 'max':
            norms = np.max(X, axis=1)
        norms = _handle_zeros_in_scale(norms, copy=False)
        X /= norms[:, np.newaxis]

    if axis == 0:
        X = X.T

    if return_norm:
        return X, norms
        return X
Ejemplo n.º 11
def csr_summ(x):
    mean, var = sparsefuncs.mean_variance_axis(x, 0)
    min_val, max_val = sparsefuncs.min_max_axis(x, 0)
    return np.hstack(
        [mean + 0.005, var + 0.005, min_val + 0.005, max_val + 0.005])
Ejemplo n.º 12
def CorrelationThreshold(X, threshold, kind):
    """Learn empirical variances from X.
    X : {array-like, sparse matrix} of shape (n_samples, n_features)
        Training set to compute correlations.
    y : ignored
        Not used, present here for API consistency by convention.
    support_mask : Boolean array for feature selection
    if not (0.0 <= threshold <= 1.0):
        raise BFE.from_errors([{'0100': 'Threshold value must in [0.0, 1.0]'}])
    if kind not in ('pearson', 'spearmanr'):
        raise BFE.from_errors([{'0100': "Kind must be 'pearson' or 'spearmanr"}])
    if issparse(X) and kind != 'pearson':
        raise BFE.from_errors([{'0100': "Only pearson correlation is supported with 'sparse matrices'"}])

    X = check_array(X, accept_sparse=['csc', 'csr'], dtype=[np.float64, np.float32])
    n_features = X.shape[1]
    if threshold == 1 or (1 in X.shape):
        support_mask = np.ones(n_features, dtype=np.bool)
        return support_mask
    # get constant features
    if issparse(X):
        mins, maxes = min_max_axis(X, axis=0)
        peak_to_peaks = maxes - mins
        constant_mask = np.isclose(peak_to_peaks, 0.0)
        # sparse correlation
        mu, sparse_var = mean_variance_axis(X, 0)
        X_corr = sparse_correlation(X, mu, ~constant_mask)
        peak_to_peaks = np.ptp(X, axis=0)
        constant_mask = np.isclose(peak_to_peaks, 0.0)
        if kind == 'pearson':
            X_corr = np.corrcoef(X, rowvar=False)
        else: # spearmanr
            X_corr, _ = spearmanr(X)
            # spearmanr returns scaler when comparing two columns
            if isinstance(X_corr, float):
                X_corr = np.array([[1, X_corr], [X_corr, 1]])
    np.fabs(X_corr, out=X_corr)
    # Removes constant features from support_mask
    support_mask = np.ones(n_features, dtype=np.bool)
    upper_idx = np.triu_indices(n_features, 1)
    non_constant_features = n_features
    for i in np.flatnonzero(constant_mask):
        feat_remove_mask = np.logical_and(upper_idx[0] != i,
                                          upper_idx[1] != i)
        upper_idx = (upper_idx[0][feat_remove_mask],
        support_mask[i] = False
        non_constant_features -= 1
    for _ in range(non_constant_features -1):
        max_idx = np.argmax(X_corr[upper_idx])
        feat1, feat2 = upper_idx[0][max_idx], upper_idx[1][max_idx]
        cur_corr = X_corr[feat1, feat2]
        # max correlation is lower than threshold
        if cur_corr < threshold:
        # Temporary remove both features to calculate the mean with other
        # features. One of the featuers will be selected.
        support_mask[[feat1, feat2]] = False
        # if there are no other features to compare, keep the feature with the most
        # variance
        if np.all(~support_mask):
            if issparse(X):
                # sparse precalculates variance for all features
                var = sparse_var[[feat1, feat2]]
                var = np.var(X[:, [feat1, feat2]], axis=0)

            print(feat1, feat2)
            if var[0] < var[1]:
                support_mask[feat2] = True
                support_mask[feat1] = True
        # mean with other features
        feat1_mean = np.mean(X_corr[feat1, support_mask])
        feat2_mean = np.mean(X_corr[feat2, support_mask])
        # feature with lower mean is kept
        if feat1_mean < feat2_mean:
            support_mask[feat1] = True
            feat_to_remove = feat2
            support_mask[feat2] = True
            feat_to_remove = feat1
        # remove the removed feature from consideration
        upper_idx_to_keep = np.logical_and(upper_idx[0] != feat_to_remove,
                                           upper_idx[1] != feat_to_remove)
        upper_idx = (upper_idx[0][upper_idx_to_keep],

    return support_mask
Ejemplo n.º 13
def get_ranked_phrases(nlp,
    Get phrases ranked by either TF-IDF (importance) score or BNgram (novelty) score.

    nlp : spacy.language.Language
        Spacy language model

    raw_documents : Iterable[str]
        An iterable which yields either str objects.

    timestamps : Iterable[str]
        timestamp of the documents. An iterable which
        yields datetime objects. Only used when

    include_verb_phrases : bool, default=False
        Indicator to include verb phrases also.

    minlen : int, default=1
        Minimum length of extracted multi-word phrases.
        Used for tokenizing the text.

    maxlen : int, default=8
        Maximum length of extracted multi-word phrases.
        Used for tokenizing the text.

    n_jobs : int, default=-1
        Number of processes to get noun phrases in parallel
        from documents.
            * -1: Use one process per available CPU cores
            * >0: Use `n_jobs` processes

    batch_size : int, default=1000
        Batch size for tokenizing, tagging and extracting
        noun phrases. Use smaller batch sizes on large
        number of large texts and vice-versa.

    stop_phrases : List[str], default=[]
        List of phrases to remove.

    vectorizer : str, default='bngram'
        One of ('bngram', 'tfidf').

    aggfunc : Union[str, callable, NoneType], default='sum'
        Function to aggregate over the scores per document
        for a single phrase to rank. One of ('sum', 'mean',
        'max', 'median', 'median_ignore_0', callable that
        accepts sparse matrix, None). If None, this function
        will return the vectorized documents and the vectorizer

    vectorizer_kws : dict
        Keyword arguments for TfidfVectorizer

    ranked_phrases : Union[pandas.DataFrame, Tuple[array[N, M], vectorizer]]
        If aggfunc is not None, returns the dataframe with the extracted
        n-gram / phrase and sorted descending by the aggregated bngram /
        td-idf scores, else returns the vectorized documents (where
        N=len(raw_documents) and M=len(phrases)) and the vectorizer object,
    assert vectorizer in ('bngram', 'tfidf')
    stop_phrases = set(stop_phrases)

    # get candidate phrases

    # extract phrases
    def process_chunk(texts):
        return list(nlp.pipe(texts))

    logger.info('Tokenizing, tagging and extracting noun phrases '
                'per documents with spacy')
    n_jobs = psutil.cpu_count(logical=False)\
        if n_jobs == -1 else n_jobs
    raw_documents = list(
        nlp.pipe(raw_documents, batch_size=batch_size, n_process=n_jobs))

    # vectorize the texts
    if 'norm' in vectorizer_kws and aggfunc is not None:
            "'vectorizer_kws' should not contain 'norm'. "
            "'vectorizer_kws['norm']' will be replaced.", UserWarning)
        vectorizer_kws['norm'] = None
    if 'analyzer' in vectorizer_kws:
            "'vectorizer_kws' should not contain 'analyzer'. "
            "'vectorizer_kws['analyzer']' will be replaced.", UserWarning)
    vectorizer_kws['analyzer'] = lambda doc: [
        p for p in doc._.noun_phrases if p not in stop_phrases
    if vectorizer == 'bngram':
        if timestamps is None:
            raise ValueError(
                'Parameter `timestamps` cannot be None if `vectorizer=bngram`.'
        vectorizer = BngramsVectorizer(**vectorizer_kws)
        logger.info('Vectorizing documents with BNgrams')
        X = vectorizer.fit_transform(raw_documents, timestamps)
    elif vectorizer == 'tfidf':
        vectorizer = TfidfVectorizer(**vectorizer_kws)
        logger.info('Vectorizing documents with TF-IDF')
        X = vectorizer.fit_transform(raw_documents)
        raise ValueError(f'Unknown vectorizer={vectorizer} given.')

    logger.info('Scoring phrases')
    if aggfunc == 'sum':
        scores = np.array(X.tocsc().sum(0))[0]
    elif aggfunc == 'mean':
        scores = np.array(X.tocsc().mean(0))[0]
    elif aggfunc == 'max':
        scores = min_max_axis(X.tocsc(), axis=0, ignore_nan=True)[1]
    elif aggfunc == 'median':
        scores = csc_median_axis_0(X.tocsc())
    elif aggfunc == 'median_ignore_0':
        scores = _get_median(X.tocsc(), 0)
    elif callable(aggfunc):
        scores = aggfunc(X.tocsc())
    elif aggfunc is None:
        return X, vectorizer
        raise ValueError(f'Unknown method: {aggfunc}')

    logger.info('Rank phrases based on score')
    ranked_phrases = pd.DataFrame(list(
        zip(vectorizer.get_feature_names(), scores)),
                                  columns=['phrase', 'score'])
    ranked_phrases = ranked_phrases\
        .sort_values('score', ascending=False)\

    return ranked_phrases