Esempio n. 1
0
def marginal_score(
    entity_relation_batch: torch.LongTensor,
    per_entity: Optional[scipy.sparse.csr_matrix],
    per_relation: Optional[scipy.sparse.csr_matrix],
    num_entities: int,
) -> torch.FloatTensor:
    """Shared code for computing entity scores from marginals."""
    batch_size = entity_relation_batch.shape[0]

    # base case
    if per_entity is None and per_relation is None:
        return torch.full(size=(batch_size, num_entities),
                          fill_value=1 / num_entities)

    e, r = entity_relation_batch.cpu().numpy().T

    if per_relation is not None and per_entity is None:
        scores = per_relation[r]
    elif per_relation is None and per_entity is not None:
        scores = per_entity[e]
    elif per_relation is not None and per_entity is not None:
        e_score = per_entity[e]
        r_score = per_relation[r]
        scores = e_score.multiply(r_score)
        scores = sklearn_normalize(scores, norm="l1", axis=1)
    else:
        raise AssertionError  # for mypy

    # note: we need to work with dense arrays only to comply with returning torch tensors. Otherwise, we could
    # stay sparse here, with a potential of a huge memory benefit on large datasets!
    return torch.from_numpy(scores.todense())
Esempio n. 2
0
 def test_normalize(self, matrix, norm, axis):
     if norm > 2:
         raise ValueError
     str_norm = 'l1' if norm == 1 else 'l2' if norm == 2 else 'max'
     np.allclose(
         normalize(matrix, norm, axis).numpy(),
         sklearn_normalize(matrix.numpy(), str_norm, axis))
Esempio n. 3
0
    def execute(cls, ctx, op):
        (x, ), device_id, xp = as_same_device(
            [ctx[inp.key] for inp in op.inputs],
            device=op.device,
            ret_extra=True)
        axis = op.axis
        return_norm = op.return_norm
        norm = op.norm
        outs = op.outputs

        with device(device_id):
            if device_id < 0 and op.use_sklearn and sklearn_normalize is not None:
                # no GPU
                try:
                    if xp is sparse:
                        if axis == 0:
                            xm = x.raw.tocsc()
                        else:
                            xm = x.raw
                    else:
                        xm = x
                    ret = sklearn_normalize(xm,
                                            norm=norm,
                                            axis=axis,
                                            return_norm=return_norm)
                    normed = None
                    if return_norm:
                        ret, normed = ret
                    if issparse(ret):
                        ret = sparse.SparseNDArray(ret)
                    ctx[outs[0].key] = ret
                    if normed is not None:
                        ctx[outs[1].key] = normed
                    return
                except NotImplementedError:
                    pass

            # fall back
            if axis == 0:
                x = x.T

            if norm == 'l1':
                norms = xp.abs(x).sum(axis=1)
            elif norm == 'l2':
                norms = xp.sqrt((x**2).sum(axis=1))
            else:
                norms = xp.max(x, axis=1)
                if issparse(norms):
                    norms = norms.toarray()
            norms[norms == 0.0] = 1.0
            x = x / norms[:, np.newaxis]

            if axis == 0:
                x = x.T

            ctx[outs[0].key] = x
            if return_norm:
                ctx[outs[1].key] = norms
Esempio n. 4
0
def normalize(dm, norm="l2"):
    feature_types = dm.feature_types
    numercial_index = [i for i in range(len(feature_types))
                       if feature_types[i] == "Float" or feature_types[i] == "Discrete"]

    (train_x, _), (valid_x, _), (test_x, _) = dm.get_train(), dm.get_val(), dm.get_test()

    train_x[:, numercial_index] = sklearn_normalize(train_x[:, numercial_index], norm)
    dm.train_X = train_x

    if valid_x is not None:
        valid_x[:, numercial_index] = sklearn_normalize(valid_x[:, numercial_index], norm)
        dm.val_X = valid_x

    if test_x is not None:
        test_x[:, numercial_index] = sklearn_normalize(test_x[:, numercial_index], norm)
        dm.test_X = test_x

    return dm
Esempio n. 5
0
def tf_idf(corpus, normalize_results=True):
    """Compute the TF-IDF on the corpus.
    Args:
        corpus (list): a list of text strings
        normalize_results (bool): Should the TF-IDF results be normalized (Optional - True default)
    Returns:
        tfidf_df (DataFrame): a data frame with the TF-IDF values
    """
    tfidf = sklearn_TfidfVectorizer()
    results = tfidf.fit_transform(corpus)
    results = results.toarray()
    if normalize_results:
        results = sklearn_normalize(results)
    tfidf_df = pd.DataFrame(
        sklearn_normalize(results),
        columns=tfidf.get_feature_names(),
        index=range(1, len(corpus) + 1),
    )
    return tfidf_df
Esempio n. 6
0
def normalize(input_matrix: Union[pd.DataFrame, pd.Series], norm="l2") -> pd.Series:
    """
    Normalize every cell in a Pandas Series.

    Input can be VectorSeries or DataFrames. For sparse DataFrames,
    the sparseness is kept.

    Parameters
    ----------
    input_matrix: Pandas Series (VectorSeries) or DataFrame

    norm: str, optional, default="l2"
        One of "l1", "l2", or "max". The norm that is used.

    Examples
    --------
    >>> import texthero as hero
    >>> import pandas as pd
    >>> col = ["a","b","c", "d"]
    >>> s = pd.DataFrame([[1, 2, 3, 4],[4, 2, 7, 5],[2, 2, 3, 5],[1, 2, 9, 8]], 
    ...                   columns=col).astype("Sparse")
    >>> hero.normalize(s, norm="max") # doctest: +SKIP      
              a         b     c         d
    0  0.250000  0.500000  0.75  1.000000
    1  0.571429  0.285714  1.00  0.714286
    2  0.400000  0.400000  0.60  1.000000
    3  0.111111  0.222222  1.00  0.888889


    See Also
    --------
    Representation Series link TODO add link to tutorial

    `Norm on Wikipedia
    <https://en.wikipedia.org/wiki/Norm_(mathematics)>`_

    """
    isDataFrame = isinstance(input_matrix, pd.DataFrame)

    if isDataFrame:
        input_matrix_coo = input_matrix.sparse.to_coo()
        input_matrix_for_vectorization = input_matrix_coo.astype("float64")
    else:
        input_matrix_for_vectorization = list(input_matrix)

    result = sklearn_normalize(
        input_matrix_for_vectorization, norm=norm
    )  # Can handle sparse input.

    if isDataFrame:
        return pd.DataFrame.sparse.from_spmatrix(
            result, input_matrix.index, input_matrix.columns
        )
    else:
        return pd.Series(list(result), index=input_matrix.index)
Esempio n. 7
0
def get_csr_matrix(
    row_indices: numpy.ndarray,
    col_indices: numpy.ndarray,
    shape: Tuple[int, int],
) -> scipy.sparse.csr_matrix:
    """Create a sparse matrix, for the given non-zero locations."""
    # create sparse matrix of absolute counts
    matrix = scipy.sparse.coo_matrix(
        (numpy.ones(row_indices.shape, dtype=numpy.float32),
         (row_indices, col_indices)),
        shape=shape,
    ).tocsr()
    # normalize to relative counts
    return sklearn_normalize(matrix, norm="l1")
Esempio n. 8
0
def bag_of_words(sentences, na_fill=0, normalize=False):
    """Transforms a list of sentences into a bag of words matrix
    Args:
        sentences (list): A list of sentences
        na_fill (mixed): What should fill the NA's? (Optional - 0 default)
        normalize: If the bag of words should be normalized (Optional - False default)
    Returns:
        bag_of_words_df (DataFrame): Dataframe with word counts by sentence.
    """
    # Get the count of words in each sentence
    bag = dict()
    n = 0
    for sentence in sentences:
        n += 1
        sentence = sentence.lower()
        for word in sentence.split():
            # Use a tuple as the key holding the word and the sentence index number
            key = (word, n)
            # Count the word
            bag[key] = bag.get(key, 0) + 1
    # Convert the count into a long data frame
    bag_of_words_data = list()
    for k in bag.keys():
        row = {"word": k[0], "index": k[1], "count": bag[k]}
        bag_of_words_data.append(row)
    bag_of_words_df = pd.DataFrame(bag_of_words_data)
    # Convert from long to wide dataframe
    bag_of_words_df = bag_of_words_df.pivot_table(
        index="index", columns="word", values="count", fill_value=na_fill
    )
    if normalize:
        normalized_bag_of_words = sklearn_normalize(bag_of_words_df)
        bag_of_words_df = pd.DataFrame(
            normalized_bag_of_words,
            columns=bag_of_words_df.columns,
            index=bag_of_words_df.index,
        )

    return bag_of_words_df
Esempio n. 9
0
def normalize(x, axis=1, norm='l1'):
    return sklearn_normalize(x, norm=norm, axis=axis)
 def _normalize(self, v):
     """Normalize vector v in-place with norm or return v if norm=None."""
     if self._norm is None:
         return v
     else:
         return sklearn_normalize(v, norm=self._norm, copy=False)
Esempio n. 11
0
def normalize(s: pd.Series, norm="l2") -> pd.Series:
    """
    Normalize every cell in a Pandas Series.

    Input has to be a Representation Series.

    Parameters
    ----------
    s: Pandas Series

    norm: str, default to "l2"
        One of "l1", "l2", or "max". The norm that is used.

    Examples
    --------
    >>> import texthero as hero
    >>> import pandas as pd
    >>> idx = pd.MultiIndex.from_tuples(
    ...             [(0, "a"), (0, "b"), (1, "c"), (1, "d")],
    ...              names=("document", "word"))
    >>> s = pd.Series([1, 2, 3, 4], index=idx)
    >>> hero.normalize(s, norm="max")
    document  word
    0         a       0.50
              b       1.00
    1         c       0.75
              d       1.00
    dtype: Sparse[float64, nan]


    See Also
    --------
    Representation Series link TODO add link to tutorial

    `Norm on Wikipedia <https://en.wikipedia.org/wiki/Norm_(mathematics)>`_

    """

    is_valid_representation = (isinstance(s.index, pd.MultiIndex)
                               and s.index.nlevels == 2)

    if not is_valid_representation:
        raise TypeError(
            "The input Pandas Series should be a Representation Pandas Series and should have a MultiIndex. The given Pandas Series does not appears to have MultiIndex"
        )
    # TODO after merging representation: use _check_is_valid_representation instead

    if pd.api.types.is_sparse(s):
        s_coo_matrix = s.sparse.to_coo()[0]
    else:
        s = s.astype("Sparse")
        s_coo_matrix = s.sparse.to_coo()[0]

    s_for_vectorization = s_coo_matrix

    result = sklearn_normalize(s_for_vectorization,
                               norm=norm)  # Can handle sparse input.

    result_coo = coo_matrix(result)
    s_result = pd.Series.sparse.from_coo(result_coo)
    s_result.index = s.index

    return s_result
Esempio n. 12
0
def standardize_input(input_matrix):
    if input_matrix.dtype in (np.float32, np.float64, np.float, np.double):
        return sklearn_normalize(input_matrix, norm="l1")
    else:
        return input_matrix