def marginal_score( entity_relation_batch: torch.LongTensor, per_entity: Optional[scipy.sparse.csr_matrix], per_relation: Optional[scipy.sparse.csr_matrix], num_entities: int, ) -> torch.FloatTensor: """Shared code for computing entity scores from marginals.""" batch_size = entity_relation_batch.shape[0] # base case if per_entity is None and per_relation is None: return torch.full(size=(batch_size, num_entities), fill_value=1 / num_entities) e, r = entity_relation_batch.cpu().numpy().T if per_relation is not None and per_entity is None: scores = per_relation[r] elif per_relation is None and per_entity is not None: scores = per_entity[e] elif per_relation is not None and per_entity is not None: e_score = per_entity[e] r_score = per_relation[r] scores = e_score.multiply(r_score) scores = sklearn_normalize(scores, norm="l1", axis=1) else: raise AssertionError # for mypy # note: we need to work with dense arrays only to comply with returning torch tensors. Otherwise, we could # stay sparse here, with a potential of a huge memory benefit on large datasets! return torch.from_numpy(scores.todense())
def test_normalize(self, matrix, norm, axis): if norm > 2: raise ValueError str_norm = 'l1' if norm == 1 else 'l2' if norm == 2 else 'max' np.allclose( normalize(matrix, norm, axis).numpy(), sklearn_normalize(matrix.numpy(), str_norm, axis))
def execute(cls, ctx, op): (x, ), device_id, xp = as_same_device( [ctx[inp.key] for inp in op.inputs], device=op.device, ret_extra=True) axis = op.axis return_norm = op.return_norm norm = op.norm outs = op.outputs with device(device_id): if device_id < 0 and op.use_sklearn and sklearn_normalize is not None: # no GPU try: if xp is sparse: if axis == 0: xm = x.raw.tocsc() else: xm = x.raw else: xm = x ret = sklearn_normalize(xm, norm=norm, axis=axis, return_norm=return_norm) normed = None if return_norm: ret, normed = ret if issparse(ret): ret = sparse.SparseNDArray(ret) ctx[outs[0].key] = ret if normed is not None: ctx[outs[1].key] = normed return except NotImplementedError: pass # fall back if axis == 0: x = x.T if norm == 'l1': norms = xp.abs(x).sum(axis=1) elif norm == 'l2': norms = xp.sqrt((x**2).sum(axis=1)) else: norms = xp.max(x, axis=1) if issparse(norms): norms = norms.toarray() norms[norms == 0.0] = 1.0 x = x / norms[:, np.newaxis] if axis == 0: x = x.T ctx[outs[0].key] = x if return_norm: ctx[outs[1].key] = norms
def normalize(dm, norm="l2"): feature_types = dm.feature_types numercial_index = [i for i in range(len(feature_types)) if feature_types[i] == "Float" or feature_types[i] == "Discrete"] (train_x, _), (valid_x, _), (test_x, _) = dm.get_train(), dm.get_val(), dm.get_test() train_x[:, numercial_index] = sklearn_normalize(train_x[:, numercial_index], norm) dm.train_X = train_x if valid_x is not None: valid_x[:, numercial_index] = sklearn_normalize(valid_x[:, numercial_index], norm) dm.val_X = valid_x if test_x is not None: test_x[:, numercial_index] = sklearn_normalize(test_x[:, numercial_index], norm) dm.test_X = test_x return dm
def tf_idf(corpus, normalize_results=True): """Compute the TF-IDF on the corpus. Args: corpus (list): a list of text strings normalize_results (bool): Should the TF-IDF results be normalized (Optional - True default) Returns: tfidf_df (DataFrame): a data frame with the TF-IDF values """ tfidf = sklearn_TfidfVectorizer() results = tfidf.fit_transform(corpus) results = results.toarray() if normalize_results: results = sklearn_normalize(results) tfidf_df = pd.DataFrame( sklearn_normalize(results), columns=tfidf.get_feature_names(), index=range(1, len(corpus) + 1), ) return tfidf_df
def normalize(input_matrix: Union[pd.DataFrame, pd.Series], norm="l2") -> pd.Series: """ Normalize every cell in a Pandas Series. Input can be VectorSeries or DataFrames. For sparse DataFrames, the sparseness is kept. Parameters ---------- input_matrix: Pandas Series (VectorSeries) or DataFrame norm: str, optional, default="l2" One of "l1", "l2", or "max". The norm that is used. Examples -------- >>> import texthero as hero >>> import pandas as pd >>> col = ["a","b","c", "d"] >>> s = pd.DataFrame([[1, 2, 3, 4],[4, 2, 7, 5],[2, 2, 3, 5],[1, 2, 9, 8]], ... columns=col).astype("Sparse") >>> hero.normalize(s, norm="max") # doctest: +SKIP a b c d 0 0.250000 0.500000 0.75 1.000000 1 0.571429 0.285714 1.00 0.714286 2 0.400000 0.400000 0.60 1.000000 3 0.111111 0.222222 1.00 0.888889 See Also -------- Representation Series link TODO add link to tutorial `Norm on Wikipedia <https://en.wikipedia.org/wiki/Norm_(mathematics)>`_ """ isDataFrame = isinstance(input_matrix, pd.DataFrame) if isDataFrame: input_matrix_coo = input_matrix.sparse.to_coo() input_matrix_for_vectorization = input_matrix_coo.astype("float64") else: input_matrix_for_vectorization = list(input_matrix) result = sklearn_normalize( input_matrix_for_vectorization, norm=norm ) # Can handle sparse input. if isDataFrame: return pd.DataFrame.sparse.from_spmatrix( result, input_matrix.index, input_matrix.columns ) else: return pd.Series(list(result), index=input_matrix.index)
def get_csr_matrix( row_indices: numpy.ndarray, col_indices: numpy.ndarray, shape: Tuple[int, int], ) -> scipy.sparse.csr_matrix: """Create a sparse matrix, for the given non-zero locations.""" # create sparse matrix of absolute counts matrix = scipy.sparse.coo_matrix( (numpy.ones(row_indices.shape, dtype=numpy.float32), (row_indices, col_indices)), shape=shape, ).tocsr() # normalize to relative counts return sklearn_normalize(matrix, norm="l1")
def bag_of_words(sentences, na_fill=0, normalize=False): """Transforms a list of sentences into a bag of words matrix Args: sentences (list): A list of sentences na_fill (mixed): What should fill the NA's? (Optional - 0 default) normalize: If the bag of words should be normalized (Optional - False default) Returns: bag_of_words_df (DataFrame): Dataframe with word counts by sentence. """ # Get the count of words in each sentence bag = dict() n = 0 for sentence in sentences: n += 1 sentence = sentence.lower() for word in sentence.split(): # Use a tuple as the key holding the word and the sentence index number key = (word, n) # Count the word bag[key] = bag.get(key, 0) + 1 # Convert the count into a long data frame bag_of_words_data = list() for k in bag.keys(): row = {"word": k[0], "index": k[1], "count": bag[k]} bag_of_words_data.append(row) bag_of_words_df = pd.DataFrame(bag_of_words_data) # Convert from long to wide dataframe bag_of_words_df = bag_of_words_df.pivot_table( index="index", columns="word", values="count", fill_value=na_fill ) if normalize: normalized_bag_of_words = sklearn_normalize(bag_of_words_df) bag_of_words_df = pd.DataFrame( normalized_bag_of_words, columns=bag_of_words_df.columns, index=bag_of_words_df.index, ) return bag_of_words_df
def normalize(x, axis=1, norm='l1'): return sklearn_normalize(x, norm=norm, axis=axis)
def _normalize(self, v): """Normalize vector v in-place with norm or return v if norm=None.""" if self._norm is None: return v else: return sklearn_normalize(v, norm=self._norm, copy=False)
def normalize(s: pd.Series, norm="l2") -> pd.Series: """ Normalize every cell in a Pandas Series. Input has to be a Representation Series. Parameters ---------- s: Pandas Series norm: str, default to "l2" One of "l1", "l2", or "max". The norm that is used. Examples -------- >>> import texthero as hero >>> import pandas as pd >>> idx = pd.MultiIndex.from_tuples( ... [(0, "a"), (0, "b"), (1, "c"), (1, "d")], ... names=("document", "word")) >>> s = pd.Series([1, 2, 3, 4], index=idx) >>> hero.normalize(s, norm="max") document word 0 a 0.50 b 1.00 1 c 0.75 d 1.00 dtype: Sparse[float64, nan] See Also -------- Representation Series link TODO add link to tutorial `Norm on Wikipedia <https://en.wikipedia.org/wiki/Norm_(mathematics)>`_ """ is_valid_representation = (isinstance(s.index, pd.MultiIndex) and s.index.nlevels == 2) if not is_valid_representation: raise TypeError( "The input Pandas Series should be a Representation Pandas Series and should have a MultiIndex. The given Pandas Series does not appears to have MultiIndex" ) # TODO after merging representation: use _check_is_valid_representation instead if pd.api.types.is_sparse(s): s_coo_matrix = s.sparse.to_coo()[0] else: s = s.astype("Sparse") s_coo_matrix = s.sparse.to_coo()[0] s_for_vectorization = s_coo_matrix result = sklearn_normalize(s_for_vectorization, norm=norm) # Can handle sparse input. result_coo = coo_matrix(result) s_result = pd.Series.sparse.from_coo(result_coo) s_result.index = s.index return s_result
def standardize_input(input_matrix): if input_matrix.dtype in (np.float32, np.float64, np.float, np.double): return sklearn_normalize(input_matrix, norm="l1") else: return input_matrix