Exemple #1
0
 def test_type_coercion_at_construction(self):
     # GH 15682
     result = pd.SparseDataFrame(
         {
             'a': [1, 0, 0],
             'b': [0, 1, 0],
             'c': [0, 0, 1]
         },
         dtype='uint8',
         default_fill_value=0)
     expected = pd.SparseDataFrame(
         {
             'a': pd.SparseSeries([1, 0, 0], dtype='uint8'),
             'b': pd.SparseSeries([0, 1, 0], dtype='uint8'),
             'c': pd.SparseSeries([0, 0, 1], dtype='uint8')
         },
         default_fill_value=0)
     tm.assert_sp_frame_equal(result, expected)
def test_is_sparse(check_scipy):
    assert com.is_sparse(pd.SparseArray([1, 2, 3]))
    assert com.is_sparse(pd.SparseSeries([1, 2, 3]))

    assert not com.is_sparse(np.array([1, 2, 3]))

    if check_scipy:
        import scipy.sparse
        assert not com.is_sparse(scipy.sparse.bsr_matrix([1, 2, 3]))
Exemple #3
0
    def test_get(self):
        s = pd.SparseSeries([1, np.nan, np.nan, 3, np.nan])
        self.assertEqual(s.get(0), 1)
        assert np.isnan(s.get(1))
        assert s.get(5) is None

        s = pd.SparseSeries([1, np.nan, 0, 3, 0], index=list('ABCDE'))
        self.assertEqual(s.get('A'), 1)
        assert np.isnan(s.get('B'))
        self.assertEqual(s.get('C'), 0)
        assert s.get('XX') is None

        s = pd.SparseSeries([1, np.nan, 0, 3, 0], index=list('ABCDE'),
                            fill_value=0)
        self.assertEqual(s.get('A'), 1)
        assert np.isnan(s.get('B'))
        self.assertEqual(s.get('C'), 0)
        assert s.get('XX') is None
Exemple #4
0
    def test_concat_sparse_dense(self, kind):
        # use first input's fill_value
        val1 = np.array([1, 2, np.nan, np.nan, 0, np.nan])
        val2 = np.array([3, np.nan, 4, 0, 0])

        sparse = pd.SparseSeries(val1, name='x', kind=kind)
        dense = pd.Series(val2, name='y')

        res = pd.concat([sparse, dense])
        exp = pd.SparseSeries(pd.concat([pd.Series(val1), dense]), kind=kind)
        tm.assert_sp_series_equal(res, exp)

        res = pd.concat([dense, sparse, dense])
        exp = pd.concat([dense, pd.Series(val1), dense])
        # XXX: changed from SparseSeries to Series[sparse]
        exp = pd.Series(
            pd.SparseArray(exp, kind=kind),
            index=exp.index,
            name=exp.name,
        )
        tm.assert_series_equal(res, exp)

        sparse = pd.SparseSeries(val1, name='x', kind=kind, fill_value=0)
        dense = pd.Series(val2, name='y')

        res = pd.concat([sparse, dense])
        # XXX: changed from SparseSeries to Series[sparse]
        exp = pd.concat([pd.Series(val1), dense])
        exp = pd.Series(
            pd.SparseArray(exp, kind=kind, fill_value=0),
            index=exp.index,
            name=exp.name,
        )
        tm.assert_series_equal(res, exp)

        res = pd.concat([dense, sparse, dense])
        exp = pd.concat([dense, pd.Series(val1), dense])
        # XXX: changed from SparseSeries to Series[sparse]
        exp = pd.Series(
            pd.SparseArray(exp, kind=kind, fill_value=0),
            index=exp.index,
            name=exp.name,
        )
        tm.assert_series_equal(res, exp)
Exemple #5
0
def test_is_sparse():
    assert com.is_sparse(pd.SparseArray([1, 2, 3]))
    assert com.is_sparse(pd.SparseSeries([1, 2, 3]))

    assert not com.is_sparse(np.array([1, 2, 3]))

    # This test will only skip if the previous assertions
    # pass AND scipy is not installed.
    sparse = pytest.importorskip("scipy.sparse")
    assert not com.is_sparse(sparse.bsr_matrix([1, 2, 3]))
Exemple #6
0
    def test_get(self):
        s = pd.SparseSeries([1, np.nan, np.nan, 3, np.nan])
        assert s.get(0) == 1
        assert np.isnan(s.get(1))
        assert s.get(5) is None

        s = pd.SparseSeries([1, np.nan, 0, 3, 0], index=list("ABCDE"))
        assert s.get("A") == 1
        assert np.isnan(s.get("B"))
        assert s.get("C") == 0
        assert s.get("XX") is None

        s = pd.SparseSeries([1, np.nan, 0, 3, 0],
                            index=list("ABCDE"),
                            fill_value=0)
        assert s.get("A") == 1
        assert np.isnan(s.get("B"))
        assert s.get("C") == 0
        assert s.get("XX") is None
Exemple #7
0
 def test_dataframe_dummies_subset(self, df, sparse):
     result = get_dummies(df, prefix=['from_A'], columns=['A'],
                          sparse=sparse)
     expected = DataFrame({'B': ['b', 'b', 'c'],
                           'C': [1, 2, 3],
                           'from_A_a': [1, 0, 1],
                           'from_A_b': [0, 1, 0]}, dtype=np.uint8)
     expected[['C']] = df[['C']]
     if sparse:
         cols = ['from_A_a', 'from_A_b']
         expected[cols] = expected[cols].apply(lambda x: pd.SparseSeries(x))
     assert_frame_equal(result, expected)
Exemple #8
0
    def spoolTestData(self, testData=None):
        if not testData is None:
            X = testData
        else:
            X = self.X_test

        daily_fingerprint_count, unsupervised, Y = self.spoolData(
            X, self.daily_fingerprint_count)

        currencies = self.oheC.transform(
            np.array(X['currency']).reshape(-1, 1))
        for i, col in enumerate(self.oheC.get_feature_names()):
            unsupervised[col] = pd.SparseSeries(
                currencies[:, i].toarray().ravel(), fill_value=0)

        bankEncoded = self.oheB.transform(np.array(X['bank']).reshape(-1, 1))
        for i, col in enumerate(self.oheB.get_feature_names()):
            unsupervised[col] = pd.SparseSeries(
                bankEncoded[:, i].toarray().ravel(), fill_value=0)

        return unsupervised, Y
Exemple #9
0
    def test_concat_different_fill(self):
        val1 = np.array([1, 2, np.nan, np.nan, 0, np.nan])
        val2 = np.array([3, np.nan, 4, 0, 0])

        for kind in ['integer', 'block']:
            sparse1 = pd.SparseSeries(val1, name='x', kind=kind)
            sparse2 = pd.SparseSeries(val2, name='y', kind=kind, fill_value=0)

            with tm.assert_produces_warning(PerformanceWarning):
                res = pd.concat([sparse1, sparse2])

            exp = pd.concat([pd.Series(val1), pd.Series(val2)])
            exp = pd.SparseSeries(exp, kind=kind)
            tm.assert_sp_series_equal(res, exp)

            with tm.assert_produces_warning(PerformanceWarning):
                res = pd.concat([sparse2, sparse1])

            exp = pd.concat([pd.Series(val2), pd.Series(val1)])
            exp = pd.SparseSeries(exp, kind=kind, fill_value=0)
            tm.assert_sp_series_equal(res, exp)
Exemple #10
0
    def test_concat_sparse_dense(self):
        # use first input's fill_value
        val1 = np.array([1, 2, np.nan, np.nan, 0, np.nan])
        val2 = np.array([3, np.nan, 4, 0, 0])

        for kind in ['integer', 'block']:
            sparse = pd.SparseSeries(val1, name='x', kind=kind)
            dense = pd.Series(val2, name='y')

            res = pd.concat([sparse, dense])
            exp = pd.concat([pd.Series(val1), dense])
            exp = pd.SparseSeries(exp, kind=kind)
            tm.assert_sp_series_equal(res, exp)

            res = pd.concat([dense, sparse, dense])
            exp = pd.concat([dense, pd.Series(val1), dense])
            exp = pd.SparseSeries(exp, kind=kind)
            tm.assert_sp_series_equal(res, exp)

            sparse = pd.SparseSeries(val1, name='x', kind=kind, fill_value=0)
            dense = pd.Series(val2, name='y')

            res = pd.concat([sparse, dense])
            exp = pd.concat([pd.Series(val1), dense])
            exp = pd.SparseSeries(exp, kind=kind, fill_value=0)
            tm.assert_sp_series_equal(res, exp)

            res = pd.concat([dense, sparse, dense])
            exp = pd.concat([dense, pd.Series(val1), dense])
            exp = pd.SparseSeries(exp, kind=kind, fill_value=0)
            tm.assert_sp_series_equal(res, exp)
    def test_concat_different_fill(self):
        val1 = np.array([1, 2, np.nan, np.nan, 0, np.nan])
        val2 = np.array([3, np.nan, 4, 0, 0])

        for kind in ["integer", "block"]:
            sparse1 = pd.SparseSeries(val1, name="x", kind=kind)
            sparse2 = pd.SparseSeries(val2, name="y", kind=kind, fill_value=0)

            with tm.assert_produces_warning(PerformanceWarning,
                                            raise_on_extra_warnings=False):
                res = pd.concat([sparse1, sparse2])

            exp = pd.concat([pd.Series(val1), pd.Series(val2)])
            exp = pd.SparseSeries(exp, kind=kind)
            tm.assert_sp_series_equal(res, exp)

            with tm.assert_produces_warning(PerformanceWarning,
                                            raise_on_extra_warnings=False):
                res = pd.concat([sparse2, sparse1])

            exp = pd.concat([pd.Series(val2), pd.Series(val1)])
            exp = pd.SparseSeries(exp, kind=kind, fill_value=0)
            tm.assert_sp_series_equal(res, exp)
Exemple #12
0
 def transform(self, X, y=None, **transform_params):
     links = []
     # print(links)
     for index, row in X.iteritems():
         text = DataCleaner().clean_email(row)
         links.append(Links().get_list_keywords(text))
     dtm  = self.vectorizer.transform(links)
     data=self.tfidf_transformer.transform(dtm)
     df = pd.SparseDataFrame(data=[pd.SparseSeries(data[i].toarray().ravel())
                                    for i in np.arange(data.shape[0])],
                              columns=["Lnk."+freq for freq in self.vectorizer.get_feature_names()])
     # print("1")
     #df=DataFrame(data=data.todense(), columns=list(X.columns.values))
     return df
Exemple #13
0
    def tests_indexing_with_sparse(self):
        # GH 13985

        for kind in ['integer', 'block']:
            for fill in [True, False, np.nan]:
                arr = pd.SparseArray([1, 2, 3], kind=kind)
                indexer = pd.SparseArray([True, False, True],
                                         fill_value=fill,
                                         dtype=bool)

                tm.assert_sp_array_equal(
                    pd.SparseArray([1, 3], kind=kind),
                    arr[indexer],
                )

                s = pd.SparseSeries(arr,
                                    index=['a', 'b', 'c'],
                                    dtype=np.float64)

                exp = pd.SparseSeries([1, 3],
                                      index=['a', 'c'],
                                      dtype=SparseDtype(
                                          np.float64, s.fill_value),
                                      kind=kind)
                tm.assert_sp_series_equal(s[indexer], exp)
                tm.assert_sp_series_equal(s.loc[indexer], exp)
                tm.assert_sp_series_equal(s.iloc[indexer], exp)

                indexer = pd.SparseSeries(indexer, index=['a', 'b', 'c'])
                tm.assert_sp_series_equal(s[indexer], exp)
                tm.assert_sp_series_equal(s.loc[indexer], exp)

                msg = ("iLocation based boolean indexing cannot use an "
                       "indexable as a mask")
                with tm.assert_raises_regex(ValueError, msg):
                    s.iloc[indexer]
Exemple #14
0
 def test_dataframe_dummies_drop_first_with_categorical(
         self, df, sparse, dtype):
     df['cat'] = pd.Categorical(['x', 'y', 'y'])
     result = get_dummies(df, drop_first=True, sparse=sparse)
     expected = DataFrame({'C': [1, 2, 3],
                           'A_b': [0, 1, 0],
                           'B_c': [0, 0, 1],
                           'cat_y': [0, 1, 1]})
     cols = ['A_b', 'B_c', 'cat_y']
     expected[cols] = expected[cols].astype(np.uint8)
     expected = expected[['C', 'A_b', 'B_c', 'cat_y']]
     if sparse:
         for col in cols:
             expected[col] = pd.SparseSeries(expected[col])
     assert_frame_equal(result, expected)
Exemple #15
0
def get_sparse_mx(df, fields, count):

    row_ = list(sorted(df[fields[0]].unique()))
    col_ = list(sorted(df[fields[1]].unique()))

    row = df[fields[0]].astype(pd.api.types.CategoricalDtype(categories=row_)).cat.codes
    col = df[fields[1]].astype(pd.api.types.CategoricalDtype(categories=col_)).cat.codes

    data = df[count].tolist()

    sparse_matrix = sp.csr_matrix((data, (row, col)), shape=(len(row_), len(col_)))

    df = pd.SparseDataFrame([pd.SparseSeries(sparse_matrix[i].toarray().ravel(), fill_value=0)
                            for i in np.arange(sparse_matrix.shape[0])],
                            index=row_, columns=col_, default_fill_value=0)
    return df
Exemple #16
0
    def to_pandas(self, sample_field=None, feature_field=None, sparse=None):
        '''Get a pandas dataframe of the abundances
        Samples are rows, features are columns. Can specify the metadata fields
        for the index (default is sample_metadata index) and column labels
        (default is feature_metadata index)

        Parameters
        ----------
        sample_field : str or None, optional
            Name of the sample_metadata column to use for index.
            None (default) is the sample_metadata index
        feature_field : str or None, optional
            Name of the feature_metadata column to use for column names.
            None (default) is the feature_metadata index
        sparse: bool or None, optional
            None (default) to get sparsity based on the underlying Experiment sparsity
            True to force to sparse pandas.Dataframe
            False to force to standard pandas.Dataframe

        Returns
        -------
        pandas.Dataframe or pandas.SparseDataFrame
        '''
        if sample_field is None:
            ind = self.sample_metadata.index
        else:
            ind = self.sample_metadata[sample_field]
        if feature_field is None:
            cols = self.feature_metadata.index
        else:
            cols = self.feature_metadata[feature_field]

        if sparse is not None:
            self.sparse = sparse

        if self.sparse:
            # create list of sparse rows
            sr = [
                pd.SparseSeries(self.data[i, :].toarray().ravel(),
                                fill_value=0)
                for i in np.arange(self.data.shape[0])
            ]
            df = pd.SparseDataFrame(sr, index=ind, columns=cols)
        else:
            df = pd.DataFrame(self.data, index=ind, columns=cols, copy=True)
        return df
Exemple #17
0
 def test_dataframe_dummies_drop_first_with_categorical(
         self, df, sparse, dtype):
     df["cat"] = pd.Categorical(["x", "y", "y"])
     result = get_dummies(df, drop_first=True, sparse=sparse)
     expected = DataFrame({
         "C": [1, 2, 3],
         "A_b": [0, 1, 0],
         "B_c": [0, 0, 1],
         "cat_y": [0, 1, 1]
     })
     cols = ["A_b", "B_c", "cat_y"]
     expected[cols] = expected[cols].astype(np.uint8)
     expected = expected[["C", "A_b", "B_c", "cat_y"]]
     if sparse:
         for col in cols:
             expected[col] = pd.SparseSeries(expected[col])
     assert_frame_equal(result, expected)
Exemple #18
0
def sparse_df(connection, sample_ids, feature_ids):
    """
    Takes a connection and lists of sample and feature ids and returns a sparse
    dataframe.
    :param connection:
    :param sample_ids:
    :param feature_ids:
    :return:
    """
    dense = df(connection, sample_ids, feature_ids)
    sparse_df = dense.to_sparse(fill_value=0.0)
    csr = sparse_df.to_coo().tocsr()
    sp = pd.SparseDataFrame([
        pd.SparseSeries(dense[i].toarray().ravel())
        for i in np.arange(dense.shape[0])
    ])
    return sp
Exemple #19
0
    def test_sparse_int(self):
        # GH 13110
        s = pd.SparseSeries([0, 1, 0, 0, 1, 0], fill_value=False)

        result = repr(s)
        dtype = '' if use_32bit_repr else ', dtype=int32'
        exp = ("0    0\n1    1\n2    0\n3    0\n4    1\n"
               "5    0\ndtype: int64\nBlockIndex\n"
               "Block locations: array([1, 4]{0})\n"
               "Block lengths: array([1, 1]{0})".format(dtype))
        self.assertEqual(result, exp)

        with option_context("display.max_rows", 3):
            result = repr(s)
            exp = ("0    0\n    ..\n5    0\n"
                   "dtype: int64\nBlockIndex\n"
                   "Block locations: array([1, 4]{0})\n"
                   "Block lengths: array([1, 1]{0})".format(dtype))
            self.assertEqual(result, exp)
Exemple #20
0
def make_graph_and_calculate_centrality():

    # make graph
    graph = graph_maker()

    # draw the graph if you want to
    nx.draw_networkx(graph)
    if show_graph:
        plt.show()

    # get the adjacency matrix
    adj_matrix = nx.adjacency_matrix(graph)

    # turn it from sparse to regular
    sparse_df = pd.SparseDataFrame(
        [pd.SparseSeries(adj_matrix[x].toarray().ravel()) for x in np.arange(adj_matrix.shape[0])])
    df = sparse_df.to_dense()

    # change the name of the rows
    df.index = ['Node' + str(x) for x in df]
    adj_matrix = df

    # make the random walks
    random_walks, value_counts, separated_string = make_random_walks(adj_matrix, number_of_walks, length_of_walks)
    print "done with random walks"
    # Calculate the number of iteration needed
    num_of_iteration = calculate_num_of_iteration(number_of_walks, length_of_walks, len(df.index))

    # after_parse =  wevi_parser(random_walks, window_size)
    # print "num of iter is {}".format(num_of_iteration)
    # centrality_vector = wevi_automate(after_parse, num_of_iteration)
    # centrality_compare(graph, centrality_vector)

    if use_new_version:
        input1 = str(testing_gensim(separated_string))[1:-1]
    else:
        print "The input to wevi:"
        print wevi_parser(random_walks, window_size)
        print "Please paste here the results from wevi"
        input1 = raw_input()

    compare_dict = centrality_compare(graph, input1, value_counts)
    return compare_dict
Exemple #21
0
 def test_dataframe_dummies_subset(self, df, sparse):
     result = get_dummies(df,
                          prefix=["from_A"],
                          columns=["A"],
                          sparse=sparse)
     expected = DataFrame(
         {
             "B": ["b", "b", "c"],
             "C": [1, 2, 3],
             "from_A_a": [1, 0, 1],
             "from_A_b": [0, 1, 0],
         },
         dtype=np.uint8,
     )
     expected[["C"]] = df[["C"]]
     if sparse:
         cols = ["from_A_a", "from_A_b"]
         expected[cols] = expected[cols].apply(lambda x: pd.SparseSeries(x))
     assert_frame_equal(result, expected)
Exemple #22
0
def test_is_extension_type(check_scipy):
    assert not com.is_extension_type([1, 2, 3])
    assert not com.is_extension_type(np.array([1, 2, 3]))
    assert not com.is_extension_type(pd.DatetimeIndex([1, 2, 3]))

    cat = pd.Categorical([1, 2, 3])
    assert com.is_extension_type(cat)
    assert com.is_extension_type(pd.Series(cat))
    assert com.is_extension_type(pd.SparseArray([1, 2, 3]))
    assert com.is_extension_type(pd.SparseSeries([1, 2, 3]))
    assert com.is_extension_type(pd.DatetimeIndex([1, 2, 3], tz="US/Eastern"))

    dtype = DatetimeTZDtype("ns", tz="US/Eastern")
    s = pd.Series([], dtype=dtype)
    assert com.is_extension_type(s)

    if check_scipy:
        import scipy.sparse
        assert not com.is_extension_type(scipy.sparse.bsr_matrix([1, 2, 3]))
Exemple #23
0
    def test_sparse_bool(self):
        # GH 13110
        s = pd.SparseSeries([True, False, False, True, False, False],
                            fill_value=False)
        result = repr(s)
        dtype = '' if use_32bit_repr else ', dtype=int32'
        exp = ("0     True\n1    False\n2    False\n"
               "3     True\n4    False\n5    False\n"
               "dtype: Sparse[bool, False]\nBlockIndex\n"
               "Block locations: array([0, 3]{0})\n"
               "Block lengths: array([1, 1]{0})".format(dtype))
        assert result == exp

        with option_context("display.max_rows", 3):
            result = repr(s)
            exp = ("0     True\n     ...  \n5    False\n"
                   "Length: 6, dtype: Sparse[bool, False]\nBlockIndex\n"
                   "Block locations: array([0, 3]{0})\n"
                   "Block lengths: array([1, 1]{0})".format(dtype))
            assert result == exp
Exemple #24
0
    def test_sparse_int(self):
        # GH 13110
        s = pd.SparseSeries([0, 1, 0, 0, 1, 0], fill_value=False)

        result = repr(s)
        dtype = "" if use_32bit_repr else ", dtype=int32"
        exp = ("0    0\n1    1\n2    0\n3    0\n4    1\n"
               "5    0\ndtype: Sparse[int64, False]\nBlockIndex\n"
               "Block locations: array([1, 4]{0})\n"
               "Block lengths: array([1, 1]{0})".format(dtype))
        assert result == exp

        with option_context("display.max_rows", 3, "display.show_dimensions",
                            False):
            result = repr(s)
            exp = ("0    0\n    ..\n5    0\n"
                   "dtype: Sparse[int64, False]\nBlockIndex\n"
                   "Block locations: array([1, 4]{0})\n"
                   "Block lengths: array([1, 1]{0})".format(dtype))
            assert result == exp
Exemple #25
0
def test_is_extension_type():
    assert not com.is_extension_type([1, 2, 3])
    assert not com.is_extension_type(np.array([1, 2, 3]))
    assert not com.is_extension_type(pd.DatetimeIndex([1, 2, 3]))

    cat = pd.Categorical([1, 2, 3])
    assert com.is_extension_type(cat)
    assert com.is_extension_type(pd.Series(cat))
    assert com.is_extension_type(pd.SparseArray([1, 2, 3]))
    assert com.is_extension_type(pd.SparseSeries([1, 2, 3]))
    assert com.is_extension_type(pd.DatetimeIndex([1, 2, 3], tz="US/Eastern"))

    dtype = DatetimeTZDtype("ns", tz="US/Eastern")
    s = pd.Series([], dtype=dtype)
    assert com.is_extension_type(s)

    # This test will only skip if the previous assertions
    # pass AND scipy is not installed.
    sparse = pytest.importorskip("scipy.sparse")
    assert not com.is_extension_type(sparse.bsr_matrix([1, 2, 3]))
Exemple #26
0
    def sparsity(self):
        data = pd.read_csv('data-files/raw_data_march.csv')
        data.drop(data.columns[[0, 1, 2, 3, 4, 7, 12, 13, 14, 15, 16, 17, 18, 19]], axis=1, inplace=True)
        #df = data.pivot_table(index=['Practitioner'], columns=['ISBN'], values=['Qty'])
        #print (df)
        user = list(data['Practitioner'].unique())
        isbn = list(data['ISBN'].unique())

        d = data['Qty'].tolist()
        row = data.Practitioner.astype('category', categories=user).cat.codes
        col = data.ISBN.astype('category', categories=isbn).cat.codes
        sparse_matrix = sps.csr_matrix((d, (row, col)), shape=(len(user), len(isbn)))

        dfs=pd.SparseDataFrame([ pd.SparseSeries(sparse_matrix[i].toarray().ravel(), fill_value=0)
                              for i in np.arange(sparse_matrix.shape[0]) ], index=user, columns=isbn, default_fill_value=0)

        #print (dfs)
        # calculate sparsity here
        print (dfs.density)
        # calculation ends
        return dfs, data
Exemple #27
0
    def test_constructor_preserve_attr(self):
        # GH 13866
        arr = pd.SparseArray([1, 0, 3, 0], dtype=np.int64, fill_value=0)
        assert arr.dtype == np.int64
        assert arr.fill_value == 0

        df = pd.SparseDataFrame({'x': arr})
        assert df['x'].dtype == np.int64
        assert df['x'].fill_value == 0

        s = pd.SparseSeries(arr, name='x')
        assert s.dtype == np.int64
        assert s.fill_value == 0

        df = pd.SparseDataFrame(s)
        assert df['x'].dtype == np.int64
        assert df['x'].fill_value == 0

        df = pd.SparseDataFrame({'x': s})
        assert df['x'].dtype == np.int64
        assert df['x'].fill_value == 0
Exemple #28
0
def tfidf(site_dict):
    """ Find 10 words with highest TF-IDF for each site """
    tfidf = TfidfVectorizer()
    tfs = tfidf.fit_transform(site_dict.values())
    tfidf_data = pd.DataFrame([
        pd.SparseSeries(tfs[i].toarray().ravel())
        for i in np.arange(tfs.shape[0])
    ])
    columns = tfidf.get_feature_names()
    tfidf_data.columns = columns
    tfidf_data.index = site_dict.keys()

    tfidf_data = tfidf_data.stack().reset_index()
    tfidf_data = tfidf_data.rename(columns={
        'level_0': 'site',
        'level_1': 'term',
        0: 'tfidf'
    })
    tfidf_data = tfidf_data.sort_values(
        by=['site', 'tfidf'], ascending=False).groupby('site').head(10)
    return tfidf_data
Exemple #29
0
    def test_dataframe_dummies_prefix_dict(self, sparse):
        prefixes = {'A': 'from_A', 'B': 'from_B'}
        df = DataFrame({'C': [1, 2, 3],
                        'A': ['a', 'b', 'a'],
                        'B': ['b', 'b', 'c']})
        result = get_dummies(df, prefix=prefixes, sparse=sparse)

        expected = DataFrame({'C': [1, 2, 3],
                              'from_A_a': [1, 0, 1],
                              'from_A_b': [0, 1, 0],
                              'from_B_b': [1, 1, 0],
                              'from_B_c': [0, 0, 1]})

        columns = ['from_A_a', 'from_A_b', 'from_B_b', 'from_B_c']
        expected[columns] = expected[columns].astype(np.uint8)
        if sparse:
            expected[columns] = expected[columns].apply(
                lambda x: pd.SparseSeries(x)
            )

        assert_frame_equal(result, expected)
Exemple #30
0
    def test_constructor_preserve_attr(self):
        # GH 13866
        arr = pd.SparseArray([1, 0, 3, 0], dtype=np.int64, fill_value=0)
        self.assertEqual(arr.dtype, np.int64)
        self.assertEqual(arr.fill_value, 0)

        df = pd.SparseDataFrame({'x': arr})
        self.assertEqual(df['x'].dtype, np.int64)
        self.assertEqual(df['x'].fill_value, 0)

        s = pd.SparseSeries(arr, name='x')
        self.assertEqual(s.dtype, np.int64)
        self.assertEqual(s.fill_value, 0)

        df = pd.SparseDataFrame(s)
        self.assertEqual(df['x'].dtype, np.int64)
        self.assertEqual(df['x'].fill_value, 0)

        df = pd.SparseDataFrame({'x': s})
        self.assertEqual(df['x'].dtype, np.int64)
        self.assertEqual(df['x'].fill_value, 0)