def test_type_coercion_at_construction(self): # GH 15682 result = pd.SparseDataFrame( { 'a': [1, 0, 0], 'b': [0, 1, 0], 'c': [0, 0, 1] }, dtype='uint8', default_fill_value=0) expected = pd.SparseDataFrame( { 'a': pd.SparseSeries([1, 0, 0], dtype='uint8'), 'b': pd.SparseSeries([0, 1, 0], dtype='uint8'), 'c': pd.SparseSeries([0, 0, 1], dtype='uint8') }, default_fill_value=0) tm.assert_sp_frame_equal(result, expected)
def test_is_sparse(check_scipy): assert com.is_sparse(pd.SparseArray([1, 2, 3])) assert com.is_sparse(pd.SparseSeries([1, 2, 3])) assert not com.is_sparse(np.array([1, 2, 3])) if check_scipy: import scipy.sparse assert not com.is_sparse(scipy.sparse.bsr_matrix([1, 2, 3]))
def test_get(self): s = pd.SparseSeries([1, np.nan, np.nan, 3, np.nan]) self.assertEqual(s.get(0), 1) assert np.isnan(s.get(1)) assert s.get(5) is None s = pd.SparseSeries([1, np.nan, 0, 3, 0], index=list('ABCDE')) self.assertEqual(s.get('A'), 1) assert np.isnan(s.get('B')) self.assertEqual(s.get('C'), 0) assert s.get('XX') is None s = pd.SparseSeries([1, np.nan, 0, 3, 0], index=list('ABCDE'), fill_value=0) self.assertEqual(s.get('A'), 1) assert np.isnan(s.get('B')) self.assertEqual(s.get('C'), 0) assert s.get('XX') is None
def test_concat_sparse_dense(self, kind): # use first input's fill_value val1 = np.array([1, 2, np.nan, np.nan, 0, np.nan]) val2 = np.array([3, np.nan, 4, 0, 0]) sparse = pd.SparseSeries(val1, name='x', kind=kind) dense = pd.Series(val2, name='y') res = pd.concat([sparse, dense]) exp = pd.SparseSeries(pd.concat([pd.Series(val1), dense]), kind=kind) tm.assert_sp_series_equal(res, exp) res = pd.concat([dense, sparse, dense]) exp = pd.concat([dense, pd.Series(val1), dense]) # XXX: changed from SparseSeries to Series[sparse] exp = pd.Series( pd.SparseArray(exp, kind=kind), index=exp.index, name=exp.name, ) tm.assert_series_equal(res, exp) sparse = pd.SparseSeries(val1, name='x', kind=kind, fill_value=0) dense = pd.Series(val2, name='y') res = pd.concat([sparse, dense]) # XXX: changed from SparseSeries to Series[sparse] exp = pd.concat([pd.Series(val1), dense]) exp = pd.Series( pd.SparseArray(exp, kind=kind, fill_value=0), index=exp.index, name=exp.name, ) tm.assert_series_equal(res, exp) res = pd.concat([dense, sparse, dense]) exp = pd.concat([dense, pd.Series(val1), dense]) # XXX: changed from SparseSeries to Series[sparse] exp = pd.Series( pd.SparseArray(exp, kind=kind, fill_value=0), index=exp.index, name=exp.name, ) tm.assert_series_equal(res, exp)
def test_is_sparse(): assert com.is_sparse(pd.SparseArray([1, 2, 3])) assert com.is_sparse(pd.SparseSeries([1, 2, 3])) assert not com.is_sparse(np.array([1, 2, 3])) # This test will only skip if the previous assertions # pass AND scipy is not installed. sparse = pytest.importorskip("scipy.sparse") assert not com.is_sparse(sparse.bsr_matrix([1, 2, 3]))
def test_get(self): s = pd.SparseSeries([1, np.nan, np.nan, 3, np.nan]) assert s.get(0) == 1 assert np.isnan(s.get(1)) assert s.get(5) is None s = pd.SparseSeries([1, np.nan, 0, 3, 0], index=list("ABCDE")) assert s.get("A") == 1 assert np.isnan(s.get("B")) assert s.get("C") == 0 assert s.get("XX") is None s = pd.SparseSeries([1, np.nan, 0, 3, 0], index=list("ABCDE"), fill_value=0) assert s.get("A") == 1 assert np.isnan(s.get("B")) assert s.get("C") == 0 assert s.get("XX") is None
def test_dataframe_dummies_subset(self, df, sparse): result = get_dummies(df, prefix=['from_A'], columns=['A'], sparse=sparse) expected = DataFrame({'B': ['b', 'b', 'c'], 'C': [1, 2, 3], 'from_A_a': [1, 0, 1], 'from_A_b': [0, 1, 0]}, dtype=np.uint8) expected[['C']] = df[['C']] if sparse: cols = ['from_A_a', 'from_A_b'] expected[cols] = expected[cols].apply(lambda x: pd.SparseSeries(x)) assert_frame_equal(result, expected)
def spoolTestData(self, testData=None): if not testData is None: X = testData else: X = self.X_test daily_fingerprint_count, unsupervised, Y = self.spoolData( X, self.daily_fingerprint_count) currencies = self.oheC.transform( np.array(X['currency']).reshape(-1, 1)) for i, col in enumerate(self.oheC.get_feature_names()): unsupervised[col] = pd.SparseSeries( currencies[:, i].toarray().ravel(), fill_value=0) bankEncoded = self.oheB.transform(np.array(X['bank']).reshape(-1, 1)) for i, col in enumerate(self.oheB.get_feature_names()): unsupervised[col] = pd.SparseSeries( bankEncoded[:, i].toarray().ravel(), fill_value=0) return unsupervised, Y
def test_concat_different_fill(self): val1 = np.array([1, 2, np.nan, np.nan, 0, np.nan]) val2 = np.array([3, np.nan, 4, 0, 0]) for kind in ['integer', 'block']: sparse1 = pd.SparseSeries(val1, name='x', kind=kind) sparse2 = pd.SparseSeries(val2, name='y', kind=kind, fill_value=0) with tm.assert_produces_warning(PerformanceWarning): res = pd.concat([sparse1, sparse2]) exp = pd.concat([pd.Series(val1), pd.Series(val2)]) exp = pd.SparseSeries(exp, kind=kind) tm.assert_sp_series_equal(res, exp) with tm.assert_produces_warning(PerformanceWarning): res = pd.concat([sparse2, sparse1]) exp = pd.concat([pd.Series(val2), pd.Series(val1)]) exp = pd.SparseSeries(exp, kind=kind, fill_value=0) tm.assert_sp_series_equal(res, exp)
def test_concat_sparse_dense(self): # use first input's fill_value val1 = np.array([1, 2, np.nan, np.nan, 0, np.nan]) val2 = np.array([3, np.nan, 4, 0, 0]) for kind in ['integer', 'block']: sparse = pd.SparseSeries(val1, name='x', kind=kind) dense = pd.Series(val2, name='y') res = pd.concat([sparse, dense]) exp = pd.concat([pd.Series(val1), dense]) exp = pd.SparseSeries(exp, kind=kind) tm.assert_sp_series_equal(res, exp) res = pd.concat([dense, sparse, dense]) exp = pd.concat([dense, pd.Series(val1), dense]) exp = pd.SparseSeries(exp, kind=kind) tm.assert_sp_series_equal(res, exp) sparse = pd.SparseSeries(val1, name='x', kind=kind, fill_value=0) dense = pd.Series(val2, name='y') res = pd.concat([sparse, dense]) exp = pd.concat([pd.Series(val1), dense]) exp = pd.SparseSeries(exp, kind=kind, fill_value=0) tm.assert_sp_series_equal(res, exp) res = pd.concat([dense, sparse, dense]) exp = pd.concat([dense, pd.Series(val1), dense]) exp = pd.SparseSeries(exp, kind=kind, fill_value=0) tm.assert_sp_series_equal(res, exp)
def test_concat_different_fill(self): val1 = np.array([1, 2, np.nan, np.nan, 0, np.nan]) val2 = np.array([3, np.nan, 4, 0, 0]) for kind in ["integer", "block"]: sparse1 = pd.SparseSeries(val1, name="x", kind=kind) sparse2 = pd.SparseSeries(val2, name="y", kind=kind, fill_value=0) with tm.assert_produces_warning(PerformanceWarning, raise_on_extra_warnings=False): res = pd.concat([sparse1, sparse2]) exp = pd.concat([pd.Series(val1), pd.Series(val2)]) exp = pd.SparseSeries(exp, kind=kind) tm.assert_sp_series_equal(res, exp) with tm.assert_produces_warning(PerformanceWarning, raise_on_extra_warnings=False): res = pd.concat([sparse2, sparse1]) exp = pd.concat([pd.Series(val2), pd.Series(val1)]) exp = pd.SparseSeries(exp, kind=kind, fill_value=0) tm.assert_sp_series_equal(res, exp)
def transform(self, X, y=None, **transform_params): links = [] # print(links) for index, row in X.iteritems(): text = DataCleaner().clean_email(row) links.append(Links().get_list_keywords(text)) dtm = self.vectorizer.transform(links) data=self.tfidf_transformer.transform(dtm) df = pd.SparseDataFrame(data=[pd.SparseSeries(data[i].toarray().ravel()) for i in np.arange(data.shape[0])], columns=["Lnk."+freq for freq in self.vectorizer.get_feature_names()]) # print("1") #df=DataFrame(data=data.todense(), columns=list(X.columns.values)) return df
def tests_indexing_with_sparse(self): # GH 13985 for kind in ['integer', 'block']: for fill in [True, False, np.nan]: arr = pd.SparseArray([1, 2, 3], kind=kind) indexer = pd.SparseArray([True, False, True], fill_value=fill, dtype=bool) tm.assert_sp_array_equal( pd.SparseArray([1, 3], kind=kind), arr[indexer], ) s = pd.SparseSeries(arr, index=['a', 'b', 'c'], dtype=np.float64) exp = pd.SparseSeries([1, 3], index=['a', 'c'], dtype=SparseDtype( np.float64, s.fill_value), kind=kind) tm.assert_sp_series_equal(s[indexer], exp) tm.assert_sp_series_equal(s.loc[indexer], exp) tm.assert_sp_series_equal(s.iloc[indexer], exp) indexer = pd.SparseSeries(indexer, index=['a', 'b', 'c']) tm.assert_sp_series_equal(s[indexer], exp) tm.assert_sp_series_equal(s.loc[indexer], exp) msg = ("iLocation based boolean indexing cannot use an " "indexable as a mask") with tm.assert_raises_regex(ValueError, msg): s.iloc[indexer]
def test_dataframe_dummies_drop_first_with_categorical( self, df, sparse, dtype): df['cat'] = pd.Categorical(['x', 'y', 'y']) result = get_dummies(df, drop_first=True, sparse=sparse) expected = DataFrame({'C': [1, 2, 3], 'A_b': [0, 1, 0], 'B_c': [0, 0, 1], 'cat_y': [0, 1, 1]}) cols = ['A_b', 'B_c', 'cat_y'] expected[cols] = expected[cols].astype(np.uint8) expected = expected[['C', 'A_b', 'B_c', 'cat_y']] if sparse: for col in cols: expected[col] = pd.SparseSeries(expected[col]) assert_frame_equal(result, expected)
def get_sparse_mx(df, fields, count): row_ = list(sorted(df[fields[0]].unique())) col_ = list(sorted(df[fields[1]].unique())) row = df[fields[0]].astype(pd.api.types.CategoricalDtype(categories=row_)).cat.codes col = df[fields[1]].astype(pd.api.types.CategoricalDtype(categories=col_)).cat.codes data = df[count].tolist() sparse_matrix = sp.csr_matrix((data, (row, col)), shape=(len(row_), len(col_))) df = pd.SparseDataFrame([pd.SparseSeries(sparse_matrix[i].toarray().ravel(), fill_value=0) for i in np.arange(sparse_matrix.shape[0])], index=row_, columns=col_, default_fill_value=0) return df
def to_pandas(self, sample_field=None, feature_field=None, sparse=None): '''Get a pandas dataframe of the abundances Samples are rows, features are columns. Can specify the metadata fields for the index (default is sample_metadata index) and column labels (default is feature_metadata index) Parameters ---------- sample_field : str or None, optional Name of the sample_metadata column to use for index. None (default) is the sample_metadata index feature_field : str or None, optional Name of the feature_metadata column to use for column names. None (default) is the feature_metadata index sparse: bool or None, optional None (default) to get sparsity based on the underlying Experiment sparsity True to force to sparse pandas.Dataframe False to force to standard pandas.Dataframe Returns ------- pandas.Dataframe or pandas.SparseDataFrame ''' if sample_field is None: ind = self.sample_metadata.index else: ind = self.sample_metadata[sample_field] if feature_field is None: cols = self.feature_metadata.index else: cols = self.feature_metadata[feature_field] if sparse is not None: self.sparse = sparse if self.sparse: # create list of sparse rows sr = [ pd.SparseSeries(self.data[i, :].toarray().ravel(), fill_value=0) for i in np.arange(self.data.shape[0]) ] df = pd.SparseDataFrame(sr, index=ind, columns=cols) else: df = pd.DataFrame(self.data, index=ind, columns=cols, copy=True) return df
def test_dataframe_dummies_drop_first_with_categorical( self, df, sparse, dtype): df["cat"] = pd.Categorical(["x", "y", "y"]) result = get_dummies(df, drop_first=True, sparse=sparse) expected = DataFrame({ "C": [1, 2, 3], "A_b": [0, 1, 0], "B_c": [0, 0, 1], "cat_y": [0, 1, 1] }) cols = ["A_b", "B_c", "cat_y"] expected[cols] = expected[cols].astype(np.uint8) expected = expected[["C", "A_b", "B_c", "cat_y"]] if sparse: for col in cols: expected[col] = pd.SparseSeries(expected[col]) assert_frame_equal(result, expected)
def sparse_df(connection, sample_ids, feature_ids): """ Takes a connection and lists of sample and feature ids and returns a sparse dataframe. :param connection: :param sample_ids: :param feature_ids: :return: """ dense = df(connection, sample_ids, feature_ids) sparse_df = dense.to_sparse(fill_value=0.0) csr = sparse_df.to_coo().tocsr() sp = pd.SparseDataFrame([ pd.SparseSeries(dense[i].toarray().ravel()) for i in np.arange(dense.shape[0]) ]) return sp
def test_sparse_int(self): # GH 13110 s = pd.SparseSeries([0, 1, 0, 0, 1, 0], fill_value=False) result = repr(s) dtype = '' if use_32bit_repr else ', dtype=int32' exp = ("0 0\n1 1\n2 0\n3 0\n4 1\n" "5 0\ndtype: int64\nBlockIndex\n" "Block locations: array([1, 4]{0})\n" "Block lengths: array([1, 1]{0})".format(dtype)) self.assertEqual(result, exp) with option_context("display.max_rows", 3): result = repr(s) exp = ("0 0\n ..\n5 0\n" "dtype: int64\nBlockIndex\n" "Block locations: array([1, 4]{0})\n" "Block lengths: array([1, 1]{0})".format(dtype)) self.assertEqual(result, exp)
def make_graph_and_calculate_centrality(): # make graph graph = graph_maker() # draw the graph if you want to nx.draw_networkx(graph) if show_graph: plt.show() # get the adjacency matrix adj_matrix = nx.adjacency_matrix(graph) # turn it from sparse to regular sparse_df = pd.SparseDataFrame( [pd.SparseSeries(adj_matrix[x].toarray().ravel()) for x in np.arange(adj_matrix.shape[0])]) df = sparse_df.to_dense() # change the name of the rows df.index = ['Node' + str(x) for x in df] adj_matrix = df # make the random walks random_walks, value_counts, separated_string = make_random_walks(adj_matrix, number_of_walks, length_of_walks) print "done with random walks" # Calculate the number of iteration needed num_of_iteration = calculate_num_of_iteration(number_of_walks, length_of_walks, len(df.index)) # after_parse = wevi_parser(random_walks, window_size) # print "num of iter is {}".format(num_of_iteration) # centrality_vector = wevi_automate(after_parse, num_of_iteration) # centrality_compare(graph, centrality_vector) if use_new_version: input1 = str(testing_gensim(separated_string))[1:-1] else: print "The input to wevi:" print wevi_parser(random_walks, window_size) print "Please paste here the results from wevi" input1 = raw_input() compare_dict = centrality_compare(graph, input1, value_counts) return compare_dict
def test_dataframe_dummies_subset(self, df, sparse): result = get_dummies(df, prefix=["from_A"], columns=["A"], sparse=sparse) expected = DataFrame( { "B": ["b", "b", "c"], "C": [1, 2, 3], "from_A_a": [1, 0, 1], "from_A_b": [0, 1, 0], }, dtype=np.uint8, ) expected[["C"]] = df[["C"]] if sparse: cols = ["from_A_a", "from_A_b"] expected[cols] = expected[cols].apply(lambda x: pd.SparseSeries(x)) assert_frame_equal(result, expected)
def test_is_extension_type(check_scipy): assert not com.is_extension_type([1, 2, 3]) assert not com.is_extension_type(np.array([1, 2, 3])) assert not com.is_extension_type(pd.DatetimeIndex([1, 2, 3])) cat = pd.Categorical([1, 2, 3]) assert com.is_extension_type(cat) assert com.is_extension_type(pd.Series(cat)) assert com.is_extension_type(pd.SparseArray([1, 2, 3])) assert com.is_extension_type(pd.SparseSeries([1, 2, 3])) assert com.is_extension_type(pd.DatetimeIndex([1, 2, 3], tz="US/Eastern")) dtype = DatetimeTZDtype("ns", tz="US/Eastern") s = pd.Series([], dtype=dtype) assert com.is_extension_type(s) if check_scipy: import scipy.sparse assert not com.is_extension_type(scipy.sparse.bsr_matrix([1, 2, 3]))
def test_sparse_bool(self): # GH 13110 s = pd.SparseSeries([True, False, False, True, False, False], fill_value=False) result = repr(s) dtype = '' if use_32bit_repr else ', dtype=int32' exp = ("0 True\n1 False\n2 False\n" "3 True\n4 False\n5 False\n" "dtype: Sparse[bool, False]\nBlockIndex\n" "Block locations: array([0, 3]{0})\n" "Block lengths: array([1, 1]{0})".format(dtype)) assert result == exp with option_context("display.max_rows", 3): result = repr(s) exp = ("0 True\n ... \n5 False\n" "Length: 6, dtype: Sparse[bool, False]\nBlockIndex\n" "Block locations: array([0, 3]{0})\n" "Block lengths: array([1, 1]{0})".format(dtype)) assert result == exp
def test_sparse_int(self): # GH 13110 s = pd.SparseSeries([0, 1, 0, 0, 1, 0], fill_value=False) result = repr(s) dtype = "" if use_32bit_repr else ", dtype=int32" exp = ("0 0\n1 1\n2 0\n3 0\n4 1\n" "5 0\ndtype: Sparse[int64, False]\nBlockIndex\n" "Block locations: array([1, 4]{0})\n" "Block lengths: array([1, 1]{0})".format(dtype)) assert result == exp with option_context("display.max_rows", 3, "display.show_dimensions", False): result = repr(s) exp = ("0 0\n ..\n5 0\n" "dtype: Sparse[int64, False]\nBlockIndex\n" "Block locations: array([1, 4]{0})\n" "Block lengths: array([1, 1]{0})".format(dtype)) assert result == exp
def test_is_extension_type(): assert not com.is_extension_type([1, 2, 3]) assert not com.is_extension_type(np.array([1, 2, 3])) assert not com.is_extension_type(pd.DatetimeIndex([1, 2, 3])) cat = pd.Categorical([1, 2, 3]) assert com.is_extension_type(cat) assert com.is_extension_type(pd.Series(cat)) assert com.is_extension_type(pd.SparseArray([1, 2, 3])) assert com.is_extension_type(pd.SparseSeries([1, 2, 3])) assert com.is_extension_type(pd.DatetimeIndex([1, 2, 3], tz="US/Eastern")) dtype = DatetimeTZDtype("ns", tz="US/Eastern") s = pd.Series([], dtype=dtype) assert com.is_extension_type(s) # This test will only skip if the previous assertions # pass AND scipy is not installed. sparse = pytest.importorskip("scipy.sparse") assert not com.is_extension_type(sparse.bsr_matrix([1, 2, 3]))
def sparsity(self): data = pd.read_csv('data-files/raw_data_march.csv') data.drop(data.columns[[0, 1, 2, 3, 4, 7, 12, 13, 14, 15, 16, 17, 18, 19]], axis=1, inplace=True) #df = data.pivot_table(index=['Practitioner'], columns=['ISBN'], values=['Qty']) #print (df) user = list(data['Practitioner'].unique()) isbn = list(data['ISBN'].unique()) d = data['Qty'].tolist() row = data.Practitioner.astype('category', categories=user).cat.codes col = data.ISBN.astype('category', categories=isbn).cat.codes sparse_matrix = sps.csr_matrix((d, (row, col)), shape=(len(user), len(isbn))) dfs=pd.SparseDataFrame([ pd.SparseSeries(sparse_matrix[i].toarray().ravel(), fill_value=0) for i in np.arange(sparse_matrix.shape[0]) ], index=user, columns=isbn, default_fill_value=0) #print (dfs) # calculate sparsity here print (dfs.density) # calculation ends return dfs, data
def test_constructor_preserve_attr(self): # GH 13866 arr = pd.SparseArray([1, 0, 3, 0], dtype=np.int64, fill_value=0) assert arr.dtype == np.int64 assert arr.fill_value == 0 df = pd.SparseDataFrame({'x': arr}) assert df['x'].dtype == np.int64 assert df['x'].fill_value == 0 s = pd.SparseSeries(arr, name='x') assert s.dtype == np.int64 assert s.fill_value == 0 df = pd.SparseDataFrame(s) assert df['x'].dtype == np.int64 assert df['x'].fill_value == 0 df = pd.SparseDataFrame({'x': s}) assert df['x'].dtype == np.int64 assert df['x'].fill_value == 0
def tfidf(site_dict): """ Find 10 words with highest TF-IDF for each site """ tfidf = TfidfVectorizer() tfs = tfidf.fit_transform(site_dict.values()) tfidf_data = pd.DataFrame([ pd.SparseSeries(tfs[i].toarray().ravel()) for i in np.arange(tfs.shape[0]) ]) columns = tfidf.get_feature_names() tfidf_data.columns = columns tfidf_data.index = site_dict.keys() tfidf_data = tfidf_data.stack().reset_index() tfidf_data = tfidf_data.rename(columns={ 'level_0': 'site', 'level_1': 'term', 0: 'tfidf' }) tfidf_data = tfidf_data.sort_values( by=['site', 'tfidf'], ascending=False).groupby('site').head(10) return tfidf_data
def test_dataframe_dummies_prefix_dict(self, sparse): prefixes = {'A': 'from_A', 'B': 'from_B'} df = DataFrame({'C': [1, 2, 3], 'A': ['a', 'b', 'a'], 'B': ['b', 'b', 'c']}) result = get_dummies(df, prefix=prefixes, sparse=sparse) expected = DataFrame({'C': [1, 2, 3], 'from_A_a': [1, 0, 1], 'from_A_b': [0, 1, 0], 'from_B_b': [1, 1, 0], 'from_B_c': [0, 0, 1]}) columns = ['from_A_a', 'from_A_b', 'from_B_b', 'from_B_c'] expected[columns] = expected[columns].astype(np.uint8) if sparse: expected[columns] = expected[columns].apply( lambda x: pd.SparseSeries(x) ) assert_frame_equal(result, expected)
def test_constructor_preserve_attr(self): # GH 13866 arr = pd.SparseArray([1, 0, 3, 0], dtype=np.int64, fill_value=0) self.assertEqual(arr.dtype, np.int64) self.assertEqual(arr.fill_value, 0) df = pd.SparseDataFrame({'x': arr}) self.assertEqual(df['x'].dtype, np.int64) self.assertEqual(df['x'].fill_value, 0) s = pd.SparseSeries(arr, name='x') self.assertEqual(s.dtype, np.int64) self.assertEqual(s.fill_value, 0) df = pd.SparseDataFrame(s) self.assertEqual(df['x'].dtype, np.int64) self.assertEqual(df['x'].fill_value, 0) df = pd.SparseDataFrame({'x': s}) self.assertEqual(df['x'].dtype, np.int64) self.assertEqual(df['x'].fill_value, 0)