def test_column_dups2(self): # drop buggy GH 6240 df = DataFrame({'A': np.random.randn(5), 'B': np.random.randn(5), 'C': np.random.randn(5), 'D': ['a', 'b', 'c', 'd', 'e']}) expected = df.take([0, 1, 1], axis=1) df2 = df.take([2, 0, 1, 2, 1], axis=1) result = df2.drop('C', axis=1) assert_frame_equal(result, expected) # dropna df = DataFrame({'A': np.random.randn(5), 'B': np.random.randn(5), 'C': np.random.randn(5), 'D': ['a', 'b', 'c', 'd', 'e']}) df.iloc[2, [0, 1, 2]] = np.nan df.iloc[0, 0] = np.nan df.iloc[1, 1] = np.nan df.iloc[:, 3] = np.nan expected = df.dropna(subset=['A', 'B', 'C'], how='all') expected.columns = ['A', 'A', 'B', 'C'] df.columns = ['A', 'A', 'B', 'C'] result = df.dropna(subset=['A', 'C'], how='all') assert_frame_equal(result, expected)
def test_sort_index_multicolumn(self): import random A = np.arange(5).repeat(20) B = np.tile(np.arange(5), 20) random.shuffle(A) random.shuffle(B) frame = DataFrame({'A': A, 'B': B, 'C': np.random.randn(100)}) # use .sort_values #9816 with tm.assert_produces_warning(FutureWarning): frame.sort_index(by=['A', 'B']) result = frame.sort_values(by=['A', 'B']) indexer = np.lexsort((frame['B'], frame['A'])) expected = frame.take(indexer) assert_frame_equal(result, expected) # use .sort_values #9816 with tm.assert_produces_warning(FutureWarning): frame.sort_index(by=['A', 'B'], ascending=False) result = frame.sort_values(by=['A', 'B'], ascending=False) indexer = np.lexsort((frame['B'].rank(ascending=False), frame['A'].rank(ascending=False))) expected = frame.take(indexer) assert_frame_equal(result, expected) # use .sort_values #9816 with tm.assert_produces_warning(FutureWarning): frame.sort_index(by=['B', 'A']) result = frame.sort_values(by=['B', 'A']) indexer = np.lexsort((frame['A'], frame['B'])) expected = frame.take(indexer) assert_frame_equal(result, expected)
def sample01(): df=DataFrame(np.arange(5*4).reshape(5,4)) sampler=np.random.permutation(5) print sampler print df print df.take(sampler) print df.take(np.random.permutation(len(df))[:3])
def slide_16(): df = DataFrame(np.arange(5 * 4).reshape(5, 4)) sampler = np.random.permutation(5) print sampler print df print df.take(sampler) print df.take(np.random.permutation(len(df))[:3]) bag = np.array([5, 7, -1, 6, 4]) sampler = np.random.randint(0, len(bag), size=10) print sampler draws = bag.take(sampler) print draws
def test_apply_chunk_view(): # Low level tinkering could be unsafe, make sure not df = DataFrame({"key": [1, 1, 1, 2, 2, 2, 3, 3, 3], "value": range(9)}) result = df.groupby("key", group_keys=False).apply(lambda x: x[:2]) expected = df.take([0, 1, 3, 4, 6, 7]) tm.assert_frame_equal(result, expected)
def test_sort_index_different_sortorder(self): A = np.arange(20).repeat(5) B = np.tile(np.arange(5), 20) indexer = np.random.permutation(100) A = A.take(indexer) B = B.take(indexer) df = DataFrame({'A': A, 'B': B, 'C': np.random.randn(100)}) # use .sort_values #9816 with tm.assert_produces_warning(FutureWarning): df.sort_index(by=['A', 'B'], ascending=[1, 0]) result = df.sort_values(by=['A', 'B'], ascending=[1, 0]) ex_indexer = np.lexsort((df.B.max() - df.B, df.A)) expected = df.take(ex_indexer) assert_frame_equal(result, expected) # test with multiindex, too idf = df.set_index(['A', 'B']) result = idf.sort_index(ascending=[1, 0]) expected = idf.take(ex_indexer) assert_frame_equal(result, expected) # also, Series! result = idf['C'].sort_index(ascending=[1, 0]) assert_series_equal(result, expected['C'])
def get_negatives(interactome: pd.DataFrame, positives: set, pname, num: int = 0, bivalent=False) -> set: """ :interactome dataframe of interaction data :num number of negatives to randomly generate :positives set of positives :returns k probable negatives """ if num == 0: num = len(positives) * 50 edges = make_edges(interactome.take([0, 1], axis=1)) edges = {tuple(e) for e in edges} #nodes = set() #for e in edges: # nodes.update(set(e)) nodes = set([e[0] for e in edges]).union(set([e[1] for e in edges])) print('%d total nodes (%.3f are positives)' % (len(nodes), len(positives) / len(nodes))) nodes = nodes - positives print('%d total' % (len(nodes))) if bivalent: return nodes samp = set(random.sample(list(nodes), k=num)) psamp = {(a, pname) for a in samp} return psamp
def test_loc_getitem_frame(self): df = DataFrame({"A": range(10)}) s = pd.cut(df.A, 5) df["B"] = s df = df.set_index("B") result = df.loc[4] expected = df.iloc[4:6] tm.assert_frame_equal(result, expected) with pytest.raises(KeyError, match="10"): df.loc[10] # single list-like result = df.loc[[4]] expected = df.iloc[4:6] tm.assert_frame_equal(result, expected) # non-unique result = df.loc[[4, 5]] expected = df.take([4, 5, 4, 5]) tm.assert_frame_equal(result, expected) with pytest.raises(KeyError, match="^$"): df.loc[[10]] # partial missing with pytest.raises(KeyError, match="^$"): df.loc[[10, 4]]
def test_loc_getitem_frame(self): df = DataFrame({'A': range(10)}) s = pd.cut(df.A, 5) df['B'] = s df = df.set_index('B') result = df.loc[4] expected = df.iloc[4:6] tm.assert_frame_equal(result, expected) with pytest.raises(KeyError): df.loc[10] # single list-like result = df.loc[[4]] expected = df.iloc[4:6] tm.assert_frame_equal(result, expected) # non-unique result = df.loc[[4, 5]] expected = df.take([4, 5, 4, 5]) tm.assert_frame_equal(result, expected) with pytest.raises(KeyError): df.loc[[10]] # partial missing with pytest.raises(KeyError): df.loc[[10, 4]]
def test_apply_chunk_view(): # Low level tinkering could be unsafe, make sure not df = DataFrame({'key': [1, 1, 1, 2, 2, 2, 3, 3, 3], 'value': compat.lrange(9)}) result = df.groupby('key', group_keys=False).apply(lambda x: x[:2]) expected = df.take([0, 1, 3, 4, 6, 7]) tm.assert_frame_equal(result, expected)
def test_apply_chunk_view(): # Low level tinkering could be unsafe, make sure not df = DataFrame({'key': [1, 1, 1, 2, 2, 2, 3, 3, 3], 'value': compat.lrange(9)}) # return view f = lambda x: x[:2] result = df.groupby('key', group_keys=False).apply(f) expected = df.take([0, 1, 3, 4, 6, 7]) tm.assert_frame_equal(result, expected)
def test_apply_chunk_view(group_keys): # Low level tinkering could be unsafe, make sure not df = DataFrame({"key": [1, 1, 1, 2, 2, 2, 3, 3, 3], "value": range(9)}) result = df.groupby("key", group_keys=group_keys).apply(lambda x: x.iloc[:2]) expected = df.take([0, 1, 3, 4, 6, 7]) if group_keys: expected.index = MultiIndex.from_arrays( [[1, 1, 2, 2, 3, 3], expected.index], names=["key", None]) tm.assert_frame_equal(result, expected)
def test_sort_values_multicolumn(self): A = np.arange(5).repeat(20) B = np.tile(np.arange(5), 20) random.shuffle(A) random.shuffle(B) frame = DataFrame({"A": A, "B": B, "C": np.random.randn(100)}) result = frame.sort_values(by=["A", "B"]) indexer = np.lexsort((frame["B"], frame["A"])) expected = frame.take(indexer) tm.assert_frame_equal(result, expected) result = frame.sort_values(by=["A", "B"], ascending=False) indexer = np.lexsort((frame["B"].rank(ascending=False), frame["A"].rank(ascending=False))) expected = frame.take(indexer) tm.assert_frame_equal(result, expected) result = frame.sort_values(by=["B", "A"]) indexer = np.lexsort((frame["A"], frame["B"])) expected = frame.take(indexer) tm.assert_frame_equal(result, expected)
def test_column_dups2(self): # drop buggy GH 6240 df = DataFrame( { "A": np.random.randn(5), "B": np.random.randn(5), "C": np.random.randn(5), "D": ["a", "b", "c", "d", "e"], } ) expected = df.take([0, 1, 1], axis=1) df2 = df.take([2, 0, 1, 2, 1], axis=1) result = df2.drop("C", axis=1) tm.assert_frame_equal(result, expected) # dropna df = DataFrame( { "A": np.random.randn(5), "B": np.random.randn(5), "C": np.random.randn(5), "D": ["a", "b", "c", "d", "e"], } ) df.iloc[2, [0, 1, 2]] = np.nan df.iloc[0, 0] = np.nan df.iloc[1, 1] = np.nan df.iloc[:, 3] = np.nan expected = df.dropna(subset=["A", "B", "C"], how="all") expected.columns = ["A", "A", "B", "C"] df.columns = ["A", "A", "B", "C"] result = df.dropna(subset=["A", "C"], how="all") tm.assert_frame_equal(result, expected)
def test_sort_values_stable_multicolumn_sort(self, expected_idx_non_na, ascending, na_position): # GH#38426 Clarify sort_values with mult. columns / labels is stable df = DataFrame({ "A": [1, 2, np.nan, 1, 1, 1, 6, 8, 4, 8, 8, np.nan, np.nan, 8, 8], "B": [9, np.nan, 5, 2, 2, 2, 5, 4, 5, 3, 4, np.nan, np.nan, 4, 4], }) # All rows with NaN in col "B" only have unique values in "A", therefore, # only the rows with NaNs in "A" have to be treated individually: expected_idx = ([11, 12, 2] + expected_idx_non_na if na_position == "first" else expected_idx_non_na + [2, 11, 12]) expected = df.take(expected_idx) sorted_df = df.sort_values(["A", "B"], ascending=ascending, na_position=na_position) tm.assert_frame_equal(sorted_df, expected)
def test_groupby_datetime_categorical(self): # GH9049: ensure backward compatibility levels = pd.date_range('2014-01-01', periods=4) codes = np.random.randint(0, 4, size=100) cats = Categorical.from_codes(codes, levels, ordered=True) data = DataFrame(np.random.randn(100, 4)) result = data.groupby(cats).mean() expected = data.groupby(np.asarray(cats)).mean() expected = expected.reindex(levels) expected.index = CategoricalIndex(expected.index, categories=expected.index, ordered=True) assert_frame_equal(result, expected) grouped = data.groupby(cats) desc_result = grouped.describe() idx = cats.codes.argsort() ord_labels = cats.take_nd(idx) ord_data = data.take(idx) expected = ord_data.groupby(ord_labels).describe() assert_frame_equal(desc_result, expected) tm.assert_index_equal(desc_result.index, expected.index) tm.assert_index_equal( desc_result.index.get_level_values(0), expected.index.get_level_values(0)) # GH 10460 expc = Categorical.from_codes( np.arange(4).repeat(8), levels, ordered=True) exp = CategoricalIndex(expc) self.assert_index_equal((desc_result.stack() .index .get_level_values(0)), exp) exp = Index(['count', 'mean', 'std', 'min', '25%', '50%', '75%', 'max'] * 4) self.assert_index_equal((desc_result.stack() .index .get_level_values(1)), exp)
def test_groupby_categorical(self): levels = ['foo', 'bar', 'baz', 'qux'] codes = np.random.randint(0, 4, size=100) cats = Categorical.from_codes(codes, levels, ordered=True) data = DataFrame(np.random.randn(100, 4)) result = data.groupby(cats).mean() expected = data.groupby(np.asarray(cats)).mean() exp_idx = CategoricalIndex(levels, categories=cats.categories, ordered=True) expected = expected.reindex(exp_idx) assert_frame_equal(result, expected) grouped = data.groupby(cats) desc_result = grouped.describe() idx = cats.codes.argsort() ord_labels = np.asarray(cats).take(idx) ord_data = data.take(idx) exp_cats = Categorical(ord_labels, ordered=True, categories=['foo', 'bar', 'baz', 'qux']) expected = ord_data.groupby(exp_cats, sort=False).describe() assert_frame_equal(desc_result, expected) # GH 10460 expc = Categorical.from_codes(np.arange(4).repeat(8), levels, ordered=True) exp = CategoricalIndex(expc) tm.assert_index_equal((desc_result.stack().index.get_level_values(0)), exp) exp = Index( ['count', 'mean', 'std', 'min', '25%', '50%', '75%', 'max'] * 4) tm.assert_index_equal((desc_result.stack().index.get_level_values(1)), exp)
def test_groupby_datetime_categorical(self): # GH9049: ensure backward compatibility levels = pd.date_range('2014-01-01', periods=4) codes = np.random.randint(0, 4, size=100) cats = Categorical.from_codes(codes, levels, ordered=True) data = DataFrame(np.random.randn(100, 4)) result = data.groupby(cats).mean() expected = data.groupby(np.asarray(cats)).mean() expected = expected.reindex(levels) expected.index = CategoricalIndex(expected.index, categories=expected.index, ordered=True) assert_frame_equal(result, expected) grouped = data.groupby(cats) desc_result = grouped.describe() idx = cats.codes.argsort() ord_labels = cats.take_nd(idx) ord_data = data.take(idx) expected = ord_data.groupby(ord_labels).describe() assert_frame_equal(desc_result, expected) tm.assert_index_equal(desc_result.index, expected.index) tm.assert_index_equal(desc_result.index.get_level_values(0), expected.index.get_level_values(0)) # GH 10460 expc = Categorical.from_codes(np.arange(4).repeat(8), levels, ordered=True) exp = CategoricalIndex(expc) tm.assert_index_equal((desc_result.stack().index.get_level_values(0)), exp) exp = Index( ['count', 'mean', 'std', 'min', '25%', '50%', '75%', 'max'] * 4) tm.assert_index_equal((desc_result.stack().index.get_level_values(1)), exp)
def test_groupby_categorical(self): levels = ['foo', 'bar', 'baz', 'qux'] codes = np.random.randint(0, 4, size=100) cats = Categorical.from_codes(codes, levels, ordered=True) data = DataFrame(np.random.randn(100, 4)) result = data.groupby(cats).mean() expected = data.groupby(np.asarray(cats)).mean() exp_idx = CategoricalIndex(levels, categories=cats.categories, ordered=True) expected = expected.reindex(exp_idx) assert_frame_equal(result, expected) grouped = data.groupby(cats) desc_result = grouped.describe() idx = cats.codes.argsort() ord_labels = np.asarray(cats).take(idx) ord_data = data.take(idx) exp_cats = Categorical(ord_labels, ordered=True, categories=['foo', 'bar', 'baz', 'qux']) expected = ord_data.groupby(exp_cats, sort=False).describe() assert_frame_equal(desc_result, expected) # GH 10460 expc = Categorical.from_codes(np.arange(4).repeat(8), levels, ordered=True) exp = CategoricalIndex(expc) self.assert_index_equal((desc_result.stack() .index .get_level_values(0)), exp) exp = Index(['count', 'mean', 'std', 'min', '25%', '50%', '75%', 'max'] * 4) self.assert_index_equal((desc_result.stack() .index .get_level_values(1)), exp)
def get_negatives(interactome: pd.DataFrame, positives: set, pname, num: int = 0, bivalent=False) -> set: """ :interactome dataframe of interaction data :num number of negatives to randomly generate :positives set of positives :returns k probable negatives """ if num == 0: num = len(positives) * 50 edges = make_edges(interactome.take([0, 1], axis=1)) edges = edges - positives if bivalent: return edges samp = set(random.sample(list(edges), k=num)) psamp = {frozenset((a, b, pname)) for (a, b) in samp} ## Nnode negs should be determined here too. - AR return psamp
def normalize_synop_data( all_synop_data: pd.DataFrame, synop_data_indices: [int], features, length_of_sequence, normalization_type: NormalizationType = NormalizationType.STANDARD): # Bear in mind that synop_data_indices are indices of FIRST synop in the sequence. Not all synop data exist in synop_data_indices because of that fact. all_indices = set([ item for sublist in [[index + frame for frame in range(0, length_of_sequence)] for index in synop_data_indices] for item in sublist ]) all_relevant_labels = all_synop_data.take(list(all_indices)) _, mean_or_min, std_or_max = normalize( all_relevant_labels[features].values, normalization_type) if normalization_type == NormalizationType.STANDARD: all_synop_data[features] = (all_synop_data[features].values - mean_or_min) / std_or_max return all_synop_data, mean_or_min, std_or_max else: all_synop_data[features] = (all_synop_data[features].values - mean_or_min) / (std_or_max - mean_or_min) return all_synop_data, mean_or_min, std_or_max
def test_sort_index_different_sortorder(self): A = np.arange(20).repeat(5) B = np.tile(np.arange(5), 20) indexer = np.random.permutation(100) A = A.take(indexer) B = B.take(indexer) df = DataFrame({"A": A, "B": B, "C": np.random.randn(100)}) ex_indexer = np.lexsort((df.B.max() - df.B, df.A)) expected = df.take(ex_indexer) # test with multiindex, too idf = df.set_index(["A", "B"]) result = idf.sort_index(ascending=[1, 0]) expected = idf.take(ex_indexer) tm.assert_frame_equal(result, expected) # also, Series! result = idf["C"].sort_index(ascending=[1, 0]) tm.assert_series_equal(result, expected["C"])
def test_datetime(): # GH9049: ensure backward compatibility levels = pd.date_range("2014-01-01", periods=4) codes = np.random.randint(0, 4, size=100) cats = Categorical.from_codes(codes, levels, ordered=True) data = DataFrame(np.random.randn(100, 4)) result = data.groupby(cats, observed=False).mean() expected = data.groupby(np.asarray(cats), observed=False).mean() expected = expected.reindex(levels) expected.index = CategoricalIndex(expected.index, categories=expected.index, ordered=True) tm.assert_frame_equal(result, expected) grouped = data.groupby(cats, observed=False) desc_result = grouped.describe() idx = cats.codes.argsort() ord_labels = cats.take(idx) ord_data = data.take(idx) expected = ord_data.groupby(ord_labels, observed=False).describe() tm.assert_frame_equal(desc_result, expected) tm.assert_index_equal(desc_result.index, expected.index) tm.assert_index_equal(desc_result.index.get_level_values(0), expected.index.get_level_values(0)) # GH 10460 expc = Categorical.from_codes(np.arange(4).repeat(8), levels, ordered=True) exp = CategoricalIndex(expc) tm.assert_index_equal((desc_result.stack().index.get_level_values(0)), exp) exp = Index(["count", "mean", "std", "min", "25%", "50%", "75%", "max"] * 4) tm.assert_index_equal((desc_result.stack().index.get_level_values(1)), exp)
col=df[0] col.head() col[np.abs(col)>3] df[(np.abs(df)>3).any(1)] df[np.abs(df)>3] = np.sign(df)*3 # cap outliers in the data at 3 df.describe() ##### Permutations df = DataFrame(np.arange(16).reshape(4,4)) df blender = np.random.permutation(4) #without replacement blender df.take(blender) #permutate the rows box = np.array([1,2,3]) #with replacement shaker=np.random.randint(0,len(box),size=10) shaker box.take(shaker) ##### Groupby on DFs df = DataFrame({'k1':['X','X','Y','Y','Z'], 'k2':['alpha','beta','alpha','beta','alpha'], 'dataset1':np.random.randn(5), 'dataset2':np.random.randn(5)}) df group1 = df['dataset1'].groupby(df['k1']) group1
def test_basic(): cats = Categorical(["a", "a", "a", "b", "b", "b", "c", "c", "c"], categories=["a", "b", "c", "d"], ordered=True) data = DataFrame({"a": [1, 1, 1, 2, 2, 2, 3, 4, 5], "b": cats}) exp_index = CategoricalIndex(list('abcd'), name='b', ordered=True) expected = DataFrame({'a': [1, 2, 4, np.nan]}, index=exp_index) result = data.groupby("b", observed=False).mean() tm.assert_frame_equal(result, expected) cat1 = Categorical(["a", "a", "b", "b"], categories=["a", "b", "z"], ordered=True) cat2 = Categorical(["c", "d", "c", "d"], categories=["c", "d", "y"], ordered=True) df = DataFrame({"A": cat1, "B": cat2, "values": [1, 2, 3, 4]}) # single grouper gb = df.groupby("A", observed=False) exp_idx = CategoricalIndex(['a', 'b', 'z'], name='A', ordered=True) expected = DataFrame({'values': Series([3, 7, 0], index=exp_idx)}) result = gb.sum() tm.assert_frame_equal(result, expected) # GH 8623 x = DataFrame([[1, 'John P. Doe'], [2, 'Jane Dove'], [1, 'John P. Doe']], columns=['person_id', 'person_name']) x['person_name'] = Categorical(x.person_name) g = x.groupby(['person_id'], observed=False) result = g.transform(lambda x: x) tm.assert_frame_equal(result, x[['person_name']]) result = x.drop_duplicates('person_name') expected = x.iloc[[0, 1]] tm.assert_frame_equal(result, expected) def f(x): return x.drop_duplicates('person_name').iloc[0] result = g.apply(f) expected = x.iloc[[0, 1]].copy() expected.index = Index([1, 2], name='person_id') expected['person_name'] = expected['person_name'].astype('object') tm.assert_frame_equal(result, expected) # GH 9921 # Monotonic df = DataFrame({"a": [5, 15, 25]}) c = pd.cut(df.a, bins=[0, 10, 20, 30, 40]) result = df.a.groupby(c, observed=False).transform(sum) tm.assert_series_equal(result, df['a']) tm.assert_series_equal( df.a.groupby(c, observed=False).transform(lambda xs: np.sum(xs)), df['a']) tm.assert_frame_equal( df.groupby(c, observed=False).transform(sum), df[['a']]) tm.assert_frame_equal( df.groupby(c, observed=False).transform(lambda xs: np.max(xs)), df[['a']]) # Filter tm.assert_series_equal( df.a.groupby(c, observed=False).filter(np.all), df['a']) tm.assert_frame_equal(df.groupby(c, observed=False).filter(np.all), df) # Non-monotonic df = DataFrame({"a": [5, 15, 25, -5]}) c = pd.cut(df.a, bins=[-10, 0, 10, 20, 30, 40]) result = df.a.groupby(c, observed=False).transform(sum) tm.assert_series_equal(result, df['a']) tm.assert_series_equal( df.a.groupby(c, observed=False).transform(lambda xs: np.sum(xs)), df['a']) tm.assert_frame_equal( df.groupby(c, observed=False).transform(sum), df[['a']]) tm.assert_frame_equal( df.groupby(c, observed=False).transform(lambda xs: np.sum(xs)), df[['a']]) # GH 9603 df = DataFrame({'a': [1, 0, 0, 0]}) c = pd.cut(df.a, [0, 1, 2, 3, 4], labels=Categorical(list('abcd'))) result = df.groupby(c, observed=False).apply(len) exp_index = CategoricalIndex(c.values.categories, ordered=c.values.ordered) expected = Series([1, 0, 0, 0], index=exp_index) expected.index.name = 'a' tm.assert_series_equal(result, expected) # more basic levels = ['foo', 'bar', 'baz', 'qux'] codes = np.random.randint(0, 4, size=100) cats = Categorical.from_codes(codes, levels, ordered=True) data = DataFrame(np.random.randn(100, 4)) result = data.groupby(cats, observed=False).mean() expected = data.groupby(np.asarray(cats), observed=False).mean() exp_idx = CategoricalIndex(levels, categories=cats.categories, ordered=True) expected = expected.reindex(exp_idx) assert_frame_equal(result, expected) grouped = data.groupby(cats, observed=False) desc_result = grouped.describe() idx = cats.codes.argsort() ord_labels = np.asarray(cats).take(idx) ord_data = data.take(idx) exp_cats = Categorical(ord_labels, ordered=True, categories=['foo', 'bar', 'baz', 'qux']) expected = ord_data.groupby(exp_cats, sort=False, observed=False).describe() assert_frame_equal(desc_result, expected) # GH 10460 expc = Categorical.from_codes(np.arange(4).repeat(8), levels, ordered=True) exp = CategoricalIndex(expc) tm.assert_index_equal((desc_result.stack().index.get_level_values(0)), exp) exp = Index(['count', 'mean', 'std', 'min', '25%', '50%', '75%', 'max'] * 4) tm.assert_index_equal((desc_result.stack().index.get_level_values(1)), exp)
def test_iloc_getitem_with_duplicates2(self): # GH#2259 df = DataFrame([[1, 2, 3], [4, 5, 6]], columns=[1, 1, 2]) result = df.iloc[:, [0]] expected = df.take([0], axis=1) tm.assert_frame_equal(result, expected)
def test_basic(): cats = Categorical(["a", "a", "a", "b", "b", "b", "c", "c", "c"], categories=["a", "b", "c", "d"], ordered=True) data = DataFrame({"a": [1, 1, 1, 2, 2, 2, 3, 4, 5], "b": cats}) exp_index = CategoricalIndex(list('abcd'), name='b', ordered=True) expected = DataFrame({'a': [1, 2, 4, np.nan]}, index=exp_index) result = data.groupby("b", observed=False).mean() tm.assert_frame_equal(result, expected) cat1 = Categorical(["a", "a", "b", "b"], categories=["a", "b", "z"], ordered=True) cat2 = Categorical(["c", "d", "c", "d"], categories=["c", "d", "y"], ordered=True) df = DataFrame({"A": cat1, "B": cat2, "values": [1, 2, 3, 4]}) # single grouper gb = df.groupby("A", observed=False) exp_idx = CategoricalIndex(['a', 'b', 'z'], name='A', ordered=True) expected = DataFrame({'values': Series([3, 7, 0], index=exp_idx)}) result = gb.sum() tm.assert_frame_equal(result, expected) # GH 8623 x = DataFrame([[1, 'John P. Doe'], [2, 'Jane Dove'], [1, 'John P. Doe']], columns=['person_id', 'person_name']) x['person_name'] = Categorical(x.person_name) g = x.groupby(['person_id'], observed=False) result = g.transform(lambda x: x) tm.assert_frame_equal(result, x[['person_name']]) result = x.drop_duplicates('person_name') expected = x.iloc[[0, 1]] tm.assert_frame_equal(result, expected) def f(x): return x.drop_duplicates('person_name').iloc[0] result = g.apply(f) expected = x.iloc[[0, 1]].copy() expected.index = Index([1, 2], name='person_id') expected['person_name'] = expected['person_name'].astype('object') tm.assert_frame_equal(result, expected) # GH 9921 # Monotonic df = DataFrame({"a": [5, 15, 25]}) c = pd.cut(df.a, bins=[0, 10, 20, 30, 40]) result = df.a.groupby(c, observed=False).transform(sum) tm.assert_series_equal(result, df['a']) tm.assert_series_equal( df.a.groupby(c, observed=False).transform(lambda xs: np.sum(xs)), df['a']) tm.assert_frame_equal( df.groupby(c, observed=False).transform(sum), df[['a']]) tm.assert_frame_equal( df.groupby(c, observed=False).transform(lambda xs: np.max(xs)), df[['a']]) # Filter tm.assert_series_equal( df.a.groupby(c, observed=False).filter(np.all), df['a']) tm.assert_frame_equal( df.groupby(c, observed=False).filter(np.all), df) # Non-monotonic df = DataFrame({"a": [5, 15, 25, -5]}) c = pd.cut(df.a, bins=[-10, 0, 10, 20, 30, 40]) result = df.a.groupby(c, observed=False).transform(sum) tm.assert_series_equal(result, df['a']) tm.assert_series_equal( df.a.groupby(c, observed=False).transform(lambda xs: np.sum(xs)), df['a']) tm.assert_frame_equal( df.groupby(c, observed=False).transform(sum), df[['a']]) tm.assert_frame_equal( df.groupby(c, observed=False).transform(lambda xs: np.sum(xs)), df[['a']]) # GH 9603 df = DataFrame({'a': [1, 0, 0, 0]}) c = pd.cut(df.a, [0, 1, 2, 3, 4], labels=Categorical(list('abcd'))) result = df.groupby(c, observed=False).apply(len) exp_index = CategoricalIndex( c.values.categories, ordered=c.values.ordered) expected = Series([1, 0, 0, 0], index=exp_index) expected.index.name = 'a' tm.assert_series_equal(result, expected) # more basic levels = ['foo', 'bar', 'baz', 'qux'] codes = np.random.randint(0, 4, size=100) cats = Categorical.from_codes(codes, levels, ordered=True) data = DataFrame(np.random.randn(100, 4)) result = data.groupby(cats, observed=False).mean() expected = data.groupby(np.asarray(cats), observed=False).mean() exp_idx = CategoricalIndex(levels, categories=cats.categories, ordered=True) expected = expected.reindex(exp_idx) assert_frame_equal(result, expected) grouped = data.groupby(cats, observed=False) desc_result = grouped.describe() idx = cats.codes.argsort() ord_labels = np.asarray(cats).take(idx) ord_data = data.take(idx) exp_cats = Categorical(ord_labels, ordered=True, categories=['foo', 'bar', 'baz', 'qux']) expected = ord_data.groupby( exp_cats, sort=False, observed=False).describe() assert_frame_equal(desc_result, expected) # GH 10460 expc = Categorical.from_codes(np.arange(4).repeat(8), levels, ordered=True) exp = CategoricalIndex(expc) tm.assert_index_equal((desc_result.stack().index .get_level_values(0)), exp) exp = Index(['count', 'mean', 'std', 'min', '25%', '50%', '75%', 'max'] * 4) tm.assert_index_equal((desc_result.stack().index .get_level_values(1)), exp)
df3[df7] # In[ ]: # In[6]: df3 # In[8]: per = np.random.permutation(6) per # In[12]: df3.take(per) # In[ ]: # In[13]: df4 = DataFrame({ 'item': ['Apple', 'Banana', 'Orenge', 'Banana', 'Orenge', 'Apple'], 'price': [4, 3, 3, 2.5, 4, 2], 'color': ['red', 'yellow', 'orenge', 'yellow', 'green', 'green'] }) df4 # In[17]: df4.groupby('item')
print(cats) agg = pd.value_counts(cats) print(agg) # detecting and filtering outliers np.random.seed(12345) data = DataFrame(np.random.randn(100, 4)) dat = data[(np.abs(data) > 3).any(axis=1)] print(dat) # permutation and Random sampling df = DataFrame(np.arange(20).reshape(5, 4)) sampler = np.random.permutation(5) df = df.take(sampler) print(df) # Computing Indicator df = DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'b'], 'data1': range(6)}) indicator = pd.get_dummies(df['key']).join(df['data1']) print(indicator) # string object methods string = 'string is good. .....We use it to do lot of things.' s = string.replace('.', '').split() print(s) # Visualization # Figures
data[(np.abs(data) > 3).any(1)] #2 data[np.abs(data) > 3] = np.sign(data) * 3 data.describe() ###排列与随机采样 #1 df = DataFrame(np.arange(5 * 4).reshape((5, 4))) sampler = np.random.permutation(5) sampler df df.take(sampler) #2 df.take(np.random.permutation(len(df))[:3]) #3 bag = np.array([5, 7, -1, 6, 4]) sampler = np.random.randint(0, len(bag), size=10) sampler draws = bag.take(sampler) draws ###计算指标与哑变量 #1
import numpy as np from pandas import DataFrame dframe = DataFrame(np.arange(4 * 4).reshape((4, 4))) blender = np.array([0, 3, 2, 1]) dframe.take(blender) #blenderに従って、行を並び替える dframe.take(blender, axis=1) #列を並び替える box = np.array(['A', 'B', 'C']) shaker = np.random.randint(0, len(box), size=10) #boxの要素の数まで(0~2)を10回ランダムに取り出す print(shaker) hand_grabs = box.take(shaker) #0~2までのランダムな数を使って、要素を取り出す(0だったら[0]を取り出す) print(hand_grabs) print(box.take(0))
# which values in col are greater than 3 col[np.abs(col) > 3] # in any column dframe[(np.abs(dframe) > 3).any(1)] # anywhere in dframe where abs() > 3 set that equal to sign of that value # and multiply it by 3 # used to cap liers dframe[np.abs(dframe) > 3] = np.sign(dframe) * 3 dframe.describe() """ Permutations """ # randomly reorder in a series or dataframe df = DataFrame(np.arange(16).reshape(4, 4)) # created a random permutation blender = np.random.permutation(4) # get a permutation of the rows df.take(blender) # box with 3 marbles box = np.array([1, 2, 3]) # permutaiton with replacement, size is how many times to pick from the box shaker = np.random.randint(0, len(box), size=10) # simulates taking a marble with replacement hand_grabs = box.take(shaker)
# pivot long to wide format # check help(pd.DataFrame.pivot) and help(pd.DataFrame.pivot_table) # transform # help(pd.Series.map) # replace values data = Series([1.,-999.,2.,-999.,-1000.,3.]) data.replace(-999, np.nan) data.replace([-999,-1000], np.nan) data.replace([-999,-1000], [np.nan,0]) # data.replace({-999:np.nan,-1000:0}) # permutation and random sampling df = DataFrame(np.arange(20).reshape(5,4)) sampler = np.random.permutation(5) df.take(sampler) df.take(sampler[:3]) df.take(np.random.permutation(len(df))[:3]) # sample without replacement (slow but works) sampler = np.random.randint(0, len(df), size=10) df.take(sampler) # sample with replacement # get dummy variables for categorical variable df = DataFrame({'key':['b','b','a','c','a','b'], 'value':range(6)}) dummies = pd.get_dummies(df['key'], prefix='key') df_with_dummy = df[['value']].join(dummies) # type(df['value']) and type(df[['value']]) # string manipulation ''' startwith, endwith, split, strip, count index, find, rfind, replace, join,
import numpy as np import pandas as pd from pandas import Series, DataFrame # Create a permutation of dataframe dframe = DataFrame(np.arange(16).reshape(4, 4)) print dframe blender = np.random.permutation( 4) #blender creates a random permutation of 0 1 2 3 print blender print dframe.take( blender ) # From the dataframe, take index (rows) in the order that is in blender # Permutation with replacement box = np.array([1, 2, 3]) shaker = np.random.randint( 0, len(box), size=10) # randint picks from 0 to len(box) with replacement 10 times print shaker hand_grab = box.take(shaker) # take box values in the order of shaker
##檢測和過濾極端值(outlier) np.random.seed(12345) df15 = DataFrame(np.random.randn(1000, 4)) df15.describe() col = df15[3] col[np.abs(col) > 3] #找出絕對值大於3的數 df15[(np.abs(df15) > 3).any(1)] #找出有任一有大於3的rows,any(1) => axis=1 df15[np.abs(df15) > 3] = np.sign(df15) * 3 #np.sign是返回-1和1組成的array df15.describe() #能看出來max=3 / min=-3 ##排列和隨機抽樣 df16 = DataFrame(np.arange(5 * 4).reshape((5, 4))) sampler = np.random.permutation(len(df16.index)) sampler #產生一個隨機排列的array df16 df16.take(sampler) #對index重新排列 df16.take(sampler[:3]) #可以在此選擇子集 ##計算指標與Dummy df17 = DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'b'], 'data1': range(6)}) pd.get_dummies(df17['key']) dummies = pd.get_dummies(df17['key'], prefix='key') df17_dum = df17[['data1']].join(dummies) df17_dum ###建立指標的方式,但是對於大規模的數據庫時,會變得很慢,需要改良 mnames = ['movie_id', 'tittle', 'genres'] movies = pd.read_table( '/Users/changyueh/Desktop/CodePractice/Data_Analysis/Chapt2/ml-1m/movies.dat', sep='::', header=None,
import numpy as np import pandas as pd from pandas import Series, DataFrame dframe = DataFrame(np.arange(16).reshape(4, 4)) blender = np.random.permutation(4) print(blender) print("=" * 50) print(dframe) print("=" * 50) #take dataframe and use blender to put then in that order #i order to do permuattion using the blender as index order dframe.take(blender) print(dframe.take(blender)) print("=" * 50) #permutation with replace box = np.array([1, 2, 3]) #cam ise ramdint with argument to do permuation with replacment shaker = np.random.randint(0, len(box), size=10) print(shaker) hand_grabs = box.take(shaker) print(hand_grabs)
data[np.abs(data) > 3] = np.sign(data) * 3 data.describe() # In[142]: df = DataFrame(np.arange(5 * 4).reshape(5, 4)) sampler = np.random.permutation(5) sampler # In[143]: df # In[144]: df.take(sampler) # In[145]: df.take(np.random.permutation(len(df))[:3]) # In[146]: bag = np.array([5, 7, -1, 6, 4]) sampler = np.random.randint(0, len(bag), size=10) sampler # In[148]: draws = bag.take(sampler) draws
data[np.abs(data) > 3] = np.sign(data) * 3 print(data.describe()) fs() print(np.sign(data).head()) # 7.2.7 置换和随机抽样 fs() df = DataFrame(np.arange(5 * 4).reshape((5, 4))) sampler = np.random.permutation(5) pprint(sampler) fs() print(df) fs() print(df.take(sampler)) fs() print(df.sample(n=3)) fs() choices = Series([5, 7, -1, 6, 4]) draws = choices.sample(n=10, replace=True) print(draws) # 7.2.8 计算指标/虚拟变量 fs() df = DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'b'], 'data1': range(6)}) print(df) print(pd.get_dummies(df['key']))
data = DataFrame(np.random.randn(1000,4)) #分析数据,包括平均值、最大值、四分位数等 data.describe() #找出每列中绝对值超过3的值 col = data[3] col[np.abs(col)>3] #找出所有含有“绝对值超过3”的行 data[(np.abs(data)>3).any(1)] #将值限制在-3到3之间 data[np.abs(data)>3] = np.sign(data) * 3 #排列和随机采样 df = DataFrame(np.arange(5*4).reshape(5,4)) sampler = np.random.permutation(5) #将df的行顺序按sampler的顺序重排 df.take(sampler) #取前3行 df.take(sampler)[:3] #得到一组随机整数,范围0——4,长度10 sampler0 = np.random.randint(0,5,size=10) #哑变量 df = DataFrame({'key':['b','b','a','c','a','b'],'data1':range(6)}) #得到哑矩阵 pd.get_dummies(df['key']) #给每个列名加前缀 dummies = pd.get_dummies(df['key'],prefix='key') #将data1列合并 df[['data1']].join(dummies) #字符串操作
import numpy as np import pandas as pd from pandas import Series, DataFrame # WE can randomly reorder (permutate) a Series, or the rows in a DataFrame dframe = DataFrame(np.arange(16).reshape(4,4)) #creating random permutation blender = np.random.permutation(4) #permutation means reorder blender dframe.take(blender)#this will reorder or permute dframe rows with the permutation blender # Now what if we want permuations WITH replacement #make a box with 3 marbles box = np.array([1,2,3]) box #randint takes the integer low high and return inputed size of array # Now lets create a random permuation WITH replacement using randint shaker = np.random.randint(0,len(box),size=5) #we will use take later and use shaker as index of box shaker #Now lets grab form the box #both example are using same logic hand_picks = box.take(shaker) #taking permutation of shaker hand_picks #these are the marbles
years = [1990, 1991, 1992, 2008, 2012, 2015, 1987, 1969, 2013, 2008, 1999] # これを10年ごとにまとめてみます。 decade_bins = [1960, 1970, 1980, 1990, 2000, 2010, 2020] decade_cat = pd.cut(years, decade_bins) decade_cat.shape decade_cat.categories pd.value_counts(decade_cat) np.random.seed(12345) dframe = DataFrame(np.random.randn(1000, 4)) dframe.head() dframe.tail() dframe.describe() col = dframe[0] col.head() col[np.abs(col) > 3] np.abs(-3.33) dframe[(np.abs(dframe) > 3).any(1)] np.sign(dframe) dframe = DataFrame(np.arange(4 * 4).reshape((4, 4))) blender = np.random.permutation(4) blender dframe dframe.take(blender)
cats.categories pd.value_counts(cats) data = np.random.rand(20) pd.cut(data, 4, precision=2) #input the number of the bins data = np.random.randn(1000) # Normally distributed cats = pd.qcut(data, 4) # Cut into quartiles cats pd.value_counts(cats) pd.qcut(data, [0, 0.1, 0.5, 0.9, 1.]) # Permutation and random sampling df = DataFrame(np.arange(5 * 4).reshape((5, 4))) df.take(np.random.permutation(5)) df.take(np.random.permutation(len(df))[:3]) # Computing indicator / dummy variables df = DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'b'], 'data1': range(6)}) pd.get_dummies(df['key']) # String manipulation========================================================== val = 'a,b, guido' val.split(',') pieces = [x.strip() for x in val.split(',')] pieces '::'.join(pieces) 'guido' in val val.index(',')