Beispiel #1
1
    def test_column_dups2(self):

        # drop buggy GH 6240
        df = DataFrame({'A': np.random.randn(5),
                        'B': np.random.randn(5),
                        'C': np.random.randn(5),
                        'D': ['a', 'b', 'c', 'd', 'e']})

        expected = df.take([0, 1, 1], axis=1)
        df2 = df.take([2, 0, 1, 2, 1], axis=1)
        result = df2.drop('C', axis=1)
        assert_frame_equal(result, expected)

        # dropna
        df = DataFrame({'A': np.random.randn(5),
                        'B': np.random.randn(5),
                        'C': np.random.randn(5),
                        'D': ['a', 'b', 'c', 'd', 'e']})
        df.iloc[2, [0, 1, 2]] = np.nan
        df.iloc[0, 0] = np.nan
        df.iloc[1, 1] = np.nan
        df.iloc[:, 3] = np.nan
        expected = df.dropna(subset=['A', 'B', 'C'], how='all')
        expected.columns = ['A', 'A', 'B', 'C']

        df.columns = ['A', 'A', 'B', 'C']

        result = df.dropna(subset=['A', 'C'], how='all')
        assert_frame_equal(result, expected)
Beispiel #2
0
    def test_sort_index_multicolumn(self):
        import random
        A = np.arange(5).repeat(20)
        B = np.tile(np.arange(5), 20)
        random.shuffle(A)
        random.shuffle(B)
        frame = DataFrame({'A': A, 'B': B,
                           'C': np.random.randn(100)})

        # use .sort_values #9816
        with tm.assert_produces_warning(FutureWarning):
            frame.sort_index(by=['A', 'B'])
        result = frame.sort_values(by=['A', 'B'])
        indexer = np.lexsort((frame['B'], frame['A']))
        expected = frame.take(indexer)
        assert_frame_equal(result, expected)

        # use .sort_values #9816
        with tm.assert_produces_warning(FutureWarning):
            frame.sort_index(by=['A', 'B'], ascending=False)
        result = frame.sort_values(by=['A', 'B'], ascending=False)
        indexer = np.lexsort((frame['B'].rank(ascending=False),
                              frame['A'].rank(ascending=False)))
        expected = frame.take(indexer)
        assert_frame_equal(result, expected)

        # use .sort_values #9816
        with tm.assert_produces_warning(FutureWarning):
            frame.sort_index(by=['B', 'A'])
        result = frame.sort_values(by=['B', 'A'])
        indexer = np.lexsort((frame['A'], frame['B']))
        expected = frame.take(indexer)
        assert_frame_equal(result, expected)
def sample01():
    df=DataFrame(np.arange(5*4).reshape(5,4))
    sampler=np.random.permutation(5)
    print sampler
    print df
    print df.take(sampler)
    print df.take(np.random.permutation(len(df))[:3])
def slide_16():
    df = DataFrame(np.arange(5 * 4).reshape(5, 4))
    sampler = np.random.permutation(5)
    print sampler
    print df
    print df.take(sampler)

    print df.take(np.random.permutation(len(df))[:3])

    bag = np.array([5, 7, -1, 6, 4])
    sampler = np.random.randint(0, len(bag), size=10)
    print sampler
    draws = bag.take(sampler)
    print draws
Beispiel #5
0
def test_apply_chunk_view():
    # Low level tinkering could be unsafe, make sure not
    df = DataFrame({"key": [1, 1, 1, 2, 2, 2, 3, 3, 3], "value": range(9)})

    result = df.groupby("key", group_keys=False).apply(lambda x: x[:2])
    expected = df.take([0, 1, 3, 4, 6, 7])
    tm.assert_frame_equal(result, expected)
Beispiel #6
0
    def test_sort_index_different_sortorder(self):
        A = np.arange(20).repeat(5)
        B = np.tile(np.arange(5), 20)

        indexer = np.random.permutation(100)
        A = A.take(indexer)
        B = B.take(indexer)

        df = DataFrame({'A': A, 'B': B,
                        'C': np.random.randn(100)})

        # use .sort_values #9816
        with tm.assert_produces_warning(FutureWarning):
            df.sort_index(by=['A', 'B'], ascending=[1, 0])
        result = df.sort_values(by=['A', 'B'], ascending=[1, 0])

        ex_indexer = np.lexsort((df.B.max() - df.B, df.A))
        expected = df.take(ex_indexer)
        assert_frame_equal(result, expected)

        # test with multiindex, too
        idf = df.set_index(['A', 'B'])

        result = idf.sort_index(ascending=[1, 0])
        expected = idf.take(ex_indexer)
        assert_frame_equal(result, expected)

        # also, Series!
        result = idf['C'].sort_index(ascending=[1, 0])
        assert_series_equal(result, expected['C'])
Beispiel #7
0
    def test_sort_index_different_sortorder(self):
        A = np.arange(20).repeat(5)
        B = np.tile(np.arange(5), 20)

        indexer = np.random.permutation(100)
        A = A.take(indexer)
        B = B.take(indexer)

        df = DataFrame({'A': A, 'B': B,
                        'C': np.random.randn(100)})

        # use .sort_values #9816
        with tm.assert_produces_warning(FutureWarning):
            df.sort_index(by=['A', 'B'], ascending=[1, 0])
        result = df.sort_values(by=['A', 'B'], ascending=[1, 0])

        ex_indexer = np.lexsort((df.B.max() - df.B, df.A))
        expected = df.take(ex_indexer)
        assert_frame_equal(result, expected)

        # test with multiindex, too
        idf = df.set_index(['A', 'B'])

        result = idf.sort_index(ascending=[1, 0])
        expected = idf.take(ex_indexer)
        assert_frame_equal(result, expected)

        # also, Series!
        result = idf['C'].sort_index(ascending=[1, 0])
        assert_series_equal(result, expected['C'])
def get_negatives(interactome: pd.DataFrame,
                  positives: set,
                  pname,
                  num: int = 0,
                  bivalent=False) -> set:
    """
    :interactome dataframe of interaction data
    :num         number of negatives to randomly generate
    :positives   set of positives
    :returns     k probable negatives
    """
    if num == 0:
        num = len(positives) * 50
    edges = make_edges(interactome.take([0, 1], axis=1))
    edges = {tuple(e) for e in edges}
    #nodes = set()
    #for e in edges:
    #        nodes.update(set(e))
    nodes = set([e[0] for e in edges]).union(set([e[1] for e in edges]))
    print('%d total nodes (%.3f are positives)' %
          (len(nodes), len(positives) / len(nodes)))
    nodes = nodes - positives
    print('%d total' % (len(nodes)))
    if bivalent:
        return nodes

    samp = set(random.sample(list(nodes), k=num))
    psamp = {(a, pname) for a in samp}
    return psamp
Beispiel #9
0
    def test_loc_getitem_frame(self):

        df = DataFrame({"A": range(10)})
        s = pd.cut(df.A, 5)
        df["B"] = s
        df = df.set_index("B")

        result = df.loc[4]
        expected = df.iloc[4:6]
        tm.assert_frame_equal(result, expected)

        with pytest.raises(KeyError, match="10"):
            df.loc[10]

        # single list-like
        result = df.loc[[4]]
        expected = df.iloc[4:6]
        tm.assert_frame_equal(result, expected)

        # non-unique
        result = df.loc[[4, 5]]
        expected = df.take([4, 5, 4, 5])
        tm.assert_frame_equal(result, expected)

        with pytest.raises(KeyError, match="^$"):
            df.loc[[10]]

        # partial missing
        with pytest.raises(KeyError, match="^$"):
            df.loc[[10, 4]]
Beispiel #10
0
    def test_loc_getitem_frame(self):

        df = DataFrame({'A': range(10)})
        s = pd.cut(df.A, 5)
        df['B'] = s
        df = df.set_index('B')

        result = df.loc[4]
        expected = df.iloc[4:6]
        tm.assert_frame_equal(result, expected)

        with pytest.raises(KeyError):
            df.loc[10]

        # single list-like
        result = df.loc[[4]]
        expected = df.iloc[4:6]
        tm.assert_frame_equal(result, expected)

        # non-unique
        result = df.loc[[4, 5]]
        expected = df.take([4, 5, 4, 5])
        tm.assert_frame_equal(result, expected)

        with pytest.raises(KeyError):
            df.loc[[10]]

        # partial missing
        with pytest.raises(KeyError):
            df.loc[[10, 4]]
Beispiel #11
0
def test_apply_chunk_view():
    # Low level tinkering could be unsafe, make sure not
    df = DataFrame({'key': [1, 1, 1, 2, 2, 2, 3, 3, 3],
                    'value': compat.lrange(9)})

    result = df.groupby('key', group_keys=False).apply(lambda x: x[:2])
    expected = df.take([0, 1, 3, 4, 6, 7])
    tm.assert_frame_equal(result, expected)
Beispiel #12
0
def test_apply_chunk_view():
    # Low level tinkering could be unsafe, make sure not
    df = DataFrame({'key': [1, 1, 1, 2, 2, 2, 3, 3, 3],
                    'value': compat.lrange(9)})

    # return view
    f = lambda x: x[:2]

    result = df.groupby('key', group_keys=False).apply(f)
    expected = df.take([0, 1, 3, 4, 6, 7])
    tm.assert_frame_equal(result, expected)
Beispiel #13
0
def test_apply_chunk_view(group_keys):
    # Low level tinkering could be unsafe, make sure not
    df = DataFrame({"key": [1, 1, 1, 2, 2, 2, 3, 3, 3], "value": range(9)})

    result = df.groupby("key",
                        group_keys=group_keys).apply(lambda x: x.iloc[:2])
    expected = df.take([0, 1, 3, 4, 6, 7])
    if group_keys:
        expected.index = MultiIndex.from_arrays(
            [[1, 1, 2, 2, 3, 3], expected.index], names=["key", None])

    tm.assert_frame_equal(result, expected)
Beispiel #14
0
    def test_sort_values_multicolumn(self):
        A = np.arange(5).repeat(20)
        B = np.tile(np.arange(5), 20)
        random.shuffle(A)
        random.shuffle(B)
        frame = DataFrame({"A": A, "B": B, "C": np.random.randn(100)})

        result = frame.sort_values(by=["A", "B"])
        indexer = np.lexsort((frame["B"], frame["A"]))
        expected = frame.take(indexer)
        tm.assert_frame_equal(result, expected)

        result = frame.sort_values(by=["A", "B"], ascending=False)
        indexer = np.lexsort((frame["B"].rank(ascending=False),
                              frame["A"].rank(ascending=False)))
        expected = frame.take(indexer)
        tm.assert_frame_equal(result, expected)

        result = frame.sort_values(by=["B", "A"])
        indexer = np.lexsort((frame["A"], frame["B"]))
        expected = frame.take(indexer)
        tm.assert_frame_equal(result, expected)
    def test_column_dups2(self):

        # drop buggy GH 6240
        df = DataFrame(
            {
                "A": np.random.randn(5),
                "B": np.random.randn(5),
                "C": np.random.randn(5),
                "D": ["a", "b", "c", "d", "e"],
            }
        )

        expected = df.take([0, 1, 1], axis=1)
        df2 = df.take([2, 0, 1, 2, 1], axis=1)
        result = df2.drop("C", axis=1)
        tm.assert_frame_equal(result, expected)

        # dropna
        df = DataFrame(
            {
                "A": np.random.randn(5),
                "B": np.random.randn(5),
                "C": np.random.randn(5),
                "D": ["a", "b", "c", "d", "e"],
            }
        )
        df.iloc[2, [0, 1, 2]] = np.nan
        df.iloc[0, 0] = np.nan
        df.iloc[1, 1] = np.nan
        df.iloc[:, 3] = np.nan
        expected = df.dropna(subset=["A", "B", "C"], how="all")
        expected.columns = ["A", "A", "B", "C"]

        df.columns = ["A", "A", "B", "C"]

        result = df.dropna(subset=["A", "C"], how="all")
        tm.assert_frame_equal(result, expected)
Beispiel #16
0
 def test_sort_values_stable_multicolumn_sort(self, expected_idx_non_na,
                                              ascending, na_position):
     # GH#38426 Clarify sort_values with mult. columns / labels is stable
     df = DataFrame({
         "A": [1, 2, np.nan, 1, 1, 1, 6, 8, 4, 8, 8, np.nan, np.nan, 8, 8],
         "B": [9, np.nan, 5, 2, 2, 2, 5, 4, 5, 3, 4, np.nan, np.nan, 4, 4],
     })
     # All rows with NaN in col "B" only have unique values in "A", therefore,
     # only the rows with NaNs in "A" have to be treated individually:
     expected_idx = ([11, 12, 2] + expected_idx_non_na if na_position
                     == "first" else expected_idx_non_na + [2, 11, 12])
     expected = df.take(expected_idx)
     sorted_df = df.sort_values(["A", "B"],
                                ascending=ascending,
                                na_position=na_position)
     tm.assert_frame_equal(sorted_df, expected)
Beispiel #17
0
    def test_groupby_datetime_categorical(self):
        # GH9049: ensure backward compatibility
        levels = pd.date_range('2014-01-01', periods=4)
        codes = np.random.randint(0, 4, size=100)

        cats = Categorical.from_codes(codes, levels, ordered=True)

        data = DataFrame(np.random.randn(100, 4))
        result = data.groupby(cats).mean()

        expected = data.groupby(np.asarray(cats)).mean()
        expected = expected.reindex(levels)
        expected.index = CategoricalIndex(expected.index,
                                          categories=expected.index,
                                          ordered=True)

        assert_frame_equal(result, expected)

        grouped = data.groupby(cats)
        desc_result = grouped.describe()

        idx = cats.codes.argsort()
        ord_labels = cats.take_nd(idx)
        ord_data = data.take(idx)
        expected = ord_data.groupby(ord_labels).describe()
        assert_frame_equal(desc_result, expected)
        tm.assert_index_equal(desc_result.index, expected.index)
        tm.assert_index_equal(
            desc_result.index.get_level_values(0),
            expected.index.get_level_values(0))

        # GH 10460
        expc = Categorical.from_codes(
            np.arange(4).repeat(8), levels, ordered=True)
        exp = CategoricalIndex(expc)
        self.assert_index_equal((desc_result.stack()
                                            .index
                                            .get_level_values(0)), exp)
        exp = Index(['count', 'mean', 'std', 'min', '25%', '50%',
                     '75%', 'max'] * 4)
        self.assert_index_equal((desc_result.stack()
                                            .index
                                            .get_level_values(1)), exp)
    def test_groupby_categorical(self):
        levels = ['foo', 'bar', 'baz', 'qux']
        codes = np.random.randint(0, 4, size=100)

        cats = Categorical.from_codes(codes, levels, ordered=True)

        data = DataFrame(np.random.randn(100, 4))

        result = data.groupby(cats).mean()

        expected = data.groupby(np.asarray(cats)).mean()
        exp_idx = CategoricalIndex(levels,
                                   categories=cats.categories,
                                   ordered=True)
        expected = expected.reindex(exp_idx)

        assert_frame_equal(result, expected)

        grouped = data.groupby(cats)
        desc_result = grouped.describe()

        idx = cats.codes.argsort()
        ord_labels = np.asarray(cats).take(idx)
        ord_data = data.take(idx)

        exp_cats = Categorical(ord_labels,
                               ordered=True,
                               categories=['foo', 'bar', 'baz', 'qux'])
        expected = ord_data.groupby(exp_cats, sort=False).describe()
        assert_frame_equal(desc_result, expected)

        # GH 10460
        expc = Categorical.from_codes(np.arange(4).repeat(8),
                                      levels,
                                      ordered=True)
        exp = CategoricalIndex(expc)
        tm.assert_index_equal((desc_result.stack().index.get_level_values(0)),
                              exp)
        exp = Index(
            ['count', 'mean', 'std', 'min', '25%', '50%', '75%', 'max'] * 4)
        tm.assert_index_equal((desc_result.stack().index.get_level_values(1)),
                              exp)
    def test_groupby_datetime_categorical(self):
        # GH9049: ensure backward compatibility
        levels = pd.date_range('2014-01-01', periods=4)
        codes = np.random.randint(0, 4, size=100)

        cats = Categorical.from_codes(codes, levels, ordered=True)

        data = DataFrame(np.random.randn(100, 4))
        result = data.groupby(cats).mean()

        expected = data.groupby(np.asarray(cats)).mean()
        expected = expected.reindex(levels)
        expected.index = CategoricalIndex(expected.index,
                                          categories=expected.index,
                                          ordered=True)

        assert_frame_equal(result, expected)

        grouped = data.groupby(cats)
        desc_result = grouped.describe()

        idx = cats.codes.argsort()
        ord_labels = cats.take_nd(idx)
        ord_data = data.take(idx)
        expected = ord_data.groupby(ord_labels).describe()
        assert_frame_equal(desc_result, expected)
        tm.assert_index_equal(desc_result.index, expected.index)
        tm.assert_index_equal(desc_result.index.get_level_values(0),
                              expected.index.get_level_values(0))

        # GH 10460
        expc = Categorical.from_codes(np.arange(4).repeat(8),
                                      levels,
                                      ordered=True)
        exp = CategoricalIndex(expc)
        tm.assert_index_equal((desc_result.stack().index.get_level_values(0)),
                              exp)
        exp = Index(
            ['count', 'mean', 'std', 'min', '25%', '50%', '75%', 'max'] * 4)
        tm.assert_index_equal((desc_result.stack().index.get_level_values(1)),
                              exp)
Beispiel #20
0
    def test_groupby_categorical(self):
        levels = ['foo', 'bar', 'baz', 'qux']
        codes = np.random.randint(0, 4, size=100)

        cats = Categorical.from_codes(codes, levels, ordered=True)

        data = DataFrame(np.random.randn(100, 4))

        result = data.groupby(cats).mean()

        expected = data.groupby(np.asarray(cats)).mean()
        exp_idx = CategoricalIndex(levels, categories=cats.categories,
                                   ordered=True)
        expected = expected.reindex(exp_idx)

        assert_frame_equal(result, expected)

        grouped = data.groupby(cats)
        desc_result = grouped.describe()

        idx = cats.codes.argsort()
        ord_labels = np.asarray(cats).take(idx)
        ord_data = data.take(idx)

        exp_cats = Categorical(ord_labels, ordered=True,
                               categories=['foo', 'bar', 'baz', 'qux'])
        expected = ord_data.groupby(exp_cats, sort=False).describe()
        assert_frame_equal(desc_result, expected)

        # GH 10460
        expc = Categorical.from_codes(np.arange(4).repeat(8),
                                      levels, ordered=True)
        exp = CategoricalIndex(expc)
        self.assert_index_equal((desc_result.stack()
                                            .index
                                            .get_level_values(0)), exp)
        exp = Index(['count', 'mean', 'std', 'min', '25%', '50%',
                     '75%', 'max'] * 4)
        self.assert_index_equal((desc_result.stack()
                                            .index
                                            .get_level_values(1)), exp)
Beispiel #21
0
def get_negatives(interactome: pd.DataFrame,
                  positives: set,
                  pname,
                  num: int = 0,
                  bivalent=False) -> set:
    """
    :interactome dataframe of interaction data
    :num         number of negatives to randomly generate
    :positives   set of positives
    :returns     k probable negatives
    """
    if num == 0:
        num = len(positives) * 50
    edges = make_edges(interactome.take([0, 1], axis=1))
    edges = edges - positives
    if bivalent:
        return edges
    samp = set(random.sample(list(edges), k=num))
    psamp = {frozenset((a, b, pname)) for (a, b) in samp}
    ## Nnode negs should be determined here too. - AR
    return psamp
Beispiel #22
0
def normalize_synop_data(
        all_synop_data: pd.DataFrame,
        synop_data_indices: [int],
        features,
        length_of_sequence,
        normalization_type: NormalizationType = NormalizationType.STANDARD):
    # Bear in mind that synop_data_indices are indices of FIRST synop in the sequence. Not all synop data exist in synop_data_indices because of that fact.
    all_indices = set([
        item for sublist in
        [[index + frame for frame in range(0, length_of_sequence)]
         for index in synop_data_indices] for item in sublist
    ])
    all_relevant_labels = all_synop_data.take(list(all_indices))
    _, mean_or_min, std_or_max = normalize(
        all_relevant_labels[features].values, normalization_type)
    if normalization_type == NormalizationType.STANDARD:
        all_synop_data[features] = (all_synop_data[features].values -
                                    mean_or_min) / std_or_max
        return all_synop_data, mean_or_min, std_or_max
    else:
        all_synop_data[features] = (all_synop_data[features].values -
                                    mean_or_min) / (std_or_max - mean_or_min)
        return all_synop_data, mean_or_min, std_or_max
    def test_sort_index_different_sortorder(self):
        A = np.arange(20).repeat(5)
        B = np.tile(np.arange(5), 20)

        indexer = np.random.permutation(100)
        A = A.take(indexer)
        B = B.take(indexer)

        df = DataFrame({"A": A, "B": B, "C": np.random.randn(100)})

        ex_indexer = np.lexsort((df.B.max() - df.B, df.A))
        expected = df.take(ex_indexer)

        # test with multiindex, too
        idf = df.set_index(["A", "B"])

        result = idf.sort_index(ascending=[1, 0])
        expected = idf.take(ex_indexer)
        tm.assert_frame_equal(result, expected)

        # also, Series!
        result = idf["C"].sort_index(ascending=[1, 0])
        tm.assert_series_equal(result, expected["C"])
Beispiel #24
0
def test_datetime():
    # GH9049: ensure backward compatibility
    levels = pd.date_range("2014-01-01", periods=4)
    codes = np.random.randint(0, 4, size=100)

    cats = Categorical.from_codes(codes, levels, ordered=True)

    data = DataFrame(np.random.randn(100, 4))
    result = data.groupby(cats, observed=False).mean()

    expected = data.groupby(np.asarray(cats), observed=False).mean()
    expected = expected.reindex(levels)
    expected.index = CategoricalIndex(expected.index,
                                      categories=expected.index,
                                      ordered=True)

    tm.assert_frame_equal(result, expected)

    grouped = data.groupby(cats, observed=False)
    desc_result = grouped.describe()

    idx = cats.codes.argsort()
    ord_labels = cats.take(idx)
    ord_data = data.take(idx)
    expected = ord_data.groupby(ord_labels, observed=False).describe()
    tm.assert_frame_equal(desc_result, expected)
    tm.assert_index_equal(desc_result.index, expected.index)
    tm.assert_index_equal(desc_result.index.get_level_values(0),
                          expected.index.get_level_values(0))

    # GH 10460
    expc = Categorical.from_codes(np.arange(4).repeat(8), levels, ordered=True)
    exp = CategoricalIndex(expc)
    tm.assert_index_equal((desc_result.stack().index.get_level_values(0)), exp)
    exp = Index(["count", "mean", "std", "min", "25%", "50%", "75%", "max"] *
                4)
    tm.assert_index_equal((desc_result.stack().index.get_level_values(1)), exp)
Beispiel #25
0
col=df[0]
col.head()
col[np.abs(col)>3]

df[(np.abs(df)>3).any(1)]

df[np.abs(df)>3] = np.sign(df)*3 # cap outliers in the data at 3
df.describe()

##### Permutations
df = DataFrame(np.arange(16).reshape(4,4))
df
blender = np.random.permutation(4) #without replacement
blender
df.take(blender) #permutate the rows 

box = np.array([1,2,3]) #with replacement 
shaker=np.random.randint(0,len(box),size=10)
shaker
box.take(shaker)


##### Groupby on DFs
df = DataFrame({'k1':['X','X','Y','Y','Z'],
                    'k2':['alpha','beta','alpha','beta','alpha'],
                    'dataset1':np.random.randn(5),
                    'dataset2':np.random.randn(5)})
df
group1 = df['dataset1'].groupby(df['k1'])
group1
Beispiel #26
0
def test_basic():

    cats = Categorical(["a", "a", "a", "b", "b", "b", "c", "c", "c"],
                       categories=["a", "b", "c", "d"],
                       ordered=True)
    data = DataFrame({"a": [1, 1, 1, 2, 2, 2, 3, 4, 5], "b": cats})

    exp_index = CategoricalIndex(list('abcd'), name='b', ordered=True)
    expected = DataFrame({'a': [1, 2, 4, np.nan]}, index=exp_index)
    result = data.groupby("b", observed=False).mean()
    tm.assert_frame_equal(result, expected)

    cat1 = Categorical(["a", "a", "b", "b"],
                       categories=["a", "b", "z"],
                       ordered=True)
    cat2 = Categorical(["c", "d", "c", "d"],
                       categories=["c", "d", "y"],
                       ordered=True)
    df = DataFrame({"A": cat1, "B": cat2, "values": [1, 2, 3, 4]})

    # single grouper
    gb = df.groupby("A", observed=False)
    exp_idx = CategoricalIndex(['a', 'b', 'z'], name='A', ordered=True)
    expected = DataFrame({'values': Series([3, 7, 0], index=exp_idx)})
    result = gb.sum()
    tm.assert_frame_equal(result, expected)

    # GH 8623
    x = DataFrame([[1, 'John P. Doe'], [2, 'Jane Dove'], [1, 'John P. Doe']],
                  columns=['person_id', 'person_name'])
    x['person_name'] = Categorical(x.person_name)

    g = x.groupby(['person_id'], observed=False)
    result = g.transform(lambda x: x)
    tm.assert_frame_equal(result, x[['person_name']])

    result = x.drop_duplicates('person_name')
    expected = x.iloc[[0, 1]]
    tm.assert_frame_equal(result, expected)

    def f(x):
        return x.drop_duplicates('person_name').iloc[0]

    result = g.apply(f)
    expected = x.iloc[[0, 1]].copy()
    expected.index = Index([1, 2], name='person_id')
    expected['person_name'] = expected['person_name'].astype('object')
    tm.assert_frame_equal(result, expected)

    # GH 9921
    # Monotonic
    df = DataFrame({"a": [5, 15, 25]})
    c = pd.cut(df.a, bins=[0, 10, 20, 30, 40])

    result = df.a.groupby(c, observed=False).transform(sum)
    tm.assert_series_equal(result, df['a'])

    tm.assert_series_equal(
        df.a.groupby(c, observed=False).transform(lambda xs: np.sum(xs)),
        df['a'])
    tm.assert_frame_equal(
        df.groupby(c, observed=False).transform(sum), df[['a']])
    tm.assert_frame_equal(
        df.groupby(c, observed=False).transform(lambda xs: np.max(xs)),
        df[['a']])

    # Filter
    tm.assert_series_equal(
        df.a.groupby(c, observed=False).filter(np.all), df['a'])
    tm.assert_frame_equal(df.groupby(c, observed=False).filter(np.all), df)

    # Non-monotonic
    df = DataFrame({"a": [5, 15, 25, -5]})
    c = pd.cut(df.a, bins=[-10, 0, 10, 20, 30, 40])

    result = df.a.groupby(c, observed=False).transform(sum)
    tm.assert_series_equal(result, df['a'])

    tm.assert_series_equal(
        df.a.groupby(c, observed=False).transform(lambda xs: np.sum(xs)),
        df['a'])
    tm.assert_frame_equal(
        df.groupby(c, observed=False).transform(sum), df[['a']])
    tm.assert_frame_equal(
        df.groupby(c, observed=False).transform(lambda xs: np.sum(xs)),
        df[['a']])

    # GH 9603
    df = DataFrame({'a': [1, 0, 0, 0]})
    c = pd.cut(df.a, [0, 1, 2, 3, 4], labels=Categorical(list('abcd')))
    result = df.groupby(c, observed=False).apply(len)

    exp_index = CategoricalIndex(c.values.categories, ordered=c.values.ordered)
    expected = Series([1, 0, 0, 0], index=exp_index)
    expected.index.name = 'a'
    tm.assert_series_equal(result, expected)

    # more basic
    levels = ['foo', 'bar', 'baz', 'qux']
    codes = np.random.randint(0, 4, size=100)

    cats = Categorical.from_codes(codes, levels, ordered=True)

    data = DataFrame(np.random.randn(100, 4))

    result = data.groupby(cats, observed=False).mean()

    expected = data.groupby(np.asarray(cats), observed=False).mean()
    exp_idx = CategoricalIndex(levels,
                               categories=cats.categories,
                               ordered=True)
    expected = expected.reindex(exp_idx)

    assert_frame_equal(result, expected)

    grouped = data.groupby(cats, observed=False)
    desc_result = grouped.describe()

    idx = cats.codes.argsort()
    ord_labels = np.asarray(cats).take(idx)
    ord_data = data.take(idx)

    exp_cats = Categorical(ord_labels,
                           ordered=True,
                           categories=['foo', 'bar', 'baz', 'qux'])
    expected = ord_data.groupby(exp_cats, sort=False,
                                observed=False).describe()
    assert_frame_equal(desc_result, expected)

    # GH 10460
    expc = Categorical.from_codes(np.arange(4).repeat(8), levels, ordered=True)
    exp = CategoricalIndex(expc)
    tm.assert_index_equal((desc_result.stack().index.get_level_values(0)), exp)
    exp = Index(['count', 'mean', 'std', 'min', '25%', '50%', '75%', 'max'] *
                4)
    tm.assert_index_equal((desc_result.stack().index.get_level_values(1)), exp)
Beispiel #27
0
 def test_iloc_getitem_with_duplicates2(self):
     # GH#2259
     df = DataFrame([[1, 2, 3], [4, 5, 6]], columns=[1, 1, 2])
     result = df.iloc[:, [0]]
     expected = df.take([0], axis=1)
     tm.assert_frame_equal(result, expected)
Beispiel #28
0
def test_basic():

    cats = Categorical(["a", "a", "a", "b", "b", "b", "c", "c", "c"],
                       categories=["a", "b", "c", "d"], ordered=True)
    data = DataFrame({"a": [1, 1, 1, 2, 2, 2, 3, 4, 5], "b": cats})

    exp_index = CategoricalIndex(list('abcd'), name='b', ordered=True)
    expected = DataFrame({'a': [1, 2, 4, np.nan]}, index=exp_index)
    result = data.groupby("b", observed=False).mean()
    tm.assert_frame_equal(result, expected)

    cat1 = Categorical(["a", "a", "b", "b"],
                       categories=["a", "b", "z"], ordered=True)
    cat2 = Categorical(["c", "d", "c", "d"],
                       categories=["c", "d", "y"], ordered=True)
    df = DataFrame({"A": cat1, "B": cat2, "values": [1, 2, 3, 4]})

    # single grouper
    gb = df.groupby("A", observed=False)
    exp_idx = CategoricalIndex(['a', 'b', 'z'], name='A', ordered=True)
    expected = DataFrame({'values': Series([3, 7, 0], index=exp_idx)})
    result = gb.sum()
    tm.assert_frame_equal(result, expected)

    # GH 8623
    x = DataFrame([[1, 'John P. Doe'], [2, 'Jane Dove'],
                   [1, 'John P. Doe']],
                  columns=['person_id', 'person_name'])
    x['person_name'] = Categorical(x.person_name)

    g = x.groupby(['person_id'], observed=False)
    result = g.transform(lambda x: x)
    tm.assert_frame_equal(result, x[['person_name']])

    result = x.drop_duplicates('person_name')
    expected = x.iloc[[0, 1]]
    tm.assert_frame_equal(result, expected)

    def f(x):
        return x.drop_duplicates('person_name').iloc[0]

    result = g.apply(f)
    expected = x.iloc[[0, 1]].copy()
    expected.index = Index([1, 2], name='person_id')
    expected['person_name'] = expected['person_name'].astype('object')
    tm.assert_frame_equal(result, expected)

    # GH 9921
    # Monotonic
    df = DataFrame({"a": [5, 15, 25]})
    c = pd.cut(df.a, bins=[0, 10, 20, 30, 40])

    result = df.a.groupby(c, observed=False).transform(sum)
    tm.assert_series_equal(result, df['a'])

    tm.assert_series_equal(
        df.a.groupby(c, observed=False).transform(lambda xs: np.sum(xs)),
        df['a'])
    tm.assert_frame_equal(
        df.groupby(c, observed=False).transform(sum),
        df[['a']])
    tm.assert_frame_equal(
        df.groupby(c, observed=False).transform(lambda xs: np.max(xs)),
        df[['a']])

    # Filter
    tm.assert_series_equal(
        df.a.groupby(c, observed=False).filter(np.all),
        df['a'])
    tm.assert_frame_equal(
        df.groupby(c, observed=False).filter(np.all),
        df)

    # Non-monotonic
    df = DataFrame({"a": [5, 15, 25, -5]})
    c = pd.cut(df.a, bins=[-10, 0, 10, 20, 30, 40])

    result = df.a.groupby(c, observed=False).transform(sum)
    tm.assert_series_equal(result, df['a'])

    tm.assert_series_equal(
        df.a.groupby(c, observed=False).transform(lambda xs: np.sum(xs)),
        df['a'])
    tm.assert_frame_equal(
        df.groupby(c, observed=False).transform(sum),
        df[['a']])
    tm.assert_frame_equal(
        df.groupby(c, observed=False).transform(lambda xs: np.sum(xs)),
        df[['a']])

    # GH 9603
    df = DataFrame({'a': [1, 0, 0, 0]})
    c = pd.cut(df.a, [0, 1, 2, 3, 4], labels=Categorical(list('abcd')))
    result = df.groupby(c, observed=False).apply(len)

    exp_index = CategoricalIndex(
        c.values.categories, ordered=c.values.ordered)
    expected = Series([1, 0, 0, 0], index=exp_index)
    expected.index.name = 'a'
    tm.assert_series_equal(result, expected)

    # more basic
    levels = ['foo', 'bar', 'baz', 'qux']
    codes = np.random.randint(0, 4, size=100)

    cats = Categorical.from_codes(codes, levels, ordered=True)

    data = DataFrame(np.random.randn(100, 4))

    result = data.groupby(cats, observed=False).mean()

    expected = data.groupby(np.asarray(cats), observed=False).mean()
    exp_idx = CategoricalIndex(levels, categories=cats.categories,
                               ordered=True)
    expected = expected.reindex(exp_idx)

    assert_frame_equal(result, expected)

    grouped = data.groupby(cats, observed=False)
    desc_result = grouped.describe()

    idx = cats.codes.argsort()
    ord_labels = np.asarray(cats).take(idx)
    ord_data = data.take(idx)

    exp_cats = Categorical(ord_labels, ordered=True,
                           categories=['foo', 'bar', 'baz', 'qux'])
    expected = ord_data.groupby(
        exp_cats, sort=False, observed=False).describe()
    assert_frame_equal(desc_result, expected)

    # GH 10460
    expc = Categorical.from_codes(np.arange(4).repeat(8),
                                  levels, ordered=True)
    exp = CategoricalIndex(expc)
    tm.assert_index_equal((desc_result.stack().index
                           .get_level_values(0)), exp)
    exp = Index(['count', 'mean', 'std', 'min', '25%', '50%',
                 '75%', 'max'] * 4)
    tm.assert_index_equal((desc_result.stack().index
                           .get_level_values(1)), exp)
Beispiel #29
0
df3[df7]

# In[ ]:

# In[6]:

df3

# In[8]:

per = np.random.permutation(6)
per

# In[12]:

df3.take(per)

# In[ ]:

# In[13]:

df4 = DataFrame({
    'item': ['Apple', 'Banana', 'Orenge', 'Banana', 'Orenge', 'Apple'],
    'price': [4, 3, 3, 2.5, 4, 2],
    'color': ['red', 'yellow', 'orenge', 'yellow', 'green', 'green']
})
df4

# In[17]:

df4.groupby('item')
Beispiel #30
0
print(cats)

agg = pd.value_counts(cats)
print(agg)

# detecting and filtering outliers
np.random.seed(12345)
data = DataFrame(np.random.randn(100, 4))

dat = data[(np.abs(data) > 3).any(axis=1)]
print(dat)

# permutation and Random sampling
df = DataFrame(np.arange(20).reshape(5, 4))
sampler = np.random.permutation(5)
df = df.take(sampler)

print(df)

# Computing Indicator
df = DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'b'], 'data1': range(6)})
indicator = pd.get_dummies(df['key']).join(df['data1'])
print(indicator)

# string object methods
string = 'string is good. .....We use it to do lot of things.'
s = string.replace('.', '').split()
print(s)

# Visualization
# Figures
Beispiel #31
0
data[(np.abs(data) > 3).any(1)]

#2
data[np.abs(data) > 3] = np.sign(data) * 3
data.describe()


###排列与随机采样
#1
df = DataFrame(np.arange(5 * 4).reshape((5, 4)))
sampler = np.random.permutation(5)
sampler
df

df.take(sampler)

#2
df.take(np.random.permutation(len(df))[:3])

#3
bag = np.array([5, 7, -1, 6, 4])
sampler = np.random.randint(0, len(bag), size=10)
sampler

draws = bag.take(sampler)
draws


###计算指标与哑变量
#1
Beispiel #32
0
import numpy as np
from pandas import DataFrame
dframe = DataFrame(np.arange(4 * 4).reshape((4, 4)))
blender = np.array([0, 3, 2, 1])
dframe.take(blender)  #blenderに従って、行を並び替える
dframe.take(blender, axis=1)  #列を並び替える
box = np.array(['A', 'B', 'C'])
shaker = np.random.randint(0, len(box), size=10)  #boxの要素の数まで(0~2)を10回ランダムに取り出す
print(shaker)
hand_grabs = box.take(shaker)  #0~2までのランダムな数を使って、要素を取り出す(0だったら[0]を取り出す)
print(hand_grabs)
print(box.take(0))
# which values in col are greater than 3
col[np.abs(col) > 3]
# in any column
dframe[(np.abs(dframe) > 3).any(1)]

# anywhere in dframe where abs() > 3 set that equal to sign of that value
# and multiply it by 3
# used to cap liers
dframe[np.abs(dframe) > 3] = np.sign(dframe) * 3
dframe.describe()
"""
Permutations
"""
# randomly reorder in a series or dataframe

df = DataFrame(np.arange(16).reshape(4, 4))
# created a random permutation
blender = np.random.permutation(4)

# get a permutation of the rows
df.take(blender)

# box with 3 marbles
box = np.array([1, 2, 3])
# permutaiton with replacement, size is how many times to pick from the box
shaker = np.random.randint(0, len(box), size=10)

# simulates taking a marble with replacement
hand_grabs = box.take(shaker)
Beispiel #34
0
# pivot long to wide format
# check help(pd.DataFrame.pivot) and help(pd.DataFrame.pivot_table)

# transform 
# help(pd.Series.map) 

# replace values
data = Series([1.,-999.,2.,-999.,-1000.,3.])
data.replace(-999, np.nan)
data.replace([-999,-1000], np.nan)
data.replace([-999,-1000], [np.nan,0]) # data.replace({-999:np.nan,-1000:0})

# permutation and random sampling
df = DataFrame(np.arange(20).reshape(5,4))
sampler = np.random.permutation(5)
df.take(sampler)
df.take(sampler[:3])
df.take(np.random.permutation(len(df))[:3]) # sample without replacement (slow but works)
sampler = np.random.randint(0, len(df), size=10)
df.take(sampler) # sample with replacement

# get dummy variables for categorical variable
df = DataFrame({'key':['b','b','a','c','a','b'],
                'value':range(6)})
dummies = pd.get_dummies(df['key'], prefix='key')
df_with_dummy = df[['value']].join(dummies) # type(df['value']) and type(df[['value']])

# string manipulation
'''
startwith, endwith, split, strip, count
index, find, rfind, replace, join,  
import numpy as np
import pandas as pd
from pandas import Series, DataFrame

# Create a permutation of dataframe
dframe = DataFrame(np.arange(16).reshape(4, 4))
print dframe
blender = np.random.permutation(
    4)  #blender creates a random permutation of 0 1 2 3
print blender
print dframe.take(
    blender
)  # From the dataframe, take index (rows) in the order that is in blender

# Permutation with replacement
box = np.array([1, 2, 3])
shaker = np.random.randint(
    0, len(box),
    size=10)  # randint picks from 0 to len(box) with replacement 10 times
print shaker
hand_grab = box.take(shaker)  # take box values in the order of shaker
##檢測和過濾極端值(outlier)
np.random.seed(12345)
df15 = DataFrame(np.random.randn(1000, 4))
df15.describe()
col = df15[3]
col[np.abs(col) > 3]  #找出絕對值大於3的數
df15[(np.abs(df15) > 3).any(1)]  #找出有任一有大於3的rows,any(1) => axis=1
df15[np.abs(df15) > 3] = np.sign(df15) * 3  #np.sign是返回-1和1組成的array
df15.describe()  #能看出來max=3 / min=-3

##排列和隨機抽樣
df16 = DataFrame(np.arange(5 * 4).reshape((5, 4)))
sampler = np.random.permutation(len(df16.index))
sampler  #產生一個隨機排列的array
df16
df16.take(sampler)  #對index重新排列
df16.take(sampler[:3])  #可以在此選擇子集

##計算指標與Dummy
df17 = DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'b'], 'data1': range(6)})
pd.get_dummies(df17['key'])
dummies = pd.get_dummies(df17['key'], prefix='key')
df17_dum = df17[['data1']].join(dummies)
df17_dum

###建立指標的方式,但是對於大規模的數據庫時,會變得很慢,需要改良
mnames = ['movie_id', 'tittle', 'genres']
movies = pd.read_table(
    '/Users/changyueh/Desktop/CodePractice/Data_Analysis/Chapt2/ml-1m/movies.dat',
    sep='::',
    header=None,
Beispiel #37
0
import numpy as np
import pandas as pd
from pandas import Series, DataFrame

dframe = DataFrame(np.arange(16).reshape(4, 4))

blender = np.random.permutation(4)

print(blender)
print("=" * 50)

print(dframe)
print("=" * 50)

#take dataframe and use blender to put then in that order
#i order to do permuattion using the blender as index order
dframe.take(blender)
print(dframe.take(blender))
print("=" * 50)

#permutation with replace
box = np.array([1, 2, 3])

#cam ise ramdint with argument to do permuation with replacment
shaker = np.random.randint(0, len(box), size=10)
print(shaker)

hand_grabs = box.take(shaker)

print(hand_grabs)
Beispiel #38
0
data[np.abs(data) > 3] = np.sign(data) * 3
data.describe()

# In[142]:

df = DataFrame(np.arange(5 * 4).reshape(5, 4))
sampler = np.random.permutation(5)
sampler

# In[143]:

df

# In[144]:

df.take(sampler)

# In[145]:

df.take(np.random.permutation(len(df))[:3])

# In[146]:

bag = np.array([5, 7, -1, 6, 4])
sampler = np.random.randint(0, len(bag), size=10)
sampler

# In[148]:

draws = bag.take(sampler)
draws
Beispiel #39
0
data[np.abs(data) > 3] = np.sign(data) * 3
print(data.describe())

fs()
print(np.sign(data).head())

# 7.2.7 置换和随机抽样
fs()
df = DataFrame(np.arange(5 * 4).reshape((5, 4)))
sampler = np.random.permutation(5)
pprint(sampler)

fs()
print(df)
fs()
print(df.take(sampler))

fs()
print(df.sample(n=3))

fs()
choices = Series([5, 7, -1, 6, 4])
draws = choices.sample(n=10, replace=True)
print(draws)

# 7.2.8 计算指标/虚拟变量
fs()
df = DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'b'], 'data1': range(6)})
print(df)
print(pd.get_dummies(df['key']))
data = DataFrame(np.random.randn(1000,4))
#分析数据,包括平均值、最大值、四分位数等
data.describe()
#找出每列中绝对值超过3的值
col = data[3]
col[np.abs(col)>3]
#找出所有含有“绝对值超过3”的行
data[(np.abs(data)>3).any(1)]
#将值限制在-3到3之间
data[np.abs(data)>3] = np.sign(data) * 3

#排列和随机采样
df = DataFrame(np.arange(5*4).reshape(5,4))
sampler = np.random.permutation(5)
#将df的行顺序按sampler的顺序重排
df.take(sampler)
#取前3行
df.take(sampler)[:3]
#得到一组随机整数,范围0——4,长度10
sampler0 = np.random.randint(0,5,size=10)

#哑变量
df = DataFrame({'key':['b','b','a','c','a','b'],'data1':range(6)})
#得到哑矩阵
pd.get_dummies(df['key'])
#给每个列名加前缀
dummies = pd.get_dummies(df['key'],prefix='key')
#将data1列合并
df[['data1']].join(dummies)

#字符串操作
import numpy as np
import pandas as pd
from pandas import Series, DataFrame

# WE can randomly reorder (permutate) a Series, or the rows in a DataFrame
dframe = DataFrame(np.arange(16).reshape(4,4))
#creating random permutation
blender = np.random.permutation(4) #permutation means reorder
blender
dframe.take(blender)#this will reorder or permute dframe rows with the permutation blender
# Now what if we want permuations WITH replacement
#make a box with 3 marbles
box = np.array([1,2,3])
box
#randint takes the integer low high and return inputed size of array
# Now lets create a random permuation WITH replacement using randint
shaker = np.random.randint(0,len(box),size=5) #we will use take later and use shaker as index of box
shaker
#Now lets grab form the box
#both example are using same logic
hand_picks = box.take(shaker) #taking permutation of shaker
hand_picks #these are the marbles
Beispiel #42
0
years = [1990, 1991, 1992, 2008, 2012, 2015, 1987, 1969, 2013, 2008, 1999]
# これを10年ごとにまとめてみます。
decade_bins = [1960, 1970, 1980, 1990, 2000, 2010, 2020]

decade_cat = pd.cut(years, decade_bins)
decade_cat.shape

decade_cat.categories
pd.value_counts(decade_cat)

np.random.seed(12345)
dframe = DataFrame(np.random.randn(1000, 4))
dframe.head()
dframe.tail()

dframe.describe()
col = dframe[0]
col.head()
col[np.abs(col) > 3]
np.abs(-3.33)
dframe[(np.abs(dframe) > 3).any(1)]
np.sign(dframe)

dframe = DataFrame(np.arange(4 * 4).reshape((4, 4)))
blender = np.random.permutation(4)
blender

dframe
dframe.take(blender)
Beispiel #43
0
cats.categories
pd.value_counts(cats)

data = np.random.rand(20)
pd.cut(data, 4, precision=2)    #input the number of the bins

data = np.random.randn(1000) # Normally distributed
cats = pd.qcut(data, 4) # Cut into quartiles
cats
pd.value_counts(cats)
pd.qcut(data, [0, 0.1, 0.5, 0.9, 1.])

# Permutation and random sampling
df = DataFrame(np.arange(5 * 4).reshape((5, 4)))

df.take(np.random.permutation(5))
df.take(np.random.permutation(len(df))[:3])

# Computing indicator / dummy variables
df = DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'b'],
                'data1': range(6)})
pd.get_dummies(df['key'])

# String manipulation==========================================================
val = 'a,b,  guido'
val.split(',')
pieces = [x.strip() for x in val.split(',')]
pieces
'::'.join(pieces)
'guido' in val
val.index(',')