Example #1
0
    def test_sort_values(self):
        frame = DataFrame([[1, 1, 2], [3, 1, 0], [4, 5, 6]],
                          index=[1, 2, 3], columns=list('ABC'))

        # by column (axis=0)
        sorted_df = frame.sort_values(by='A')
        indexer = frame['A'].argsort().values
        expected = frame.ix[frame.index[indexer]]
        assert_frame_equal(sorted_df, expected)

        sorted_df = frame.sort_values(by='A', ascending=False)
        indexer = indexer[::-1]
        expected = frame.ix[frame.index[indexer]]
        assert_frame_equal(sorted_df, expected)

        sorted_df = frame.sort_values(by='A', ascending=False)
        assert_frame_equal(sorted_df, expected)

        # GH4839
        sorted_df = frame.sort_values(by=['A'], ascending=[False])
        assert_frame_equal(sorted_df, expected)

        # multiple bys
        sorted_df = frame.sort_values(by=['B', 'C'])
        expected = frame.loc[[2, 1, 3]]
        assert_frame_equal(sorted_df, expected)

        sorted_df = frame.sort_values(by=['B', 'C'], ascending=False)
        assert_frame_equal(sorted_df, expected[::-1])

        sorted_df = frame.sort_values(by=['B', 'A'], ascending=[True, False])
        assert_frame_equal(sorted_df, expected)

        self.assertRaises(ValueError, lambda: frame.sort_values(
            by=['A', 'B'], axis=2, inplace=True))

        # by row (axis=1): GH 10806
        sorted_df = frame.sort_values(by=3, axis=1)
        expected = frame
        assert_frame_equal(sorted_df, expected)

        sorted_df = frame.sort_values(by=3, axis=1, ascending=False)
        expected = frame.reindex(columns=['C', 'B', 'A'])
        assert_frame_equal(sorted_df, expected)

        sorted_df = frame.sort_values(by=[1, 2], axis='columns')
        expected = frame.reindex(columns=['B', 'A', 'C'])
        assert_frame_equal(sorted_df, expected)

        sorted_df = frame.sort_values(by=[1, 3], axis=1,
                                      ascending=[True, False])
        assert_frame_equal(sorted_df, expected)

        sorted_df = frame.sort_values(by=[1, 3], axis=1, ascending=False)
        expected = frame.reindex(columns=['C', 'B', 'A'])
        assert_frame_equal(sorted_df, expected)

        msg = r'Length of ascending \(5\) != length of by \(2\)'
        with assertRaisesRegexp(ValueError, msg):
            frame.sort_values(by=['A', 'B'], axis=0, ascending=[True] * 5)
def prepareBreakagebreakageSummary(breakageData, stlSalesSamePeriod, kcSalesSamePeriod, reportYear, lastYear):
    '''
    Takes in clean data and gets it ready for consumption
    '''
    aggFuncs = { 'Breakage|Dollars' : np.sum,
                'Breakage|Cases' : np.sum }
    groupCols = ['Warehouse','ReasonCode','Year']
    breakageSummary = DataFrame(breakageData.groupby(groupCols).agg(aggFuncs).reset_index(drop=False))
    breakageSummary = pd.DataFrame(breakageSummary.pivot_table(values=['Breakage|Cases','Breakage|Dollars'], index=['Warehouse','ReasonCode'], columns=['Year']))
    breakageSummary.columns = ['%s%s' % (a, '|%s' % b if b else '') for a, b in breakageSummary.columns]  
    breakageSummary.sort_index(inplace=True, ascending=False)
    
    breakageSummary['Breakage|% Sales'] = breakageSummary.index.get_level_values(0)
    breakageSummary['Breakage|% Sales'] = breakageSummary['Breakage|% Sales'].map({'Kansas City':kcSalesSamePeriod, 'Saint Louis':stlSalesSamePeriod})
    breakageSummary['Breakage|% Sales'] = np.divide(breakageSummary['Breakage|Dollars|2016'], breakageSummary['Breakage|% Sales'])
    
    def yoy_delta(now, then): return np.divide(np.subtract(now,then), then)
    
    breakageSummary['Breakage|Dollars|% Change'] = round(yoy_delta(breakageSummary['Breakage|Dollars|'+str(reportYear)], breakageSummary['Breakage|Dollars|'+str(lastYear)]),4)
    breakageSummary['Breakage|Cases|% Change'] = round(yoy_delta(breakageSummary['Breakage|Cases|'+str(reportYear)], breakageSummary['Breakage|Cases|'+str(lastYear)]),4)
    breakageSummary = breakageSummary.reindex(columns=['Breakage|Dollars|'+str(lastYear), 'Breakage|Dollars|'+str(reportYear), 'Breakage|Dollars|% Change', 'Breakage|% Sales',
                                        'Breakage|Cases|'+str(lastYear), 'Breakage|Cases|'+str(reportYear), 'Breakage|Cases|% Change'])
    breakageSummary = breakageSummary.reindex(index=['Warehouse Breakage','Cross-Dock Breakage','Driver Breakage','Supplier Breakage','Sales Breakage & Unsaleables'], level='ReasonCode')

    return breakageSummary
Example #3
0
    def test_na_values_keep_default(self):
        data = """\
One,Two,Three
a,1,one
b,2,two
,3,three
d,4,nan
e,5,five
nan,6,
g,7,seven
"""
        df = self.read_csv(StringIO(data))
        xp = DataFrame({'One': ['a', 'b', np.nan, 'd', 'e', np.nan, 'g'],
                        'Two': [1, 2, 3, 4, 5, 6, 7],
                        'Three': ['one', 'two', 'three', np.nan, 'five',
                                  np.nan, 'seven']})
        tm.assert_frame_equal(xp.reindex(columns=df.columns), df)

        df = self.read_csv(StringIO(data), na_values={'One': [], 'Three': []},
                           keep_default_na=False)
        xp = DataFrame({'One': ['a', 'b', '', 'd', 'e', 'nan', 'g'],
                        'Two': [1, 2, 3, 4, 5, 6, 7],
                        'Three': ['one', 'two', 'three', 'nan', 'five',
                                  '', 'seven']})
        tm.assert_frame_equal(xp.reindex(columns=df.columns), df)

        df = self.read_csv(
            StringIO(data), na_values=['a'], keep_default_na=False)
        xp = DataFrame({'One': [np.nan, 'b', '', 'd', 'e', 'nan', 'g'],
                        'Two': [1, 2, 3, 4, 5, 6, 7],
                        'Three': ['one', 'two', 'three', 'nan', 'five', '',
                                  'seven']})
        tm.assert_frame_equal(xp.reindex(columns=df.columns), df)

        df = self.read_csv(StringIO(data), na_values={'One': [], 'Three': []})
        xp = DataFrame({'One': ['a', 'b', np.nan, 'd', 'e', np.nan, 'g'],
                        'Two': [1, 2, 3, 4, 5, 6, 7],
                        'Three': ['one', 'two', 'three', np.nan, 'five',
                                  np.nan, 'seven']})
        tm.assert_frame_equal(xp.reindex(columns=df.columns), df)

        # see gh-4318: passing na_values=None and
        # keep_default_na=False yields 'None' as a na_value
        data = """\
One,Two,Three
a,1,None
b,2,two
,3,None
d,4,nan
e,5,five
nan,6,
g,7,seven
"""
        df = self.read_csv(
            StringIO(data), keep_default_na=False)
        xp = DataFrame({'One': ['a', 'b', '', 'd', 'e', 'nan', 'g'],
                        'Two': [1, 2, 3, 4, 5, 6, 7],
                        'Three': ['None', 'two', 'None', 'nan', 'five', '',
                                  'seven']})
        tm.assert_frame_equal(xp.reindex(columns=df.columns), df)
Example #4
0
class Reindex(object):

    def setup(self):
        rng = date_range(start='1/1/1970', periods=10000, freq='1min')
        self.df = DataFrame(np.random.rand(10000, 10), index=rng,
                            columns=range(10))
        self.df['foo'] = 'bar'
        self.rng_subset = Index(rng[::2])
        self.df2 = DataFrame(index=range(10000),
                             data=np.random.rand(10000, 30), columns=range(30))
        N = 5000
        K = 200
        level1 = tm.makeStringIndex(N).values.repeat(K)
        level2 = np.tile(tm.makeStringIndex(K).values, N)
        index = MultiIndex.from_arrays([level1, level2])
        self.s = Series(np.random.randn(N * K), index=index)
        self.s_subset = self.s[::2]

    def time_reindex_dates(self):
        self.df.reindex(self.rng_subset)

    def time_reindex_columns(self):
        self.df2.reindex(columns=self.df.columns[1:5])

    def time_reindex_multiindex(self):
        self.s.reindex(self.s_subset.index)
Example #5
0
    def test_reindex_api_equivalence(self):
        # https://github.com/pandas-dev/pandas/issues/12392
        # equivalence of the labels/axis and index/columns API's
        df = DataFrame([[1, 2, 3], [3, 4, 5], [5, 6, 7]],
                       index=['a', 'b', 'c'],
                       columns=['d', 'e', 'f'])

        res1 = df.reindex(['b', 'a'])
        res2 = df.reindex(index=['b', 'a'])
        res3 = df.reindex(labels=['b', 'a'])
        res4 = df.reindex(labels=['b', 'a'], axis=0)
        res5 = df.reindex(['b', 'a'], axis=0)
        for res in [res2, res3, res4, res5]:
            tm.assert_frame_equal(res1, res)

        res1 = df.reindex(columns=['e', 'd'])
        res2 = df.reindex(['e', 'd'], axis=1)
        res3 = df.reindex(labels=['e', 'd'], axis=1)
        for res in [res2, res3]:
            tm.assert_frame_equal(res1, res)

        with tm.assert_produces_warning(FutureWarning) as m:
            res1 = df.reindex(['b', 'a'], ['e', 'd'])
        assert 'reindex' in str(m[0].message)
        res2 = df.reindex(columns=['e', 'd'], index=['b', 'a'])
        res3 = df.reindex(labels=['b', 'a'], axis=0).reindex(labels=['e', 'd'],
                                                             axis=1)
        for res in [res2, res3]:
            tm.assert_frame_equal(res1, res)
Example #6
0
    def test_join_multiindex(self):
        index1 = MultiIndex.from_arrays([['a', 'a', 'a', 'b', 'b', 'b'],
                                         [1, 2, 3, 1, 2, 3]],
                                        names=['first', 'second'])

        index2 = MultiIndex.from_arrays([['b', 'b', 'b', 'c', 'c', 'c'],
                                         [1, 2, 3, 1, 2, 3]],
                                        names=['first', 'second'])

        df1 = DataFrame(data=np.random.randn(6), index=index1,
                        columns=['var X'])
        df2 = DataFrame(data=np.random.randn(6), index=index2,
                        columns=['var Y'])

        df1 = df1.sort_index(level=0)
        df2 = df2.sort_index(level=0)

        joined = df1.join(df2, how='outer')
        ex_index = Index(index1.values).union(Index(index2.values))
        expected = df1.reindex(ex_index).join(df2.reindex(ex_index))
        expected.index.names = index1.names
        assert_frame_equal(joined, expected)
        assert joined.index.names == index1.names

        df1 = df1.sort_index(level=1)
        df2 = df2.sort_index(level=1)

        joined = df1.join(df2, how='outer').sort_index(level=0)
        ex_index = Index(index1.values).union(Index(index2.values))
        expected = df1.reindex(ex_index).join(df2.reindex(ex_index))
        expected.index.names = index1.names

        assert_frame_equal(joined, expected)
        assert joined.index.names == index1.names
def strategy_statistics(strategy_name):
    all_qr = QR.objects(strategy_name=strategy_name)
    if not all_qr:
        print "Wrong Strategy Name!"
        return

    trading_date = SDT.objects(stock_number__startswith="300").distinct("date")
    trading_date.sort()
    bt_result = {}
    for d in trading_date:
        bt_result[str(d.date())] = back_test_success(strategy_name, d)

    frame = DataFrame(bt_result)
    pd.set_option("display.width", 200)
    pd.set_option("display.max_rows", 400)
    print frame.reindex(
        [
            "count",
            "one_back_test",
            "one_yield_expectation",
            "three_back_test",
            "three_yield_expectation",
            "five_back_test",
            "five_yield_expectation",
        ]
    ).T
    pd.set_option("display.width", None)
    pd.set_option("display.max_rows", None)
Example #8
0
def pd_dataframe5():
    frame = DataFrame(np.arange(9).reshape((3, 3)), index=['a', 'c', 'd'],columns=['Ohio', 'Texas', 'California'])
    print frame
    frame2=frame.reindex(['a','b','c','d'])
    print frame2
    stats=['Texas','Utah','California']
    print frame.reindex(columns=stats)
    frame=frame.reindex(index=['a','b','c','d'],method='ffill',columns=stats)
    print frame
    print frame.ix[['a','b','c','d'],stats]
    def test_reindex_boolean(self):
        frame = DataFrame(np.ones((10, 2), dtype=bool), index=np.arange(0, 20, 2), columns=[0, 2])

        reindexed = frame.reindex(np.arange(10))
        self.assertEqual(reindexed.values.dtype, np.object_)
        self.assertTrue(isnull(reindexed[0][1]))

        reindexed = frame.reindex(columns=lrange(3))
        self.assertEqual(reindexed.values.dtype, np.object_)
        self.assertTrue(isnull(reindexed[1]).all())
Example #10
0
    def test_reindex_api_equivalence(self):
        # equivalence of the labels/axis and index/columns API's
        df = DataFrame([[1, 2, 3], [3, 4, 5], [5, 6, 7]],
                       index=['a', 'b', 'c'],
                       columns=['d', 'e', 'f'])

        res1 = df.reindex(['b', 'a'])
        res2 = df.reindex(index=['b', 'a'])
        res3 = df.reindex(labels=['b', 'a'])
        res4 = df.reindex(labels=['b', 'a'], axis=0)
        res5 = df.reindex(['b', 'a'], axis=0)
        for res in [res2, res3, res4, res5]:
            tm.assert_frame_equal(res1, res)

        res1 = df.reindex(columns=['e', 'd'])
        res2 = df.reindex(['e', 'd'], axis=1)
        res3 = df.reindex(labels=['e', 'd'], axis=1)
        for res in [res2, res3]:
            tm.assert_frame_equal(res1, res)

        res1 = df.reindex(index=['b', 'a'], columns=['e', 'd'])
        res2 = df.reindex(columns=['e', 'd'], index=['b', 'a'])
        res3 = df.reindex(labels=['b', 'a'], axis=0).reindex(labels=['e', 'd'],
                                                             axis=1)
        for res in [res2, res3]:
            tm.assert_frame_equal(res1, res)
Example #11
0
    def test_reindex_boolean(self):
        frame = DataFrame(np.ones((10, 2), dtype=bool),
                          index=np.arange(0, 20, 2),
                          columns=[0, 2])

        reindexed = frame.reindex(np.arange(10))
        assert reindexed.values.dtype == np.object_
        assert isna(reindexed[0][1])

        reindexed = frame.reindex(columns=lrange(3))
        assert reindexed.values.dtype == np.object_
        assert isna(reindexed[1]).all()
Example #12
0
class Reindex(object):

    goal_time = 0.2

    def setup(self):
        N = 10**3
        self.df = DataFrame(np.random.randn(N * 10, N))
        self.idx = np.arange(4 * N, 7 * N)
        self.df2 = DataFrame(
            {c: {0: np.random.randint(0, 2, N).astype(np.bool_),
                 1: np.random.randint(0, N, N).astype(np.int16),
                 2: np.random.randint(0, N, N).astype(np.int32),
                 3: np.random.randint(0, N, N).astype(np.int64)}
                [np.random.randint(0, 4)] for c in range(N)})

    def time_reindex_axis0(self):
        self.df.reindex(self.idx)

    def time_reindex_axis1(self):
        self.df.reindex(columns=self.idx)

    def time_reindex_both_axes(self):
        self.df.reindex(index=self.idx, columns=self.idx)

    def time_reindex_both_axes_ix(self):
        self.df.ix[self.idx, self.idx]

    def time_reindex_upcast(self):
        self.df2.reindex(np.random.permutation(range(1200)))
    def test_reindex_multi(self):
        df = DataFrame(np.random.randn(3, 3))

        result = df.reindex(lrange(4), lrange(4))
        expected = df.reindex(lrange(4)).reindex(columns=lrange(4))

        assert_frame_equal(result, expected)

        df = DataFrame(np.random.randint(0, 10, (3, 3)))

        result = df.reindex(lrange(4), lrange(4))
        expected = df.reindex(lrange(4)).reindex(columns=lrange(4))

        assert_frame_equal(result, expected)

        df = DataFrame(np.random.randint(0, 10, (3, 3)))

        result = df.reindex(lrange(2), lrange(2))
        expected = df.reindex(lrange(2)).reindex(columns=lrange(2))

        assert_frame_equal(result, expected)

        df = DataFrame(np.random.randn(5, 3) + 1j, columns=['a', 'b', 'c'])

        result = df.reindex(index=[0, 1], columns=['a', 'b'])
        expected = df.reindex([0, 1]).reindex(columns=['a', 'b'])

        assert_frame_equal(result, expected)
    def test_reindex_axes(self):
        # GH 3317, reindexing by both axes loses freq of the index
        df = DataFrame(
            np.ones((3, 3)),
            index=[datetime(2012, 1, 1), datetime(2012, 1, 2), datetime(2012, 1, 3)],
            columns=["a", "b", "c"],
        )
        time_freq = date_range("2012-01-01", "2012-01-03", freq="d")
        some_cols = ["a", "b"]

        index_freq = df.reindex(index=time_freq).index.freq
        both_freq = df.reindex(index=time_freq, columns=some_cols).index.freq
        seq_freq = df.reindex(index=time_freq).reindex(columns=some_cols).index.freq
        self.assertEqual(index_freq, both_freq)
        self.assertEqual(index_freq, seq_freq)
    def test_reindex_name_remains(self):
        s = Series(random.rand(10))
        df = DataFrame(s, index=np.arange(len(s)))
        i = Series(np.arange(10), name='iname')

        df = df.reindex(i)
        self.assertEqual(df.index.name, 'iname')

        df = df.reindex(Index(np.arange(10), name='tmpname'))
        self.assertEqual(df.index.name, 'tmpname')

        s = Series(random.rand(10))
        df = DataFrame(s.T, index=np.arange(len(s)))
        i = Series(np.arange(10), name='iname')
        df = df.reindex(columns=i)
        self.assertEqual(df.columns.name, 'iname')
    def test_reindex_with_nans(self):
        df = DataFrame([[1, 2], [3, 4], [np.nan, np.nan], [7, 8], [9, 10]],
                       columns=['a', 'b'],
                       index=[100.0, 101.0, np.nan, 102.0, 103.0])

        result = df.reindex(index=[101.0, 102.0, 103.0])
        expected = df.iloc[[1, 3, 4]]
        assert_frame_equal(result, expected)

        result = df.reindex(index=[103.0])
        expected = df.iloc[[4]]
        assert_frame_equal(result, expected)

        result = df.reindex(index=[101.0])
        expected = df.iloc[[1]]
        assert_frame_equal(result, expected)
    def test_reindex_axes(self):
        # GH 3317, reindexing by both axes loses freq of the index
        df = DataFrame(np.ones((3, 3)),
                       index=[datetime(2012, 1, 1),
                              datetime(2012, 1, 2),
                              datetime(2012, 1, 3)],
                       columns=['a', 'b', 'c'])
        time_freq = date_range('2012-01-01', '2012-01-03', freq='d')
        some_cols = ['a', 'b']

        index_freq = df.reindex(index=time_freq).index.freq
        both_freq = df.reindex(index=time_freq, columns=some_cols).index.freq
        seq_freq = df.reindex(index=time_freq).reindex(
            columns=some_cols).index.freq
        self.assertEqual(index_freq, both_freq)
        self.assertEqual(index_freq, seq_freq)
Example #18
0
    def test_dti_set_index_reindex(self):
        # GH 6631
        df = DataFrame(np.random.random(6))
        idx1 = date_range('2011/01/01', periods=6, freq='M', tz='US/Eastern')
        idx2 = date_range('2013', periods=6, freq='A', tz='Asia/Tokyo')

        df = df.set_index(idx1)
        tm.assert_index_equal(df.index, idx1)
        df = df.reindex(idx2)
        tm.assert_index_equal(df.index, idx2)

        # 11314
        # with tz
        index = date_range(datetime(2015, 10, 1),
                           datetime(2015, 10, 1, 23),
                           freq='H', tz='US/Eastern')
        df = DataFrame(np.random.randn(24, 1), columns=['a'], index=index)
        new_index = date_range(datetime(2015, 10, 2),
                               datetime(2015, 10, 2, 23),
                               freq='H', tz='US/Eastern')

        # TODO: unused?
        result = df.set_index(new_index)  # noqa

        assert new_index.freq == index.freq
Example #19
0
def pickle_from_db(event_list, fname, verbose=False):
    for event in event_list:
        result = DataFrame({"text": [], "event": [], "features": [], "unique_id": [], "raw_text": []})
        count = 0
        if verbose:
            print "processing data from %s" % (event)
        examples = client[insert_db][event].find()
        for tweet in examples:
            if verbose and count % 1000 == 0 and count != 0:
                print "processed %s tweets" % count
            if tweet["text"]:
                result = result.append(
                    DataFrame(
                        {
                            "text": tweet["text"],
                            "event": event,
                            "features": json.dumps(tweet["features"]),
                            "unique_id": tweet["unique_id"],
                            "raw_text": tweet["raw_text"],
                        },
                        index=[count],
                    )
                )
                count += 1
                if count == 50:
                    break
        result = result.reindex(numpy.random.permutation(result.index))

        fpath = os.path.join(os.path.dirname(__file__), os.pardir, "dicts/") + event + "_" + fname
        f = open(fpath, "w")
        pickle.dump(result, f)
        f.close()
        if verbose:
            print result
            print "dumped %s tweets" % len(result)
Example #20
0
    def test_include_na(self, sparse, dtype):
        if sparse:
            pytest.xfail(reason='nan in index is problematic (GH 16894)')

        s = ['a', 'b', np.nan]
        res = get_dummies(s, sparse=sparse, dtype=dtype)
        exp = DataFrame({'a': [1, 0, 0],
                         'b': [0, 1, 0]},
                        dtype=self.effective_dtype(dtype))
        assert_frame_equal(res, exp)

        # Sparse dataframes do not allow nan labelled columns, see #GH8822
        res_na = get_dummies(s, dummy_na=True, sparse=sparse, dtype=dtype)
        exp_na = DataFrame({nan: [0, 0, 1],
                            'a': [1, 0, 0],
                            'b': [0, 1, 0]},
                           dtype=self.effective_dtype(dtype))
        exp_na = exp_na.reindex(['a', 'b', nan], axis=1)
        # hack (NaN handling in assert_index_equal)
        exp_na.columns = res_na.columns
        assert_frame_equal(res_na, exp_na)

        res_just_na = get_dummies([nan], dummy_na=True,
                                  sparse=sparse, dtype=dtype)
        exp_just_na = DataFrame(Series(1, index=[0]), columns=[nan],
                                dtype=self.effective_dtype(dtype))
        tm.assert_numpy_array_equal(res_just_na.values, exp_just_na.values)
Example #21
0
def test_preserve_categorical_dtype():
    # GH13743, GH13854
    df = DataFrame({'A': [1, 2, 1, 1, 2],
                    'B': [10, 16, 22, 28, 34],
                    'C1': Categorical(list("abaab"),
                                      categories=list("bac"),
                                      ordered=False),
                    'C2': Categorical(list("abaab"),
                                      categories=list("bac"),
                                      ordered=True)})
    # single grouper
    exp_full = DataFrame({'A': [2.0, 1.0, np.nan],
                          'B': [25.0, 20.0, np.nan],
                          'C1': Categorical(list("bac"),
                                            categories=list("bac"),
                                            ordered=False),
                          'C2': Categorical(list("bac"),
                                            categories=list("bac"),
                                            ordered=True)})
    for col in ['C1', 'C2']:
        result1 = df.groupby(by=col, as_index=False, observed=False).mean()
        result2 = df.groupby(
            by=col, as_index=True, observed=False).mean().reset_index()
        expected = exp_full.reindex(columns=result1.columns)
        tm.assert_frame_equal(result1, expected)
        tm.assert_frame_equal(result2, expected)
Example #22
0
def plot_scores(scores, title, x_label, classifier_names):
    """ Make a barplot of the scores of some performance measure.

        Parameters
        ----------
        scores : dict
            Where the keys are the classifier names and the values are the scores.

        title : str
            Title of the plot.

        x_label : str
            Label for the x-axis

        classifier_names : array
            List of the names of the classifiers, the order of which will be used
            to order the bars.
    """

    scores = DataFrame(scores, index=[x_label])
    scores = scores.reindex(columns=classifier_names)

    format_as_percent_plot = lambda x, pos: "{:.0f}%".format(x * 100)
    fig, ax = plt.subplots(figsize=(9, 5))
    scores.plot(ax=ax, kind="bar", title=title, fontsize=12)
    ax.legend(bbox_to_anchor = (1.5, 0.6))
    ax.set_xticklabels([], rotation=0)
    ax.get_yaxis().set_major_formatter(FuncFormatter(format_as_percent_plot))

    plt.show()
def main():
	# Series 可以看做一个定长的有序字典。
	s1 = Series([1,2,3.0,'abc'])
	print s1
	print
	s2 = Series(data=[1,3,5,7],index = ['a','b','x','y'])
	print s2
	print s2.index
	print s2.values
	s2.name = 'a_series'
	s2.index.name = 'the_index'
	print s2
	ser = Series([4.5,7.2,-5.3,3.6],index=['d','b','a','c'])
	# reindex
	a = ['a','b','c','d','e']
	ser_1 = ser.reindex(a)
	print ser_1
	ser_2 = ser.reindex(a, fill_value=0)
	print ser_2
	print
	# DataFrame 是一个表格型的数据结构,它含有一组有序的列(类似于 index),每列可以是不同的值类型(不像 ndarray 只能有一个 dtype)。
	# 基本上可以把 DataFrame 看成是共享同一个 index 的 Series 的集合。
	data = {'state':['Ohino','Ohino','Ohino','Nevada','Nevada'], 'year':[2000,2001,2002,2001,2002], 'pop':[1.5,1.7,3.6,2.4,2.9]}
	df = DataFrame(data)
	print df
	df = DataFrame(data, index=['one','two','three','four','five'], columns=['year','state','pop','debt'])
	print df
	print df.index
	print df.columns
	print type(df['debt'])
	state = ['Texas','Utha','California']
	df1 = df.reindex(columns=state, method='ffill')
	print df1
	print
    def test_reindex_dups(self):

        # GH4746, reindex on duplicate index error messages
        arr = np.random.randn(10)
        df = DataFrame(arr, index=[1, 2, 3, 4, 5, 1, 2, 3, 4, 5])

        # set index is ok
        result = df.copy()
        result.index = list(range(len(df)))
        expected = DataFrame(arr, index=list(range(len(df))))
        assert_frame_equal(result, expected)

        # reindex fails
        msg = "cannot reindex from a duplicate axis"
        with pytest.raises(ValueError, match=msg):
            df.reindex(index=list(range(len(df))))
def readDatasetIntoDataFrame():

    #Open file
    f = open("SpamHamDataset.txt", "r");

    #New DataFrame with two columns
    df = DataFrame(columns=('label', 'text'))

    count = 0
    for line in f:
        tokens = line.split()
        flag = tokens[0] #The first word of each row is the label.
        text = ""

        #Concatenate all tokens, except the label, to get the content of the message itself.
        for x in range(1, tokens.__len__()):
            text = text + tokens[x]
            text = text + " "
            sig = 0
            if flag == 'spam':
                sig = 1
        #print label, "---", text
        df.loc[count] = [sig, text]
        count = count + 1

    #Housekeeping
    df = df.reindex(random.permutation(df.index))

    return df
def viz_dist_mat(df, new_index, show_img=True):
    '''
    Re-order a triangular data frame.
    '''
    from pandas import DataFrame

    sym_dist = df.values.T + df.values

    sym_df = DataFrame(sym_dist, index=df.index, columns=df.columns)

    reorder_df = sym_df.reindex(index=new_index, columns=new_index)

    # Now restore only the upper triangle

    upptri_df = DataFrame(reorder_df.values * (df.values != 0.0),
                          index=new_index,
                          columns=new_index)

    if show_img:
        import matplotlib.pyplot as p

        p.imshow(upptri_df.values, interpolation='nearest',
                 cmap='binary')
        cbar = p.colorbar()
        cbar.set_label('Distance', fontsize=20)
        p.show()
    return upptri_df
Example #27
0
    def test_include_na(self, sparse, dtype):
        s = ['a', 'b', np.nan]
        res = get_dummies(s, sparse=sparse, dtype=dtype)
        exp = DataFrame({'a': [1, 0, 0],
                         'b': [0, 1, 0]},
                        dtype=self.effective_dtype(dtype))
        if sparse:
            exp = exp.apply(pd.SparseArray, fill_value=0.0)
        assert_frame_equal(res, exp)

        # Sparse dataframes do not allow nan labelled columns, see #GH8822
        res_na = get_dummies(s, dummy_na=True, sparse=sparse, dtype=dtype)
        exp_na = DataFrame({nan: [0, 0, 1],
                            'a': [1, 0, 0],
                            'b': [0, 1, 0]},
                           dtype=self.effective_dtype(dtype))
        exp_na = exp_na.reindex(['a', 'b', nan], axis=1)
        # hack (NaN handling in assert_index_equal)
        exp_na.columns = res_na.columns
        if sparse:
            exp_na = exp_na.apply(pd.SparseArray, fill_value=0.0)
        assert_frame_equal(res_na, exp_na)

        res_just_na = get_dummies([nan], dummy_na=True,
                                  sparse=sparse, dtype=dtype)
        exp_just_na = DataFrame(Series(1, index=[0]), columns=[nan],
                                dtype=self.effective_dtype(dtype))
        tm.assert_numpy_array_equal(res_just_na.values, exp_just_na.values)
Example #28
0
    def test_unexpected_keyword(self):  # GH8597
        df = DataFrame(np.random.randn(5, 2), columns=['jim', 'joe'])
        ca = pd.Categorical([0, 0, 2, 2, 3, np.nan])
        ts = df['joe'].copy()
        ts[2] = np.nan

        with pytest.raises(TypeError, match='unexpected keyword'):
            df.drop('joe', axis=1, in_place=True)

        with pytest.raises(TypeError, match='unexpected keyword'):
            df.reindex([1, 0], inplace=True)

        with pytest.raises(TypeError, match='unexpected keyword'):
            ca.fillna(0, inplace=True)

        with pytest.raises(TypeError, match='unexpected keyword'):
            ts.fillna(0, in_place=True)
Example #29
0
class LevelAlign(object):

    def setup(self):
        self.index = MultiIndex(
            levels=[np.arange(10), np.arange(100), np.arange(100)],
            codes=[np.arange(10).repeat(10000),
                   np.tile(np.arange(100).repeat(100), 10),
                   np.tile(np.tile(np.arange(100), 100), 10)])
        self.df = DataFrame(np.random.randn(len(self.index), 4),
                            index=self.index)
        self.df_level = DataFrame(np.random.randn(100, 4),
                                  index=self.index.levels[1])

    def time_align_level(self):
        self.df.align(self.df_level, level=1, copy=False)

    def time_reindex_level(self):
        self.df_level.reindex(self.index, level=1)
Example #30
0
    def testWithXEffects(self):
        result = ols(y=self.panel_y2, x=self.panel_x2, x_effects=['x1'])

        assert_almost_equal(result._y.values.flat, [1, 4, 5])

        res = result._x
        exp_x = DataFrame([[0., 0., 14., 1.], [0, 1, 17, 1], [1, 0, 48, 1]],
                          columns=['x1_30', 'x1_9', 'x2', 'intercept'],
                          index=res.index, dtype=float)
        assert_frame_equal(res, exp_x.reindex(columns=res.columns))
Example #31
0
#reindexing series and dataframes

from pandas import Series, DataFrame
obj = Series([100, 200, 300, 400, 500], index=['d', 'a', 'b', 'e', 'c'])
print(obj)

#reindexing Series
obj = obj.reindex(['a', 'b', 'c', 'd', 'e'])
print(obj)

#----------------------------------------------------------
data = {
    'Name': ['John', 'Kevin', 'Sam'],
    'Age': [32, 42, 54],
    'Salary': [300, 400, 500]
}
frame = DataFrame(data)
print(frame)

#reindexing row of DataFrame

frame = frame.reindex([0, 2, 1])
print(frame)

#reindexing column of DataFrame

fields = ['Age', 'Name', 'Salary']
frame = frame.reindex(columns=fields)
print(frame)
Example #32
0
    def test_sort_values_nan(self):
        # GH#3917
        df = DataFrame({
            "A": [1, 2, np.nan, 1, 6, 8, 4],
            "B": [9, np.nan, 5, 2, 5, 4, 5]
        })

        # sort one column only
        expected = DataFrame(
            {
                "A": [np.nan, 1, 1, 2, 4, 6, 8],
                "B": [5, 9, 2, np.nan, 5, 5, 4]
            },
            index=[2, 0, 3, 1, 6, 4, 5],
        )
        sorted_df = df.sort_values(["A"], na_position="first")
        tm.assert_frame_equal(sorted_df, expected)

        expected = DataFrame(
            {
                "A": [np.nan, 8, 6, 4, 2, 1, 1],
                "B": [5, 4, 5, 5, np.nan, 9, 2]
            },
            index=[2, 5, 4, 6, 1, 0, 3],
        )
        sorted_df = df.sort_values(["A"], na_position="first", ascending=False)
        tm.assert_frame_equal(sorted_df, expected)

        expected = df.reindex(columns=["B", "A"])
        sorted_df = df.sort_values(by=1, axis=1, na_position="first")
        tm.assert_frame_equal(sorted_df, expected)

        # na_position='last', order
        expected = DataFrame(
            {
                "A": [1, 1, 2, 4, 6, 8, np.nan],
                "B": [2, 9, np.nan, 5, 5, 4, 5]
            },
            index=[3, 0, 1, 6, 4, 5, 2],
        )
        sorted_df = df.sort_values(["A", "B"])
        tm.assert_frame_equal(sorted_df, expected)

        # na_position='first', order
        expected = DataFrame(
            {
                "A": [np.nan, 1, 1, 2, 4, 6, 8],
                "B": [5, 2, 9, np.nan, 5, 5, 4]
            },
            index=[2, 3, 0, 1, 6, 4, 5],
        )
        sorted_df = df.sort_values(["A", "B"], na_position="first")
        tm.assert_frame_equal(sorted_df, expected)

        # na_position='first', not order
        expected = DataFrame(
            {
                "A": [np.nan, 1, 1, 2, 4, 6, 8],
                "B": [5, 9, 2, np.nan, 5, 5, 4]
            },
            index=[2, 0, 3, 1, 6, 4, 5],
        )
        sorted_df = df.sort_values(["A", "B"],
                                   ascending=[1, 0],
                                   na_position="first")
        tm.assert_frame_equal(sorted_df, expected)

        # na_position='last', not order
        expected = DataFrame(
            {
                "A": [8, 6, 4, 2, 1, 1, np.nan],
                "B": [4, 5, 5, np.nan, 2, 9, 5]
            },
            index=[5, 4, 6, 1, 3, 0, 2],
        )
        sorted_df = df.sort_values(["A", "B"],
                                   ascending=[0, 1],
                                   na_position="last")
        tm.assert_frame_equal(sorted_df, expected)
Example #33
0
 def test_reindex_single_named_indexer(self):
     # https://github.com/pandas-dev/pandas/issues/12392
     df = DataFrame({"A": [1, 2, 3], "B": [1, 2, 3]})
     result = df.reindex([0, 1], columns=["A"])
     expected = DataFrame({"A": [1, 2]})
     tm.assert_frame_equal(result, expected)
Example #34
0
    def test_sort_values(self):
        frame = DataFrame([[1, 1, 2], [3, 1, 0], [4, 5, 6]],
                          index=[1, 2, 3],
                          columns=list("ABC"))

        # by column (axis=0)
        sorted_df = frame.sort_values(by="A")
        indexer = frame["A"].argsort().values
        expected = frame.loc[frame.index[indexer]]
        tm.assert_frame_equal(sorted_df, expected)

        sorted_df = frame.sort_values(by="A", ascending=False)
        indexer = indexer[::-1]
        expected = frame.loc[frame.index[indexer]]
        tm.assert_frame_equal(sorted_df, expected)

        sorted_df = frame.sort_values(by="A", ascending=False)
        tm.assert_frame_equal(sorted_df, expected)

        # GH4839
        sorted_df = frame.sort_values(by=["A"], ascending=[False])
        tm.assert_frame_equal(sorted_df, expected)

        # multiple bys
        sorted_df = frame.sort_values(by=["B", "C"])
        expected = frame.loc[[2, 1, 3]]
        tm.assert_frame_equal(sorted_df, expected)

        sorted_df = frame.sort_values(by=["B", "C"], ascending=False)
        tm.assert_frame_equal(sorted_df, expected[::-1])

        sorted_df = frame.sort_values(by=["B", "A"], ascending=[True, False])
        tm.assert_frame_equal(sorted_df, expected)

        msg = "No axis named 2 for object type DataFrame"
        with pytest.raises(ValueError, match=msg):
            frame.sort_values(by=["A", "B"], axis=2, inplace=True)

        # by row (axis=1): GH#10806
        sorted_df = frame.sort_values(by=3, axis=1)
        expected = frame
        tm.assert_frame_equal(sorted_df, expected)

        sorted_df = frame.sort_values(by=3, axis=1, ascending=False)
        expected = frame.reindex(columns=["C", "B", "A"])
        tm.assert_frame_equal(sorted_df, expected)

        sorted_df = frame.sort_values(by=[1, 2], axis="columns")
        expected = frame.reindex(columns=["B", "A", "C"])
        tm.assert_frame_equal(sorted_df, expected)

        sorted_df = frame.sort_values(by=[1, 3],
                                      axis=1,
                                      ascending=[True, False])
        tm.assert_frame_equal(sorted_df, expected)

        sorted_df = frame.sort_values(by=[1, 3], axis=1, ascending=False)
        expected = frame.reindex(columns=["C", "B", "A"])
        tm.assert_frame_equal(sorted_df, expected)

        msg = r"Length of ascending \(5\) != length of by \(2\)"
        with pytest.raises(ValueError, match=msg):
            frame.sort_values(by=["A", "B"], axis=0, ascending=[True] * 5)
Example #35
0
    def test_reindex_with_categoricalindex(self):
        df = DataFrame(
            {
                "A": np.arange(3, dtype="int64"),
            },
            index=CategoricalIndex(list("abc"), dtype=CDT(list("cabe")), name="B"),
        )

        # reindexing
        # convert to a regular index
        result = df.reindex(["a", "b", "e"])
        expected = DataFrame({"A": [0, 1, np.nan], "B": Series(list("abe"))}).set_index(
            "B"
        )
        tm.assert_frame_equal(result, expected, check_index_type=True)

        result = df.reindex(["a", "b"])
        expected = DataFrame({"A": [0, 1], "B": Series(list("ab"))}).set_index("B")
        tm.assert_frame_equal(result, expected, check_index_type=True)

        result = df.reindex(["e"])
        expected = DataFrame({"A": [np.nan], "B": Series(["e"])}).set_index("B")
        tm.assert_frame_equal(result, expected, check_index_type=True)

        result = df.reindex(["d"])
        expected = DataFrame({"A": [np.nan], "B": Series(["d"])}).set_index("B")
        tm.assert_frame_equal(result, expected, check_index_type=True)

        # since we are actually reindexing with a Categorical
        # then return a Categorical
        cats = list("cabe")

        result = df.reindex(Categorical(["a", "e"], categories=cats))
        expected = DataFrame(
            {"A": [0, np.nan], "B": Series(list("ae")).astype(CDT(cats))}
        ).set_index("B")
        tm.assert_frame_equal(result, expected, check_index_type=True)

        result = df.reindex(Categorical(["a"], categories=cats))
        expected = DataFrame(
            {"A": [0], "B": Series(list("a")).astype(CDT(cats))}
        ).set_index("B")
        tm.assert_frame_equal(result, expected, check_index_type=True)

        result = df.reindex(["a", "b", "e"])
        expected = DataFrame({"A": [0, 1, np.nan], "B": Series(list("abe"))}).set_index(
            "B"
        )
        tm.assert_frame_equal(result, expected, check_index_type=True)

        result = df.reindex(["a", "b"])
        expected = DataFrame({"A": [0, 1], "B": Series(list("ab"))}).set_index("B")
        tm.assert_frame_equal(result, expected, check_index_type=True)

        result = df.reindex(["e"])
        expected = DataFrame({"A": [np.nan], "B": Series(["e"])}).set_index("B")
        tm.assert_frame_equal(result, expected, check_index_type=True)

        # give back the type of categorical that we received
        result = df.reindex(Categorical(["a", "e"], categories=cats, ordered=True))
        expected = DataFrame(
            {"A": [0, np.nan], "B": Series(list("ae")).astype(CDT(cats, ordered=True))}
        ).set_index("B")
        tm.assert_frame_equal(result, expected, check_index_type=True)

        result = df.reindex(Categorical(["a", "d"], categories=["a", "d"]))
        expected = DataFrame(
            {"A": [0, np.nan], "B": Series(list("ad")).astype(CDT(["a", "d"]))}
        ).set_index("B")
        tm.assert_frame_equal(result, expected, check_index_type=True)

        df2 = DataFrame(
            {
                "A": np.arange(6, dtype="int64"),
            },
            index=CategoricalIndex(list("aabbca"), dtype=CDT(list("cabe")), name="B"),
        )
        # passed duplicate indexers are not allowed
        msg = "cannot reindex from a duplicate axis"
        with pytest.raises(ValueError, match=msg):
            df2.reindex(["a", "b"])

        # args NotImplemented ATM
        msg = r"argument {} is not implemented for CategoricalIndex\.reindex"
        with pytest.raises(NotImplementedError, match=msg.format("method")):
            df.reindex(["a"], method="ffill")
        with pytest.raises(NotImplementedError, match=msg.format("level")):
            df.reindex(["a"], level=1)
        with pytest.raises(NotImplementedError, match=msg.format("limit")):
            df.reindex(["a"], limit=2)
Example #36
0
class Scores:
    """

    Parameters
    ----------
    uri : str, optional

    modality : str, optional

    Returns
    -------
    scores : `Scores`

    Examples
    --------

        >>> s = Scores(uri='video', modality='speaker')
        >>> s[Segment(0,1), 's1', 'A'] = 0.1
        >>> s[Segment(0,1), 's1', 'B'] = 0.2
        >>> s[Segment(0,1), 's1', 'C'] = 0.3
        >>> s[Segment(0,1), 's2', 'A'] = 0.4
        >>> s[Segment(0,1), 's2', 'B'] = 0.3
        >>> s[Segment(0,1), 's2', 'C'] = 0.2
        >>> s[Segment(2,3), 's1', 'A'] = 0.2
        >>> s[Segment(2,3), 's1', 'B'] = 0.1
        >>> s[Segment(2,3), 's1', 'C'] = 0.3

    """
    @classmethod
    def from_df(cls,
                df: DataFrame,
                uri: Optional[str] = None,
                modality: Optional[str] = None,
                aggfunc: Callable = np.mean):
        """

        Parameters
        ----------
        df : DataFrame
            Must contain the following columns:
            'segment', 'track', 'label' and 'value'
        uri : str, optional
            Resource identifier
        modality : str, optional
            Modality
        aggfunc : func
            Value aggregation function in case of duplicate (segment, track,
            label) tuples

        Returns
        -------

        """
        dataframe = pivot_table(df,
                                values=PYANNOTE_SCORE,
                                index=[PYANNOTE_SEGMENT, PYANNOTE_TRACK],
                                columns=PYANNOTE_LABEL,
                                aggfunc=aggfunc)

        annotation = Annotation(uri=uri, modality=modality)
        for index, _ in dataframe.iterrows():
            segment = Segment(*index[0])
            track = index[1]
            annotation[segment, track] = ''

        labels = dataframe.columns

        return cls(uri=uri,
                   modality=modality,
                   annotation=annotation,
                   labels=labels,
                   values=dataframe.values)

    def __init__(self,
                 uri: Optional[str] = None,
                 modality: Optional[str] = None,
                 annotation: Optional[Annotation] = None,
                 labels: Iterable[Hashable] = None,
                 values: Optional[np.ndarray] = None,
                 dtype=None):  # TODO maybe this should get removed

        names = [
            PYANNOTE_SEGMENT + '_' + field.name for field in fields(Segment)
        ] + [PYANNOTE_TRACK]

        if annotation:
            annotation = annotation.copy()
            index = Index([s + (t, ) for s, t in annotation.itertracks()],
                          name=names)

        else:
            annotation = Annotation(uri=uri, modality=modality)
            index = MultiIndex(levels=[list() for name in names],
                               labels=[list() for name in names],
                               names=names)

        self.annotation_ = annotation
        columns = None if labels is None else list(labels)
        data = None if values is None else np.array(values)
        dtype = np.float if values is None else values.dtype

        self.dataframe_ = DataFrame(data=data,
                                    dtype=dtype,
                                    index=index,
                                    columns=columns)

        self.hasChanged_ = True

        self.modality = modality
        self.uri = uri

    def copy(self) -> 'Scores':
        self._reindexIfNeeded()
        copied = self.__class__(uri=self.uri, modality=self.modality)
        copied.dataframe_ = self.dataframe_.copy()
        copied.annotation_ = self.annotation_.copy()
        copied.hasChanged_ = self.hasChanged_
        return copied

    # del scores[segment]
    # del scores[segment, :]
    # del scores[segment, track]
    def __delitem__(self, key: Key):

        if isinstance(key, Segment):
            segment = key
            self.dataframe_.drop(tuple(segment), axis=0, inplace=True)
            del self.annotation_[segment]
            self.hasChanged_ = True

        elif isinstance(key, tuple) and len(key) == 2:
            segment, track = key
            self.dataframe_.drop(tuple(segment) + (track, ),
                                 axis=0,
                                 inplace=True)
            del self.annotation_[segment, track]
            self.hasChanged_ = True

        else:
            raise KeyError('')

    # value = scores[segment, track, label]
    def __getitem__(self, key):

        if len(key) == 2:
            key = (key[0], '_', key[1])

        segment, track, label = key
        return self.dataframe_.at[tuple(segment) + (track, ), label]

    # scores[segment, track, label] = value
    # scores[segment, label] ==== scores[segment, '_', label]
    def __setitem__(self, key, value):

        if len(key) == 2:
            key = (key[0], '_', key[1])

        segment, track, label = key

        # do not add empty track
        if not segment:
            return

        self.dataframe_.at[tuple(segment) + (track, ), label] = value
        self.annotation_[segment, track] = label
        self.hasChanged_ = True

    def __len__(self):
        """Number of annotated segments"""
        return len(self.annotation_)

    def __nonzero__(self):
        return self.__bool__()

    def __bool__(self):
        """False if annotation is empty"""
        return True if self.annotation_ else False

    def __contains__(self, included):
        """Check if segments are annotated

        Parameters
        ----------
        included : `Segment` or `Timeline`

        Returns
        -------
        contains : bool
            True if every segment in `included` is annotated, False otherwise.
        """
        return included in self.annotation_

    def __iter__(self):
        """Iterate over sorted segments"""
        return iter(self.annotation_.get_timeline(copy=False))

    def __reversed__(self):
        """Reverse iterate over sorted segments"""
        return reversed(self.annotation_.get_timeline(copy=False))

    def itersegments(self):
        return iter(self)

    def tracks(self, segment: Segment):
        """Set of tracks for query segment

        Parameters
        ----------
        segment : `Segment`
            Query segment

        Returns
        -------
        tracks : set
            Set of tracks for query segment
        """
        return self.annotation_.get_tracks(segment)

    def has_track(self, segment: Segment, track):
        """Check whether a given track exists

        Parameters
        ----------
        segment : `Segment`
            Query segment
        track :
            Query track

        Returns
        -------
        exists : bool
            True if track exists for segment
        """
        return self.annotation_.has_track(segment, track)

    def get_track_by_name(self, track: TrackName) -> List[Tuple[Segment]]:
        """Get all tracks with given name

        Parameters
        ----------
        track : any valid track name
            Requested name track

        Returns
        -------
        tracks : list
            List of (segment, track) tuples
        """
        # WARNING: this doesn't call a valid class
        return self.annotation_.get_track_by_name(track)

    def new_track(self,
                  segment: Segment,
                  candidate: Optional[TrackName] = None,
                  prefix: Optional[str] = None):
        """Track name generator

        Parameters
        ----------
        segment : Segment
        candidate : any valid track name
        prefix : str, optional

        Returns
        -------
        track : str
            New track name
        """

        return self.annotation_.new_track(segment, candidate=None, prefix=None)

    def itertracks(self):
        """Iterate over annotation as (segment, track) tuple"""
        return self.annotation_.itertracks()

    def itervalues(self):
        """Iterate over scores as (segment, track, label, value) tuple"""

        # make sure segment/track pairs are sorted
        self._reindexIfNeeded()

        labels = self.labels()

        # yield one (segment, track, label) tuple per loop
        for index, columns in self.dataframe_.iterrows():
            segment = Segment(*index[:-1])
            track = index[-1]
            for label in labels:
                value = columns[label]
                if not np.isnan(value):
                    yield segment, track, label, value

    def get_track_scores(self, segment: Segment, track):
        """Get all scores for a given track.

        Parameters
        ----------
        segment : Segment
        track : hashable
            segment, track must be a valid track

        Returns
        -------
        scores : dict
            {label: score} dictionary
        """
        return dict(self.dataframe_.xs(tuple(segment) + (track, )))

    def labels(self) -> List[Label]:
        """List of labels

        Returns
        -------
        labels : list
            Sorted list of existing labels

        Remarks
        -------
            Labels are sorted based on their string representation.
        """
        return sorted(self.dataframe_.columns, key=str)

    def _reindexIfNeeded(self):

        if not self.hasChanged_:
            return

        names = [
            PYANNOTE_SEGMENT + '_' + field.name for field in fields(Segment)
        ] + [PYANNOTE_TRACK]

        new_index = Index(
            [astuple(s) + (t, ) for s, t in self.annotation_.itertracks()],
            name=names)

        self.dataframe_ = self.dataframe_.reindex(new_index)

        self.hasChanged_ = False

        return

    def rename_tracks(self, generator: LabelGenerator = 'int'):
        """Rename tracks"""

        self._reindexIfNeeded()
        retracked = self.copy()

        annotation = self.annotation_.rename_tracks(generator=generator)
        retracked.annotation_ = annotation

        names = [
            PYANNOTE_SEGMENT + '_' + field.name for field in fields(Segment)
        ] + [PYANNOTE_TRACK]
        new_index = Index(
            [astuple(s) + (t, ) for s, t in annotation.itertracks()],
            name=names)
        retracked.dataframe_.index = new_index

        return retracked

    def apply(self, func: Callable, axis=0):

        applied = self.copy()
        applied.dataframe_ = self.dataframe_.apply(func, axis=axis)
        applied.hasChanged_ = True

        return applied

    def rank(self, ascending: bool = False):
        """

        Parameters
        ----------
        ascending : boolean, default False
            False for ranks by high (0) to low (N-1)

        Returns
        -------
        rank : `Scores`

        """

        ranked = self.copy()
        ranked.dataframe_ = -1 + self.dataframe_.rank(axis=1,
                                                      ascending=ascending)
        ranked.hasChanged_ = True
        return ranked

    def nbest(self, n: int, ascending: bool = False):
        """

        Parameters
        ----------
        n : int
            Size of n-best list
        ascending : boolean, default False
            False for ranks by high (0) to low (N-1)

        Returns
        -------
        nbest : `Scores`
            New scores where only n-best are kept.

        """

        filtered = self.copy()
        ranked_ = -1 + self.dataframe_.rank(axis=1, ascending=ascending)
        filtered.dataframe_ = filtered.dataframe_.where(ranked_ < n,
                                                        other=np.NaN)
        filtered.hasChanged_ = True
        return filtered

    def subset(self, labels: Set[Label], invert: bool = False):
        """Scores subset

        Extract scores subset based on labels

        Parameters
        ----------
        labels : set
            Set of labels
        invert : bool, optional
            If invert is True, extract all but requested `labels`

        Returns
        -------
        subset : `Scores`
            Scores subset.
        """

        self._reindexIfNeeded()

        if not isinstance(labels, set):
            raise TypeError('labels must be provided as a set of labels.')

        if invert:
            labels = set(self.labels()) - labels
        else:
            labels = labels & set(self.labels())

        subset = Scores(uri=self.uri, modality=self.modality)
        subset.annotation_ = self.annotation_
        subset.dataframe_ = self.dataframe_[list(labels)]

        return subset

    def to_annotation(self,
                      threshold: float = -np.inf,
                      posterior: bool = False):
        """

        Parameters
        ----------
        threshold : float, optional
            Each track is annotated with the label with the highest score.
            Yet, if the latter is smaller than `threshold`, label is replaced
            with an `Unknown` instance.
        posterior : bool, optional
            If True, scores are posterior probabilities in open-set
            identification. If top model posterior is higher than unknown
            posterior, it is selected. Otherwise, label is replaced with an
            `Unknown` instance.
        """

        if not self:
            return Annotation(uri=self.uri, modality=self.modality)

        best = self.nbest(1, ascending=False)
        large_enough = best.copy()

        if posterior:
            unknown_posterior = 1. - self.dataframe_.sum(axis=1)

            large_enough.dataframe_ = (((best.dataframe_.T > unknown_posterior)
                                        & (best.dataframe_.T > threshold)).T)

        else:

            large_enough.dataframe_ = ((best.dataframe_.T > threshold).T)

        large_enough.dataframe_.where(best.dataframe_.notnull(),
                                      inplace=True,
                                      other=np.NaN)

        annotation = Annotation(uri=self.uri, modality=self.modality)
        for segment, track, label, value in large_enough.itervalues():
            label = label if value else Unknown()
            annotation[segment, track] = label

        return annotation

    def map(self, func: Callable):
        """Apply function to all values"""

        mapped = self.copy()
        mapped.dataframe_ = self.dataframe_.applymap(func)
        mapped.hasChanged_ = True
        return mapped

    def crop(self, focus: Support, mode: str = 'strict') -> Support:
        """Crop on focus

        Parameters
        ----------
        focus : `Segment` or `Timeline`

        mode : {'strict', 'loose', 'intersection'}
            In 'strict' mode, only segments fully included in focus coverage
            are kept. In 'loose' mode, any intersecting segment is kept
            unchanged. In 'intersection' mode, only intersecting segments are
            kept and replaced by their actual intersection with the focus.

        Returns
        -------
        cropped : same type as caller
            Cropped version of the caller containing only tracks matching
            the provided focus and mode.

        Remarks
        -------
        In 'intersection' mode, the best is done to keep the track names
        unchanged. However, in some cases where two original segments are
        cropped into the same resulting segments, conflicting track names are
        modified to make sure no track is lost.

        """

        if isinstance(focus, Segment):
            return self.crop(Timeline([focus], uri=self.uri), mode=mode)

        self._reindexIfNeeded()
        cropped = self.copy()

        if mode in ['strict', 'loose']:

            new_annotation = self.annotation_.crop(focus, mode=mode)
            keep = [
                new_annotation.has_track(segment, track)
                for segment, track in self.itertracks()
            ]
            cropped.dataframe_ = self.dataframe_[keep]
            cropped.annotation_ = new_annotation
            cropped.hasChanged_ = True

            return cropped

        elif mode in ['intersection']:

            raise NotImplementedError('')

            # # two original segments might be cropped into the same resulting
            # # segment -- therefore, we keep track of the mapping
            # intersection, mapping = timeline.crop(coverage,
            #                                       mode=mode, mapping=True)
            #
            # # create new empty annotation
            # A = self.__class__(uri=self.uri, modality=self.modality)
            #
            # for cropped in intersection:
            #     for original in mapping[cropped]:
            #         for track in self.tracks(original):
            #             # try to use original track name (candidate)
            #             # if it already exists, create a brand new one
            #             new_track = A.new_track(cropped, candidate=track)
            #             # copy each value, column by column
            #             for label in self.dataframe_.columns:
            #                 value = self.dataframe_.get_value((original, track),
            #                                            label)
            #                 A.dataframe_ = A.dataframe_.set_value((cropped, new_track),
            #                                         label, value)
            #
            # return A

    def __str__(self):
        """Human-friendly representation"""
        if self:
            self._reindexIfNeeded()
            return str(self.dataframe_)
        else:
            return ""

    def _repr_png_(self):
        from .notebook import repr_scores
        return repr_scores(self)
Example #37
0
ser4 = Series(['USA', 'Mexico', 'Canada'], index=[0, 5, 10])
ranger = range(15)
ser5 = ser4.reindex(ranger, method='ffill')
print("\n Series 4")
print(ser4)
print(ser5)

# reindex dataframe
dframe = DataFrame(randn(25).reshape(5, 5),
                   index=['A', 'B', 'D', 'E', 'F'],
                   columns=['c1', 'c2', 'c3', 'c4', 'c5'])
print(print("\n DFrame "))
print(dframe)

print(print("\n DFrame2 "))
dframe2 = dframe.reindex(['A', 'B', 'C', 'D', 'E', 'F'])
print(dframe2)

# reindex columns
new_columns = ['col1', 'col2', 'col3', 'col4', 'col5', 'col6']
dframe3 = dframe2.reindex(columns=new_columns)
print(print("\n DFrame3 "))
print(dframe3)

# reindex in place
print(print("\n DFrame "))
print(dframe)

#dframe.
#dframe.ix(['A', 'B', 'C', 'D', 'E', 'F'], new_columns])
#print (dframe)
Example #38
0
 def _apply_to_df(self, el: pd.DataFrame, *, axis, fill_value=None):
     if fill_value is None:
         fill_value = np.NaN
     return el.reindex(self.new_idx, axis=axis, fill_value=fill_value)
Example #39
0
dup_labels.append(pd.Index(['add']))
dup_labels
dup_labels.is_unique

obj = pd.Series([4.5, 7.2, -5.3, 3.6], index=['d', 'b', 'a', 'c'])
obj
obj2 = obj.reindex(['a', 'b', 'c', 'd', 'e'])
obj2
obj3 = pd.Series(['blue', 'purple', 'yellow'], index=[0, 2, 4])
obj3
obj3.reindex(range(6), method='ffill') #forward fill the values
frame = DataFrame(np.arange(9).reshape(3,3),
                  index = ['a', 'c', 'd'],
                  columns = ['Ohio', 'Texas', 'California'])
frame
frame2 = frame.reindex(['a', 'b', 'c', 'd'])
frame2
states = ['Texas', 'Utah', 'California']
frame.reindex(columns=states)
frame.loc[['a', 'b', 'c', 'd'], states]
frame.loc[:,states]

obj = pd.Series(np.arange(5.), index=['a', 'b', 'c', 'd', 'e'])
obj
new_obj = obj.drop('c')
new_obj
obj.drop(['d', 'c'])

data = pd.DataFrame(np.arange(16).reshape((4, 4)),
                    index=['Ohio', 'Colorado', 'Utah', 'New York'],
                    columns=['one', 'two', 'three', 'four'])
Example #40
0
##索引对象
frame2.index.name='index'
frame2.columns.name='property'
frame2.index[2]='t' # 不能更改索引,为了安全共享。
'gpa' in frame2.columns
'fou' in frame2.index


#基本功能
##重新索引
obj2
obj2.reindex(['a','c','d','t']) # 返回一个新的,obj2并没有改变
obj2.reindex(['a','b','c','d','t'],fill_value=0) # 填补缺失值# 填补缺失值 index must be monotonic increasing or decreasing

frame2
frame2.reindex(columns=['name','gpa','year','university'])

frame2.drop(['one','two'],axis=0) # 丢弃行,但axis是0,因为行的index是按列排列的!!!!!!!!!!

arr3.shape
np.mean(arr3,axis=1)  #按行求均值

frame3=DataFrame(arr3)
frame3.mean(axis=1)  #按行求均值


frame2['one':'four']
frame2['gpa']
frame2[frame2['gpa'].astype(np.float)>3] # 转换数据类型  按条件删选

#算数运算 和数据对齐
Example #41
0
    def test_reindex_with_multi_index(self):
        # https://github.com/pandas-dev/pandas/issues/29896
        # tests for reindexing a multi-indexed DataFrame with a new MultiIndex
        #
        # confirms that we can reindex a multi-indexed DataFrame with a new
        # MultiIndex object correctly when using no filling, backfilling, and
        # padding
        #
        # The DataFrame, `df`, used in this test is:
        #       c
        #  a b
        # -1 0  A
        #    1  B
        #    2  C
        #    3  D
        #    4  E
        #    5  F
        #    6  G
        #  0 0  A
        #    1  B
        #    2  C
        #    3  D
        #    4  E
        #    5  F
        #    6  G
        #  1 0  A
        #    1  B
        #    2  C
        #    3  D
        #    4  E
        #    5  F
        #    6  G
        #
        # and the other MultiIndex, `new_multi_index`, is:
        # 0: 0 0.5
        # 1:   2.0
        # 2:   5.0
        # 3:   5.8
        df = DataFrame(
            {
                "a": [-1] * 7 + [0] * 7 + [1] * 7,
                "b": list(range(7)) * 3,
                "c": ["A", "B", "C", "D", "E", "F", "G"] * 3,
            }
        ).set_index(["a", "b"])
        new_index = [0.5, 2.0, 5.0, 5.8]
        new_multi_index = MultiIndex.from_product([[0], new_index], names=["a", "b"])

        # reindexing w/o a `method` value
        reindexed = df.reindex(new_multi_index)
        expected = DataFrame(
            {"a": [0] * 4, "b": new_index, "c": [np.nan, "C", "F", np.nan]}
        ).set_index(["a", "b"])
        tm.assert_frame_equal(expected, reindexed)

        # reindexing with backfilling
        expected = DataFrame(
            {"a": [0] * 4, "b": new_index, "c": ["B", "C", "F", "G"]}
        ).set_index(["a", "b"])
        reindexed_with_backfilling = df.reindex(new_multi_index, method="bfill")
        tm.assert_frame_equal(expected, reindexed_with_backfilling)

        reindexed_with_backfilling = df.reindex(new_multi_index, method="backfill")
        tm.assert_frame_equal(expected, reindexed_with_backfilling)

        # reindexing with padding
        expected = DataFrame(
            {"a": [0] * 4, "b": new_index, "c": ["A", "C", "F", "F"]}
        ).set_index(["a", "b"])
        reindexed_with_padding = df.reindex(new_multi_index, method="pad")
        tm.assert_frame_equal(expected, reindexed_with_padding)

        reindexed_with_padding = df.reindex(new_multi_index, method="ffill")
        tm.assert_frame_equal(expected, reindexed_with_padding)
Example #42
0
obj3.reindex(range(6),method='ffill')


# In[86]:

frame=DataFrame(np.arange(9).reshape((3,3)),index=['a','c','d'],columns=['Beijing','Shanghai','Tongling'])


# In[87]:

frame


# In[88]:

frame2=frame.reindex(['a','b','c','d'])


# In[89]:

frame2


# In[90]:

states=['Tongling','Shenzheng','Beijing']


# In[91]:

frame.reindex(columns=states)
Example #43
0
                index=['Ohio', 'Texas', 'Colorado'])
df2 = DataFrame(np.arange(12).reshape((4, 3)),
                columns=list('bde'),
                index=['Utah', 'Ohio', 'Texas', 'Oregon'])
print(df1)
print(df2)
print(df1 + df2)
print()

print("## Fill values:")
df1 = DataFrame(np.arange(12.).reshape((3, 4)), columns=list('abcd'))
df2 = DataFrame(np.arange(20.).reshape((4, 5)), columns=list('abcde'))
print(df1)
print(df2)
print(df1.add(df2, fill_value=0))
print(df1.reindex(columns=df2.columns, fill_value=0))
print()

print("## Operate between Series and DataFrame:")
arr = np.arange(12.).reshape((3, 4))
print(arr)
print(arr[0])
print(arr - arr[0])
frame = DataFrame(np.arange(12).reshape((4, 3)),
                  columns=list('bde'),
                  index=['Utah', 'Ohio', 'Texas', 'Oregon'])
series = frame.ix[0]
print("frame:")
print(frame)
print("series:")
print(series)
Example #44
0
class TestJoin:
    def setup_method(self, method):
        # aggregate multiple columns
        self.df = DataFrame({
            "key1": get_test_data(),
            "key2": get_test_data(),
            "data1": np.random.randn(N),
            "data2": np.random.randn(N),
        })

        # exclude a couple keys for fun
        self.df = self.df[self.df["key2"] > 1]

        self.df2 = DataFrame({
            "key1":
            get_test_data(n=N // 5),
            "key2":
            get_test_data(ngroups=NGROUPS // 2, n=N // 5),
            "value":
            np.random.randn(N // 5),
        })

        index, data = tm.getMixedTypeDict()
        self.target = DataFrame(data, index=index)

        # Join on string value
        self.source = DataFrame({
            "MergedA": data["A"],
            "MergedD": data["D"]
        },
                                index=data["C"])

    def test_left_outer_join(self):
        joined_key2 = merge(self.df, self.df2, on="key2")
        _check_join(self.df, self.df2, joined_key2, ["key2"], how="left")

        joined_both = merge(self.df, self.df2)
        _check_join(self.df,
                    self.df2,
                    joined_both, ["key1", "key2"],
                    how="left")

    def test_right_outer_join(self):
        joined_key2 = merge(self.df, self.df2, on="key2", how="right")
        _check_join(self.df, self.df2, joined_key2, ["key2"], how="right")

        joined_both = merge(self.df, self.df2, how="right")
        _check_join(self.df,
                    self.df2,
                    joined_both, ["key1", "key2"],
                    how="right")

    def test_full_outer_join(self):
        joined_key2 = merge(self.df, self.df2, on="key2", how="outer")
        _check_join(self.df, self.df2, joined_key2, ["key2"], how="outer")

        joined_both = merge(self.df, self.df2, how="outer")
        _check_join(self.df,
                    self.df2,
                    joined_both, ["key1", "key2"],
                    how="outer")

    def test_inner_join(self):
        joined_key2 = merge(self.df, self.df2, on="key2", how="inner")
        _check_join(self.df, self.df2, joined_key2, ["key2"], how="inner")

        joined_both = merge(self.df, self.df2, how="inner")
        _check_join(self.df,
                    self.df2,
                    joined_both, ["key1", "key2"],
                    how="inner")

    def test_handle_overlap(self):
        joined = merge(self.df, self.df2, on="key2", suffixes=(".foo", ".bar"))

        assert "key1.foo" in joined
        assert "key1.bar" in joined

    def test_handle_overlap_arbitrary_key(self):
        joined = merge(
            self.df,
            self.df2,
            left_on="key2",
            right_on="key1",
            suffixes=(".foo", ".bar"),
        )
        assert "key1.foo" in joined
        assert "key2.bar" in joined

    def test_join_on(self):
        target = self.target
        source = self.source

        merged = target.join(source, on="C")
        tm.assert_series_equal(merged["MergedA"],
                               target["A"],
                               check_names=False)
        tm.assert_series_equal(merged["MergedD"],
                               target["D"],
                               check_names=False)

        # join with duplicates (fix regression from DataFrame/Matrix merge)
        df = DataFrame({"key": ["a", "a", "b", "b", "c"]})
        df2 = DataFrame({"value": [0, 1, 2]}, index=["a", "b", "c"])
        joined = df.join(df2, on="key")
        expected = DataFrame({
            "key": ["a", "a", "b", "b", "c"],
            "value": [0, 0, 1, 1, 2]
        })
        tm.assert_frame_equal(joined, expected)

        # Test when some are missing
        df_a = DataFrame([[1], [2], [3]],
                         index=["a", "b", "c"],
                         columns=["one"])
        df_b = DataFrame([["foo"], ["bar"]], index=[1, 2], columns=["two"])
        df_c = DataFrame([[1], [2]], index=[1, 2], columns=["three"])
        joined = df_a.join(df_b, on="one")
        joined = joined.join(df_c, on="one")
        assert np.isnan(joined["two"]["c"])
        assert np.isnan(joined["three"]["c"])

        # merge column not p resent
        with pytest.raises(KeyError, match="^'E'$"):
            target.join(source, on="E")

        # overlap
        source_copy = source.copy()
        source_copy["A"] = 0
        msg = ("You are trying to merge on float64 and object columns. If "
               "you wish to proceed you should use pd.concat")
        with pytest.raises(ValueError, match=msg):
            target.join(source_copy, on="A")

    def test_join_on_fails_with_different_right_index(self):
        df = DataFrame({
            "a": np.random.choice(["m", "f"], size=3),
            "b": np.random.randn(3)
        })
        df2 = DataFrame(
            {
                "a": np.random.choice(["m", "f"], size=10),
                "b": np.random.randn(10)
            },
            index=tm.makeCustomIndex(10, 2),
        )
        msg = r'len\(left_on\) must equal the number of levels in the index of "right"'
        with pytest.raises(ValueError, match=msg):
            merge(df, df2, left_on="a", right_index=True)

    def test_join_on_fails_with_different_left_index(self):
        df = DataFrame(
            {
                "a": np.random.choice(["m", "f"], size=3),
                "b": np.random.randn(3)
            },
            index=tm.makeCustomIndex(3, 2),
        )
        df2 = DataFrame({
            "a": np.random.choice(["m", "f"], size=10),
            "b": np.random.randn(10)
        })
        msg = r'len\(right_on\) must equal the number of levels in the index of "left"'
        with pytest.raises(ValueError, match=msg):
            merge(df, df2, right_on="b", left_index=True)

    def test_join_on_fails_with_different_column_counts(self):
        df = DataFrame({
            "a": np.random.choice(["m", "f"], size=3),
            "b": np.random.randn(3)
        })
        df2 = DataFrame(
            {
                "a": np.random.choice(["m", "f"], size=10),
                "b": np.random.randn(10)
            },
            index=tm.makeCustomIndex(10, 2),
        )
        msg = r"len\(right_on\) must equal len\(left_on\)"
        with pytest.raises(ValueError, match=msg):
            merge(df, df2, right_on="a", left_on=["a", "b"])

    @pytest.mark.parametrize("wrong_type", [2, "str", None, np.array([0, 1])])
    def test_join_on_fails_with_wrong_object_type(self, wrong_type):
        # GH12081 - original issue

        # GH21220 - merging of Series and DataFrame is now allowed
        # Edited test to remove the Series object from test parameters

        df = DataFrame({"a": [1, 1]})
        msg = ("Can only merge Series or DataFrame objects, "
               f"a {type(wrong_type)} was passed")
        with pytest.raises(TypeError, match=msg):
            merge(wrong_type, df, left_on="a", right_on="a")
        with pytest.raises(TypeError, match=msg):
            merge(df, wrong_type, left_on="a", right_on="a")

    def test_join_on_pass_vector(self):
        expected = self.target.join(self.source, on="C")
        del expected["C"]

        join_col = self.target.pop("C")
        result = self.target.join(self.source, on=join_col)
        tm.assert_frame_equal(result, expected)

    def test_join_with_len0(self):
        # nothing to merge
        merged = self.target.join(self.source.reindex([]), on="C")
        for col in self.source:
            assert col in merged
            assert merged[col].isna().all()

        merged2 = self.target.join(self.source.reindex([]),
                                   on="C",
                                   how="inner")
        tm.assert_index_equal(merged2.columns, merged.columns)
        assert len(merged2) == 0

    def test_join_on_inner(self):
        df = DataFrame({"key": ["a", "a", "d", "b", "b", "c"]})
        df2 = DataFrame({"value": [0, 1]}, index=["a", "b"])

        joined = df.join(df2, on="key", how="inner")

        expected = df.join(df2, on="key")
        expected = expected[expected["value"].notna()]
        tm.assert_series_equal(joined["key"], expected["key"])
        tm.assert_series_equal(joined["value"],
                               expected["value"],
                               check_dtype=False)
        tm.assert_index_equal(joined.index, expected.index)

    def test_join_on_singlekey_list(self):
        df = DataFrame({"key": ["a", "a", "b", "b", "c"]})
        df2 = DataFrame({"value": [0, 1, 2]}, index=["a", "b", "c"])

        # corner cases
        joined = df.join(df2, on=["key"])
        expected = df.join(df2, on="key")

        tm.assert_frame_equal(joined, expected)

    def test_join_on_series(self):
        result = self.target.join(self.source["MergedA"], on="C")
        expected = self.target.join(self.source[["MergedA"]], on="C")
        tm.assert_frame_equal(result, expected)

    def test_join_on_series_buglet(self):
        # GH #638
        df = DataFrame({"a": [1, 1]})
        ds = Series([2], index=[1], name="b")
        result = df.join(ds, on="a")
        expected = DataFrame({"a": [1, 1], "b": [2, 2]}, index=df.index)
        tm.assert_frame_equal(result, expected)

    def test_join_index_mixed(self, join_type):
        # no overlapping blocks
        df1 = DataFrame(index=np.arange(10))
        df1["bool"] = True
        df1["string"] = "foo"

        df2 = DataFrame(index=np.arange(5, 15))
        df2["int"] = 1
        df2["float"] = 1.0

        joined = df1.join(df2, how=join_type)
        expected = _join_by_hand(df1, df2, how=join_type)
        tm.assert_frame_equal(joined, expected)

        joined = df2.join(df1, how=join_type)
        expected = _join_by_hand(df2, df1, how=join_type)
        tm.assert_frame_equal(joined, expected)

    def test_join_index_mixed_overlap(self):
        df1 = DataFrame(
            {
                "A": 1.0,
                "B": 2,
                "C": "foo",
                "D": True
            },
            index=np.arange(10),
            columns=["A", "B", "C", "D"],
        )
        assert df1["B"].dtype == np.int64
        assert df1["D"].dtype == np.bool_

        df2 = DataFrame(
            {
                "A": 1.0,
                "B": 2,
                "C": "foo",
                "D": True
            },
            index=np.arange(0, 10, 2),
            columns=["A", "B", "C", "D"],
        )

        # overlap
        joined = df1.join(df2, lsuffix="_one", rsuffix="_two")
        expected_columns = [
            "A_one",
            "B_one",
            "C_one",
            "D_one",
            "A_two",
            "B_two",
            "C_two",
            "D_two",
        ]
        df1.columns = expected_columns[:4]
        df2.columns = expected_columns[4:]
        expected = _join_by_hand(df1, df2)
        tm.assert_frame_equal(joined, expected)

    def test_join_empty_bug(self):
        # generated an exception in 0.4.3
        x = DataFrame()
        x.join(DataFrame([3], index=[0], columns=["A"]), how="outer")

    def test_join_unconsolidated(self):
        # GH #331
        a = DataFrame(np.random.randn(30, 2), columns=["a", "b"])
        c = Series(np.random.randn(30))
        a["c"] = c
        d = DataFrame(np.random.randn(30, 1), columns=["q"])

        # it works!
        a.join(d)
        d.join(a)

    def test_join_multiindex(self):
        index1 = MultiIndex.from_arrays(
            [["a", "a", "a", "b", "b", "b"], [1, 2, 3, 1, 2, 3]],
            names=["first", "second"],
        )

        index2 = MultiIndex.from_arrays(
            [["b", "b", "b", "c", "c", "c"], [1, 2, 3, 1, 2, 3]],
            names=["first", "second"],
        )

        df1 = DataFrame(data=np.random.randn(6),
                        index=index1,
                        columns=["var X"])
        df2 = DataFrame(data=np.random.randn(6),
                        index=index2,
                        columns=["var Y"])

        df1 = df1.sort_index(level=0)
        df2 = df2.sort_index(level=0)

        joined = df1.join(df2, how="outer")
        ex_index = Index(index1.values).union(Index(index2.values))
        expected = df1.reindex(ex_index).join(df2.reindex(ex_index))
        expected.index.names = index1.names
        tm.assert_frame_equal(joined, expected)
        assert joined.index.names == index1.names

        df1 = df1.sort_index(level=1)
        df2 = df2.sort_index(level=1)

        joined = df1.join(df2, how="outer").sort_index(level=0)
        ex_index = Index(index1.values).union(Index(index2.values))
        expected = df1.reindex(ex_index).join(df2.reindex(ex_index))
        expected.index.names = index1.names

        tm.assert_frame_equal(joined, expected)
        assert joined.index.names == index1.names

    def test_join_inner_multiindex(self):
        key1 = [
            "bar", "bar", "bar", "foo", "foo", "baz", "baz", "qux", "qux",
            "snap"
        ]
        key2 = [
            "two",
            "one",
            "three",
            "one",
            "two",
            "one",
            "two",
            "two",
            "three",
            "one",
        ]

        data = np.random.randn(len(key1))
        data = DataFrame({"key1": key1, "key2": key2, "data": data})

        index = MultiIndex(
            levels=[["foo", "bar", "baz", "qux"], ["one", "two", "three"]],
            codes=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3],
                   [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]],
            names=["first", "second"],
        )
        to_join = DataFrame(np.random.randn(10, 3),
                            index=index,
                            columns=["j_one", "j_two", "j_three"])

        joined = data.join(to_join, on=["key1", "key2"], how="inner")
        expected = merge(
            data,
            to_join.reset_index(),
            left_on=["key1", "key2"],
            right_on=["first", "second"],
            how="inner",
            sort=False,
        )

        expected2 = merge(
            to_join,
            data,
            right_on=["key1", "key2"],
            left_index=True,
            how="inner",
            sort=False,
        )
        tm.assert_frame_equal(joined, expected2.reindex_like(joined))

        expected2 = merge(
            to_join,
            data,
            right_on=["key1", "key2"],
            left_index=True,
            how="inner",
            sort=False,
        )

        expected = expected.drop(["first", "second"], axis=1)
        expected.index = joined.index

        assert joined.index.is_monotonic
        tm.assert_frame_equal(joined, expected)

        # _assert_same_contents(expected, expected2.loc[:, expected.columns])

    def test_join_hierarchical_mixed(self):
        # GH 2024
        df = DataFrame([(1, 2, 3), (4, 5, 6)], columns=["a", "b", "c"])
        new_df = df.groupby(["a"]).agg({"b": [np.mean, np.sum]})
        other_df = DataFrame([(1, 2, 3), (7, 10, 6)], columns=["a", "b", "d"])
        other_df.set_index("a", inplace=True)
        # GH 9455, 12219
        with tm.assert_produces_warning(UserWarning):
            result = merge(new_df, other_df, left_index=True, right_index=True)
        assert ("b", "mean") in result
        assert "b" in result

    def test_join_float64_float32(self):

        a = DataFrame(np.random.randn(10, 2),
                      columns=["a", "b"],
                      dtype=np.float64)
        b = DataFrame(np.random.randn(10, 1), columns=["c"], dtype=np.float32)
        joined = a.join(b)
        assert joined.dtypes["a"] == "float64"
        assert joined.dtypes["b"] == "float64"
        assert joined.dtypes["c"] == "float32"

        a = np.random.randint(0, 5, 100).astype("int64")
        b = np.random.random(100).astype("float64")
        c = np.random.random(100).astype("float32")
        df = DataFrame({"a": a, "b": b, "c": c})
        xpdf = DataFrame({"a": a, "b": b, "c": c})
        s = DataFrame(np.random.random(5).astype("float32"), columns=["md"])
        rs = df.merge(s, left_on="a", right_index=True)
        assert rs.dtypes["a"] == "int64"
        assert rs.dtypes["b"] == "float64"
        assert rs.dtypes["c"] == "float32"
        assert rs.dtypes["md"] == "float32"

        xp = xpdf.merge(s, left_on="a", right_index=True)
        tm.assert_frame_equal(rs, xp)

    def test_join_many_non_unique_index(self):
        df1 = DataFrame({"a": [1, 1], "b": [1, 1], "c": [10, 20]})
        df2 = DataFrame({"a": [1, 1], "b": [1, 2], "d": [100, 200]})
        df3 = DataFrame({"a": [1, 1], "b": [1, 2], "e": [1000, 2000]})
        idf1 = df1.set_index(["a", "b"])
        idf2 = df2.set_index(["a", "b"])
        idf3 = df3.set_index(["a", "b"])

        result = idf1.join([idf2, idf3], how="outer")

        df_partially_merged = merge(df1, df2, on=["a", "b"], how="outer")
        expected = merge(df_partially_merged, df3, on=["a", "b"], how="outer")

        result = result.reset_index()
        expected = expected[result.columns]
        expected["a"] = expected.a.astype("int64")
        expected["b"] = expected.b.astype("int64")
        tm.assert_frame_equal(result, expected)

        df1 = DataFrame({"a": [1, 1, 1], "b": [1, 1, 1], "c": [10, 20, 30]})
        df2 = DataFrame({"a": [1, 1, 1], "b": [1, 1, 2], "d": [100, 200, 300]})
        df3 = DataFrame({
            "a": [1, 1, 1],
            "b": [1, 1, 2],
            "e": [1000, 2000, 3000]
        })
        idf1 = df1.set_index(["a", "b"])
        idf2 = df2.set_index(["a", "b"])
        idf3 = df3.set_index(["a", "b"])
        result = idf1.join([idf2, idf3], how="inner")

        df_partially_merged = merge(df1, df2, on=["a", "b"], how="inner")
        expected = merge(df_partially_merged, df3, on=["a", "b"], how="inner")

        result = result.reset_index()

        tm.assert_frame_equal(result, expected.loc[:, result.columns])

        # GH 11519
        df = DataFrame({
            "A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"],
            "B": ["one", "one", "two", "three", "two", "two", "one", "three"],
            "C":
            np.random.randn(8),
            "D":
            np.random.randn(8),
        })
        s = Series(np.repeat(np.arange(8), 2),
                   index=np.repeat(np.arange(8), 2),
                   name="TEST")
        inner = df.join(s, how="inner")
        outer = df.join(s, how="outer")
        left = df.join(s, how="left")
        right = df.join(s, how="right")
        tm.assert_frame_equal(inner, outer)
        tm.assert_frame_equal(inner, left)
        tm.assert_frame_equal(inner, right)

    def test_join_sort(self):
        left = DataFrame({
            "key": ["foo", "bar", "baz", "foo"],
            "value": [1, 2, 3, 4]
        })
        right = DataFrame({"value2": ["a", "b", "c"]},
                          index=["bar", "baz", "foo"])

        joined = left.join(right, on="key", sort=True)
        expected = DataFrame(
            {
                "key": ["bar", "baz", "foo", "foo"],
                "value": [2, 3, 1, 4],
                "value2": ["a", "b", "c", "c"],
            },
            index=[1, 2, 0, 3],
        )
        tm.assert_frame_equal(joined, expected)

        # smoke test
        joined = left.join(right, on="key", sort=False)
        tm.assert_index_equal(joined.index, Index(range(4)), exact=True)

    def test_join_mixed_non_unique_index(self):
        # GH 12814, unorderable types in py3 with a non-unique index
        df1 = DataFrame({"a": [1, 2, 3, 4]}, index=[1, 2, 3, "a"])
        df2 = DataFrame({"b": [5, 6, 7, 8]}, index=[1, 3, 3, 4])
        result = df1.join(df2)
        expected = DataFrame(
            {
                "a": [1, 2, 3, 3, 4],
                "b": [5, np.nan, 6, 7, np.nan]
            },
            index=[1, 2, 3, 3, "a"],
        )
        tm.assert_frame_equal(result, expected)

        df3 = DataFrame({"a": [1, 2, 3, 4]}, index=[1, 2, 2, "a"])
        df4 = DataFrame({"b": [5, 6, 7, 8]}, index=[1, 2, 3, 4])
        result = df3.join(df4)
        expected = DataFrame({
            "a": [1, 2, 3, 4],
            "b": [5, 6, 6, np.nan]
        },
                             index=[1, 2, 2, "a"])
        tm.assert_frame_equal(result, expected)

    def test_join_non_unique_period_index(self):
        # GH #16871
        index = pd.period_range("2016-01-01", periods=16, freq="M")
        df = DataFrame(list(range(len(index))), index=index, columns=["pnum"])
        df2 = concat([df, df])
        result = df.join(df2, how="inner", rsuffix="_df2")
        expected = DataFrame(
            np.tile(np.arange(16, dtype=np.int64).repeat(2).reshape(-1, 1), 2),
            columns=["pnum", "pnum_df2"],
            index=df2.sort_index().index,
        )
        tm.assert_frame_equal(result, expected)

    def test_mixed_type_join_with_suffix(self):
        # GH #916
        df = DataFrame(np.random.randn(20, 6),
                       columns=["a", "b", "c", "d", "e", "f"])
        df.insert(0, "id", 0)
        df.insert(5, "dt", "foo")

        grouped = df.groupby("id")
        mn = grouped.mean()
        cn = grouped.count()

        # it works!
        mn.join(cn, rsuffix="_right")

    def test_join_many(self):
        df = DataFrame(np.random.randn(10, 6), columns=list("abcdef"))
        df_list = [df[["a", "b"]], df[["c", "d"]], df[["e", "f"]]]

        joined = df_list[0].join(df_list[1:])
        tm.assert_frame_equal(joined, df)

        df_list = [
            df[["a", "b"]][:-2], df[["c", "d"]][2:], df[["e", "f"]][1:9]
        ]

        def _check_diff_index(df_list, result, exp_index):
            reindexed = [x.reindex(exp_index) for x in df_list]
            expected = reindexed[0].join(reindexed[1:])
            tm.assert_frame_equal(result, expected)

        # different join types
        joined = df_list[0].join(df_list[1:], how="outer")
        _check_diff_index(df_list, joined, df.index)

        joined = df_list[0].join(df_list[1:])
        _check_diff_index(df_list, joined, df_list[0].index)

        joined = df_list[0].join(df_list[1:], how="inner")
        _check_diff_index(df_list, joined, df.index[2:8])

        msg = "Joining multiple DataFrames only supported for joining on index"
        with pytest.raises(ValueError, match=msg):
            df_list[0].join(df_list[1:], on="a")

    def test_join_many_mixed(self):
        df = DataFrame(np.random.randn(8, 4), columns=["A", "B", "C", "D"])
        df["key"] = ["foo", "bar"] * 4
        df1 = df.loc[:, ["A", "B"]]
        df2 = df.loc[:, ["C", "D"]]
        df3 = df.loc[:, ["key"]]

        result = df1.join([df2, df3])
        tm.assert_frame_equal(result, df)

    def test_join_dups(self):

        # joining dups
        df = concat(
            [
                DataFrame(np.random.randn(10, 4), columns=["A", "A", "B", "B"
                                                           ]),
                DataFrame(np.random.randint(0, 10, size=20).reshape(10, 2),
                          columns=["A", "C"]),
            ],
            axis=1,
        )

        expected = concat([df, df], axis=1)
        result = df.join(df, rsuffix="_2")
        result.columns = expected.columns
        tm.assert_frame_equal(result, expected)

        # GH 4975, invalid join on dups
        w = DataFrame(np.random.randn(4, 2), columns=["x", "y"])
        x = DataFrame(np.random.randn(4, 2), columns=["x", "y"])
        y = DataFrame(np.random.randn(4, 2), columns=["x", "y"])
        z = DataFrame(np.random.randn(4, 2), columns=["x", "y"])

        dta = x.merge(y, left_index=True,
                      right_index=True).merge(z,
                                              left_index=True,
                                              right_index=True,
                                              how="outer")
        dta = dta.merge(w, left_index=True, right_index=True)
        expected = concat([x, y, z, w], axis=1)
        expected.columns = [
            "x_x", "y_x", "x_y", "y_y", "x_x", "y_x", "x_y", "y_y"
        ]
        tm.assert_frame_equal(dta, expected)

    def test_join_multi_to_multi(self, join_type):
        # GH 20475
        leftindex = MultiIndex.from_product(
            [list("abc"), list("xy"), [1, 2]], names=["abc", "xy", "num"])
        left = DataFrame({"v1": range(12)}, index=leftindex)

        rightindex = MultiIndex.from_product(
            [list("abc"), list("xy")], names=["abc", "xy"])
        right = DataFrame({"v2": [100 * i for i in range(1, 7)]},
                          index=rightindex)

        result = left.join(right, on=["abc", "xy"], how=join_type)
        expected = (left.reset_index().merge(right.reset_index(),
                                             on=["abc", "xy"],
                                             how=join_type).set_index(
                                                 ["abc", "xy", "num"]))
        tm.assert_frame_equal(expected, result)

        msg = r'len\(left_on\) must equal the number of levels in the index of "right"'
        with pytest.raises(ValueError, match=msg):
            left.join(right, on="xy", how=join_type)

        with pytest.raises(ValueError, match=msg):
            right.join(left, on=["abc", "xy"], how=join_type)

    def test_join_on_tz_aware_datetimeindex(self):
        # GH 23931, 26335
        df1 = DataFrame({
            "date":
            pd.date_range(start="2018-01-01", periods=5, tz="America/Chicago"),
            "vals":
            list("abcde"),
        })

        df2 = DataFrame({
            "date":
            pd.date_range(start="2018-01-03", periods=5, tz="America/Chicago"),
            "vals_2":
            list("tuvwx"),
        })
        result = df1.join(df2.set_index("date"), on="date")
        expected = df1.copy()
        expected["vals_2"] = Series([np.nan] * 2 + list("tuv"), dtype=object)
        tm.assert_frame_equal(result, expected)

    def test_join_datetime_string(self):
        # GH 5647
        dfa = DataFrame(
            [
                ["2012-08-02", "L", 10],
                ["2012-08-02", "J", 15],
                ["2013-04-06", "L", 20],
                ["2013-04-06", "J", 25],
            ],
            columns=["x", "y", "a"],
        )
        dfa["x"] = pd.to_datetime(dfa["x"])
        dfb = DataFrame(
            [["2012-08-02", "J", 1], ["2013-04-06", "L", 2]],
            columns=["x", "y", "z"],
            index=[2, 4],
        )
        dfb["x"] = pd.to_datetime(dfb["x"])
        result = dfb.join(dfa.set_index(["x", "y"]), on=["x", "y"])
        expected = DataFrame(
            [
                [Timestamp("2012-08-02 00:00:00"), "J", 1, 15],
                [Timestamp("2013-04-06 00:00:00"), "L", 2, 20],
            ],
            index=[2, 4],
            columns=["x", "y", "z", "a"],
        )
        tm.assert_frame_equal(result, expected)
Example #45
0
    def test_reindex_axis_style_raises(self):
        # https://github.com/pandas-dev/pandas/issues/12392
        df = DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]})
        with pytest.raises(TypeError, match="Cannot specify both 'axis'"):
            df.reindex([0, 1], ["A"], axis=1)

        with pytest.raises(TypeError, match="Cannot specify both 'axis'"):
            df.reindex([0, 1], ["A"], axis="index")

        with pytest.raises(TypeError, match="Cannot specify both 'axis'"):
            df.reindex(index=[0, 1], axis="index")

        with pytest.raises(TypeError, match="Cannot specify both 'axis'"):
            df.reindex(index=[0, 1], axis="columns")

        with pytest.raises(TypeError, match="Cannot specify both 'axis'"):
            df.reindex(columns=[0, 1], axis="columns")

        with pytest.raises(TypeError, match="Cannot specify both 'axis'"):
            df.reindex(index=[0, 1], columns=[0, 1], axis="columns")

        with pytest.raises(TypeError, match="Cannot specify all"):
            df.reindex([0, 1], [0], ["A"])

        # Mixing styles
        with pytest.raises(TypeError, match="Cannot specify both 'axis'"):
            df.reindex(index=[0, 1], axis="index")

        with pytest.raises(TypeError, match="Cannot specify both 'axis'"):
            df.reindex(index=[0, 1], axis="columns")

        # Duplicates
        with pytest.raises(TypeError, match="multiple values"):
            df.reindex([0, 1], labels=[0, 1])
Example #46
0
serie_2 = serie_1.reindex(["a", "b", "c", "d", "e", "f", "g", "h"],
                          fill_value=0)
print(serie_2)

serie_3 = Series(["Santa Catarina", "Santo André", "Santo Antônio"],
                 index=[0, 5, 8])
print(serie_3)

index_range = range(15)

serie_4 = serie_3.reindex(index_range, method="ffill")
print(serie_4)

serie_4 = serie_3.reindex(index_range, method="bfill")
print(serie_4)

serie_4 = serie_3.reindex(index_range, method="nearest")
print(serie_4)

data_frame = DataFrame(np.random.randn(25).reshape((5, 5)),
                       index=["a", "b", "d", "e", "f"],
                       columns=["col_1", "col_2", "col_3", "col_4", "col_5"])
print(data_frame)

data_frame_2 = data_frame.reindex(
    columns=["col_1", "col_2", "col_3", "col_4", "col_5", "col_6"],
    fill_value=5)
print(data_frame_2)
print()
Example #47
0
def test_observed(observed):
    # multiple groupers, don't re-expand the output space
    # of the grouper
    # gh-14942 (implement)
    # gh-10132 (back-compat)
    # gh-8138 (back-compat)
    # gh-8869

    cat1 = Categorical(["a", "a", "b", "b"],
                       categories=["a", "b", "z"],
                       ordered=True)
    cat2 = Categorical(["c", "d", "c", "d"],
                       categories=["c", "d", "y"],
                       ordered=True)
    df = DataFrame({"A": cat1, "B": cat2, "values": [1, 2, 3, 4]})
    df['C'] = ['foo', 'bar'] * 2

    # multiple groupers with a non-cat
    gb = df.groupby(['A', 'B', 'C'], observed=observed)
    exp_index = pd.MultiIndex.from_arrays([cat1, cat2, ['foo', 'bar'] * 2],
                                          names=['A', 'B', 'C'])
    expected = DataFrame({
        'values': Series([1, 2, 3, 4], index=exp_index)
    }).sort_index()
    result = gb.sum()
    if not observed:
        expected = cartesian_product_for_groupers(expected,
                                                  [cat1, cat2, ['foo', 'bar']],
                                                  list('ABC'))

    tm.assert_frame_equal(result, expected)

    gb = df.groupby(['A', 'B'], observed=observed)
    exp_index = pd.MultiIndex.from_arrays([cat1, cat2], names=['A', 'B'])
    expected = DataFrame({'values': [1, 2, 3, 4]}, index=exp_index)
    result = gb.sum()
    if not observed:
        expected = cartesian_product_for_groupers(expected, [cat1, cat2],
                                                  list('AB'))

    tm.assert_frame_equal(result, expected)

    # https://github.com/pandas-dev/pandas/issues/8138
    d = {
        'cat':
        pd.Categorical(["a", "b", "a", "b"],
                       categories=["a", "b", "c"],
                       ordered=True),
        'ints': [1, 1, 2, 2],
        'val': [10, 20, 30, 40]
    }
    df = pd.DataFrame(d)

    # Grouping on a single column
    groups_single_key = df.groupby("cat", observed=observed)
    result = groups_single_key.mean()

    exp_index = pd.CategoricalIndex(list('ab'),
                                    name="cat",
                                    categories=list('abc'),
                                    ordered=True)
    expected = DataFrame({
        "ints": [1.5, 1.5],
        "val": [20., 30]
    },
                         index=exp_index)
    if not observed:
        index = pd.CategoricalIndex(list('abc'),
                                    name="cat",
                                    categories=list('abc'),
                                    ordered=True)
        expected = expected.reindex(index)

    tm.assert_frame_equal(result, expected)

    # Grouping on two columns
    groups_double_key = df.groupby(["cat", "ints"], observed=observed)
    result = groups_double_key.agg('mean')
    expected = DataFrame({
        "val": [10, 30, 20, 40],
        "cat":
        pd.Categorical(['a', 'a', 'b', 'b'],
                       categories=['a', 'b', 'c'],
                       ordered=True),
        "ints": [1, 2, 1, 2]
    }).set_index(["cat", "ints"])
    if not observed:
        expected = cartesian_product_for_groupers(expected,
                                                  [df.cat.values, [1, 2]],
                                                  ['cat', 'ints'])

    tm.assert_frame_equal(result, expected)

    # GH 10132
    for key in [('a', 1), ('b', 2), ('b', 1), ('a', 2)]:
        c, i = key
        result = groups_double_key.get_group(key)
        expected = df[(df.cat == c) & (df.ints == i)]
        assert_frame_equal(result, expected)

    # gh-8869
    # with as_index
    d = {
        'foo': [10, 8, 4, 8, 4, 1, 1],
        'bar': [10, 20, 30, 40, 50, 60, 70],
        'baz': ['d', 'c', 'e', 'a', 'a', 'd', 'c']
    }
    df = pd.DataFrame(d)
    cat = pd.cut(df['foo'], np.linspace(0, 10, 3))
    df['range'] = cat
    groups = df.groupby(['range', 'baz'], as_index=False, observed=observed)
    result = groups.agg('mean')

    groups2 = df.groupby(['range', 'baz'], as_index=True, observed=observed)
    expected = groups2.agg('mean').reset_index()
    tm.assert_frame_equal(result, expected)
Example #48
0
class TestJoin(object):
    def setup_method(self, method):
        # aggregate multiple columns
        self.df = DataFrame({
            'key1': get_test_data(),
            'key2': get_test_data(),
            'data1': np.random.randn(N),
            'data2': np.random.randn(N)
        })

        # exclude a couple keys for fun
        self.df = self.df[self.df['key2'] > 1]

        self.df2 = DataFrame({
            'key1':
            get_test_data(n=N // 5),
            'key2':
            get_test_data(ngroups=NGROUPS // 2, n=N // 5),
            'value':
            np.random.randn(N // 5)
        })

        index, data = tm.getMixedTypeDict()
        self.target = DataFrame(data, index=index)

        # Join on string value
        self.source = DataFrame({
            'MergedA': data['A'],
            'MergedD': data['D']
        },
                                index=data['C'])

    def test_cython_left_outer_join(self):
        left = a_([0, 1, 2, 1, 2, 0, 0, 1, 2, 3, 3], dtype=np.int64)
        right = a_([1, 1, 0, 4, 2, 2, 1], dtype=np.int64)
        max_group = 5

        ls, rs = libjoin.left_outer_join(left, right, max_group)

        exp_ls = left.argsort(kind='mergesort')
        exp_rs = right.argsort(kind='mergesort')

        exp_li = a_(
            [0, 1, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, 6, 6, 7, 7, 8, 8, 9, 10])
        exp_ri = a_(
            [0, 0, 0, 1, 2, 3, 1, 2, 3, 1, 2, 3, 4, 5, 4, 5, 4, 5, -1, -1])

        exp_ls = exp_ls.take(exp_li)
        exp_ls[exp_li == -1] = -1

        exp_rs = exp_rs.take(exp_ri)
        exp_rs[exp_ri == -1] = -1

        tm.assert_numpy_array_equal(ls, exp_ls, check_dtype=False)
        tm.assert_numpy_array_equal(rs, exp_rs, check_dtype=False)

    def test_cython_right_outer_join(self):
        left = a_([0, 1, 2, 1, 2, 0, 0, 1, 2, 3, 3], dtype=np.int64)
        right = a_([1, 1, 0, 4, 2, 2, 1], dtype=np.int64)
        max_group = 5

        rs, ls = libjoin.left_outer_join(right, left, max_group)

        exp_ls = left.argsort(kind='mergesort')
        exp_rs = right.argsort(kind='mergesort')

        #            0        1        1        1
        exp_li = a_([
            0,
            1,
            2,
            3,
            4,
            5,
            3,
            4,
            5,
            3,
            4,
            5,
            #            2        2        4
            6,
            7,
            8,
            6,
            7,
            8,
            -1
        ])
        exp_ri = a_([0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, 6])

        exp_ls = exp_ls.take(exp_li)
        exp_ls[exp_li == -1] = -1

        exp_rs = exp_rs.take(exp_ri)
        exp_rs[exp_ri == -1] = -1

        tm.assert_numpy_array_equal(ls, exp_ls, check_dtype=False)
        tm.assert_numpy_array_equal(rs, exp_rs, check_dtype=False)

    def test_cython_inner_join(self):
        left = a_([0, 1, 2, 1, 2, 0, 0, 1, 2, 3, 3], dtype=np.int64)
        right = a_([1, 1, 0, 4, 2, 2, 1, 4], dtype=np.int64)
        max_group = 5

        ls, rs = libjoin.inner_join(left, right, max_group)

        exp_ls = left.argsort(kind='mergesort')
        exp_rs = right.argsort(kind='mergesort')

        exp_li = a_([0, 1, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, 6, 6, 7, 7, 8, 8])
        exp_ri = a_([0, 0, 0, 1, 2, 3, 1, 2, 3, 1, 2, 3, 4, 5, 4, 5, 4, 5])

        exp_ls = exp_ls.take(exp_li)
        exp_ls[exp_li == -1] = -1

        exp_rs = exp_rs.take(exp_ri)
        exp_rs[exp_ri == -1] = -1

        tm.assert_numpy_array_equal(ls, exp_ls, check_dtype=False)
        tm.assert_numpy_array_equal(rs, exp_rs, check_dtype=False)

    def test_left_outer_join(self):
        joined_key2 = merge(self.df, self.df2, on='key2')
        _check_join(self.df, self.df2, joined_key2, ['key2'], how='left')

        joined_both = merge(self.df, self.df2)
        _check_join(self.df,
                    self.df2,
                    joined_both, ['key1', 'key2'],
                    how='left')

    def test_right_outer_join(self):
        joined_key2 = merge(self.df, self.df2, on='key2', how='right')
        _check_join(self.df, self.df2, joined_key2, ['key2'], how='right')

        joined_both = merge(self.df, self.df2, how='right')
        _check_join(self.df,
                    self.df2,
                    joined_both, ['key1', 'key2'],
                    how='right')

    def test_full_outer_join(self):
        joined_key2 = merge(self.df, self.df2, on='key2', how='outer')
        _check_join(self.df, self.df2, joined_key2, ['key2'], how='outer')

        joined_both = merge(self.df, self.df2, how='outer')
        _check_join(self.df,
                    self.df2,
                    joined_both, ['key1', 'key2'],
                    how='outer')

    def test_inner_join(self):
        joined_key2 = merge(self.df, self.df2, on='key2', how='inner')
        _check_join(self.df, self.df2, joined_key2, ['key2'], how='inner')

        joined_both = merge(self.df, self.df2, how='inner')
        _check_join(self.df,
                    self.df2,
                    joined_both, ['key1', 'key2'],
                    how='inner')

    def test_handle_overlap(self):
        joined = merge(self.df, self.df2, on='key2', suffixes=['.foo', '.bar'])

        assert 'key1.foo' in joined
        assert 'key1.bar' in joined

    def test_handle_overlap_arbitrary_key(self):
        joined = merge(self.df,
                       self.df2,
                       left_on='key2',
                       right_on='key1',
                       suffixes=['.foo', '.bar'])
        assert 'key1.foo' in joined
        assert 'key2.bar' in joined

    def test_join_on(self):
        target = self.target
        source = self.source

        merged = target.join(source, on='C')
        tm.assert_series_equal(merged['MergedA'],
                               target['A'],
                               check_names=False)
        tm.assert_series_equal(merged['MergedD'],
                               target['D'],
                               check_names=False)

        # join with duplicates (fix regression from DataFrame/Matrix merge)
        df = DataFrame({'key': ['a', 'a', 'b', 'b', 'c']})
        df2 = DataFrame({'value': [0, 1, 2]}, index=['a', 'b', 'c'])
        joined = df.join(df2, on='key')
        expected = DataFrame({
            'key': ['a', 'a', 'b', 'b', 'c'],
            'value': [0, 0, 1, 1, 2]
        })
        assert_frame_equal(joined, expected)

        # Test when some are missing
        df_a = DataFrame([[1], [2], [3]],
                         index=['a', 'b', 'c'],
                         columns=['one'])
        df_b = DataFrame([['foo'], ['bar']], index=[1, 2], columns=['two'])
        df_c = DataFrame([[1], [2]], index=[1, 2], columns=['three'])
        joined = df_a.join(df_b, on='one')
        joined = joined.join(df_c, on='one')
        assert np.isnan(joined['two']['c'])
        assert np.isnan(joined['three']['c'])

        # merge column not p resent
        pytest.raises(KeyError, target.join, source, on='E')

        # overlap
        source_copy = source.copy()
        source_copy['A'] = 0
        pytest.raises(ValueError, target.join, source_copy, on='A')

    def test_join_on_fails_with_different_right_index(self):
        with pytest.raises(ValueError):
            df = DataFrame({
                'a': np.random.choice(['m', 'f'], size=3),
                'b': np.random.randn(3)
            })
            df2 = DataFrame(
                {
                    'a': np.random.choice(['m', 'f'], size=10),
                    'b': np.random.randn(10)
                },
                index=tm.makeCustomIndex(10, 2))
            merge(df, df2, left_on='a', right_index=True)

    def test_join_on_fails_with_different_left_index(self):
        with pytest.raises(ValueError):
            df = DataFrame(
                {
                    'a': np.random.choice(['m', 'f'], size=3),
                    'b': np.random.randn(3)
                },
                index=tm.makeCustomIndex(10, 2))
            df2 = DataFrame({
                'a': np.random.choice(['m', 'f'], size=10),
                'b': np.random.randn(10)
            })
            merge(df, df2, right_on='b', left_index=True)

    def test_join_on_fails_with_different_column_counts(self):
        with pytest.raises(ValueError):
            df = DataFrame({
                'a': np.random.choice(['m', 'f'], size=3),
                'b': np.random.randn(3)
            })
            df2 = DataFrame(
                {
                    'a': np.random.choice(['m', 'f'], size=10),
                    'b': np.random.randn(10)
                },
                index=tm.makeCustomIndex(10, 2))
            merge(df, df2, right_on='a', left_on=['a', 'b'])

    @pytest.mark.parametrize("wrong_type", [2, 'str', None, np.array([0, 1])])
    def test_join_on_fails_with_wrong_object_type(self, wrong_type):
        # GH12081 - original issue

        # GH21220 - merging of Series and DataFrame is now allowed
        # Edited test to remove the Series object from test parameters

        df = DataFrame({'a': [1, 1]})
        with pytest.raises(TypeError, match=str(type(wrong_type))):
            merge(wrong_type, df, left_on='a', right_on='a')
        with pytest.raises(TypeError, match=str(type(wrong_type))):
            merge(df, wrong_type, left_on='a', right_on='a')

    def test_join_on_pass_vector(self):
        expected = self.target.join(self.source, on='C')
        del expected['C']

        join_col = self.target.pop('C')
        result = self.target.join(self.source, on=join_col)
        assert_frame_equal(result, expected)

    def test_join_with_len0(self):
        # nothing to merge
        merged = self.target.join(self.source.reindex([]), on='C')
        for col in self.source:
            assert col in merged
            assert merged[col].isna().all()

        merged2 = self.target.join(self.source.reindex([]),
                                   on='C',
                                   how='inner')
        tm.assert_index_equal(merged2.columns, merged.columns)
        assert len(merged2) == 0

    def test_join_on_inner(self):
        df = DataFrame({'key': ['a', 'a', 'd', 'b', 'b', 'c']})
        df2 = DataFrame({'value': [0, 1]}, index=['a', 'b'])

        joined = df.join(df2, on='key', how='inner')

        expected = df.join(df2, on='key')
        expected = expected[expected['value'].notna()]
        tm.assert_series_equal(joined['key'],
                               expected['key'],
                               check_dtype=False)
        tm.assert_series_equal(joined['value'],
                               expected['value'],
                               check_dtype=False)
        tm.assert_index_equal(joined.index, expected.index)

    def test_join_on_singlekey_list(self):
        df = DataFrame({'key': ['a', 'a', 'b', 'b', 'c']})
        df2 = DataFrame({'value': [0, 1, 2]}, index=['a', 'b', 'c'])

        # corner cases
        joined = df.join(df2, on=['key'])
        expected = df.join(df2, on='key')

        assert_frame_equal(joined, expected)

    def test_join_on_series(self):
        result = self.target.join(self.source['MergedA'], on='C')
        expected = self.target.join(self.source[['MergedA']], on='C')
        assert_frame_equal(result, expected)

    def test_join_on_series_buglet(self):
        # GH #638
        df = DataFrame({'a': [1, 1]})
        ds = Series([2], index=[1], name='b')
        result = df.join(ds, on='a')
        expected = DataFrame({'a': [1, 1], 'b': [2, 2]}, index=df.index)
        tm.assert_frame_equal(result, expected)

    def test_join_index_mixed(self, join_type):
        # no overlapping blocks
        df1 = DataFrame(index=np.arange(10))
        df1['bool'] = True
        df1['string'] = 'foo'

        df2 = DataFrame(index=np.arange(5, 15))
        df2['int'] = 1
        df2['float'] = 1.

        joined = df1.join(df2, how=join_type)
        expected = _join_by_hand(df1, df2, how=join_type)
        assert_frame_equal(joined, expected)

        joined = df2.join(df1, how=join_type)
        expected = _join_by_hand(df2, df1, how=join_type)
        assert_frame_equal(joined, expected)

    def test_join_index_mixed_overlap(self):
        df1 = DataFrame({
            'A': 1.,
            'B': 2,
            'C': 'foo',
            'D': True
        },
                        index=np.arange(10),
                        columns=['A', 'B', 'C', 'D'])
        assert df1['B'].dtype == np.int64
        assert df1['D'].dtype == np.bool_

        df2 = DataFrame({
            'A': 1.,
            'B': 2,
            'C': 'foo',
            'D': True
        },
                        index=np.arange(0, 10, 2),
                        columns=['A', 'B', 'C', 'D'])

        # overlap
        joined = df1.join(df2, lsuffix='_one', rsuffix='_two')
        expected_columns = [
            'A_one', 'B_one', 'C_one', 'D_one', 'A_two', 'B_two', 'C_two',
            'D_two'
        ]
        df1.columns = expected_columns[:4]
        df2.columns = expected_columns[4:]
        expected = _join_by_hand(df1, df2)
        assert_frame_equal(joined, expected)

    def test_join_empty_bug(self):
        # generated an exception in 0.4.3
        x = DataFrame()
        x.join(DataFrame([3], index=[0], columns=['A']), how='outer')

    def test_join_unconsolidated(self):
        # GH #331
        a = DataFrame(randn(30, 2), columns=['a', 'b'])
        c = Series(randn(30))
        a['c'] = c
        d = DataFrame(randn(30, 1), columns=['q'])

        # it works!
        a.join(d)
        d.join(a)

    def test_join_multiindex(self):
        index1 = MultiIndex.from_arrays(
            [['a', 'a', 'a', 'b', 'b', 'b'], [1, 2, 3, 1, 2, 3]],
            names=['first', 'second'])

        index2 = MultiIndex.from_arrays(
            [['b', 'b', 'b', 'c', 'c', 'c'], [1, 2, 3, 1, 2, 3]],
            names=['first', 'second'])

        df1 = DataFrame(data=np.random.randn(6),
                        index=index1,
                        columns=['var X'])
        df2 = DataFrame(data=np.random.randn(6),
                        index=index2,
                        columns=['var Y'])

        df1 = df1.sort_index(level=0)
        df2 = df2.sort_index(level=0)

        joined = df1.join(df2, how='outer')
        ex_index = Index(index1.values).union(Index(index2.values))
        expected = df1.reindex(ex_index).join(df2.reindex(ex_index))
        expected.index.names = index1.names
        assert_frame_equal(joined, expected)
        assert joined.index.names == index1.names

        df1 = df1.sort_index(level=1)
        df2 = df2.sort_index(level=1)

        joined = df1.join(df2, how='outer').sort_index(level=0)
        ex_index = Index(index1.values).union(Index(index2.values))
        expected = df1.reindex(ex_index).join(df2.reindex(ex_index))
        expected.index.names = index1.names

        assert_frame_equal(joined, expected)
        assert joined.index.names == index1.names

    def test_join_inner_multiindex(self):
        key1 = [
            'bar', 'bar', 'bar', 'foo', 'foo', 'baz', 'baz', 'qux', 'qux',
            'snap'
        ]
        key2 = [
            'two', 'one', 'three', 'one', 'two', 'one', 'two', 'two', 'three',
            'one'
        ]

        data = np.random.randn(len(key1))
        data = DataFrame({'key1': key1, 'key2': key2, 'data': data})

        index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'],
                                   ['one', 'two', 'three']],
                           codes=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3],
                                  [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]],
                           names=['first', 'second'])
        to_join = DataFrame(np.random.randn(10, 3),
                            index=index,
                            columns=['j_one', 'j_two', 'j_three'])

        joined = data.join(to_join, on=['key1', 'key2'], how='inner')
        expected = merge(data,
                         to_join.reset_index(),
                         left_on=['key1', 'key2'],
                         right_on=['first', 'second'],
                         how='inner',
                         sort=False)

        expected2 = merge(to_join,
                          data,
                          right_on=['key1', 'key2'],
                          left_index=True,
                          how='inner',
                          sort=False)
        assert_frame_equal(joined, expected2.reindex_like(joined))

        expected2 = merge(to_join,
                          data,
                          right_on=['key1', 'key2'],
                          left_index=True,
                          how='inner',
                          sort=False)

        expected = expected.drop(['first', 'second'], axis=1)
        expected.index = joined.index

        assert joined.index.is_monotonic
        assert_frame_equal(joined, expected)

        # _assert_same_contents(expected, expected2.loc[:, expected.columns])

    def test_join_hierarchical_mixed(self):
        # GH 2024
        df = DataFrame([(1, 2, 3), (4, 5, 6)], columns=['a', 'b', 'c'])
        new_df = df.groupby(['a']).agg({'b': [np.mean, np.sum]})
        other_df = DataFrame([(1, 2, 3), (7, 10, 6)], columns=['a', 'b', 'd'])
        other_df.set_index('a', inplace=True)
        # GH 9455, 12219
        with tm.assert_produces_warning(UserWarning):
            result = merge(new_df, other_df, left_index=True, right_index=True)
        assert ('b', 'mean') in result
        assert 'b' in result

    def test_join_float64_float32(self):

        a = DataFrame(randn(10, 2), columns=['a', 'b'], dtype=np.float64)
        b = DataFrame(randn(10, 1), columns=['c'], dtype=np.float32)
        joined = a.join(b)
        assert joined.dtypes['a'] == 'float64'
        assert joined.dtypes['b'] == 'float64'
        assert joined.dtypes['c'] == 'float32'

        a = np.random.randint(0, 5, 100).astype('int64')
        b = np.random.random(100).astype('float64')
        c = np.random.random(100).astype('float32')
        df = DataFrame({'a': a, 'b': b, 'c': c})
        xpdf = DataFrame({'a': a, 'b': b, 'c': c})
        s = DataFrame(np.random.random(5).astype('float32'), columns=['md'])
        rs = df.merge(s, left_on='a', right_index=True)
        assert rs.dtypes['a'] == 'int64'
        assert rs.dtypes['b'] == 'float64'
        assert rs.dtypes['c'] == 'float32'
        assert rs.dtypes['md'] == 'float32'

        xp = xpdf.merge(s, left_on='a', right_index=True)
        assert_frame_equal(rs, xp)

    def test_join_many_non_unique_index(self):
        df1 = DataFrame({"a": [1, 1], "b": [1, 1], "c": [10, 20]})
        df2 = DataFrame({"a": [1, 1], "b": [1, 2], "d": [100, 200]})
        df3 = DataFrame({"a": [1, 1], "b": [1, 2], "e": [1000, 2000]})
        idf1 = df1.set_index(["a", "b"])
        idf2 = df2.set_index(["a", "b"])
        idf3 = df3.set_index(["a", "b"])

        result = idf1.join([idf2, idf3], how='outer')

        df_partially_merged = merge(df1, df2, on=['a', 'b'], how='outer')
        expected = merge(df_partially_merged, df3, on=['a', 'b'], how='outer')

        result = result.reset_index()
        expected = expected[result.columns]
        expected['a'] = expected.a.astype('int64')
        expected['b'] = expected.b.astype('int64')
        assert_frame_equal(result, expected)

        df1 = DataFrame({"a": [1, 1, 1], "b": [1, 1, 1], "c": [10, 20, 30]})
        df2 = DataFrame({"a": [1, 1, 1], "b": [1, 1, 2], "d": [100, 200, 300]})
        df3 = DataFrame({
            "a": [1, 1, 1],
            "b": [1, 1, 2],
            "e": [1000, 2000, 3000]
        })
        idf1 = df1.set_index(["a", "b"])
        idf2 = df2.set_index(["a", "b"])
        idf3 = df3.set_index(["a", "b"])
        result = idf1.join([idf2, idf3], how='inner')

        df_partially_merged = merge(df1, df2, on=['a', 'b'], how='inner')
        expected = merge(df_partially_merged, df3, on=['a', 'b'], how='inner')

        result = result.reset_index()

        assert_frame_equal(result, expected.loc[:, result.columns])

        # GH 11519
        df = DataFrame({
            'A': ['foo', 'bar', 'foo', 'bar', 'foo', 'bar', 'foo', 'foo'],
            'B': ['one', 'one', 'two', 'three', 'two', 'two', 'one', 'three'],
            'C':
            np.random.randn(8),
            'D':
            np.random.randn(8)
        })
        s = Series(np.repeat(np.arange(8), 2),
                   index=np.repeat(np.arange(8), 2),
                   name='TEST')
        inner = df.join(s, how='inner')
        outer = df.join(s, how='outer')
        left = df.join(s, how='left')
        right = df.join(s, how='right')
        assert_frame_equal(inner, outer)
        assert_frame_equal(inner, left)
        assert_frame_equal(inner, right)

    def test_join_sort(self):
        left = DataFrame({
            'key': ['foo', 'bar', 'baz', 'foo'],
            'value': [1, 2, 3, 4]
        })
        right = DataFrame({'value2': ['a', 'b', 'c']},
                          index=['bar', 'baz', 'foo'])

        joined = left.join(right, on='key', sort=True)
        expected = DataFrame(
            {
                'key': ['bar', 'baz', 'foo', 'foo'],
                'value': [2, 3, 1, 4],
                'value2': ['a', 'b', 'c', 'c']
            },
            index=[1, 2, 0, 3])
        assert_frame_equal(joined, expected)

        # smoke test
        joined = left.join(right, on='key', sort=False)
        tm.assert_index_equal(joined.index, pd.Index(lrange(4)))

    def test_join_mixed_non_unique_index(self):
        # GH 12814, unorderable types in py3 with a non-unique index
        df1 = DataFrame({'a': [1, 2, 3, 4]}, index=[1, 2, 3, 'a'])
        df2 = DataFrame({'b': [5, 6, 7, 8]}, index=[1, 3, 3, 4])
        result = df1.join(df2)
        expected = DataFrame(
            {
                'a': [1, 2, 3, 3, 4],
                'b': [5, np.nan, 6, 7, np.nan]
            },
            index=[1, 2, 3, 3, 'a'])
        tm.assert_frame_equal(result, expected)

        df3 = DataFrame({'a': [1, 2, 3, 4]}, index=[1, 2, 2, 'a'])
        df4 = DataFrame({'b': [5, 6, 7, 8]}, index=[1, 2, 3, 4])
        result = df3.join(df4)
        expected = DataFrame({
            'a': [1, 2, 3, 4],
            'b': [5, 6, 6, np.nan]
        },
                             index=[1, 2, 2, 'a'])
        tm.assert_frame_equal(result, expected)

    def test_join_non_unique_period_index(self):
        # GH #16871
        index = pd.period_range('2016-01-01', periods=16, freq='M')
        df = DataFrame([i for i in range(len(index))],
                       index=index,
                       columns=['pnum'])
        df2 = concat([df, df])
        result = df.join(df2, how='inner', rsuffix='_df2')
        expected = DataFrame(np.tile(
            np.arange(16, dtype=np.int64).repeat(2).reshape(-1, 1), 2),
                             columns=['pnum', 'pnum_df2'],
                             index=df2.sort_index().index)
        tm.assert_frame_equal(result, expected)

    def test_mixed_type_join_with_suffix(self):
        # GH #916
        df = DataFrame(np.random.randn(20, 6),
                       columns=['a', 'b', 'c', 'd', 'e', 'f'])
        df.insert(0, 'id', 0)
        df.insert(5, 'dt', 'foo')

        grouped = df.groupby('id')
        mn = grouped.mean()
        cn = grouped.count()

        # it works!
        mn.join(cn, rsuffix='_right')

    def test_join_many(self):
        df = DataFrame(np.random.randn(10, 6), columns=list('abcdef'))
        df_list = [df[['a', 'b']], df[['c', 'd']], df[['e', 'f']]]

        joined = df_list[0].join(df_list[1:])
        tm.assert_frame_equal(joined, df)

        df_list = [
            df[['a', 'b']][:-2], df[['c', 'd']][2:], df[['e', 'f']][1:9]
        ]

        def _check_diff_index(df_list, result, exp_index):
            reindexed = [x.reindex(exp_index) for x in df_list]
            expected = reindexed[0].join(reindexed[1:])
            tm.assert_frame_equal(result, expected)

        # different join types
        joined = df_list[0].join(df_list[1:], how='outer')
        _check_diff_index(df_list, joined, df.index)

        joined = df_list[0].join(df_list[1:])
        _check_diff_index(df_list, joined, df_list[0].index)

        joined = df_list[0].join(df_list[1:], how='inner')
        _check_diff_index(df_list, joined, df.index[2:8])

        pytest.raises(ValueError, df_list[0].join, df_list[1:], on='a')

    def test_join_many_mixed(self):
        df = DataFrame(np.random.randn(8, 4), columns=['A', 'B', 'C', 'D'])
        df['key'] = ['foo', 'bar'] * 4
        df1 = df.loc[:, ['A', 'B']]
        df2 = df.loc[:, ['C', 'D']]
        df3 = df.loc[:, ['key']]

        result = df1.join([df2, df3])
        assert_frame_equal(result, df)

    def test_join_dups(self):

        # joining dups
        df = concat([
            DataFrame(np.random.randn(10, 4), columns=['A', 'A', 'B', 'B']),
            DataFrame(np.random.randint(0, 10, size=20).reshape(10, 2),
                      columns=['A', 'C'])
        ],
                    axis=1)

        expected = concat([df, df], axis=1)
        result = df.join(df, rsuffix='_2')
        result.columns = expected.columns
        assert_frame_equal(result, expected)

        # GH 4975, invalid join on dups
        w = DataFrame(np.random.randn(4, 2), columns=["x", "y"])
        x = DataFrame(np.random.randn(4, 2), columns=["x", "y"])
        y = DataFrame(np.random.randn(4, 2), columns=["x", "y"])
        z = DataFrame(np.random.randn(4, 2), columns=["x", "y"])

        dta = x.merge(y, left_index=True,
                      right_index=True).merge(z,
                                              left_index=True,
                                              right_index=True,
                                              how="outer")
        dta = dta.merge(w, left_index=True, right_index=True)
        expected = concat([x, y, z, w], axis=1)
        expected.columns = [
            'x_x', 'y_x', 'x_y', 'y_y', 'x_x', 'y_x', 'x_y', 'y_y'
        ]
        assert_frame_equal(dta, expected)

    def test_panel_join(self):
        with catch_warnings(record=True):
            panel = tm.makePanel()
            tm.add_nans(panel)

            p1 = panel.iloc[:2, :10, :3]
            p2 = panel.iloc[2:, 5:, 2:]

            # left join
            result = p1.join(p2)
            expected = p1.copy()
            expected['ItemC'] = p2['ItemC']
            tm.assert_panel_equal(result, expected)

            # right join
            result = p1.join(p2, how='right')
            expected = p2.copy()
            expected['ItemA'] = p1['ItemA']
            expected['ItemB'] = p1['ItemB']
            expected = expected.reindex(items=['ItemA', 'ItemB', 'ItemC'])
            tm.assert_panel_equal(result, expected)

            # inner join
            result = p1.join(p2, how='inner')
            expected = panel.iloc[:, 5:10, 2:3]
            tm.assert_panel_equal(result, expected)

            # outer join
            result = p1.join(p2, how='outer')
            expected = p1.reindex(major=panel.major_axis,
                                  minor=panel.minor_axis)
            expected = expected.join(
                p2.reindex(major=panel.major_axis, minor=panel.minor_axis))
            tm.assert_panel_equal(result, expected)

    def test_panel_join_overlap(self):
        with catch_warnings(record=True):
            panel = tm.makePanel()
            tm.add_nans(panel)

            p1 = panel.loc[['ItemA', 'ItemB', 'ItemC']]
            p2 = panel.loc[['ItemB', 'ItemC']]

            # Expected index is
            #
            # ItemA, ItemB_p1, ItemC_p1, ItemB_p2, ItemC_p2
            joined = p1.join(p2, lsuffix='_p1', rsuffix='_p2')
            p1_suf = p1.loc[['ItemB', 'ItemC']].add_suffix('_p1')
            p2_suf = p2.loc[['ItemB', 'ItemC']].add_suffix('_p2')
            no_overlap = panel.loc[['ItemA']]
            expected = no_overlap.join(p1_suf.join(p2_suf))
            tm.assert_panel_equal(joined, expected)

    def test_panel_join_many(self):
        with catch_warnings(record=True):
            tm.K = 10
            panel = tm.makePanel()
            tm.K = 4

            panels = [panel.iloc[:2], panel.iloc[2:6], panel.iloc[6:]]

            joined = panels[0].join(panels[1:])
            tm.assert_panel_equal(joined, panel)

            panels = [
                panel.iloc[:2, :-5], panel.iloc[2:6, 2:], panel.iloc[6:, 5:-7]
            ]

            data_dict = {}
            for p in panels:
                data_dict.update(p.iteritems())

            joined = panels[0].join(panels[1:], how='inner')
            expected = pd.Panel.from_dict(data_dict, intersect=True)
            tm.assert_panel_equal(joined, expected)

            joined = panels[0].join(panels[1:], how='outer')
            expected = pd.Panel.from_dict(data_dict, intersect=False)
            tm.assert_panel_equal(joined, expected)

            # edge cases
            pytest.raises(ValueError,
                          panels[0].join,
                          panels[1:],
                          how='outer',
                          lsuffix='foo',
                          rsuffix='bar')
            pytest.raises(ValueError, panels[0].join, panels[1:], how='right')

    def test_join_multi_to_multi(self, join_type):
        # GH 20475
        leftindex = MultiIndex.from_product(
            [list('abc'), list('xy'), [1, 2]], names=['abc', 'xy', 'num'])
        left = DataFrame({'v1': range(12)}, index=leftindex)

        rightindex = MultiIndex.from_product(
            [list('abc'), list('xy')], names=['abc', 'xy'])
        right = DataFrame({'v2': [100 * i for i in range(1, 7)]},
                          index=rightindex)

        result = left.join(right, on=['abc', 'xy'], how=join_type)
        expected = (left.reset_index().merge(right.reset_index(),
                                             on=['abc', 'xy'],
                                             how=join_type).set_index(
                                                 ['abc', 'xy', 'num']))
        assert_frame_equal(expected, result)

        with pytest.raises(ValueError):
            left.join(right, on='xy', how=join_type)

        with pytest.raises(ValueError):
            right.join(left, on=['abc', 'xy'], how=join_type)
Example #49
0
    def test_getitem_setitem_float_labels(self):
        index = Index([1.5, 2, 3, 4, 5])
        df = DataFrame(np.random.randn(5, 5), index=index)

        result = df.loc[1.5:4]
        expected = df.reindex([1.5, 2, 3, 4])
        tm.assert_frame_equal(result, expected)
        assert len(result) == 4

        result = df.loc[4:5]
        expected = df.reindex([4, 5])  # reindex with int
        tm.assert_frame_equal(result, expected, check_index_type=False)
        assert len(result) == 2

        result = df.loc[4:5]
        expected = df.reindex([4.0, 5.0])  # reindex with float
        tm.assert_frame_equal(result, expected)
        assert len(result) == 2

        # loc_float changes this to work properly
        result = df.loc[1:2]
        expected = df.iloc[0:2]
        tm.assert_frame_equal(result, expected)

        df.loc[1:2] = 0
        result = df[1:2]
        assert (result == 0).all().all()

        # #2727
        index = Index([1.0, 2.5, 3.5, 4.5, 5.0])
        df = DataFrame(np.random.randn(5, 5), index=index)

        # positional slicing only via iloc!
        msg = ("cannot do positional indexing on Float64Index with "
               r"these indexers \[1.0\] of type float")
        with pytest.raises(TypeError, match=msg):
            df.iloc[1.0:5]

        result = df.iloc[4:5]
        expected = df.reindex([5.0])
        tm.assert_frame_equal(result, expected)
        assert len(result) == 1

        cp = df.copy()

        with pytest.raises(TypeError, match=_slice_msg):
            cp.iloc[1.0:5] = 0

        with pytest.raises(TypeError, match=msg):
            result = cp.iloc[1.0:5] == 0

        assert result.values.all()
        assert (cp.iloc[0:1] == df.iloc[0:1]).values.all()

        cp = df.copy()
        cp.iloc[4:5] = 0
        assert (cp.iloc[4:5] == 0).values.all()
        assert (cp.iloc[0:4] == df.iloc[0:4]).values.all()

        # float slicing
        result = df.loc[1.0:5]
        expected = df
        tm.assert_frame_equal(result, expected)
        assert len(result) == 5

        result = df.loc[1.1:5]
        expected = df.reindex([2.5, 3.5, 4.5, 5.0])
        tm.assert_frame_equal(result, expected)
        assert len(result) == 4

        result = df.loc[4.51:5]
        expected = df.reindex([5.0])
        tm.assert_frame_equal(result, expected)
        assert len(result) == 1

        result = df.loc[1.0:5.0]
        expected = df.reindex([1.0, 2.5, 3.5, 4.5, 5.0])
        tm.assert_frame_equal(result, expected)
        assert len(result) == 5

        cp = df.copy()
        cp.loc[1.0:5.0] = 0
        result = cp.loc[1.0:5.0]
        assert (result == 0).values.all()
Example #50
0
def test_basic():

    cats = Categorical(["a", "a", "a", "b", "b", "b", "c", "c", "c"],
                       categories=["a", "b", "c", "d"],
                       ordered=True)
    data = DataFrame({"a": [1, 1, 1, 2, 2, 2, 3, 4, 5], "b": cats})

    exp_index = CategoricalIndex(list('abcd'), name='b', ordered=True)
    expected = DataFrame({'a': [1, 2, 4, np.nan]}, index=exp_index)
    result = data.groupby("b", observed=False).mean()
    tm.assert_frame_equal(result, expected)

    cat1 = Categorical(["a", "a", "b", "b"],
                       categories=["a", "b", "z"],
                       ordered=True)
    cat2 = Categorical(["c", "d", "c", "d"],
                       categories=["c", "d", "y"],
                       ordered=True)
    df = DataFrame({"A": cat1, "B": cat2, "values": [1, 2, 3, 4]})

    # single grouper
    gb = df.groupby("A", observed=False)
    exp_idx = CategoricalIndex(['a', 'b', 'z'], name='A', ordered=True)
    expected = DataFrame({'values': Series([3, 7, 0], index=exp_idx)})
    result = gb.sum()
    tm.assert_frame_equal(result, expected)

    # GH 8623
    x = DataFrame([[1, 'John P. Doe'], [2, 'Jane Dove'], [1, 'John P. Doe']],
                  columns=['person_id', 'person_name'])
    x['person_name'] = Categorical(x.person_name)

    g = x.groupby(['person_id'], observed=False)
    result = g.transform(lambda x: x)
    tm.assert_frame_equal(result, x[['person_name']])

    result = x.drop_duplicates('person_name')
    expected = x.iloc[[0, 1]]
    tm.assert_frame_equal(result, expected)

    def f(x):
        return x.drop_duplicates('person_name').iloc[0]

    result = g.apply(f)
    expected = x.iloc[[0, 1]].copy()
    expected.index = Index([1, 2], name='person_id')
    expected['person_name'] = expected['person_name'].astype('object')
    tm.assert_frame_equal(result, expected)

    # GH 9921
    # Monotonic
    df = DataFrame({"a": [5, 15, 25]})
    c = pd.cut(df.a, bins=[0, 10, 20, 30, 40])

    result = df.a.groupby(c, observed=False).transform(sum)
    tm.assert_series_equal(result, df['a'])

    tm.assert_series_equal(
        df.a.groupby(c, observed=False).transform(lambda xs: np.sum(xs)),
        df['a'])
    tm.assert_frame_equal(
        df.groupby(c, observed=False).transform(sum), df[['a']])
    tm.assert_frame_equal(
        df.groupby(c, observed=False).transform(lambda xs: np.max(xs)),
        df[['a']])

    # Filter
    tm.assert_series_equal(
        df.a.groupby(c, observed=False).filter(np.all), df['a'])
    tm.assert_frame_equal(df.groupby(c, observed=False).filter(np.all), df)

    # Non-monotonic
    df = DataFrame({"a": [5, 15, 25, -5]})
    c = pd.cut(df.a, bins=[-10, 0, 10, 20, 30, 40])

    result = df.a.groupby(c, observed=False).transform(sum)
    tm.assert_series_equal(result, df['a'])

    tm.assert_series_equal(
        df.a.groupby(c, observed=False).transform(lambda xs: np.sum(xs)),
        df['a'])
    tm.assert_frame_equal(
        df.groupby(c, observed=False).transform(sum), df[['a']])
    tm.assert_frame_equal(
        df.groupby(c, observed=False).transform(lambda xs: np.sum(xs)),
        df[['a']])

    # GH 9603
    df = DataFrame({'a': [1, 0, 0, 0]})
    c = pd.cut(df.a, [0, 1, 2, 3, 4], labels=Categorical(list('abcd')))
    result = df.groupby(c, observed=False).apply(len)

    exp_index = CategoricalIndex(c.values.categories, ordered=c.values.ordered)
    expected = Series([1, 0, 0, 0], index=exp_index)
    expected.index.name = 'a'
    tm.assert_series_equal(result, expected)

    # more basic
    levels = ['foo', 'bar', 'baz', 'qux']
    codes = np.random.randint(0, 4, size=100)

    cats = Categorical.from_codes(codes, levels, ordered=True)

    data = DataFrame(np.random.randn(100, 4))

    result = data.groupby(cats, observed=False).mean()

    expected = data.groupby(np.asarray(cats), observed=False).mean()
    exp_idx = CategoricalIndex(levels,
                               categories=cats.categories,
                               ordered=True)
    expected = expected.reindex(exp_idx)

    assert_frame_equal(result, expected)

    grouped = data.groupby(cats, observed=False)
    desc_result = grouped.describe()

    idx = cats.codes.argsort()
    ord_labels = np.asarray(cats).take(idx)
    ord_data = data.take(idx)

    exp_cats = Categorical(ord_labels,
                           ordered=True,
                           categories=['foo', 'bar', 'baz', 'qux'])
    expected = ord_data.groupby(exp_cats, sort=False,
                                observed=False).describe()
    assert_frame_equal(desc_result, expected)

    # GH 10460
    expc = Categorical.from_codes(np.arange(4).repeat(8), levels, ordered=True)
    exp = CategoricalIndex(expc)
    tm.assert_index_equal((desc_result.stack().index.get_level_values(0)), exp)
    exp = Index(['count', 'mean', 'std', 'min', '25%', '50%', '75%', 'max'] *
                4)
    tm.assert_index_equal((desc_result.stack().index.get_level_values(1)), exp)
Example #51
0
# 当reindex时指定的index少于原有的index的情况
print(s1)
print(s1.reindex(index=['A', 'B']))

# drop删除指定index值
print(s1)
print(s1.drop('A'))
print("{{{{{{{{{{{{{{{{{}}}}}}}}}}}")

# dataframe的reindex
# 改变dataframe的index和column:
df1 = DataFrame(np.random.rand(25).reshape([5, 5]),
                index=['A', 'B', 'D', 'E', 'F'],
                columns=['c1', 'c2', 'c3', 'c4', 'c5'])
print(df1)
print(df1.reindex(index=['A', 'B', 'C', 'D', 'E', 'F']))
print(df1.reindex(columns=['c1', 'c2', 'c3', 'c4', 'c5', 'c6']))

# 同时改变dataframe的index和column:
print(
    df1.reindex(index=['A', 'B', 'C', 'D', 'E', 'F'],
                columns=['c1', 'c2', 'c3', 'c4', 'c5', 'c6']))

# 当reindex时指定的index少于原有的index的情况:
print(df1)
print(df1.reindex(index=['A', 'B']))

# dataframe的drop操作
print(df1)
print(df1.drop('A', axis=0))
print(df1.drop('c1', axis=1))
Example #52
0
def test_observed(observed):
    # multiple groupers, don't re-expand the output space
    # of the grouper
    # gh-14942 (implement)
    # gh-10132 (back-compat)
    # gh-8138 (back-compat)
    # gh-8869

    cat1 = Categorical(["a", "a", "b", "b"], categories=["a", "b", "z"], ordered=True)
    cat2 = Categorical(["c", "d", "c", "d"], categories=["c", "d", "y"], ordered=True)
    df = DataFrame({"A": cat1, "B": cat2, "values": [1, 2, 3, 4]})
    df["C"] = ["foo", "bar"] * 2

    # multiple groupers with a non-cat
    gb = df.groupby(["A", "B", "C"], observed=observed)
    exp_index = MultiIndex.from_arrays(
        [cat1, cat2, ["foo", "bar"] * 2], names=["A", "B", "C"]
    )
    expected = DataFrame({"values": Series([1, 2, 3, 4], index=exp_index)}).sort_index()
    result = gb.sum()
    if not observed:
        expected = cartesian_product_for_groupers(
            expected, [cat1, cat2, ["foo", "bar"]], list("ABC")
        )

    tm.assert_frame_equal(result, expected)

    gb = df.groupby(["A", "B"], observed=observed)
    exp_index = MultiIndex.from_arrays([cat1, cat2], names=["A", "B"])
    expected = DataFrame({"values": [1, 2, 3, 4]}, index=exp_index)
    result = gb.sum()
    if not observed:
        expected = cartesian_product_for_groupers(expected, [cat1, cat2], list("AB"))

    tm.assert_frame_equal(result, expected)

    # https://github.com/pandas-dev/pandas/issues/8138
    d = {
        "cat": Categorical(
            ["a", "b", "a", "b"], categories=["a", "b", "c"], ordered=True
        ),
        "ints": [1, 1, 2, 2],
        "val": [10, 20, 30, 40],
    }
    df = DataFrame(d)

    # Grouping on a single column
    groups_single_key = df.groupby("cat", observed=observed)
    result = groups_single_key.mean()

    exp_index = CategoricalIndex(
        list("ab"), name="cat", categories=list("abc"), ordered=True
    )
    expected = DataFrame({"ints": [1.5, 1.5], "val": [20.0, 30]}, index=exp_index)
    if not observed:
        index = CategoricalIndex(
            list("abc"), name="cat", categories=list("abc"), ordered=True
        )
        expected = expected.reindex(index)

    tm.assert_frame_equal(result, expected)

    # Grouping on two columns
    groups_double_key = df.groupby(["cat", "ints"], observed=observed)
    result = groups_double_key.agg("mean")
    expected = DataFrame(
        {
            "val": [10, 30, 20, 40],
            "cat": Categorical(
                ["a", "a", "b", "b"], categories=["a", "b", "c"], ordered=True
            ),
            "ints": [1, 2, 1, 2],
        }
    ).set_index(["cat", "ints"])
    if not observed:
        expected = cartesian_product_for_groupers(
            expected, [df.cat.values, [1, 2]], ["cat", "ints"]
        )

    tm.assert_frame_equal(result, expected)

    # GH 10132
    for key in [("a", 1), ("b", 2), ("b", 1), ("a", 2)]:
        c, i = key
        result = groups_double_key.get_group(key)
        expected = df[(df.cat == c) & (df.ints == i)]
        tm.assert_frame_equal(result, expected)

    # gh-8869
    # with as_index
    d = {
        "foo": [10, 8, 4, 8, 4, 1, 1],
        "bar": [10, 20, 30, 40, 50, 60, 70],
        "baz": ["d", "c", "e", "a", "a", "d", "c"],
    }
    df = DataFrame(d)
    cat = pd.cut(df["foo"], np.linspace(0, 10, 3))
    df["range"] = cat
    groups = df.groupby(["range", "baz"], as_index=False, observed=observed)
    result = groups.agg("mean")

    groups2 = df.groupby(["range", "baz"], as_index=True, observed=observed)
    expected = groups2.agg("mean").reset_index()
    tm.assert_frame_equal(result, expected)
Example #53
0
def construct_datasets(Origin, min_year=2008, subsample=None):
    import MySQLdb

    ## print '   Connecting to database...'
    ## time0 = time.time()
    db = MySQLdb.connect(host="localhost",
                         user="******",
                         passwd="z2yv52K*hJ<otclN",
                         db="DelayMeNot",
                         local_infile=1)
    ## print '       That took %.1f seconds' % (time.time() - time0)

    cur = db.cursor()

    ### flights origination from Origin
    print '   Querying database...'
    time0 = time.time()
    ## if min_year == 2008:
    ##     cur.execute("SELECT Year, Month, DayofMonth, DayOfWeek, CRSDepTime, CRSArrTime, UniqueCarrier, CRSElapsedTime, ArrDelay, Dest, Distance FROM flights_2008_2013 WHERE Origin = '%s';" % (Origin))
    ## else:
    ##     cur.execute("SELECT Year, Month, DayofMonth, DayOfWeek, CRSDepTime, CRSArrTime, UniqueCarrier, CRSElapsedTime, ArrDelay, Dest, Distance FROM flights WHERE Origin = '%s' AND Year >= %d;" % (Origin, min_year))

    ## print "SELECT Year, Month, DayofMonth, DayOfWeek, CRSDepTime, CRSArrTime, UniqueCarrier, CRSElapsedTime, ArrDelay, Dest, Distance FROM flights WHERE Origin = '%s' AND Year >= %d;" % (Origin, min_year)
    cur.execute(
        "SELECT Year, Month, DayofMonth, DayOfWeek, CRSDepTime, CRSArrTime, UniqueCarrier, CRSElapsedTime, ArrDelay, Dest, Distance FROM flights WHERE Origin = '%s' AND Year >= %d;"
        % (Origin, min_year))

    print '      That took %.1f seconds.' % (time.time() - time0)

    rows = cur.fetchall()

    ### Convert to DataFrame
    print '   Converting to DataFrame...'
    time0 = time.time()

    df = DataFrame(list(rows),
                   columns=[
                       'Year', 'Month', 'DayOfMonth', 'DayOfWeek',
                       'ScheduledDepartureTime', 'ScheduledArrivalTime',
                       'Carrier', 'ScheduledElapsedTime', 'ArrivalDelay',
                       'Destination', 'Distance'
                   ])
    del rows

    ### drop columns without delays (cancellations)
    df = df.dropna()

    ### Create some auxiliary columns

    df['DayOfYear'] = df.apply(lambda x: datetime.datetime(
        x['Year'], x['Month'], x['DayOfMonth']).timetuple().tm_yday,
                               axis=1)
    df['Week'] = df['DayOfYear'] / 7 + 1
    df['ScheduledDepartureHour'] = df['ScheduledDepartureTime'] / 100 + df[
        'ScheduledDepartureTime'] % 100 / 60.0
    df['ScheduledArrivalHour'] = df['ScheduledArrivalTime'] / 100 + df[
        'ScheduledArrivalTime'] % 100 / 60.0

    df = df.drop(['ScheduledDepartureTime', 'ScheduledArrivalTime'], axis=1)

    ### Add DaysFromNearestHoliday column
    df = distance_to_holiday(df)

    ## df.head()
    print '      That took %.1f seconds.' % (time.time() - time0)

    ### subsample by a factor of 10
    if subsample is not None:
        print '   Subsampling (x%d) DataFrame...' % subsample
        time0 = time.time()
        df = df.ix[::subsample]
        print '   That took %.1f seconds.' % (time.time() - time0)
    ## print len(df)

    ## ### Normalize columns  (*** this isn't necessary for RandomForest ***)
    ## def normalize(var):
    ##     return (var - var.min()).astype(float) / (var.max() - var.min())
    ## df['Year'] = normalize(df['Year'])
    ## df['Month'] = normalize(df['Month'])
    ## df['DayOfMonth'] = normalize(df['DayOfMonth'])
    ## df['DayOfWeek'] = normalize(df['DayOfWeek'])
    ## df['DayOfYear'] = normalize(df['DayOfYear'])
    ## df['Week'] = normalize(df['Week'])
    ## df['ScheduledDepartureHour'] = normalize(df['ScheduledDepartureHour'])
    ## df['ScheduledArrivalHour'] = normalize(df['ScheduledArrivalHour'])
    ## df['ScheduledElapsedTime'] = normalize(df['ScheduledElapsedTime'])
    ## df['Distance'] = normalize(df['Distance'])
    ## df['DaysFromNearestHoliday'] = normalize(df['DaysFromNearestHoliday'])

    ## print df.head()

    ##### Dummification should happen after unpickling, since including all the dummified columns makes the pickles huge!

    ## ### "Dummify" the categorical 'Carrier' and 'Destination' columns,
    ## ### and add the dummies to the table, but drop the first dummy
    ## ### column to avoid "dummy variable trap".

    ## dummies = pd.get_dummies(df['Carrier'],prefix='Carrier')
    ## ## ## print dummies.columns
    ## df = df.join(dummies.ix[:,1:])

    ## dummies = pd.get_dummies(df['Destination'],prefix='Destination')
    ## df = df.join(dummies.ix[:,1:])

    ## ### Drop dummified columns
    ## df = df.drop(['Carrier','Destination'],axis=1)

    ## print len(df.columns)
    ## print df.head()

    ### Shuffle and create separate train and test datasets
    print '   Separating into training and testing dataset...'
    time0 = time.time()
    df = df.reindex(np.random.permutation(df.index))
    Nrow = len(df)
    Ntrain = int(2.0 / 3.0 * Nrow)
    Ntest = Nrow - Ntrain
    data_train = df[:Ntrain]
    data_test = df[Ntrain:]
    del df
    print '       That took %.1f seconds.' % (time.time() - time0)

    ### Close up the cursor and database
    cur.close()
    db.close()

    return (data_train, data_test)
Example #54
0
    def test_getitem_boolean(self, mixed_float_frame, mixed_int_frame,
                             datetime_frame):
        # boolean indexing
        d = datetime_frame.index[10]
        indexer = datetime_frame.index > d
        indexer_obj = indexer.astype(object)

        subindex = datetime_frame.index[indexer]
        subframe = datetime_frame[indexer]

        tm.assert_index_equal(subindex, subframe.index)
        with pytest.raises(ValueError, match="Item wrong length"):
            datetime_frame[indexer[:-1]]

        subframe_obj = datetime_frame[indexer_obj]
        tm.assert_frame_equal(subframe_obj, subframe)

        with pytest.raises(ValueError, match="Boolean array expected"):
            datetime_frame[datetime_frame]

        # test that Series work
        indexer_obj = Series(indexer_obj, datetime_frame.index)

        subframe_obj = datetime_frame[indexer_obj]
        tm.assert_frame_equal(subframe_obj, subframe)

        # test that Series indexers reindex
        # we are producing a warning that since the passed boolean
        # key is not the same as the given index, we will reindex
        # not sure this is really necessary
        with tm.assert_produces_warning(UserWarning):
            indexer_obj = indexer_obj.reindex(datetime_frame.index[::-1])
            subframe_obj = datetime_frame[indexer_obj]
            tm.assert_frame_equal(subframe_obj, subframe)

        # test df[df > 0]
        for df in [
                datetime_frame,
                mixed_float_frame,
                mixed_int_frame,
        ]:

            data = df._get_numeric_data()
            bif = df[df > 0]
            bifw = DataFrame(
                {
                    c: np.where(data[c] > 0, data[c], np.nan)
                    for c in data.columns
                },
                index=data.index,
                columns=data.columns,
            )

            # add back other columns to compare
            for c in df.columns:
                if c not in bifw:
                    bifw[c] = df[c]
            bifw = bifw.reindex(columns=df.columns)

            tm.assert_frame_equal(bif, bifw, check_dtype=False)
            for c in df.columns:
                if bif[c].dtype != bifw[c].dtype:
                    assert bif[c].dtype == df[c].dtype
Example #55
0
nfl_frame.head(3)
nfl_frame.ix[3]

# Add new columns to an existing data frame
nfl_frame['Stadium'] = np.arrange(5)
stadiums = Series(["Levi's Stadium", "AT&T Stadium"], index=[4, 0])
nfl_frame['Stadium'] = stadiums
del nfl_frame['Stadium']

# Create data frames from dictionaries
data = {'City': ['SF', 'LA', 'NYC'], 'Population': [837000, 388000, 840000]}
city_frame = DataFrame(data)

# Reindex
from numpy.random import randn
ser1 = Series([1, 2, 3, 4], index=['A', 'B', 'C', 'D'])
my_index = ser1.index
ser2 = ser1.reindex(['A', 'B', 'C', 'D', 'E', 'F'])
ser2.reindex(['A', 'B', 'C', 'D', 'E', 'F', 'G'], fill_value=0)
ser3 = Series(['USA', 'Mexico', 'Canada'], index=[0, 5, 10])
ser3.reindex(range(15), method='ffill')
dframe = DataFrame(randn(25).reshape((5, 5)),
                   index=['A', 'B', 'D', 'E', 'F'],
                   columns=['col1', 'col2', 'col3', 'col4', 'col5'])
new_columns = ['col1', 'col2', 'col3', 'col4', 'col5', 'col6']
dframe2 = dframe.reindex(
    ['A', 'B', 'C', 'D', 'E',
     'F'])  #### This line and the line below are equivalent to the last line
dframe2.reindex(columns=new_columns)  ####
dframe.ix[['A', 'B', 'C', 'D', 'E', 'F'], new_columns]
Example #56
0
class TestCategoricalIndex:
    def setup_method(self, method):

        self.df = DataFrame({
            "A":
            np.arange(6, dtype="int64"),
            "B":
            Series(list("aabbca")).astype(CDT(list("cab"))),
        }).set_index("B")
        self.df2 = DataFrame({
            "A":
            np.arange(6, dtype="int64"),
            "B":
            Series(list("aabbca")).astype(CDT(list("cabe"))),
        }).set_index("B")
        self.df3 = DataFrame({
            "A":
            np.arange(6, dtype="int64"),
            "B": (Series([1, 1, 2, 1, 3,
                          2]).astype(CDT([3, 2, 1], ordered=True))),
        }).set_index("B")
        self.df4 = DataFrame({
            "A":
            np.arange(6, dtype="int64"),
            "B": (Series([1, 1, 2, 1, 3,
                          2]).astype(CDT([3, 2, 1], ordered=False))),
        }).set_index("B")

    def test_loc_scalar(self):
        result = self.df.loc["a"]
        expected = DataFrame({
            "A": [0, 1, 5],
            "B": (Series(list("aaa")).astype(CDT(list("cab"))))
        }).set_index("B")
        tm.assert_frame_equal(result, expected)

        df = self.df.copy()
        df.loc["a"] = 20
        expected = DataFrame({
            "A": [20, 20, 2, 3, 4, 20],
            "B": (Series(list("aabbca")).astype(CDT(list("cab")))),
        }).set_index("B")
        tm.assert_frame_equal(df, expected)

        # value not in the categories
        with pytest.raises(KeyError, match=r"^'d'$"):
            df.loc["d"]

        msg = "cannot append a non-category item to a CategoricalIndex"
        with pytest.raises(TypeError, match=msg):
            df.loc["d"] = 10

        msg = ("cannot insert an item into a CategoricalIndex that is not"
               " already an existing category")
        with pytest.raises(TypeError, match=msg):
            df.loc["d", "A"] = 10
        with pytest.raises(TypeError, match=msg):
            df.loc["d", "C"] = 10

        msg = (
            r"cannot do label indexing on <class 'pandas\.core\.indexes\.category"
            r"\.CategoricalIndex'> with these indexers \[1\] of <class 'int'>")
        with pytest.raises(TypeError, match=msg):
            df.loc[1]

    def test_getitem_scalar(self):

        cats = Categorical([Timestamp("12-31-1999"), Timestamp("12-31-2000")])

        s = Series([1, 2], index=cats)

        expected = s.iloc[0]
        result = s[cats[0]]
        assert result == expected

    def test_slicing_directly(self):
        cat = Categorical(["a", "b", "c", "d", "a", "b", "c"])
        sliced = cat[3]
        assert sliced == "d"
        sliced = cat[3:5]
        expected = Categorical(["d", "a"], categories=["a", "b", "c", "d"])
        tm.assert_numpy_array_equal(sliced._codes, expected._codes)
        tm.assert_index_equal(sliced.categories, expected.categories)

    def test_slicing(self):
        cat = Series(Categorical([1, 2, 3, 4]))
        reversed = cat[::-1]
        exp = np.array([4, 3, 2, 1], dtype=np.int64)
        tm.assert_numpy_array_equal(reversed.__array__(), exp)

        df = DataFrame({"value": (np.arange(100) + 1).astype("int64")})
        df["D"] = pd.cut(df.value, bins=[0, 25, 50, 75, 100])

        expected = Series([11, Interval(0, 25)], index=["value", "D"], name=10)
        result = df.iloc[10]
        tm.assert_series_equal(result, expected)

        expected = DataFrame(
            {"value": np.arange(11, 21).astype("int64")},
            index=np.arange(10, 20).astype("int64"),
        )
        expected["D"] = pd.cut(expected.value, bins=[0, 25, 50, 75, 100])
        result = df.iloc[10:20]
        tm.assert_frame_equal(result, expected)

        expected = Series([9, Interval(0, 25)], index=["value", "D"], name=8)
        result = df.loc[8]
        tm.assert_series_equal(result, expected)

    def test_slicing_and_getting_ops(self):

        # systematically test the slicing operations:
        #  for all slicing ops:
        #   - returning a dataframe
        #   - returning a column
        #   - returning a row
        #   - returning a single value

        cats = Categorical(["a", "c", "b", "c", "c", "c", "c"],
                           categories=["a", "b", "c"])
        idx = Index(["h", "i", "j", "k", "l", "m", "n"])
        values = [1, 2, 3, 4, 5, 6, 7]
        df = DataFrame({"cats": cats, "values": values}, index=idx)

        # the expected values
        cats2 = Categorical(["b", "c"], categories=["a", "b", "c"])
        idx2 = Index(["j", "k"])
        values2 = [3, 4]

        # 2:4,: | "j":"k",:
        exp_df = DataFrame({"cats": cats2, "values": values2}, index=idx2)

        # :,"cats" | :,0
        exp_col = Series(cats, index=idx, name="cats")

        # "j",: | 2,:
        exp_row = Series(["b", 3],
                         index=["cats", "values"],
                         dtype="object",
                         name="j")

        # "j","cats | 2,0
        exp_val = "b"

        # iloc
        # frame
        res_df = df.iloc[2:4, :]
        tm.assert_frame_equal(res_df, exp_df)
        assert is_categorical_dtype(res_df["cats"])

        # row
        res_row = df.iloc[2, :]
        tm.assert_series_equal(res_row, exp_row)
        assert isinstance(res_row["cats"], str)

        # col
        res_col = df.iloc[:, 0]
        tm.assert_series_equal(res_col, exp_col)
        assert is_categorical_dtype(res_col)

        # single value
        res_val = df.iloc[2, 0]
        assert res_val == exp_val

        # loc
        # frame
        res_df = df.loc["j":"k", :]
        tm.assert_frame_equal(res_df, exp_df)
        assert is_categorical_dtype(res_df["cats"])

        # row
        res_row = df.loc["j", :]
        tm.assert_series_equal(res_row, exp_row)
        assert isinstance(res_row["cats"], str)

        # col
        res_col = df.loc[:, "cats"]
        tm.assert_series_equal(res_col, exp_col)
        assert is_categorical_dtype(res_col)

        # single value
        res_val = df.loc["j", "cats"]
        assert res_val == exp_val

        # ix
        # frame
        # res_df = df.loc["j":"k",[0,1]] # doesn't work?
        res_df = df.loc["j":"k", :]
        tm.assert_frame_equal(res_df, exp_df)
        assert is_categorical_dtype(res_df["cats"])

        # row
        res_row = df.loc["j", :]
        tm.assert_series_equal(res_row, exp_row)
        assert isinstance(res_row["cats"], str)

        # col
        res_col = df.loc[:, "cats"]
        tm.assert_series_equal(res_col, exp_col)
        assert is_categorical_dtype(res_col)

        # single value
        res_val = df.loc["j", df.columns[0]]
        assert res_val == exp_val

        # iat
        res_val = df.iat[2, 0]
        assert res_val == exp_val

        # at
        res_val = df.at["j", "cats"]
        assert res_val == exp_val

        # fancy indexing
        exp_fancy = df.iloc[[2]]

        res_fancy = df[df["cats"] == "b"]
        tm.assert_frame_equal(res_fancy, exp_fancy)
        res_fancy = df[df["values"] == 3]
        tm.assert_frame_equal(res_fancy, exp_fancy)

        # get_value
        res_val = df.at["j", "cats"]
        assert res_val == exp_val

        # i : int, slice, or sequence of integers
        res_row = df.iloc[2]
        tm.assert_series_equal(res_row, exp_row)
        assert isinstance(res_row["cats"], str)

        res_df = df.iloc[slice(2, 4)]
        tm.assert_frame_equal(res_df, exp_df)
        assert is_categorical_dtype(res_df["cats"])

        res_df = df.iloc[[2, 3]]
        tm.assert_frame_equal(res_df, exp_df)
        assert is_categorical_dtype(res_df["cats"])

        res_col = df.iloc[:, 0]
        tm.assert_series_equal(res_col, exp_col)
        assert is_categorical_dtype(res_col)

        res_df = df.iloc[:, slice(0, 2)]
        tm.assert_frame_equal(res_df, df)
        assert is_categorical_dtype(res_df["cats"])

        res_df = df.iloc[:, [0, 1]]
        tm.assert_frame_equal(res_df, df)
        assert is_categorical_dtype(res_df["cats"])

    def test_slicing_doc_examples(self):

        # GH 7918
        cats = Categorical(["a", "b", "b", "b", "c", "c", "c"],
                           categories=["a", "b", "c"])
        idx = Index(["h", "i", "j", "k", "l", "m", "n"])
        values = [1, 2, 2, 2, 3, 4, 5]
        df = DataFrame({"cats": cats, "values": values}, index=idx)

        result = df.iloc[2:4, :]
        expected = DataFrame(
            {
                "cats": Categorical(["b", "b"], categories=["a", "b", "c"]),
                "values": [2, 2],
            },
            index=["j", "k"],
        )
        tm.assert_frame_equal(result, expected)

        result = df.iloc[2:4, :].dtypes
        expected = Series(["category", "int64"], ["cats", "values"])
        tm.assert_series_equal(result, expected)

        result = df.loc["h":"j", "cats"]
        expected = Series(
            Categorical(["a", "b", "b"], categories=["a", "b", "c"]),
            index=["h", "i", "j"],
            name="cats",
        )
        tm.assert_series_equal(result, expected)

        result = df.loc["h":"j", df.columns[0:1]]
        expected = DataFrame(
            {"cats": Categorical(["a", "b", "b"], categories=["a", "b", "c"])},
            index=["h", "i", "j"],
        )
        tm.assert_frame_equal(result, expected)

    def test_getitem_category_type(self):
        # GH 14580
        # test iloc() on Series with Categorical data

        s = Series([1, 2, 3]).astype("category")

        # get slice
        result = s.iloc[0:2]
        expected = Series([1, 2]).astype(CategoricalDtype([1, 2, 3]))
        tm.assert_series_equal(result, expected)

        # get list of indexes
        result = s.iloc[[0, 1]]
        expected = Series([1, 2]).astype(CategoricalDtype([1, 2, 3]))
        tm.assert_series_equal(result, expected)

        # get boolean array
        result = s.iloc[[True, False, False]]
        expected = Series([1]).astype(CategoricalDtype([1, 2, 3]))
        tm.assert_series_equal(result, expected)

    def test_loc_listlike(self):

        # list of labels
        result = self.df.loc[["c", "a"]]
        expected = self.df.iloc[[4, 0, 1, 5]]
        tm.assert_frame_equal(result, expected, check_index_type=True)

        result = self.df2.loc[["a", "b", "e"]]
        exp_index = CategoricalIndex(list("aaabbe"),
                                     categories=list("cabe"),
                                     name="B")
        expected = DataFrame({"A": [0, 1, 5, 2, 3, np.nan]}, index=exp_index)
        tm.assert_frame_equal(result, expected, check_index_type=True)

        # element in the categories but not in the values
        with pytest.raises(KeyError, match=r"^'e'$"):
            self.df2.loc["e"]

        # assign is ok
        df = self.df2.copy()
        df.loc["e"] = 20
        result = df.loc[["a", "b", "e"]]
        exp_index = CategoricalIndex(list("aaabbe"),
                                     categories=list("cabe"),
                                     name="B")
        expected = DataFrame({"A": [0, 1, 5, 2, 3, 20]}, index=exp_index)
        tm.assert_frame_equal(result, expected)

        df = self.df2.copy()
        result = df.loc[["a", "b", "e"]]
        exp_index = CategoricalIndex(list("aaabbe"),
                                     categories=list("cabe"),
                                     name="B")
        expected = DataFrame({"A": [0, 1, 5, 2, 3, np.nan]}, index=exp_index)
        tm.assert_frame_equal(result, expected, check_index_type=True)

        # not all labels in the categories
        with pytest.raises(
                KeyError,
                match="'a list-indexer must only include values that are in the"
                " categories'",
        ):
            self.df2.loc[["a", "d"]]

    def test_loc_listlike_dtypes(self):
        # GH 11586

        # unique categories and codes
        index = CategoricalIndex(["a", "b", "c"])
        df = DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}, index=index)

        # unique slice
        res = df.loc[["a", "b"]]
        exp_index = CategoricalIndex(["a", "b"], categories=index.categories)
        exp = DataFrame({"A": [1, 2], "B": [4, 5]}, index=exp_index)
        tm.assert_frame_equal(res, exp, check_index_type=True)

        # duplicated slice
        res = df.loc[["a", "a", "b"]]

        exp_index = CategoricalIndex(["a", "a", "b"],
                                     categories=index.categories)
        exp = DataFrame({"A": [1, 1, 2], "B": [4, 4, 5]}, index=exp_index)
        tm.assert_frame_equal(res, exp, check_index_type=True)

        msg = "a list-indexer must only include values that are in the categories"
        with pytest.raises(KeyError, match=msg):
            df.loc[["a", "x"]]

        # duplicated categories and codes
        index = CategoricalIndex(["a", "b", "a"])
        df = DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}, index=index)

        # unique slice
        res = df.loc[["a", "b"]]
        exp = DataFrame({
            "A": [1, 3, 2],
            "B": [4, 6, 5]
        },
                        index=CategoricalIndex(["a", "a", "b"]))
        tm.assert_frame_equal(res, exp, check_index_type=True)

        # duplicated slice
        res = df.loc[["a", "a", "b"]]
        exp = DataFrame(
            {
                "A": [1, 3, 1, 3, 2],
                "B": [4, 6, 4, 6, 5]
            },
            index=CategoricalIndex(["a", "a", "a", "a", "b"]),
        )
        tm.assert_frame_equal(res, exp, check_index_type=True)

        msg = "a list-indexer must only include values that are in the categories"
        with pytest.raises(KeyError, match=msg):
            df.loc[["a", "x"]]

        # contains unused category
        index = CategoricalIndex(["a", "b", "a", "c"],
                                 categories=list("abcde"))
        df = DataFrame({"A": [1, 2, 3, 4], "B": [5, 6, 7, 8]}, index=index)

        res = df.loc[["a", "b"]]
        exp = DataFrame(
            {
                "A": [1, 3, 2],
                "B": [5, 7, 6]
            },
            index=CategoricalIndex(["a", "a", "b"], categories=list("abcde")),
        )
        tm.assert_frame_equal(res, exp, check_index_type=True)

        res = df.loc[["a", "e"]]
        exp = DataFrame(
            {
                "A": [1, 3, np.nan],
                "B": [5, 7, np.nan]
            },
            index=CategoricalIndex(["a", "a", "e"], categories=list("abcde")),
        )
        tm.assert_frame_equal(res, exp, check_index_type=True)

        # duplicated slice
        res = df.loc[["a", "a", "b"]]
        exp = DataFrame(
            {
                "A": [1, 3, 1, 3, 2],
                "B": [5, 7, 5, 7, 6]
            },
            index=CategoricalIndex(["a", "a", "a", "a", "b"],
                                   categories=list("abcde")),
        )
        tm.assert_frame_equal(res, exp, check_index_type=True)

        msg = "a list-indexer must only include values that are in the categories"
        with pytest.raises(KeyError, match=msg):
            df.loc[["a", "x"]]

    def test_get_indexer_array(self):
        arr = np.array(
            [
                Timestamp("1999-12-31 00:00:00"),
                Timestamp("2000-12-31 00:00:00")
            ],
            dtype=object,
        )
        cats = [
            Timestamp("1999-12-31 00:00:00"),
            Timestamp("2000-12-31 00:00:00")
        ]
        ci = CategoricalIndex(cats,
                              categories=cats,
                              ordered=False,
                              dtype="category")
        result = ci.get_indexer(arr)
        expected = np.array([0, 1], dtype="intp")
        tm.assert_numpy_array_equal(result, expected)

    def test_get_indexer_same_categories_same_order(self):
        ci = CategoricalIndex(["a", "b"], categories=["a", "b"])

        result = ci.get_indexer(
            CategoricalIndex(["b", "b"], categories=["a", "b"]))
        expected = np.array([1, 1], dtype="intp")
        tm.assert_numpy_array_equal(result, expected)

    def test_get_indexer_same_categories_different_order(self):
        # https://github.com/pandas-dev/pandas/issues/19551
        ci = CategoricalIndex(["a", "b"], categories=["a", "b"])

        result = ci.get_indexer(
            CategoricalIndex(["b", "b"], categories=["b", "a"]))
        expected = np.array([1, 1], dtype="intp")
        tm.assert_numpy_array_equal(result, expected)

    def test_getitem_with_listlike(self):
        # GH 16115
        cats = Categorical([Timestamp("12-31-1999"), Timestamp("12-31-2000")])

        expected = DataFrame([[1, 0], [0, 1]],
                             dtype="uint8",
                             index=[0, 1],
                             columns=cats)
        dummies = pd.get_dummies(cats)
        result = dummies[list(dummies.columns)]
        tm.assert_frame_equal(result, expected)

    def test_setitem_listlike(self):

        # GH 9469
        # properly coerce the input indexers
        np.random.seed(1)
        c = Categorical(np.random.randint(0, 5, size=150000).astype(
            np.int8)).add_categories([-1000])
        indexer = np.array([100000]).astype(np.int64)
        c[indexer] = -1000

        # we are asserting the code result here
        # which maps to the -1000 category
        result = c.codes[np.array([100000]).astype(np.int64)]
        tm.assert_numpy_array_equal(result, np.array([5], dtype="int8"))

    def test_ix_categorical_index(self):
        # GH 12531
        df = DataFrame(np.random.randn(3, 3),
                       index=list("ABC"),
                       columns=list("XYZ"))
        cdf = df.copy()
        cdf.index = CategoricalIndex(df.index)
        cdf.columns = CategoricalIndex(df.columns)

        expect = Series(df.loc["A", :], index=cdf.columns, name="A")
        tm.assert_series_equal(cdf.loc["A", :], expect)

        expect = Series(df.loc[:, "X"], index=cdf.index, name="X")
        tm.assert_series_equal(cdf.loc[:, "X"], expect)

        exp_index = CategoricalIndex(list("AB"), categories=["A", "B", "C"])
        expect = DataFrame(df.loc[["A", "B"], :],
                           columns=cdf.columns,
                           index=exp_index)
        tm.assert_frame_equal(cdf.loc[["A", "B"], :], expect)

        exp_columns = CategoricalIndex(list("XY"), categories=["X", "Y", "Z"])
        expect = DataFrame(df.loc[:, ["X", "Y"]],
                           index=cdf.index,
                           columns=exp_columns)
        tm.assert_frame_equal(cdf.loc[:, ["X", "Y"]], expect)

        # non-unique
        df = DataFrame(np.random.randn(3, 3),
                       index=list("ABA"),
                       columns=list("XYX"))
        cdf = df.copy()
        cdf.index = CategoricalIndex(df.index)
        cdf.columns = CategoricalIndex(df.columns)

        exp_index = CategoricalIndex(list("AA"), categories=["A", "B"])
        expect = DataFrame(df.loc["A", :],
                           columns=cdf.columns,
                           index=exp_index)
        tm.assert_frame_equal(cdf.loc["A", :], expect)

        exp_columns = CategoricalIndex(list("XX"), categories=["X", "Y"])
        expect = DataFrame(df.loc[:, "X"],
                           index=cdf.index,
                           columns=exp_columns)
        tm.assert_frame_equal(cdf.loc[:, "X"], expect)

        expect = DataFrame(
            df.loc[["A", "B"], :],
            columns=cdf.columns,
            index=CategoricalIndex(list("AAB")),
        )
        tm.assert_frame_equal(cdf.loc[["A", "B"], :], expect)

        expect = DataFrame(
            df.loc[:, ["X", "Y"]],
            index=cdf.index,
            columns=CategoricalIndex(list("XXY")),
        )
        tm.assert_frame_equal(cdf.loc[:, ["X", "Y"]], expect)

    def test_read_only_source(self):
        # GH 10043
        rw_array = np.eye(10)
        rw_df = DataFrame(rw_array)

        ro_array = np.eye(10)
        ro_array.setflags(write=False)
        ro_df = DataFrame(ro_array)

        tm.assert_frame_equal(rw_df.iloc[[1, 2, 3]], ro_df.iloc[[1, 2, 3]])
        tm.assert_frame_equal(rw_df.iloc[[1]], ro_df.iloc[[1]])
        tm.assert_series_equal(rw_df.iloc[1], ro_df.iloc[1])
        tm.assert_frame_equal(rw_df.iloc[1:3], ro_df.iloc[1:3])

        tm.assert_frame_equal(rw_df.loc[[1, 2, 3]], ro_df.loc[[1, 2, 3]])
        tm.assert_frame_equal(rw_df.loc[[1]], ro_df.loc[[1]])
        tm.assert_series_equal(rw_df.loc[1], ro_df.loc[1])
        tm.assert_frame_equal(rw_df.loc[1:3], ro_df.loc[1:3])

    def test_reindexing(self):
        df = DataFrame({
            "A": np.arange(3, dtype="int64"),
            "B": Series(list("abc")).astype(CDT(list("cabe"))),
        }).set_index("B")

        # reindexing
        # convert to a regular index
        result = df.reindex(["a", "b", "e"])
        expected = DataFrame({
            "A": [0, 1, np.nan],
            "B": Series(list("abe"))
        }).set_index("B")
        tm.assert_frame_equal(result, expected, check_index_type=True)

        result = df.reindex(["a", "b"])
        expected = DataFrame({
            "A": [0, 1],
            "B": Series(list("ab"))
        }).set_index("B")
        tm.assert_frame_equal(result, expected, check_index_type=True)

        result = df.reindex(["e"])
        expected = DataFrame({
            "A": [np.nan],
            "B": Series(["e"])
        }).set_index("B")
        tm.assert_frame_equal(result, expected, check_index_type=True)

        result = df.reindex(["d"])
        expected = DataFrame({
            "A": [np.nan],
            "B": Series(["d"])
        }).set_index("B")
        tm.assert_frame_equal(result, expected, check_index_type=True)

        # since we are actually reindexing with a Categorical
        # then return a Categorical
        cats = list("cabe")

        result = df.reindex(Categorical(["a", "e"], categories=cats))
        expected = DataFrame({
            "A": [0, np.nan],
            "B": Series(list("ae")).astype(CDT(cats))
        }).set_index("B")
        tm.assert_frame_equal(result, expected, check_index_type=True)

        result = df.reindex(Categorical(["a"], categories=cats))
        expected = DataFrame({
            "A": [0],
            "B": Series(list("a")).astype(CDT(cats))
        }).set_index("B")
        tm.assert_frame_equal(result, expected, check_index_type=True)

        result = df.reindex(["a", "b", "e"])
        expected = DataFrame({
            "A": [0, 1, np.nan],
            "B": Series(list("abe"))
        }).set_index("B")
        tm.assert_frame_equal(result, expected, check_index_type=True)

        result = df.reindex(["a", "b"])
        expected = DataFrame({
            "A": [0, 1],
            "B": Series(list("ab"))
        }).set_index("B")
        tm.assert_frame_equal(result, expected, check_index_type=True)

        result = df.reindex(["e"])
        expected = DataFrame({
            "A": [np.nan],
            "B": Series(["e"])
        }).set_index("B")
        tm.assert_frame_equal(result, expected, check_index_type=True)

        # give back the type of categorical that we received
        result = df.reindex(
            Categorical(["a", "e"], categories=cats, ordered=True))
        expected = DataFrame({
            "A": [0, np.nan],
            "B":
            Series(list("ae")).astype(CDT(cats, ordered=True))
        }).set_index("B")
        tm.assert_frame_equal(result, expected, check_index_type=True)

        result = df.reindex(Categorical(["a", "d"], categories=["a", "d"]))
        expected = DataFrame({
            "A": [0, np.nan],
            "B": Series(list("ad")).astype(CDT(["a", "d"]))
        }).set_index("B")
        tm.assert_frame_equal(result, expected, check_index_type=True)

        # passed duplicate indexers are not allowed
        msg = "cannot reindex from a duplicate axis"
        with pytest.raises(ValueError, match=msg):
            self.df2.reindex(["a", "b"])

        # args NotImplemented ATM
        msg = r"argument {} is not implemented for CategoricalIndex\.reindex"
        with pytest.raises(NotImplementedError, match=msg.format("method")):
            df.reindex(["a"], method="ffill")
        with pytest.raises(NotImplementedError, match=msg.format("level")):
            df.reindex(["a"], level=1)
        with pytest.raises(NotImplementedError, match=msg.format("limit")):
            df.reindex(["a"], limit=2)

    def test_loc_slice(self):
        # slicing
        # not implemented ATM
        # GH9748

        msg = ("cannot do slice indexing on {klass} with these "
               r"indexers \[1\] of {kind}".format(klass=str(CategoricalIndex),
                                                  kind=str(int)))
        with pytest.raises(TypeError, match=msg):
            self.df.loc[1:5]

        # result = df.loc[1:5]
        # expected = df.iloc[[1,2,3,4]]
        # tm.assert_frame_equal(result, expected)

    def test_loc_and_at_with_categorical_index(self):
        # GH 20629
        s = Series([1, 2, 3], index=pd.CategoricalIndex(["A", "B", "C"]))
        assert s.loc["A"] == 1
        assert s.at["A"] == 1
        df = DataFrame([[1, 2], [3, 4], [5, 6]],
                       index=pd.CategoricalIndex(["A", "B", "C"]))
        assert df.loc["B", 1] == 4
        assert df.at["B", 1] == 4

    def test_boolean_selection(self):

        df3 = self.df3
        df4 = self.df4

        result = df3[df3.index == "a"]
        expected = df3.iloc[[]]
        tm.assert_frame_equal(result, expected)

        result = df4[df4.index == "a"]
        expected = df4.iloc[[]]
        tm.assert_frame_equal(result, expected)

        result = df3[df3.index == 1]
        expected = df3.iloc[[0, 1, 3]]
        tm.assert_frame_equal(result, expected)

        result = df4[df4.index == 1]
        expected = df4.iloc[[0, 1, 3]]
        tm.assert_frame_equal(result, expected)

        # since we have an ordered categorical

        # CategoricalIndex([1, 1, 2, 1, 3, 2],
        #         categories=[3, 2, 1],
        #         ordered=True,
        #         name='B')
        result = df3[df3.index < 2]
        expected = df3.iloc[[4]]
        tm.assert_frame_equal(result, expected)

        result = df3[df3.index > 1]
        expected = df3.iloc[[]]
        tm.assert_frame_equal(result, expected)

        # unordered
        # cannot be compared

        # CategoricalIndex([1, 1, 2, 1, 3, 2],
        #         categories=[3, 2, 1],
        #         ordered=False,
        #         name='B')
        msg = "Unordered Categoricals can only compare equality or not"
        with pytest.raises(TypeError, match=msg):
            df4[df4.index < 2]
        with pytest.raises(TypeError, match=msg):
            df4[df4.index > 1]

    def test_indexing_with_category(self):

        # https://github.com/pandas-dev/pandas/issues/12564
        # consistent result if comparing as Dataframe

        cat = DataFrame({"A": ["foo", "bar", "baz"]})
        exp = DataFrame({"A": [True, False, False]})

        res = cat[["A"]] == "foo"
        tm.assert_frame_equal(res, exp)

        cat["A"] = cat["A"].astype("category")

        res = cat[["A"]] == "foo"
        tm.assert_frame_equal(res, exp)

    def test_map_with_dict_or_series(self):
        orig_values = ["a", "B", 1, "a"]
        new_values = ["one", 2, 3.0, "one"]
        cur_index = pd.CategoricalIndex(orig_values, name="XXX")
        expected = pd.CategoricalIndex(new_values,
                                       name="XXX",
                                       categories=[3.0, 2, "one"])

        mapper = pd.Series(new_values[:-1], index=orig_values[:-1])
        output = cur_index.map(mapper)
        # Order of categories in output can be different
        tm.assert_index_equal(expected, output)

        mapper = {o: n for o, n in zip(orig_values[:-1], new_values[:-1])}
        output = cur_index.map(mapper)
        # Order of categories in output can be different
        tm.assert_index_equal(expected, output)

    @pytest.mark.parametrize(
        "idx_values",
        [
            # python types
            [1, 2, 3],
            [-1, -2, -3],
            [1.5, 2.5, 3.5],
            [-1.5, -2.5, -3.5],
            # numpy int/uint
            *[
                np.array([1, 2, 3], dtype=dtype)
                for dtype in conftest.ALL_INT_DTYPES
            ],
            # numpy floats
            *[
                np.array([1.5, 2.5, 3.5], dtype=dtyp)
                for dtyp in conftest.FLOAT_DTYPES
            ],
            # numpy object
            np.array([1, "b", 3.5], dtype=object),
            # pandas scalars
            [Interval(1, 4), Interval(4, 6),
             Interval(6, 9)],
            [
                Timestamp(2019, 1, 1),
                Timestamp(2019, 2, 1),
                Timestamp(2019, 3, 1)
            ],
            [Timedelta(1, "d"),
             Timedelta(2, "d"),
             Timedelta(3, "D")],
            # pandas Integer arrays
            *[
                pd.array([1, 2, 3], dtype=dtype)
                for dtype in conftest.ALL_EA_INT_DTYPES
            ],
            # other pandas arrays
            pd.IntervalIndex.from_breaks([1, 4, 6, 9]).array,
            pd.date_range("2019-01-01", periods=3).array,
            pd.timedelta_range(start="1d", periods=3).array,
        ],
    )
    def test_loc_with_non_string_categories(self, idx_values, ordered_fixture):
        # GH-17569
        cat_idx = CategoricalIndex(idx_values, ordered=ordered_fixture)
        df = DataFrame({"A": ["foo", "bar", "baz"]}, index=cat_idx)

        # scalar selection
        result = df.loc[idx_values[0]]
        expected = Series(["foo"], index=["A"], name=idx_values[0])
        tm.assert_series_equal(result, expected)

        # list selection
        result = df.loc[idx_values[:2]]
        expected = DataFrame(["foo", "bar"], index=cat_idx[:2], columns=["A"])
        tm.assert_frame_equal(result, expected)

        # scalar assignment
        result = df.copy()
        result.loc[idx_values[0]] = "qux"
        expected = DataFrame({"A": ["qux", "bar", "baz"]}, index=cat_idx)
        tm.assert_frame_equal(result, expected)

        # list assignment
        result = df.copy()
        result.loc[idx_values[:2], "A"] = ["qux", "qux2"]
        expected = DataFrame({"A": ["qux", "qux2", "baz"]}, index=cat_idx)
        tm.assert_frame_equal(result, expected)
Example #57
0
def supervised_classifier(input_SOURCES, test_directory):
    NEWLINE = '\n'
    SKIP_FILES = {'cmds'}

    def read_files(path):
        for root, dir_names, file_names in os.walk(path):
            for path in dir_names:
                read_files(os.path.join(root, path))
            for file_name in file_names:
                if file_name not in SKIP_FILES:
                    file_path = os.path.join(root, file_name)
                    if os.path.isfile(file_path):
                        # past_header, lines = False, []
                        past_header, lines = True, []
                        f = open(file_path, errors='ignore')
                        for line in f:
                            if past_header:
                                lines.append(line)
                            elif line == NEWLINE:
                                past_header = True
                        f.close()
                        content = NEWLINE.join(lines)
                        yield file_path, content

    def build_data_test_frame(path):
        rows = []
        index = []
        for file_name, text in read_files(path):
            rows.append({'text': text})
            index.append(file_name)
            # print("[DEBUG] file_name: {}".format(file_name))
        data_frame_test = DataFrame(rows, index=index)
        return data_frame_test

    def build_data_frame(path, classification):
        rows = []
        index = []
        for file_name, text in read_files(path):
            # print("[DEBUG] text: {}".format(text))
            rows.append({'text': text, 'class': classification})
            index.append(file_name)
        data_frame = DataFrame(rows, index=index)
        return data_frame

    # Training
    # Assigning classes to training set (label training dataset)

    Path_extracted = Address(1).split("\n")
    Path_extracted1 = Path_extracted[0]
    SOURCES = input_SOURCES

    data = DataFrame({'text': [], 'class': []})
    for path, classification in SOURCES:
        new_data_frame = build_data_frame(path, classification)
        data = data.append(new_data_frame, sort=True)
    data = data.reindex(numpy.random.permutation(data.index))

    # Naive Bayes classifier

    count_vectorizer = CountVectorizer(
        stop_words=None
    )  # Segmenting text file into words, counting occurrence number of each word and assigning this number as an ID to words for training set
    # print("[DEBUG] data['text'].values: {}".format(data['text'].values))
    counts = count_vectorizer.fit_transform(data['text'].values)
    # print("[DEBUG] count: {}".format(counts))

    # comment the above two lines and apply TF-IDF with the following two line of codes
    # count_vectorizer = TfidfVectorizer(use_idf=True)
    # counts = count_vectorizer.fit_transform(data['text'].values)

    # below is for the MultinomialNB method:
    classifier = MultinomialNB(
    )  # Calculating coefficients for training set based on Naive Bayes
    targets = data['class'].values
    classifier.fit(counts,
                   targets)  # classifier.class_ = ['bad', 'good', 'perfect']

    # below is for the SVM method:
    # classifier = svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto', probability=True)
    # targets = data['class'].values
    # classifier.fit(counts, targets)

    # define perfect as the location of class 'perfect'
    perfect = 0
    for i in range(len(classifier.classes_)):
        if classifier.classes_[i] == 'perfect':
            perfect = i

    # # find boundary possibility
    # SOURCEStrain = [path for path, classification in SOURCES]
    # data_train = DataFrame({'text': []})
    # for path in SOURCEStrain:
    #     data_train = data_train.append(build_data_test_frame(path))
    # training_samples = data_train['text'].values
    # sample_counts = count_vectorizer.transform(training_samples)
    # all_possibility = classifier.predict_proba(sample_counts)
    # boundary = numpy.mean(all_possibility)
    # print(boundary)

    # judge with SVC
    clf = LinearSVC()
    clf.fit(counts, targets)

    # Testing
    SOURCEStest = [os.path.join(Path_extracted1,
                                test_directory)]  # Testset directory of pages
    # SOURCEStest = [os.path.join(Path_extracted1,'Test_cropped_text')#Testset directory
    # (os.path.join(os.path.join(Path_extracted1,'Text_test'), 'Temperature_Sensor'))]
    # print("source dir here: {}".format(SOURCEStest))
    data_test = DataFrame({'text': []})
    for path in SOURCEStest:
        data_test = data_test.append(build_data_test_frame(path))
    examples = data_test['text'].values
    example_counts = count_vectorizer.transform(examples)
    # print("[DEBUG] example count at path: {},\n{}".format(Path_extracted, example_counts))
    # Applying calculated Naive Bayes coefficients and decision based on MAP
    predictions = classifier.predict(example_counts)

    # SVC prediction
    pass_list = clf.predict(example_counts)

    # # check whether all the predictions are 'bad'
    # all_bad = 1
    # for prediction in predictions:
    #     if not prediction == 'bad':
    #         all_bad = 0
    #         break
    # if all_bad == 1:
    #     perfect_prob = [pred_prob[perfect] for pred_prob in classifier.predict_proba(example_counts)]
    #     perfect_page = perfect_prob.index(max(perfect_prob))
    #     predictions[perfect_page] = 'perfect'

    perfect_prob = [
        pred_prob[perfect]
        for pred_prob in classifier.predict_proba(example_counts)
    ]
    perfect_idx = perfect_prob.index(max(perfect_prob))
    if pass_list[perfect_idx] == 'perfect':
        predictions[perfect_idx] = 'perfect'
        for i in range(len(predictions)):
            if not i == perfect_idx:
                predictions[i] = 'bad'
    else:
        for i in range(len(predictions)):
            predictions[i] = 'bad'

    # print result in tuples
    page_classification_result = []
    for path in SOURCEStest:
        for page, label in zip(os.listdir(path), predictions):
            page_name = page.split(".txt")[0]
            predicted_tuple = [page_name, label]
            page_classification_result.append(predicted_tuple)
    return page_classification_result
Example #58
0
ser2

ser3=ser2.reindex(['A','B','C','D','E','F','G'],fill_value=0) #fill_value
ser3

ser4 = Series(['USA','Mexico','Canada'],index = [0,5,10])
ser4
ser4.reindex(range(15),method='ffill') # reindexing series

# DFs
from numpy.random import randn
df1 = DataFrame(randn(25).reshape((5,5)),
                   index=['A','B','D','E','F'], # missed C
                   columns=['col1','col2','col3','col4','col5'])
df1
df2=df1.reindex(['A','B','C','D','E','F']) #renindexing rows of a df
df2

new_columns = ['col1','col2','col3','col4','col5','col6']
df2.reindex(columns=new_columns) #reindexing columns of a df

#reindexing with .ix is faster
df1
df1.ix[['A','B','C','D','E','F'],new_columns]

### Droping enteries
ser1=Series(np.arange(3),index=['a','b','c'])
ser1
ser1.drop('b')

df1=DataFrame(np.arange(9).reshape([3,3]),index=['SF','LA','NYC'],columns=['pop','size','year'])
Example #59
0
    def test_dups_fancy_indexing(self):

        # GH 3455
        from pandas.util.testing import makeCustomDataframe as mkdf
        df = mkdf(10, 3)
        df.columns = ['a', 'a', 'b']
        result = df[['b', 'a']].columns
        expected = Index(['b', 'a', 'a'])
        tm.assert_index_equal(result, expected)

        # across dtypes
        df = DataFrame([[1, 2, 1., 2., 3., 'foo', 'bar']],
                       columns=list('aaaaaaa'))
        df.head()
        str(df)
        result = DataFrame([[1, 2, 1., 2., 3., 'foo', 'bar']])
        result.columns = list('aaaaaaa')

        # TODO(wesm): unused?
        df_v = df.iloc[:, 4]  # noqa
        res_v = result.iloc[:, 4]  # noqa

        tm.assert_frame_equal(df, result)

        # GH 3561, dups not in selected order
        df = DataFrame(
            {
                'test': [5, 7, 9, 11],
                'test1': [4., 5, 6, 7],
                'other': list('abcd')
            },
            index=['A', 'A', 'B', 'C'])
        rows = ['C', 'B']
        expected = DataFrame(
            {
                'test': [11, 9],
                'test1': [7., 6],
                'other': ['d', 'c']
            },
            index=rows)
        result = df.loc[rows]
        tm.assert_frame_equal(result, expected)

        result = df.loc[Index(rows)]
        tm.assert_frame_equal(result, expected)

        rows = ['C', 'B', 'E']
        expected = DataFrame(
            {
                'test': [11, 9, np.nan],
                'test1': [7., 6, np.nan],
                'other': ['d', 'c', np.nan]
            },
            index=rows)

        result = df.loc[rows]
        tm.assert_frame_equal(result, expected)

        # see GH5553, make sure we use the right indexer
        rows = ['F', 'G', 'H', 'C', 'B', 'E']
        expected = DataFrame(
            {
                'test': [np.nan, np.nan, np.nan, 11, 9, np.nan],
                'test1': [np.nan, np.nan, np.nan, 7., 6, np.nan],
                'other': [np.nan, np.nan, np.nan, 'd', 'c', np.nan]
            },
            index=rows)
        result = df.loc[rows]
        tm.assert_frame_equal(result, expected)

        # inconsistent returns for unique/duplicate indices when values are
        # missing
        df = DataFrame(np.random.randn(4, 3), index=list('ABCD'))
        expected = df.reindex(['E'])

        dfnu = DataFrame(np.random.randn(5, 3), index=list('AABCD'))
        with catch_warnings(record=True):
            result = dfnu.ix[['E']]
        tm.assert_frame_equal(result, expected)

        # ToDo: check_index_type can be True after GH 11497

        # GH 4619; duplicate indexer with missing label
        df = DataFrame({"A": [0, 1, 2]})
        result = df.loc[[0, 8, 0]]
        expected = DataFrame({"A": [0, np.nan, 0]}, index=[0, 8, 0])
        tm.assert_frame_equal(result, expected, check_index_type=False)

        df = DataFrame({"A": list('abc')})
        result = df.loc[[0, 8, 0]]
        expected = DataFrame({"A": ['a', np.nan, 'a']}, index=[0, 8, 0])
        tm.assert_frame_equal(result, expected, check_index_type=False)

        # non unique with non unique selector
        df = DataFrame({'test': [5, 7, 9, 11]}, index=['A', 'A', 'B', 'C'])
        expected = DataFrame({'test': [5, 7, 5, 7, np.nan]},
                             index=['A', 'A', 'A', 'A', 'E'])
        result = df.loc[['A', 'A', 'E']]
        tm.assert_frame_equal(result, expected)

        # GH 5835
        # dups on index and missing values
        df = DataFrame(np.random.randn(5, 5),
                       columns=['A', 'B', 'B', 'B', 'A'])

        expected = pd.concat([
            df.loc[:, ['A', 'B']],
            DataFrame(np.nan, columns=['C'], index=df.index)
        ],
                             axis=1)
        result = df.loc[:, ['A', 'B', 'C']]
        tm.assert_frame_equal(result, expected)

        # GH 6504, multi-axis indexing
        df = DataFrame(np.random.randn(9, 2),
                       index=[1, 1, 1, 2, 2, 2, 3, 3, 3],
                       columns=['a', 'b'])

        expected = df.iloc[0:6]
        result = df.loc[[1, 2]]
        tm.assert_frame_equal(result, expected)

        expected = df
        result = df.loc[:, ['a', 'b']]
        tm.assert_frame_equal(result, expected)

        expected = df.iloc[0:6, :]
        result = df.loc[[1, 2], ['a', 'b']]
        tm.assert_frame_equal(result, expected)
Example #60
0
    rows = []
    index = []
    for file_name, text in read_files(path):
        rows.append({'text': text, 'class': classification})
        index.append(file_name)

    data_frame = DataFrame(rows, index=index)
    return data_frame

# read the corpus data
data = DataFrame({'text': [], 'class': []})
for path, classification in SOURCES:
    data = data.append(build_data_frame(path, classification))

# randomize corpus data
data = data.reindex(numpy.random.permutation(data.index))

# create the data trasformation and classification pipeline 
# http://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html
# http://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfTransformer.html
# http://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.MultinomialNB.html
pipeline = Pipeline([
    ('vect',    CountVectorizer(stop_words='english',lowercase=True)),
    ('tfidf',   TfidfTransformer(use_idf=True,smooth_idf=False)),
    ('clf',     MultinomialNB(alpha=1.0,fit_prior=True))
])

# do k-fold cross-validation  
# https://en.wikipedia.org/wiki/Cross-validation_(statistics)
# http://scikit-learn.org/stable/modules/generated/sklearn.cross_validation.KFold.html
k_fold = KFold(n=len(data), n_folds=6)