Example #1
0
    def test_write_infer(self, ext, get_random_path):
        base = get_random_path
        path1 = base + ext
        path2 = base + ".raw"
        compression = None
        for c in self._compression_to_extension:
            if self._compression_to_extension[c] == ext:
                compression = c
                break

        with tm.ensure_clean(path1) as p1, tm.ensure_clean(path2) as p2:
            df = tm.makeDataFrame()

            # write to compressed file by inferred compression method
            df.to_pickle(p1)

            # decompress
            with tm.decompress_file(p1, compression=compression) as f:
                with open(p2, "wb") as fh:
                    fh.write(f.read())

            # read decompressed file
            df2 = pd.read_pickle(p2, compression=None)

            tm.assert_frame_equal(df, df2)
Example #2
0
    def test_excel_cell_error_na(self):
        _skip_if_no_xlrd()

        excel_data = ExcelFile(os.path.join(self.dirpath, 'test3.xls'))
        parsed = excel_data.parse('Sheet1')
        expected = DataFrame([[np.nan]], columns=['Test'])
        tm.assert_frame_equal(parsed, expected)
Example #3
0
 def test_query_single_element_booleans(self, parser, engine):
     columns = 'bid', 'bidsize', 'ask', 'asksize'
     data = np.random.randint(2, size=(1, len(columns))).astype(bool)
     df = DataFrame(data, columns=columns)
     res = df.query('bid & ask', engine=engine, parser=parser)
     expected = df[df.bid & df.ask]
     assert_frame_equal(res, expected)
Example #4
0
 def test_groupby_to_series_to_frame_2(self):
     df = pd.DataFrame({'a': [6, 2, 2], 'b': [4, 5, 6]})
     labels = ['g1', 'g1', 'g2']
     benchmark = df.groupby(labels).apply(frame_to_series)
     result = pandas_easy.groupby_to_series_to_frame(
         df, frame_to_series, 1, use_apply=False, by=labels)
     assert_frame_equal(result, benchmark)
Example #5
0
    def test_excel_stop_iterator(self):
        _skip_if_no_xlrd()

        excel_data = ExcelFile(os.path.join(self.dirpath, 'test2.xls'))
        parsed = excel_data.parse('Sheet1')
        expected = DataFrame([['aaaa', 'bbbbb']], columns=['Test', 'Test1'])
        tm.assert_frame_equal(parsed, expected)
Example #6
0
    def test_nested_scope(self):
        from pandas.core.computation.ops import UndefinedVariableError
        engine = self.engine
        parser = self.parser
        # smoke test
        x = 1  # noqa
        result = pd.eval('x + 1', engine=engine, parser=parser)
        assert result == 2

        df = DataFrame(np.random.randn(5, 3))
        df2 = DataFrame(np.random.randn(5, 3))

        # don't have the pandas parser
        with pytest.raises(SyntaxError):
            df.query('(@df>0) & (@df2>0)', engine=engine, parser=parser)

        with pytest.raises(UndefinedVariableError):
            df.query('(df>0) & (df2>0)', engine=engine, parser=parser)

        expected = df[(df > 0) & (df2 > 0)]
        result = pd.eval('df[(df > 0) & (df2 > 0)]', engine=engine,
                         parser=parser)
        assert_frame_equal(expected, result)

        expected = df[(df > 0) & (df2 > 0) & (df[df > 0] > 0)]
        result = pd.eval('df[(df > 0) & (df2 > 0) & (df[df > 0] > 0)]',
                         engine=engine, parser=parser)
        assert_frame_equal(expected, result)
Example #7
0
 def test_query_with_nested_special_character(self, parser, engine):
     skip_if_no_pandas_parser(parser)
     df = DataFrame({'a': ['a', 'b', 'test & test'],
                     'b': [1, 2, 3]})
     res = df.query('a == "test & test"', parser=parser, engine=engine)
     expec = df[df.a == 'test & test']
     assert_frame_equal(res, expec)
Example #8
0
    def test_sort_index_different_sortorder(self):
        A = np.arange(20).repeat(5)
        B = np.tile(np.arange(5), 20)

        indexer = np.random.permutation(100)
        A = A.take(indexer)
        B = B.take(indexer)

        df = DataFrame({'A': A, 'B': B,
                        'C': np.random.randn(100)})

        # use .sort_values #9816
        with tm.assert_produces_warning(FutureWarning):
            df.sort_index(by=['A', 'B'], ascending=[1, 0])
        result = df.sort_values(by=['A', 'B'], ascending=[1, 0])

        ex_indexer = np.lexsort((df.B.max() - df.B, df.A))
        expected = df.take(ex_indexer)
        assert_frame_equal(result, expected)

        # test with multiindex, too
        idf = df.set_index(['A', 'B'])

        result = idf.sort_index(ascending=[1, 0])
        expected = idf.take(ex_indexer)
        assert_frame_equal(result, expected)

        # also, Series!
        result = idf['C'].sort_index(ascending=[1, 0])
        assert_series_equal(result, expected['C'])
Example #9
0
    def test_query_python(self):

        df = self.df
        result = df.query('A>0', engine='python')
        assert_frame_equal(result, self.expected1)
        result = df.eval('A+1', engine='python')
        assert_series_equal(result, self.expected2, check_names=False)
Example #10
0
    def test_groupby_groups_datetimeindex(self):
        # GH#1430
        periods = 1000
        ind = pd.date_range(start='2012/1/1', freq='5min', periods=periods)
        df = DataFrame({'high': np.arange(periods),
                        'low': np.arange(periods)}, index=ind)
        grouped = df.groupby(lambda x: datetime(x.year, x.month, x.day))

        # it works!
        groups = grouped.groups
        assert isinstance(list(groups.keys())[0], datetime)

        # GH#11442
        index = pd.date_range('2015/01/01', periods=5, name='date')
        df = pd.DataFrame({'A': [5, 6, 7, 8, 9],
                           'B': [1, 2, 3, 4, 5]}, index=index)
        result = df.groupby(level='date').groups
        dates = ['2015-01-05', '2015-01-04', '2015-01-03',
                 '2015-01-02', '2015-01-01']
        expected = {pd.Timestamp(date): pd.DatetimeIndex([date], name='date')
                    for date in dates}
        tm.assert_dict_equal(result, expected)

        grouped = df.groupby(level='date')
        for date in dates:
            result = grouped.get_group(date)
            data = [[df.loc[date, 'A'], df.loc[date, 'B']]]
            expected_index = pd.DatetimeIndex([date], name='date')
            expected = pd.DataFrame(data,
                                    columns=list('AB'),
                                    index=expected_index)
            tm.assert_frame_equal(result, expected)
Example #11
0
 def test_stable_descending_sort(self):
     # GH #6399
     df = DataFrame([[2, 'first'], [2, 'second'], [1, 'a'], [1, 'b']],
                    columns=['sort_col', 'order'])
     sorted_df = df.sort_values(by='sort_col', kind='mergesort',
                                ascending=False)
     assert_frame_equal(df, sorted_df)
Example #12
0
        def run_tests(df, rhs, right):
            # label, index, slice
            r, i, s = list('bcd'), [1, 2, 3], slice(1, 4)
            c, j, l = ['joe', 'jolie'], [1, 2], slice(1, 3)

            left = df.copy()
            left.loc[r, c] = rhs
            tm.assert_frame_equal(left, right)

            left = df.copy()
            left.iloc[i, j] = rhs
            tm.assert_frame_equal(left, right)

            left = df.copy()
            with catch_warnings(record=True):
                left.ix[s, l] = rhs
            tm.assert_frame_equal(left, right)

            left = df.copy()
            with catch_warnings(record=True):
                left.ix[i, j] = rhs
            tm.assert_frame_equal(left, right)

            left = df.copy()
            with catch_warnings(record=True):
                left.ix[r, c] = rhs
            tm.assert_frame_equal(left, right)
Example #13
0
    def test_coercion_with_loc(self):
        for start_data, expected_result, in self.EXPECTED_SINGLE_ROW_RESULTS:
            start_dataframe = DataFrame({'foo': start_data})
            start_dataframe.loc[0, ['foo']] = None

            expected_dataframe = DataFrame({'foo': expected_result})
            tm.assert_frame_equal(start_dataframe, expected_dataframe)
Example #14
0
    def test_pivot_datetime_tz(self):
        dates1 = ['2011-07-19 07:00:00', '2011-07-19 08:00:00', '2011-07-19 09:00:00',
                  '2011-07-19 07:00:00', '2011-07-19 08:00:00', '2011-07-19 09:00:00']
        dates2 = ['2013-01-01 15:00:00', '2013-01-01 15:00:00', '2013-01-01 15:00:00',
                  '2013-02-01 15:00:00', '2013-02-01 15:00:00', '2013-02-01 15:00:00']
        df = DataFrame({'label': ['a', 'a', 'a', 'b', 'b', 'b'],
                        'dt1': dates1, 'dt2': dates2,
                        'value1': range(6), 'value2': [1, 2] * 3})
        df['dt1'] = df['dt1'].apply(lambda d: pd.Timestamp(d, tz='US/Pacific'))
        df['dt2'] = df['dt2'].apply(lambda d: pd.Timestamp(d, tz='Asia/Tokyo'))

        exp_idx = pd.DatetimeIndex(['2011-07-19 07:00:00', '2011-07-19 08:00:00',
                                    '2011-07-19 09:00:00'], tz='US/Pacific', name='dt1')
        exp_col1 = Index(['value1', 'value1'])
        exp_col2 = Index(['a', 'b'], name='label')
        exp_col = MultiIndex.from_arrays([exp_col1, exp_col2])
        expected = DataFrame([[0, 3], [1, 4], [2, 5]],
                             index=exp_idx, columns=exp_col)
        result = pivot_table(df, index=['dt1'], columns=['label'], values=['value1'])
        tm.assert_frame_equal(result, expected)


        exp_col1 = Index(['sum', 'sum', 'sum', 'sum', 'mean', 'mean', 'mean', 'mean'])
        exp_col2 = Index(['value1', 'value1', 'value2', 'value2'] * 2)
        exp_col3 = pd.DatetimeIndex(['2013-01-01 15:00:00', '2013-02-01 15:00:00'] * 4,
                                    tz='Asia/Tokyo', name='dt2')
        exp_col = MultiIndex.from_arrays([exp_col1, exp_col2, exp_col3])
        expected = DataFrame(np.array([[0, 3, 1, 2, 0, 3, 1, 2], [1, 4, 2, 1, 1, 4, 2, 1],
                              [2, 5, 1, 2, 2, 5, 1, 2]]), index=exp_idx, columns=exp_col)

        result = pivot_table(df, index=['dt1'], columns=['dt2'], values=['value1', 'value2'],
                             aggfunc=[np.sum, np.mean])
        tm.assert_frame_equal(result, expected)
Example #15
0
 def test_float_index_to_mixed(self):
     df = DataFrame({0.0: np.random.rand(10), 1.0: np.random.rand(10)})
     df['a'] = 10
     tm.assert_frame_equal(DataFrame({0.0: df[0.0],
                                      1.0: df[1.0],
                                      'a': [10] * 10}),
                           df)
Example #16
0
 def test_parse_date_float(self, data, expected, parse_dates):
     # see gh-2697
     #
     # Date parsing should fail, so we leave the data untouched
     # (i.e. float precision should remain unchanged).
     result = self.read_csv(StringIO(data), parse_dates=parse_dates)
     tm.assert_frame_equal(result, expected)
Example #17
0
    def test_ancom_3class_anova(self):
        t = pd.DataFrame([[9, 9, 19, 19, 29, 29],
                          [10, 11, 20, 20, 29, 28],
                          [9, 10, 9, 9, 10, 9],
                          [9, 10, 9, 9, 9, 8],
                          [9, 10, 9, 9, 9, 9],
                          [9, 10, 9, 9, 9, 10],
                          [9, 12, 9, 9, 9, 11]],
                         index=['O1', 'O2', 'O3', 'O4', 'O5', 'O6', 'O7'],
                         columns=['S1', 'S2', 'S3', 'S4', 'S5', 'S6']).T
        c = qiime2.CategoricalMetadataColumn(
                pd.Series(['0', '0', '1', '1', '2', '2'], name='n',
                          index=pd.Index(['S1', 'S2', 'S3',
                                          'S4', 'S5', 'S6'], name='id'))
        )
        ancom(output_dir=self.temp_dir.name, table=t+1, metadata=c)

        res = pd.read_csv(os.path.join(self.temp_dir.name, 'ancom.tsv'),
                          index_col=0, sep='\t')
        exp = pd.DataFrame(
            {'W': np.array([5, 5, 3, 3, 2, 2, 2]),
             'Reject null hypothesis': np.array([True, True, False, False,
                                                 False, False, False],
                                                dtype=bool)},
            index=['O1', 'O2', 'O3', 'O4', 'O5', 'O6', 'O7'],)
        pdt.assert_frame_equal(res, exp)
Example #18
0
    def test_yy_format_with_yearfirst(self):
        data = """date,time,B,C
090131,0010,1,2
090228,1020,3,4
090331,0830,5,6
"""

        # See gh-217
        import dateutil
        if dateutil.__version__ >= LooseVersion('2.5.0'):
            pytest.skip("testing yearfirst=True not-support"
                        "on datetutil < 2.5.0 this works but"
                        "is wrong")

        rs = self.read_csv(StringIO(data), index_col=0,
                           parse_dates=[['date', 'time']])
        idx = DatetimeIndex([datetime(2009, 1, 31, 0, 10, 0),
                             datetime(2009, 2, 28, 10, 20, 0),
                             datetime(2009, 3, 31, 8, 30, 0)],
                            dtype=object, name='date_time')
        xp = DataFrame({'B': [1, 3, 5], 'C': [2, 4, 6]}, idx)
        tm.assert_frame_equal(rs, xp)

        rs = self.read_csv(StringIO(data), index_col=0,
                           parse_dates=[[0, 1]])
        idx = DatetimeIndex([datetime(2009, 1, 31, 0, 10, 0),
                             datetime(2009, 2, 28, 10, 20, 0),
                             datetime(2009, 3, 31, 8, 30, 0)],
                            dtype=object, name='date_time')
        xp = DataFrame({'B': [1, 3, 5], 'C': [2, 4, 6]}, idx)
        tm.assert_frame_equal(rs, xp)
Example #19
0
    def test_dateparser_resolution_if_not_ns(self):
        # GH 10245
        data = """\
date,time,prn,rxstatus
2013-11-03,19:00:00,126,00E80000
2013-11-03,19:00:00,23,00E80000
2013-11-03,19:00:00,13,00E80000
"""

        def date_parser(date, time):
            datetime = np_array_datetime64_compat(
                date + 'T' + time + 'Z', dtype='datetime64[s]')
            return datetime

        df = self.read_csv(StringIO(data), date_parser=date_parser,
                           parse_dates={'datetime': ['date', 'time']},
                           index_col=['datetime', 'prn'])

        datetimes = np_array_datetime64_compat(['2013-11-03T19:00:00Z'] * 3,
                                               dtype='datetime64[s]')
        df_correct = DataFrame(data={'rxstatus': ['00E80000'] * 3},
                               index=MultiIndex.from_tuples(
                                   [(datetimes[0], 126),
                                    (datetimes[1], 23),
                                    (datetimes[2], 13)],
                               names=['datetime', 'prn']))
        tm.assert_frame_equal(df, df_correct)
    def test_reindex(self):
        newFrame = self.frame.reindex(self.ts1.index)

        for col in newFrame.columns:
            for idx, val in compat.iteritems(newFrame[col]):
                if idx in self.frame.index:
                    if np.isnan(val):
                        self.assertTrue(np.isnan(self.frame[col][idx]))
                    else:
                        self.assertEqual(val, self.frame[col][idx])
                else:
                    self.assertTrue(np.isnan(val))

        for col, series in compat.iteritems(newFrame):
            self.assertTrue(tm.equalContents(series.index, newFrame.index))
        emptyFrame = self.frame.reindex(Index([]))
        self.assertEqual(len(emptyFrame.index), 0)

        # Cython code should be unit-tested directly
        nonContigFrame = self.frame.reindex(self.ts1.index[::2])

        for col in nonContigFrame.columns:
            for idx, val in compat.iteritems(nonContigFrame[col]):
                if idx in self.frame.index:
                    if np.isnan(val):
                        self.assertTrue(np.isnan(self.frame[col][idx]))
                    else:
                        self.assertEqual(val, self.frame[col][idx])
                else:
                    self.assertTrue(np.isnan(val))

        for col, series in compat.iteritems(nonContigFrame):
            self.assertTrue(tm.equalContents(series.index,
                                             nonContigFrame.index))

        # corner cases

        # Same index, copies values but not index if copy=False
        newFrame = self.frame.reindex(self.frame.index, copy=False)
        self.assertIs(newFrame.index, self.frame.index)

        # length zero
        newFrame = self.frame.reindex([])
        self.assertTrue(newFrame.empty)
        self.assertEqual(len(newFrame.columns), len(self.frame.columns))

        # length zero with columns reindexed with non-empty index
        newFrame = self.frame.reindex([])
        newFrame = newFrame.reindex(self.frame.index)
        self.assertEqual(len(newFrame.index), len(self.frame.index))
        self.assertEqual(len(newFrame.columns), len(self.frame.columns))

        # pass non-Index
        newFrame = self.frame.reindex(list(self.ts1.index))
        self.assert_index_equal(newFrame.index, self.ts1.index)

        # copy with no axes
        result = self.frame.reindex()
        assert_frame_equal(result, self.frame)
        self.assertFalse(result is self.frame)
Example #21
0
def test_isnull():
    assert not isnull(1.)
    assert isnull(None)
    assert isnull(np.NaN)
    assert not isnull(np.inf)
    assert not isnull(-np.inf)

    # series
    for s in [tm.makeFloatSeries(),tm.makeStringSeries(),
              tm.makeObjectSeries(),tm.makeTimeSeries(),tm.makePeriodSeries()]:
        assert(isinstance(isnull(s), Series))

    # frame
    for df in [tm.makeTimeDataFrame(),tm.makePeriodFrame(),tm.makeMixedDataFrame()]:
        result = isnull(df)
        expected = df.apply(isnull)
        tm.assert_frame_equal(result, expected)

    # panel
    for p in [ tm.makePanel(), tm.makePeriodPanel(), tm.add_nans(tm.makePanel()) ]:
        result = isnull(p)
        expected = p.apply(isnull)
        tm.assert_panel_equal(result, expected)

    # panel 4d
    for p in [ tm.makePanel4D(), tm.add_nans_panel4d(tm.makePanel4D()) ]:
        result = isnull(p)
        expected = p.apply(isnull)
        tm.assert_panel4d_equal(result, expected)
Example #22
0
    def test_loc_setitem_frame_multiples(self):
        # multiple setting
        df = DataFrame({'A': ['foo', 'bar', 'baz'],
                        'B': Series(
                            range(3), dtype=np.int64)})
        rhs = df.loc[1:2]
        rhs.index = df.index[0:2]
        df.loc[0:1] = rhs
        expected = DataFrame({'A': ['bar', 'baz', 'baz'],
                              'B': Series(
                                  [1, 2, 2], dtype=np.int64)})
        tm.assert_frame_equal(df, expected)

        # multiple setting with frame on rhs (with M8)
        df = DataFrame({'date': date_range('2000-01-01', '2000-01-5'),
                        'val': Series(
                            range(5), dtype=np.int64)})
        expected = DataFrame({'date': [Timestamp('20000101'), Timestamp(
            '20000102'), Timestamp('20000101'), Timestamp('20000102'),
            Timestamp('20000103')],
            'val': Series(
            [0, 1, 0, 1, 2], dtype=np.int64)})
        rhs = df.loc[0:2]
        rhs.index = df.index[2:5]
        df.loc[2:4] = rhs
        tm.assert_frame_equal(df, expected)
Example #23
0
    def test_grouper_multilevel_freq(self):

        # GH 7885
        # with level and freq specified in a pd.Grouper
        from datetime import date, timedelta
        d0 = date.today() - timedelta(days=14)
        dates = date_range(d0, date.today())
        date_index = pd.MultiIndex.from_product(
            [dates, dates], names=['foo', 'bar'])
        df = pd.DataFrame(np.random.randint(0, 100, 225), index=date_index)

        # Check string level
        expected = df.reset_index().groupby([pd.Grouper(
            key='foo', freq='W'), pd.Grouper(key='bar', freq='W')]).sum()
        # reset index changes columns dtype to object
        expected.columns = pd.Index([0], dtype='int64')

        result = df.groupby([pd.Grouper(level='foo', freq='W'), pd.Grouper(
            level='bar', freq='W')]).sum()
        assert_frame_equal(result, expected)

        # Check integer level
        result = df.groupby([pd.Grouper(level=0, freq='W'), pd.Grouper(
            level=1, freq='W')]).sum()
        assert_frame_equal(result, expected)
Example #24
0
    def test_parse_ragged_csv(self):
        data = """1,2,3
1,2,3,4
1,2,3,4,5
1,2
1,2,3,4"""

        nice_data = """1,2,3,,
1,2,3,4,
1,2,3,4,5
1,2,,,
1,2,3,4,"""
        result = self.read_csv(StringIO(data), header=None,
                               names=['a', 'b', 'c', 'd', 'e'])

        expected = self.read_csv(StringIO(nice_data), header=None,
                                 names=['a', 'b', 'c', 'd', 'e'])

        tm.assert_frame_equal(result, expected)

        # too many columns, cause segfault if not careful
        data = "1,2\n3,4,5"

        result = self.read_csv(StringIO(data), header=None,
                               names=lrange(50))
        expected = self.read_csv(StringIO(data), header=None,
                                 names=lrange(3)).reindex(columns=lrange(50))

        tm.assert_frame_equal(result, expected)
Example #25
0
 def test_empty_header_read(count):
     s = StringIO(',' * count)
     expected = DataFrame(columns=[
         'Unnamed: {i}'.format(i=i)
         for i in range(count + 1)])
     df = self.read_csv(s)
     tm.assert_frame_equal(df, expected)
Example #26
0
    def test_empty_with_mangled_column_pass_dtype_by_indexes(self):
        data = 'one,one'
        result = self.read_csv(StringIO(data), dtype={0: 'u1', 1: 'f'})

        expected = DataFrame(
            {'one': np.empty(0, dtype='u1'), 'one.1': np.empty(0, dtype='f')})
        tm.assert_frame_equal(result, expected, check_index_type=False)
Example #27
0
    def test_custom_lineterminator(self):
        data = 'a,b,c~1,2,3~4,5,6'

        result = self.read_csv(StringIO(data), lineterminator='~')
        expected = self.read_csv(StringIO(data.replace('~', '\n')))

        tm.assert_frame_equal(result, expected)
Example #28
0
    def test_groupby_categorical_index_and_columns(self, observed):
        # GH18432
        columns = ['A', 'B', 'A', 'B']
        categories = ['B', 'A']
        data = np.ones((5, 4), int)
        cat_columns = CategoricalIndex(columns,
                                       categories=categories,
                                       ordered=True)
        df = DataFrame(data=data, columns=cat_columns)
        result = df.groupby(axis=1, level=0, observed=observed).sum()
        expected_data = 2 * np.ones((5, 2), int)

        if observed:
            # if we are not-observed we undergo a reindex
            # so need to adjust the output as our expected sets us up
            # to be non-observed
            expected_columns = CategoricalIndex(['A', 'B'],
                                                categories=categories,
                                                ordered=True)
        else:
            expected_columns = CategoricalIndex(categories,
                                                categories=categories,
                                                ordered=True)
        expected = DataFrame(data=expected_data, columns=expected_columns)
        assert_frame_equal(result, expected)

        # test transposed version
        df = DataFrame(data.T, index=cat_columns)
        result = df.groupby(axis=0, level=0, observed=observed).sum()
        expected = DataFrame(data=expected_data.T, index=expected_columns)
        assert_frame_equal(result, expected)
Example #29
0
    def test_empty_pass_dtype(self):
        data = 'one,two'
        result = self.read_csv(StringIO(data), dtype={'one': 'u1'})

        expected = DataFrame({'one': np.empty(0, dtype='u1'),
                              'two': np.empty(0, dtype=np.object)})
        tm.assert_frame_equal(result, expected, check_index_type=False)
    def test_drop_multiindex_not_lexsorted(self):
        # GH 11640

        # define the lexsorted version
        lexsorted_mi = MultiIndex.from_tuples(
            [('a', ''), ('b1', 'c1'), ('b2', 'c2')], names=['b', 'c'])
        lexsorted_df = DataFrame([[1, 3, 4]], columns=lexsorted_mi)
        self.assertTrue(lexsorted_df.columns.is_lexsorted())

        # define the non-lexsorted version
        not_lexsorted_df = DataFrame(columns=['a', 'b', 'c', 'd'],
                                     data=[[1, 'b1', 'c1', 3],
                                           [1, 'b2', 'c2', 4]])
        not_lexsorted_df = not_lexsorted_df.pivot_table(
            index='a', columns=['b', 'c'], values='d')
        not_lexsorted_df = not_lexsorted_df.reset_index()
        self.assertFalse(not_lexsorted_df.columns.is_lexsorted())

        # compare the results
        tm.assert_frame_equal(lexsorted_df, not_lexsorted_df)

        expected = lexsorted_df.drop('a', axis=1)
        with tm.assert_produces_warning(PerformanceWarning):
            result = not_lexsorted_df.drop('a', axis=1)

        tm.assert_frame_equal(result, expected)
Example #31
0
def test_apply_filter_query(test_df):
    filters = ['col1 < 3', 'col2 > 6']
    filtered = util.apply_filter_query(test_df, filters)
    expected = pd.DataFrame({'col1': [2], 'col2': [7]}, index=['c'])
    pdt.assert_frame_equal(filtered, expected)
data = r"""date,numeric_col_1,non-numeric_col,numeric_col_2
2018-06-10,0,NaN,NaN
2018-06-11,1,NaN,1
2018-06-12,NaN,a,2
2018-06-13,3,b,NaN
2018-06-14,NaN,x,3
2018-06-15,NaN,c,5
2018-06-16,6,d,7
"""

df = pd.read_csv(
    pd.compat.StringIO(data),
    index_col=['date'],
    parse_dates=True,
    date_parser=lambda x: pd.datetime.strptime(x, '%Y-%m-%d')
)
assert isinstance(df.index, pd.core.indexes.datetimes.DatetimeIndex)
print(df)

df_copy = df.copy()

# use private method
df = df._get_numeric_data()
print(df)

# use select_dtypes
df_copy = df_copy.select_dtypes([np.number])
print(df_copy)

assert_frame_equal(df, df_copy)
Example #33
0
def test_slicing_datetimes():
    # GH 7523

    # unique
    df = DataFrame(np.arange(4., dtype='float64'),
                   index=[datetime(2001, 1, i, 10, 00) for i in [1, 2, 3, 4]])
    result = df.loc[datetime(2001, 1, 1, 10):]
    assert_frame_equal(result, df)
    result = df.loc[:datetime(2001, 1, 4, 10)]
    assert_frame_equal(result, df)
    result = df.loc[datetime(2001, 1, 1, 10):datetime(2001, 1, 4, 10)]
    assert_frame_equal(result, df)

    result = df.loc[datetime(2001, 1, 1, 11):]
    expected = df.iloc[1:]
    assert_frame_equal(result, expected)
    result = df.loc['20010101 11':]
    assert_frame_equal(result, expected)

    # duplicates
    df = pd.DataFrame(
        np.arange(5., dtype='float64'),
        index=[datetime(2001, 1, i, 10, 00) for i in [1, 2, 2, 3, 4]])

    result = df.loc[datetime(2001, 1, 1, 10):]
    assert_frame_equal(result, df)
    result = df.loc[:datetime(2001, 1, 4, 10)]
    assert_frame_equal(result, df)
    result = df.loc[datetime(2001, 1, 1, 10):datetime(2001, 1, 4, 10)]
    assert_frame_equal(result, df)

    result = df.loc[datetime(2001, 1, 1, 11):]
    expected = df.iloc[1:]
    assert_frame_equal(result, expected)
    result = df.loc['20010101 11':]
    assert_frame_equal(result, expected)
Example #34
0
def test_filter_table(choosers, rates):
    filtered = util.filter_table(choosers,
                                 rates.iloc[1],
                                 ignore={'probability_of_relocating'})
    pdt.assert_frame_equal(filtered, choosers.iloc[[2]])
Example #35
0
def test_apply_filter_query_no_filter(test_df):
    filters = []
    filtered = util.apply_filter_query(test_df, filters)
    expected = test_df
    pdt.assert_frame_equal(filtered, expected)
Example #36
0
def test_apply_filter_query_or(test_df):
    filters = ['col1 < 1 or col2 > 8']
    filtered = util.apply_filter_query(test_df, filters)
    expected = pd.DataFrame({'col1': [0, 4], 'col2': [5, 9]}, index=['a', 'e'])
    pdt.assert_frame_equal(filtered, expected)
Example #37
0
def test_apply_filter_query_empty(test_df):
    filters = ['col1 < 1', 'col2 > 8']
    filtered = util.apply_filter_query(test_df, filters)
    expected = pd.DataFrame({'col1': [], 'col2': []}, index=[])
    pdt.assert_frame_equal(filtered, expected)
 def test_value_array_record_prefix(self):
     # GH 21536
     result = json_normalize({'A': [1, 2]}, 'A', record_prefix='Prefix.')
     expected = DataFrame([[1], [2]], columns=['Prefix.0'])
     tm.assert_frame_equal(result, expected)
Example #39
0
 def test_catalog(self, dummy_context, dummy_dataframe):
     assert dummy_context.catalog.layers == {"raw": {"boats"}}
     dummy_context.catalog.save("cars", dummy_dataframe)
     reloaded_df = dummy_context.catalog.load("cars")
     assert_frame_equal(reloaded_df, dummy_dataframe)
 def test_empty_array(self):
     result = json_normalize([])
     expected = DataFrame()
     tm.assert_frame_equal(result, expected)
Example #41
0
    def test_int64_overflow_issues(self):

        # #2690, combinatorial explosion
        df1 = DataFrame(np.random.randn(1000, 7), columns=list("ABCDEF") + ["G1"])
        df2 = DataFrame(np.random.randn(1000, 7), columns=list("ABCDEF") + ["G2"])

        # it works!
        result = merge(df1, df2, how="outer")
        assert len(result) == 2000

        low, high, n = -1 << 10, 1 << 10, 1 << 20
        left = DataFrame(np.random.randint(low, high, (n, 7)), columns=list("ABCDEFG"))
        left["left"] = left.sum(axis=1)

        # one-2-one match
        i = np.random.permutation(len(left))
        right = left.iloc[i].copy()
        right.columns = right.columns[:-1].tolist() + ["right"]
        right.index = np.arange(len(right))
        right["right"] *= -1

        out = merge(left, right, how="outer")
        assert len(out) == len(left)
        assert_series_equal(out["left"], -out["right"], check_names=False)
        result = out.iloc[:, :-2].sum(axis=1)
        assert_series_equal(out["left"], result, check_names=False)
        assert result.name is None

        out.sort_values(out.columns.tolist(), inplace=True)
        out.index = np.arange(len(out))
        for how in ["left", "right", "outer", "inner"]:
            assert_frame_equal(out, merge(left, right, how=how, sort=True))

        # check that left merge w/ sort=False maintains left frame order
        out = merge(left, right, how="left", sort=False)
        assert_frame_equal(left, out[left.columns.tolist()])

        out = merge(right, left, how="left", sort=False)
        assert_frame_equal(right, out[right.columns.tolist()])

        # one-2-many/none match
        n = 1 << 11
        left = DataFrame(
            np.random.randint(low, high, (n, 7)).astype("int64"),
            columns=list("ABCDEFG"),
        )

        # confirm that this is checking what it is supposed to check
        shape = left.apply(Series.nunique).values
        assert is_int64_overflow_possible(shape)

        # add duplicates to left frame
        left = concat([left, left], ignore_index=True)

        right = DataFrame(
            np.random.randint(low, high, (n // 2, 7)).astype("int64"),
            columns=list("ABCDEFG"),
        )

        # add duplicates & overlap with left to the right frame
        i = np.random.choice(len(left), n)
        right = concat([right, right, left.iloc[i]], ignore_index=True)

        left["left"] = np.random.randn(len(left))
        right["right"] = np.random.randn(len(right))

        # shuffle left & right frames
        i = np.random.permutation(len(left))
        left = left.iloc[i].copy()
        left.index = np.arange(len(left))

        i = np.random.permutation(len(right))
        right = right.iloc[i].copy()
        right.index = np.arange(len(right))

        # manually compute outer merge
        ldict, rdict = defaultdict(list), defaultdict(list)

        for idx, row in left.set_index(list("ABCDEFG")).iterrows():
            ldict[idx].append(row["left"])

        for idx, row in right.set_index(list("ABCDEFG")).iterrows():
            rdict[idx].append(row["right"])

        vals = []
        for k, lval in ldict.items():
            rval = rdict.get(k, [np.nan])
            for lv, rv in product(lval, rval):
                vals.append(k + tuple([lv, rv]))

        for k, rval in rdict.items():
            if k not in ldict:
                for rv in rval:
                    vals.append(k + tuple([np.nan, rv]))

        def align(df):
            df = df.sort_values(df.columns.tolist())
            df.index = np.arange(len(df))
            return df

        def verify_order(df):
            kcols = list("ABCDEFG")
            assert_frame_equal(
                df[kcols].copy(), df[kcols].sort_values(kcols, kind="mergesort")
            )

        out = DataFrame(vals, columns=list("ABCDEFG") + ["left", "right"])
        out = align(out)

        jmask = {
            "left": out["left"].notna(),
            "right": out["right"].notna(),
            "inner": out["left"].notna() & out["right"].notna(),
            "outer": np.ones(len(out), dtype="bool"),
        }

        for how in "left", "right", "outer", "inner":
            mask = jmask[how]
            frame = align(out[mask].copy())
            assert mask.all() ^ mask.any() or how == "outer"

            for sort in [False, True]:
                res = merge(left, right, how=how, sort=sort)
                if sort:
                    verify_order(res)

                # as in GH9092 dtypes break with outer/right join
                assert_frame_equal(
                    frame, align(res), check_dtype=how not in ("right", "outer")
                )
Example #42
0
 def test_io(self, dummy_context, dummy_dataframe):
     dummy_context.io.save("cars", dummy_dataframe)
     reloaded_df = dummy_context.io.load("cars")
     assert_frame_equal(reloaded_df, dummy_dataframe)
Example #43
0
 def test_frame_from_json_precise_float(self):
     df = DataFrame([[4.56, 4.56, 4.56], [4.56, 4.56, 4.56]])
     result = read_json(df.to_json(), precise_float=True)
     assert_frame_equal(result, df)
Example #44
0
 def verify_order(df):
     kcols = list("ABCDEFG")
     assert_frame_equal(
         df[kcols].copy(), df[kcols].sort_values(kcols, kind="mergesort")
     )
Example #45
0
 def test_frame_empty(self):
     df = DataFrame(columns=['jim', 'joe'])
     self.assertFalse(df._is_mixed_type)
     assert_frame_equal(read_json(df.to_json(), dtype=dict(df.dtypes)), df)
Example #46
0
 def test_default_handler(self):
     value = object()
     frame = DataFrame({'a': ['a', value]})
     expected = frame.applymap(str)
     result = pd.read_json(frame.to_json(default_handler=str))
     assert_frame_equal(expected, result)
Example #47
0
 def check(result, expected):
     str(result)
     result.dtypes
     tm.assert_frame_equal(result, expected)
Example #48
0
 def test_frame_empty_mixedtype(self):
     # mixed type
     df = DataFrame(columns=['jim', 'joe'])
     df['joe'] = df['joe'].astype('i8')
     self.assertTrue(df._is_mixed_type)
     assert_frame_equal(read_json(df.to_json(), dtype=dict(df.dtypes)), df)
Example #49
0
    def test_iloc_getitem_frame(self):
        df = DataFrame(np.random.randn(10, 4), index=lrange(0, 20, 2),
                       columns=lrange(0, 8, 2))

        result = df.iloc[2]
        with catch_warnings(record=True):
            filterwarnings("ignore", "\\n.ix", DeprecationWarning)
            exp = df.ix[4]
        tm.assert_series_equal(result, exp)

        result = df.iloc[2, 2]
        with catch_warnings(record=True):
            filterwarnings("ignore", "\\n.ix", DeprecationWarning)
            exp = df.ix[4, 4]
        assert result == exp

        # slice
        result = df.iloc[4:8]
        with catch_warnings(record=True):
            filterwarnings("ignore", "\\n.ix", DeprecationWarning)
            expected = df.ix[8:14]
        tm.assert_frame_equal(result, expected)

        result = df.iloc[:, 2:3]
        with catch_warnings(record=True):
            filterwarnings("ignore", "\\n.ix", DeprecationWarning)
            expected = df.ix[:, 4:5]
        tm.assert_frame_equal(result, expected)

        # list of integers
        result = df.iloc[[0, 1, 3]]
        with catch_warnings(record=True):
            filterwarnings("ignore", "\\n.ix", DeprecationWarning)
            expected = df.ix[[0, 2, 6]]
        tm.assert_frame_equal(result, expected)

        result = df.iloc[[0, 1, 3], [0, 1]]
        with catch_warnings(record=True):
            filterwarnings("ignore", "\\n.ix", DeprecationWarning)
            expected = df.ix[[0, 2, 6], [0, 2]]
        tm.assert_frame_equal(result, expected)

        # neg indices
        result = df.iloc[[-1, 1, 3], [-1, 1]]
        with catch_warnings(record=True):
            filterwarnings("ignore", "\\n.ix", DeprecationWarning)
            expected = df.ix[[18, 2, 6], [6, 2]]
        tm.assert_frame_equal(result, expected)

        # dups indices
        result = df.iloc[[-1, -1, 1, 3], [-1, 1]]
        with catch_warnings(record=True):
            filterwarnings("ignore", "\\n.ix", DeprecationWarning)
            expected = df.ix[[18, 18, 2, 6], [6, 2]]
        tm.assert_frame_equal(result, expected)

        # with index-like
        s = Series(index=lrange(1, 5))
        result = df.iloc[s.index]
        with catch_warnings(record=True):
            filterwarnings("ignore", "\\n.ix", DeprecationWarning)
            expected = df.ix[[2, 4, 6, 8]]
        tm.assert_frame_equal(result, expected)
Example #50
0
 def _check(df):
     result = read_json(df.to_json(orient='split'),
                        orient='split',
                        convert_dates=['x'])
     assert_frame_equal(result, df)
Example #51
0
    def test_iloc_exceeds_bounds(self):

        # GH6296
        # iloc should allow indexers that exceed the bounds
        df = DataFrame(np.random.random_sample((20, 5)), columns=list('ABCDE'))

        # lists of positions should raise IndexErrror!
        msg = 'positional indexers are out-of-bounds'
        with pytest.raises(IndexError, match=msg):
            df.iloc[:, [0, 1, 2, 3, 4, 5]]
        pytest.raises(IndexError, lambda: df.iloc[[1, 30]])
        pytest.raises(IndexError, lambda: df.iloc[[1, -30]])
        pytest.raises(IndexError, lambda: df.iloc[[100]])

        s = df['A']
        pytest.raises(IndexError, lambda: s.iloc[[100]])
        pytest.raises(IndexError, lambda: s.iloc[[-100]])

        # still raise on a single indexer
        msg = 'single positional indexer is out-of-bounds'
        with pytest.raises(IndexError, match=msg):
            df.iloc[30]
        pytest.raises(IndexError, lambda: df.iloc[-30])

        # GH10779
        # single positive/negative indexer exceeding Series bounds should raise
        # an IndexError
        with pytest.raises(IndexError, match=msg):
            s.iloc[30]
        pytest.raises(IndexError, lambda: s.iloc[-30])

        # slices are ok
        result = df.iloc[:, 4:10]  # 0 < start < len < stop
        expected = df.iloc[:, 4:]
        tm.assert_frame_equal(result, expected)

        result = df.iloc[:, -4:-10]  # stop < 0 < start < len
        expected = df.iloc[:, :0]
        tm.assert_frame_equal(result, expected)

        result = df.iloc[:, 10:4:-1]  # 0 < stop < len < start (down)
        expected = df.iloc[:, :4:-1]
        tm.assert_frame_equal(result, expected)

        result = df.iloc[:, 4:-10:-1]  # stop < 0 < start < len (down)
        expected = df.iloc[:, 4::-1]
        tm.assert_frame_equal(result, expected)

        result = df.iloc[:, -10:4]  # start < 0 < stop < len
        expected = df.iloc[:, :4]
        tm.assert_frame_equal(result, expected)

        result = df.iloc[:, 10:4]  # 0 < stop < len < start
        expected = df.iloc[:, :0]
        tm.assert_frame_equal(result, expected)

        result = df.iloc[:, -10:-11:-1]  # stop < start < 0 < len (down)
        expected = df.iloc[:, :0]
        tm.assert_frame_equal(result, expected)

        result = df.iloc[:, 10:11]  # 0 < len < start < stop
        expected = df.iloc[:, :0]
        tm.assert_frame_equal(result, expected)

        # slice bounds exceeding is ok
        result = s.iloc[18:30]
        expected = s.iloc[18:]
        tm.assert_series_equal(result, expected)

        result = s.iloc[30:]
        expected = s.iloc[:0]
        tm.assert_series_equal(result, expected)

        result = s.iloc[30::-1]
        expected = s.iloc[::-1]
        tm.assert_series_equal(result, expected)

        # doc example
        def check(result, expected):
            str(result)
            result.dtypes
            tm.assert_frame_equal(result, expected)

        dfl = DataFrame(np.random.randn(5, 2), columns=list('AB'))
        check(dfl.iloc[:, 2:3], DataFrame(index=dfl.index))
        check(dfl.iloc[:, 1:3], dfl.iloc[:, [1]])
        check(dfl.iloc[4:6], dfl.iloc[[4]])

        pytest.raises(IndexError, lambda: dfl.iloc[[4, 5, 6]])
        pytest.raises(IndexError, lambda: dfl.iloc[:, 4])
Example #52
0
    def test_iloc_mask(self):

        # GH 3631, iloc with a mask (of a series) should raise
        df = DataFrame(lrange(5), list('ABCDE'), columns=['a'])
        mask = (df.a % 2 == 0)
        pytest.raises(ValueError, df.iloc.__getitem__, tuple([mask]))
        mask.index = lrange(len(mask))
        pytest.raises(NotImplementedError, df.iloc.__getitem__,
                      tuple([mask]))

        # ndarray ok
        result = df.iloc[np.array([True] * len(mask), dtype=bool)]
        tm.assert_frame_equal(result, df)

        # the possibilities
        locs = np.arange(4)
        nums = 2 ** locs
        reps = lmap(bin, nums)
        df = DataFrame({'locs': locs, 'nums': nums}, reps)

        expected = {
            (None, ''): '0b1100',
            (None, '.loc'): '0b1100',
            (None, '.iloc'): '0b1100',
            ('index', ''): '0b11',
            ('index', '.loc'): '0b11',
            ('index', '.iloc'): ('iLocation based boolean indexing '
                                 'cannot use an indexable as a mask'),
            ('locs', ''): 'Unalignable boolean Series provided as indexer '
                          '(index of the boolean Series and of the indexed '
                          'object do not match',
            ('locs', '.loc'): 'Unalignable boolean Series provided as indexer '
                              '(index of the boolean Series and of the '
                              'indexed object do not match',
            ('locs', '.iloc'): ('iLocation based boolean indexing on an '
                                'integer type is not available'),
        }

        # UserWarnings from reindex of a boolean mask
        with catch_warnings(record=True):
            simplefilter("ignore", UserWarning)
            result = dict()
            for idx in [None, 'index', 'locs']:
                mask = (df.nums > 2).values
                if idx:
                    mask = Series(mask, list(reversed(getattr(df, idx))))
                for method in ['', '.loc', '.iloc']:
                    try:
                        if method:
                            accessor = getattr(df, method[1:])
                        else:
                            accessor = df
                        ans = str(bin(accessor[mask]['nums'].sum()))
                    except Exception as e:
                        ans = str(e)

                    key = tuple([idx, method])
                    r = expected.get(key)
                    if r != ans:
                        raise AssertionError(
                            "[%s] does not match [%s], received [%s]"
                            % (key, ans, r))
Example #53
0
def test_float_precision_round_trip_with_text(c_parser_only):
    # see gh-15140
    parser = c_parser_only
    df = parser.read_csv(StringIO("a"), header=None,
                         float_precision="round_trip")
    tm.assert_frame_equal(df, DataFrame({0: ["a"]}))
Example #54
0
    def test_iloc_getitem_slice_dups(self):

        df1 = DataFrame(np.random.randn(10, 4), columns=['A', 'A', 'B', 'B'])
        df2 = DataFrame(np.random.randint(0, 10, size=20).reshape(10, 2),
                        columns=['A', 'C'])

        # axis=1
        df = concat([df1, df2], axis=1)
        tm.assert_frame_equal(df.iloc[:, :4], df1)
        tm.assert_frame_equal(df.iloc[:, 4:], df2)

        df = concat([df2, df1], axis=1)
        tm.assert_frame_equal(df.iloc[:, :2], df2)
        tm.assert_frame_equal(df.iloc[:, 2:], df1)

        exp = concat([df2, df1.iloc[:, [0]]], axis=1)
        tm.assert_frame_equal(df.iloc[:, 0:3], exp)

        # axis=0
        df = concat([df, df], axis=0)
        tm.assert_frame_equal(df.iloc[0:10, :2], df2)
        tm.assert_frame_equal(df.iloc[0:10, 2:], df1)
        tm.assert_frame_equal(df.iloc[10:, :2], df2)
        tm.assert_frame_equal(df.iloc[10:, 2:], df1)
    def test_sanity_data_frame(self, mock_post):
        """Tests KustoResponse to pandas.DataFrame."""

        from pandas import DataFrame, Series
        from pandas.util.testing import assert_frame_equal

        client = KustoClient("https://somecluster.kusto.windows.net")
        data_frame = dataframe_from_result_table(
            client.execute_query("PythonTest", "Deft").primary_results[0])
        self.assertEqual(len(data_frame.columns), 19)
        expected_dict = {
            "rownumber":
            Series([None, 0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0]),
            "rowguid":
            Series(
                [
                    "",
                    "00000000-0000-0000-0001-020304050607",
                    "00000001-0000-0000-0001-020304050607",
                    "00000002-0000-0000-0001-020304050607",
                    "00000003-0000-0000-0001-020304050607",
                    "00000004-0000-0000-0001-020304050607",
                    "00000005-0000-0000-0001-020304050607",
                    "00000006-0000-0000-0001-020304050607",
                    "00000007-0000-0000-0001-020304050607",
                    "00000008-0000-0000-0001-020304050607",
                    "00000009-0000-0000-0001-020304050607",
                ],
                dtype=object,
            ),
            "xdouble":
            Series([
                None, 0.0, 1.0001, 2.0002, 3.0003, 4.0004, 5.0005, 6.0006,
                7.0007, 8.0008, 9.0009
            ]),
            "xfloat":
            Series([
                None, 0.0, 1.01, 2.02, 3.03, 4.04, 5.05, 6.06, 7.07, 8.08, 9.09
            ]),
            "xbool":
            Series([
                None, False, True, False, True, False, True, False, True,
                False, True
            ],
                   dtype=bool),
            "xint16":
            Series([None, 0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0]),
            "xint32":
            Series([None, 0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0]),
            "xint64":
            Series([None, 0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0]),
            "xuint8":
            Series([None, 0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0]),
            "xuint16":
            Series([None, 0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0]),
            "xuint32":
            Series([None, 0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0]),
            "xuint64":
            Series([None, 0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0]),
            "xdate":
            Series([
                pandas.to_datetime(None),
                pandas.to_datetime("2014-01-01T01:01:01.0000000Z"),
                pandas.to_datetime("2015-01-01T01:01:01.0000001Z"),
                pandas.to_datetime("2016-01-01T01:01:01.0000002Z"),
                pandas.to_datetime("2017-01-01T01:01:01.0000003Z"),
                pandas.to_datetime("2018-01-01T01:01:01.0000004Z"),
                pandas.to_datetime("2019-01-01T01:01:01.0000005Z"),
                pandas.to_datetime("2020-01-01T01:01:01.0000006Z"),
                pandas.to_datetime("2021-01-01T01:01:01.0000007Z"),
                pandas.to_datetime("2022-01-01T01:01:01.0000008Z"),
                pandas.to_datetime("2023-01-01T01:01:01.0000009Z"),
            ]),
            "xsmalltext":
            Series([
                "", "Zero", "One", "Two", "Three", "Four", "Five", "Six",
                "Seven", "Eight", "Nine"
            ],
                   dtype=object),
            "xtext":
            Series([
                "", "Zero", "One", "Two", "Three", "Four", "Five", "Six",
                "Seven", "Eight", "Nine"
            ],
                   dtype=object),
            "xnumberAsText":
            Series(["", "0", "1", "2", "3", "4", "5", "6", "7", "8", "9"],
                   dtype=object),
            "xtime":
            Series(
                [
                    "NaT",
                    0,
                    "1 days 00:00:01.0010001",
                    "-3 days 23:59:57.9979998",
                    "3 days 00:00:03.0030003",
                    "-5 days 23:59:55.9959996",
                    "5 days 00:00:05.0050005",
                    "-7 days 23:59:53.9939994",
                    "7 days 00:00:07.0070007",
                    "-9 days 23:59:51.9919992",
                    "9 days 00:00:09.0090009",
                ],
                dtype="timedelta64[ns]",
            ),
            "xtextWithNulls":
            Series(["", "", "", "", "", "", "", "", "", "", ""], dtype=object),
            "xdynamicWithNulls":
            Series(
                [
                    text_type(""),
                    text_type(""),
                    {
                        "rowId": 1,
                        "arr": [0, 1]
                    },
                    {
                        "rowId": 2,
                        "arr": [0, 2]
                    },
                    {
                        "rowId": 3,
                        "arr": [0, 3]
                    },
                    {
                        "rowId": 4,
                        "arr": [0, 4]
                    },
                    {
                        "rowId": 5,
                        "arr": [0, 5]
                    },
                    {
                        "rowId": 6,
                        "arr": [0, 6]
                    },
                    {
                        "rowId": 7,
                        "arr": [0, 7]
                    },
                    {
                        "rowId": 8,
                        "arr": [0, 8]
                    },
                    {
                        "rowId": 9,
                        "arr": [0, 9]
                    },
                ],
                dtype=object,
            ),
        }

        columns = [
            "rownumber",
            "rowguid",
            "xdouble",
            "xfloat",
            "xbool",
            "xint16",
            "xint32",
            "xint64",
            "xuint8",
            "xuint16",
            "xuint32",
            "xuint64",
            "xdate",
            "xsmalltext",
            "xtext",
            "xnumberAsText",
            "xtime",
            "xtextWithNulls",
            "xdynamicWithNulls",
        ]
        expected_data_frame = DataFrame(expected_dict,
                                        columns=columns,
                                        copy=True)
        assert_frame_equal(data_frame, expected_data_frame)
Example #56
0
def test_weightscomp_pyparser_prepare_atomic(atomic, expected):
    assert_frame_equal(atomic, expected, check_names=False)
 def test_to_frame(self):
     s = tm.SubclassedSeries([1, 2, 3, 4], index=list('abcd'), name='xxx')
     res = s.to_frame()
     exp = tm.SubclassedDataFrame({'xxx': [1, 2, 3, 4]}, index=list('abcd'))
     tm.assert_frame_equal(res, exp)
     assert isinstance(res, tm.SubclassedDataFrame)
Example #58
0
def test_parse_trim_buffers(c_parser_only):
    # This test is part of a bugfix for gh-13703. It attempts to
    # to stress the system memory allocator, to cause it to move the
    # stream buffer and either let the OS reclaim the region, or let
    # other memory requests of parser otherwise modify the contents
    # of memory space, where it was formally located.
    # This test is designed to cause a `segfault` with unpatched
    # `tokenizer.c`. Sometimes the test fails on `segfault`, other
    # times it fails due to memory corruption, which causes the
    # loaded DataFrame to differ from the expected one.

    parser = c_parser_only

    # Generate a large mixed-type CSV file on-the-fly (one record is
    # approx 1.5KiB).
    record_ = \
        """9999-9,99:99,,,,ZZ,ZZ,,,ZZZ-ZZZZ,.Z-ZZZZ,-9.99,,,9.99,Z""" \
        """ZZZZ,,-99,9,ZZZ-ZZZZ,ZZ-ZZZZ,,9.99,ZZZ-ZZZZZ,ZZZ-ZZZZZ,""" \
        """ZZZ-ZZZZ,ZZZ-ZZZZ,ZZZ-ZZZZ,ZZZ-ZZZZ,ZZZ-ZZZZ,ZZZ-ZZZZ,9""" \
        """99,ZZZ-ZZZZ,,ZZ-ZZZZ,,,,,ZZZZ,ZZZ-ZZZZZ,ZZZ-ZZZZ,,,9,9,""" \
        """9,9,99,99,999,999,ZZZZZ,ZZZ-ZZZZZ,ZZZ-ZZZZ,9,ZZ-ZZZZ,9.""" \
        """99,ZZ-ZZZZ,ZZ-ZZZZ,,,,ZZZZ,,,ZZ,ZZ,,,,,,,,,,,,,9,,,999.""" \
        """99,999.99,,,ZZZZZ,,,Z9,,,,,,,ZZZ,ZZZ,,,,,,,,,,,ZZZZZ,ZZ""" \
        """ZZZ,ZZZ-ZZZZZZ,ZZZ-ZZZZZZ,ZZ-ZZZZ,ZZ-ZZZZ,ZZ-ZZZZ,ZZ-ZZ""" \
        """ZZ,,,999999,999999,ZZZ,ZZZ,,,ZZZ,ZZZ,999.99,999.99,,,,Z""" \
        """ZZ-ZZZ,ZZZ-ZZZ,-9.99,-9.99,9,9,,99,,9.99,9.99,9,9,9.99,""" \
        """9.99,,,,9.99,9.99,,99,,99,9.99,9.99,,,ZZZ,ZZZ,,999.99,,""" \
        """999.99,ZZZ,ZZZ-ZZZZ,ZZZ-ZZZZ,,,ZZZZZ,ZZZZZ,ZZZ,ZZZ,9,9,""" \
        """,,,,,ZZZ-ZZZZ,ZZZ999Z,,,999.99,,999.99,ZZZ-ZZZZ,,,9.999""" \
        """,9.999,9.999,9.999,-9.999,-9.999,-9.999,-9.999,9.999,9.""" \
        """999,9.999,9.999,9.999,9.999,9.999,9.999,99999,ZZZ-ZZZZ,""" \
        """,9.99,ZZZ,,,,,,,,ZZZ,,,,,9,,,,9,,,,,,,,,,ZZZ-ZZZZ,ZZZ-Z""" \
        """ZZZ,,ZZZZZ,ZZZZZ,ZZZZZ,ZZZZZ,,,9.99,,ZZ-ZZZZ,ZZ-ZZZZ,ZZ""" \
        """,999,,,,ZZ-ZZZZ,ZZZ,ZZZ,ZZZ-ZZZZ,ZZZ-ZZZZ,,,99.99,99.99""" \
        """,,,9.99,9.99,9.99,9.99,ZZZ-ZZZZ,,,ZZZ-ZZZZZ,,,,,-9.99,-""" \
        """9.99,-9.99,-9.99,,,,,,,,,ZZZ-ZZZZ,,9,9.99,9.99,99ZZ,,-9""" \
        """.99,-9.99,ZZZ-ZZZZ,,,,,,,ZZZ-ZZZZ,9.99,9.99,9999,,,,,,,""" \
        """,,,-9.9,Z/Z-ZZZZ,999.99,9.99,,999.99,ZZ-ZZZZ,ZZ-ZZZZ,9.""" \
        """99,9.99,9.99,9.99,9.99,9.99,,ZZZ-ZZZZZ,ZZZ-ZZZZZ,ZZZ-ZZ""" \
        """ZZZ,ZZZ-ZZZZZ,ZZZ-ZZZZZ,ZZZ,ZZZ,ZZZ,ZZZ,9.99,,,-9.99,ZZ""" \
        """-ZZZZ,-999.99,,-9999,,999.99,,,,999.99,99.99,,,ZZ-ZZZZZ""" \
        """ZZZ,ZZ-ZZZZ-ZZZZZZZ,,,,ZZ-ZZ-ZZZZZZZZ,ZZZZZZZZ,ZZZ-ZZZZ""" \
        """,9999,999.99,ZZZ-ZZZZ,-9.99,-9.99,ZZZ-ZZZZ,99:99:99,,99""" \
        """,99,,9.99,,-99.99,,,,,,9.99,ZZZ-ZZZZ,-9.99,-9.99,9.99,9""" \
        """.99,,ZZZ,,,,,,,ZZZ,ZZZ,,,,,"""

    # Set the number of lines so that a call to `parser_trim_buffers`
    # is triggered: after a couple of full chunks are consumed a
    # relatively small 'residual' chunk would cause reallocation
    # within the parser.
    chunksize, n_lines = 128, 2 * 128 + 15
    csv_data = "\n".join([record_] * n_lines) + "\n"

    # We will use StringIO to load the CSV from this text buffer.
    # pd.read_csv() will iterate over the file in chunks and will
    # finally read a residual chunk of really small size.

    # Generate the expected output: manually create the dataframe
    # by splitting by comma and repeating the `n_lines` times.
    row = tuple(val_ if val_ else np.nan
                for val_ in record_.split(","))
    expected = DataFrame([row for _ in range(n_lines)],
                         dtype=object, columns=None, index=None)

    # Iterate over the CSV file in chunks of `chunksize` lines
    chunks_ = parser.read_csv(StringIO(csv_data), header=None,
                              dtype=object, chunksize=chunksize)
    result = concat(chunks_, axis=0, ignore_index=True)

    # Check for data corruption if there was no segfault
    tm.assert_frame_equal(result, expected)

    # This extra test was added to replicate the fault in gh-5291.
    # Force 'utf-8' encoding, so that `_string_convert` would take
    # a different execution branch.
    chunks_ = parser.read_csv(StringIO(csv_data), header=None,
                              dtype=object, chunksize=chunksize,
                              encoding="utf_8")
    result = concat(chunks_, axis=0, ignore_index=True)
    tm.assert_frame_equal(result, expected)
Example #59
0
 def test_createDataFrame_with_schema(self):
     pdf = self.create_pandas_data_frame()
     df = self.spark.createDataFrame(pdf, schema=self.schema)
     self.assertEquals(self.schema, df.schema)
     pdf_arrow = df.toPandas()
     assert_frame_equal(pdf_arrow, pdf)
Example #60
0
 def test_dataframe_dummies_unicode(self, get_dummies_kwargs, expected):
     # GH22084 pd.get_dummies incorrectly encodes unicode characters
     # in dataframe column names
     result = get_dummies(**get_dummies_kwargs)
     assert_frame_equal(result, expected)