Example #1
0
def test_isnull():
    assert not isnull(1.)
    assert isnull(None)
    assert isnull(np.NaN)
    assert not isnull(np.inf)
    assert not isnull(-np.inf)

    # series
    for s in [tm.makeFloatSeries(),tm.makeStringSeries(),
              tm.makeObjectSeries(),tm.makeTimeSeries(),tm.makePeriodSeries()]:
        assert(isinstance(isnull(s), Series))

    # frame
    for df in [tm.makeTimeDataFrame(),tm.makePeriodFrame(),tm.makeMixedDataFrame()]:
        result = isnull(df)
        expected = df.apply(isnull)
        tm.assert_frame_equal(result, expected)

    # panel
    for p in [ tm.makePanel(), tm.makePeriodPanel(), tm.add_nans(tm.makePanel()) ]:
        result = isnull(p)
        expected = p.apply(isnull)
        tm.assert_panel_equal(result, expected)

    # panel 4d
    for p in [ tm.makePanel4D(), tm.add_nans_panel4d(tm.makePanel4D()) ]:
        result = isnull(p)
        expected = p.apply(isnull)
        tm.assert_panel4d_equal(result, expected)
Example #2
0
    def test_hash_pandas_object(self):

        for obj in [
                Series([1, 2, 3]),
                Series([1.0, 1.5, 3.2]),
                Series([1.0, 1.5, np.nan]),
                Series([1.0, 1.5, 3.2], index=[1.5, 1.1, 3.3]),
                Series(['a', 'b', 'c']),
                Series(['a', np.nan, 'c']),
                Series([True, False, True]),
                Index([1, 2, 3]),
                Index([True, False, True]),
                DataFrame({
                    'x': ['a', 'b', 'c'],
                    'y': [1, 2, 3]
                }),
                tm.makeMissingDataframe(),
                tm.makeMixedDataFrame(),
                tm.makeTimeDataFrame(),
                tm.makeTimeSeries(),
                tm.makeTimedeltaIndex(),
                Series([1, 2, 3],
                       index=pd.MultiIndex.from_tuples([('a', 1), ('a', 2),
                                                        ('b', 1)]))
        ]:
            self.check_equal(obj)
            self.check_not_equal_with_index(obj)
Example #3
0
    def test_isna_isnull(self, isna_f):
        assert not isna_f(1.)
        assert isna_f(None)
        assert isna_f(np.NaN)
        assert float('nan')
        assert not isna_f(np.inf)
        assert not isna_f(-np.inf)

        # series
        for s in [tm.makeFloatSeries(), tm.makeStringSeries(),
                  tm.makeObjectSeries(), tm.makeTimeSeries(),
                  tm.makePeriodSeries()]:
            assert isinstance(isna_f(s), Series)

        # frame
        for df in [tm.makeTimeDataFrame(), tm.makePeriodFrame(),
                   tm.makeMixedDataFrame()]:
            result = isna_f(df)
            expected = df.apply(isna_f)
            tm.assert_frame_equal(result, expected)

        # panel
        with catch_warnings(record=True):
            simplefilter("ignore", FutureWarning)
            for p in [tm.makePanel(), tm.makePeriodPanel(),
                      tm.add_nans(tm.makePanel())]:
                result = isna_f(p)
                expected = p.apply(isna_f)
                tm.assert_panel_equal(result, expected)
Example #4
0
    def test_upload_data_if_table_exists_replace(self):

        raise nose.SkipTest("buggy test")

        destination_table = DESTINATION_TABLE + "4"

        test_size = 10
        df = make_mixed_dataframe_v2(test_size)
        df_different_schema = tm.makeMixedDataFrame()

        # Initialize table with sample data
        gbq.to_gbq(df, destination_table, _get_project_id(), chunksize=10000,
                   private_key=_get_private_key_path())

        # Test the if_exists parameter with the value 'replace'.
        gbq.to_gbq(df_different_schema, destination_table,
                   _get_project_id(), if_exists='replace',
                   private_key=_get_private_key_path())

        sleep(30)  # <- Curses Google!!!

        result = gbq.read_gbq("SELECT COUNT(*) as NUM_ROWS FROM {0}"
                              .format(destination_table),
                              project_id=_get_project_id(),
                              private_key=_get_private_key_path())
        self.assertEqual(result['NUM_ROWS'][0], 5)
Example #5
0
    def test_hash_pandas_object(self):

        for obj in [Series([1, 2, 3]),
                    Series([1.0, 1.5, 3.2]),
                    Series([1.0, 1.5, np.nan]),
                    Series([1.0, 1.5, 3.2], index=[1.5, 1.1, 3.3]),
                    Series(['a', 'b', 'c']),
                    Series(['a', np.nan, 'c']),
                    Series(['a', None, 'c']),
                    Series([True, False, True]),
                    Series(),
                    Index([1, 2, 3]),
                    Index([True, False, True]),
                    DataFrame({'x': ['a', 'b', 'c'], 'y': [1, 2, 3]}),
                    DataFrame(),
                    tm.makeMissingDataframe(),
                    tm.makeMixedDataFrame(),
                    tm.makeTimeDataFrame(),
                    tm.makeTimeSeries(),
                    tm.makeTimedeltaIndex(),
                    tm.makePeriodIndex(),
                    Series(tm.makePeriodIndex()),
                    Series(pd.date_range('20130101',
                                         periods=3, tz='US/Eastern')),
                    MultiIndex.from_product(
                        [range(5),
                         ['foo', 'bar', 'baz'],
                         pd.date_range('20130101', periods=2)]),
                    MultiIndex.from_product(
                        [pd.CategoricalIndex(list('aabc')),
                         range(3)])]:
            self.check_equal(obj)
            self.check_not_equal_with_index(obj)
Example #6
0
    def test_upload_data_if_table_exists_append(self):
        destination_table = DESTINATION_TABLE + "3"

        test_size = 10
        df = make_mixed_dataframe_v2(test_size)
        df_different_schema = tm.makeMixedDataFrame()

        # Initialize table with sample data
        gbq.to_gbq(df, destination_table, _get_project_id(), chunksize=10000,
                   private_key=_get_private_key_path())

        # Test the if_exists parameter with value 'append'
        gbq.to_gbq(df, destination_table, _get_project_id(),
                   if_exists='append', private_key=_get_private_key_path())

        sleep(30)  # <- Curses Google!!!

        result = gbq.read_gbq("SELECT COUNT(*) as NUM_ROWS FROM {0}"
                              .format(destination_table),
                              project_id=_get_project_id(),
                              private_key=_get_private_key_path())
        self.assertEqual(result['NUM_ROWS'][0], test_size * 2)

        # Try inserting with a different schema, confirm failure
        with tm.assertRaises(gbq.InvalidSchema):
            gbq.to_gbq(df_different_schema, destination_table,
                       _get_project_id(), if_exists='append',
                       private_key=_get_private_key_path())
Example #7
0
    def test_isnull(self):
        self.assertFalse(isnull(1.))
        self.assertTrue(isnull(None))
        self.assertTrue(isnull(np.NaN))
        self.assertTrue(float('nan'))
        self.assertFalse(isnull(np.inf))
        self.assertFalse(isnull(-np.inf))

        # series
        for s in [tm.makeFloatSeries(), tm.makeStringSeries(),
                  tm.makeObjectSeries(), tm.makeTimeSeries(),
                  tm.makePeriodSeries()]:
            assert isinstance(isnull(s), Series)

        # frame
        for df in [tm.makeTimeDataFrame(), tm.makePeriodFrame(),
                   tm.makeMixedDataFrame()]:
            result = isnull(df)
            expected = df.apply(isnull)
            tm.assert_frame_equal(result, expected)

        # panel
        with catch_warnings(record=True):
            for p in [tm.makePanel(), tm.makePeriodPanel(),
                      tm.add_nans(tm.makePanel())]:
                result = isnull(p)
                expected = p.apply(isnull)
                tm.assert_panel_equal(result, expected)

        # panel 4d
        with catch_warnings(record=True):
            for p in [tm.makePanel4D(), tm.add_nans_panel4d(tm.makePanel4D())]:
                result = isnull(p)
                expected = p.apply(isnull)
                tm.assert_panel4d_equal(result, expected)
Example #8
0
def test_csv_to_s3_into():
    df = tm.makeMixedDataFrame()
    with tmpfile('.csv') as fn:
        with s3_bucket('.csv') as b:
            df.to_csv(fn, index=False)
            s3 = into(b, CSV(fn))
            result = into(pd.DataFrame, s3)
    tm.assert_frame_equal(df, result)
Example #9
0
def test_csv_to_s3_append():
    df = tm.makeMixedDataFrame()
    with tmpfile(".csv") as fn:
        with s3_bucket(".csv") as b:
            s3 = resource(b)
            df.to_csv(fn, index=False)
            append(s3, CSV(fn))
            result = into(pd.DataFrame, s3)
    tm.assert_frame_equal(df, result)
Example #10
0
    def test_generate_bq_schema(self):
        df = tm.makeMixedDataFrame()
        schema = gbq.generate_bq_schema(df)

        test_schema = {'fields': [{'name': 'A', 'type': 'FLOAT'},
                                  {'name': 'B', 'type': 'FLOAT'},
                                  {'name': 'C', 'type': 'STRING'},
                                  {'name': 'D', 'type': 'TIMESTAMP'}]}

        self.assertEqual(schema, test_schema)
Example #11
0
def test_s3_encrypted_multipart_upload(s3_encryption_bucket):
    s3_connection = boto.connect_s3()

    df = tm.makeMixedDataFrame()
    with tmpfile('.csv') as fn:
        df.to_csv(fn, index=False)
        s3_uri = 's3://{bucket}/{fn}'.format(bucket=s3_encryption_bucket, fn=os.path.basename(fn))
        odo(fn, s3_uri, s3=s3_connection, encrypt_key=True, multipart=True)
        result = odo(s3_uri, pd.DataFrame, s3=s3_connection)

    tm.assert_frame_equal(df, result)
Example #12
0
def test_head_compute():
    data = tm.makeMixedDataFrame()
    t = symbol('t', discover(data))
    db = into('sqlite:///:memory:::t', data, dshape=t.dshape)
    n = 2
    d = Data(db)

    # skip the header and the ... at the end of the repr
    expr = d.head(n)
    s = repr(expr)
    assert '...' not in s
    result = s.split('\n')[1:]
    assert len(result) == n
Example #13
0
    def test_generate_schema(self):
        df = tm.makeMixedDataFrame()
        schema = gbq._generate_bq_schema(df)

        test_schema = {
            "fields": [
                {"name": "A", "type": "FLOAT"},
                {"name": "B", "type": "FLOAT"},
                {"name": "C", "type": "STRING"},
                {"name": "D", "type": "TIMESTAMP"},
            ]
        }

        self.assertEqual(schema, test_schema)
Example #14
0
    def test_upload_data_if_table_exists_replace(self):
        table_name = 'new_test4'

        test_size = 10
        df = make_mixed_dataframe_v2(test_size)
        df_different_schema = tm.makeMixedDataFrame()

        # Initialize table with sample data
        gbq.to_gbq(df, "pydata_pandas_bq_testing." + table_name, PROJECT_ID, chunksize=10000)

        # Test the if_exists parameter with the value 'replace'.
        gbq.to_gbq(df_different_schema, "pydata_pandas_bq_testing." + table_name, PROJECT_ID, if_exists='replace')

        sleep(60)  # <- Curses Google!!!

        result = gbq.read_gbq("SELECT COUNT(*) as NUM_ROWS FROM pydata_pandas_bq_testing." + table_name, project_id=PROJECT_ID)
        self.assertEqual(result['NUM_ROWS'][0], 5)
Example #15
0
    def test_upload_data_if_table_exists_replace(self):
        destination_table = DESTINATION_TABLE + "4"

        test_size = 10
        df = make_mixed_dataframe_v2(test_size)
        df_different_schema = tm.makeMixedDataFrame()

        # Initialize table with sample data
        gbq.to_gbq(df, destination_table, PROJECT_ID, chunksize=10000)

        # Test the if_exists parameter with the value 'replace'.
        gbq.to_gbq(df_different_schema, destination_table, PROJECT_ID, if_exists='replace')

        sleep(60)  # <- Curses Google!!!

        result = gbq.read_gbq("SELECT COUNT(*) as NUM_ROWS FROM {0}".format(destination_table), project_id=PROJECT_ID)
        self.assertEqual(result['NUM_ROWS'][0], 5)
Example #16
0
    def test_isnull(self):
        self.assertFalse(isnull(1.))
        self.assertTrue(isnull(None))
        self.assertTrue(isnull(np.NaN))
        self.assertTrue(float('nan'))
        self.assertFalse(isnull(np.inf))
        self.assertFalse(isnull(-np.inf))

        # series
        for s in [
                tm.makeFloatSeries(),
                tm.makeStringSeries(),
                tm.makeObjectSeries(),
                tm.makeTimeSeries(),
                tm.makePeriodSeries()
        ]:
            self.assertIsInstance(isnull(s), Series)

        # frame
        for df in [
                tm.makeTimeDataFrame(),
                tm.makePeriodFrame(),
                tm.makeMixedDataFrame()
        ]:
            result = isnull(df)
            expected = df.apply(isnull)
            tm.assert_frame_equal(result, expected)

        # panel
        for p in [
                tm.makePanel(),
                tm.makePeriodPanel(),
                tm.add_nans(tm.makePanel())
        ]:
            result = isnull(p)
            expected = p.apply(isnull)
            tm.assert_panel_equal(result, expected)

        # panel 4d
        with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
            for p in [tm.makePanel4D(), tm.add_nans_panel4d(tm.makePanel4D())]:
                result = isnull(p)
                expected = p.apply(isnull)
                tm.assert_panel4d_equal(result, expected)
Example #17
0
def test_dataframe():
    import numpy
    from pandas import Timestamp
    from pandas.util import testing
    from pandas.tslib import NaTType

    from ..dataframe import dataframe_to_rows
    df = testing.makeMixedDataFrame()
    df.iloc[0] = numpy.nan

    rows = tuple(dataframe_to_rows(df))
    assert isnan(rows[1][1])
    assert type(rows[1][-1]) == NaTType
    assert rows[2:] == (
        [1, 1.0, 1.0, 'foo2', Timestamp('2009-01-02 00:00:00')],
        [2, 2.0, 0.0, 'foo3', Timestamp('2009-01-05 00:00:00')],
        [3, 3.0, 1.0, 'foo4', Timestamp('2009-01-06 00:00:00')],
        [4, 4.0, 0.0, 'foo5', Timestamp('2009-01-07 00:00:00')],
        )
Example #18
0
def test_isnull():
    assert not isnull(1.)
    assert isnull(None)
    assert isnull(np.NaN)
    assert not isnull(np.inf)
    assert not isnull(-np.inf)

    # series
    for s in [
            tm.makeFloatSeries(),
            tm.makeStringSeries(),
            tm.makeObjectSeries(),
            tm.makeTimeSeries(),
            tm.makePeriodSeries()
    ]:
        assert (isinstance(isnull(s), Series))

    # frame
    for df in [
            tm.makeTimeDataFrame(),
            tm.makePeriodFrame(),
            tm.makeMixedDataFrame()
    ]:
        result = isnull(df)
        expected = df.apply(isnull)
        tm.assert_frame_equal(result, expected)

    # panel
    for p in [
            tm.makePanel(),
            tm.makePeriodPanel(),
            tm.add_nans(tm.makePanel())
    ]:
        result = isnull(p)
        expected = p.apply(isnull)
        tm.assert_panel_equal(result, expected)

    # panel 4d
    for p in [tm.makePanel4D(), tm.add_nans_panel4d(tm.makePanel4D())]:
        result = isnull(p)
        expected = p.apply(isnull)
        tm.assert_panel4d_equal(result, expected)
Example #19
0
    def test_isna_isnull(self, isna_f):
        assert not isna_f(1.)
        assert isna_f(None)
        assert isna_f(np.NaN)
        assert float('nan')
        assert not isna_f(np.inf)
        assert not isna_f(-np.inf)

        # series
        for s in [tm.makeFloatSeries(), tm.makeStringSeries(),
                  tm.makeObjectSeries(), tm.makeTimeSeries(),
                  tm.makePeriodSeries()]:
            assert isinstance(isna_f(s), Series)

        # frame
        for df in [tm.makeTimeDataFrame(), tm.makePeriodFrame(),
                   tm.makeMixedDataFrame()]:
            result = isna_f(df)
            expected = df.apply(isna_f)
            tm.assert_frame_equal(result, expected)
Example #20
0
    def test_hash_pandas_object(self):

        for obj in [Series([1, 2, 3]),
                    Series([1.0, 1.5, 3.2]),
                    Series([1.0, 1.5, np.nan]),
                    Series([1.0, 1.5, 3.2], index=[1.5, 1.1, 3.3]),
                    Series(['a', 'b', 'c']),
                    Series(['a', np.nan, 'c']),
                    Series(['a', None, 'c']),
                    Series([True, False, True]),
                    Index([1, 2, 3]),
                    Index([True, False, True]),
                    DataFrame({'x': ['a', 'b', 'c'], 'y': [1, 2, 3]}),
                    tm.makeMissingDataframe(),
                    tm.makeMixedDataFrame(),
                    tm.makeTimeDataFrame(),
                    tm.makeTimeSeries(),
                    tm.makeTimedeltaIndex()]:
            self.check_equal(obj)
            self.check_not_equal_with_index(obj)
Example #21
0
    def test_upload_data_if_table_exists_replace(self):
        destination_table = DESTINATION_TABLE + "4"

        test_size = 10
        df = make_mixed_dataframe_v2(test_size)
        df_different_schema = tm.makeMixedDataFrame()

        # Initialize table with sample data
        gbq.to_gbq(df, destination_table, PROJECT_ID, chunksize=10000)

        # Test the if_exists parameter with the value 'replace'.
        gbq.to_gbq(df_different_schema, destination_table,
                   PROJECT_ID, if_exists='replace')

        sleep(30)  # <- Curses Google!!!

        result = gbq.read_gbq("SELECT COUNT(*) as NUM_ROWS FROM {0}"
                              .format(destination_table),
                              project_id=PROJECT_ID)
        self.assertEqual(result['NUM_ROWS'][0], 5)
Example #22
0
    def test_isna_isnull(self, isna_f):
        assert not isna_f(1.)
        assert isna_f(None)
        assert isna_f(np.NaN)
        assert float('nan')
        assert not isna_f(np.inf)
        assert not isna_f(-np.inf)

        # series
        for s in [tm.makeFloatSeries(), tm.makeStringSeries(),
                  tm.makeObjectSeries(), tm.makeTimeSeries(),
                  tm.makePeriodSeries()]:
            assert isinstance(isna_f(s), Series)

        # frame
        for df in [tm.makeTimeDataFrame(), tm.makePeriodFrame(),
                   tm.makeMixedDataFrame()]:
            result = isna_f(df)
            expected = df.apply(isna_f)
            tm.assert_frame_equal(result, expected)
Example #23
0
    def test_upload_data_if_table_exists_append(self):
        destination_table = DESTINATION_TABLE + "3"

        test_size = 10
        df = make_mixed_dataframe_v2(test_size)
        df_different_schema = tm.makeMixedDataFrame()

        # Initialize table with sample data
        gbq.to_gbq(df, destination_table, PROJECT_ID, chunksize=10000)

        # Test the if_exists parameter with value 'append'
        gbq.to_gbq(df, destination_table, PROJECT_ID, if_exists='append')

        sleep(60)  # <- Curses Google!!!

        result = gbq.read_gbq("SELECT COUNT(*) as NUM_ROWS FROM {0}".format(destination_table), project_id=PROJECT_ID)
        self.assertEqual(result['NUM_ROWS'][0], test_size * 2)

        # Try inserting with a different schema, confirm failure
        with tm.assertRaises(gbq.InvalidSchema):
            gbq.to_gbq(df_different_schema, destination_table, PROJECT_ID, if_exists='append')
Example #24
0
    def test_upload_data_if_table_exists_append(self):
        table_name = 'new_test3'

        test_size = 10
        df = make_mixed_dataframe_v2(test_size)
        df_different_schema = tm.makeMixedDataFrame()

        # Initialize table with sample data
        gbq.to_gbq(df, "pydata_pandas_bq_testing." + table_name, PROJECT_ID, chunksize=10000)

        # Test the if_exists parameter with value 'append'
        gbq.to_gbq(df, "pydata_pandas_bq_testing." + table_name, PROJECT_ID, if_exists='append')

        sleep(60)  # <- Curses Google!!!

        result = gbq.read_gbq("SELECT COUNT(*) as NUM_ROWS FROM pydata_pandas_bq_testing." + table_name, project_id=PROJECT_ID)
        self.assertEqual(result['NUM_ROWS'][0], test_size * 2)

        # Try inserting with a different schema, confirm failure
        with tm.assertRaises(gbq.InvalidSchema):
            gbq.to_gbq(df_different_schema, "pydata_pandas_bq_testing." + table_name, PROJECT_ID, if_exists='append')
Example #25
0
    def test_generate_schema(self):
        df = tm.makeMixedDataFrame()
        schema = gbq._generate_bq_schema(df)

        test_schema = {
            'fields': [{
                'name': 'A',
                'type': 'FLOAT'
            }, {
                'name': 'B',
                'type': 'FLOAT'
            }, {
                'name': 'C',
                'type': 'STRING'
            }, {
                'name': 'D',
                'type': 'TIMESTAMP'
            }]
        }

        self.assertEqual(schema, test_schema)
Example #26
0
    def test_isna_isnull(self, isna_f):
        assert not isna_f(1.)
        assert isna_f(None)
        assert isna_f(np.NaN)
        assert float('nan')
        assert not isna_f(np.inf)
        assert not isna_f(-np.inf)

        # series
        for s in [
                tm.makeFloatSeries(),
                tm.makeStringSeries(),
                tm.makeObjectSeries(),
                tm.makeTimeSeries(),
                tm.makePeriodSeries()
        ]:
            assert isinstance(isna_f(s), Series)

        # frame
        for df in [
                tm.makeTimeDataFrame(),
                tm.makePeriodFrame(),
                tm.makeMixedDataFrame()
        ]:
            result = isna_f(df)
            expected = df.apply(isna_f)
            tm.assert_frame_equal(result, expected)

        # panel
        with catch_warnings(record=True):
            simplefilter("ignore", FutureWarning)
            for p in [
                    tm.makePanel(),
                    tm.makePeriodPanel(),
                    tm.add_nans(tm.makePanel())
            ]:
                result = isna_f(p)
                expected = p.apply(isna_f)
                tm.assert_panel_equal(result, expected)
Example #27
0
def test_tabulator_stream_dataframe(document, comm):
    df = makeMixedDataFrame()
    table = Tabulator(df)

    model = table.get_root(document, comm)

    stream_value = pd.DataFrame({
        'A': [5, 6],
        'B': [1, 0],
        'C': ['foo6', 'foo7'],
        'D': [dt.datetime(2009, 1, 8),
              dt.datetime(2009, 1, 9)]
    })

    table.stream(stream_value)

    assert len(table.value) == 7

    expected = {
        'index':
        np.array([0, 1, 2, 3, 4, 5, 6]),
        'A':
        np.array([0, 1, 2, 3, 4, 5, 6]),
        'B':
        np.array([0, 1, 0, 1, 0, 1, 0]),
        'C':
        np.array(['foo1', 'foo2', 'foo3', 'foo4', 'foo5', 'foo6', 'foo7']),
        'D':
        np.array([
            '2009-01-01T00:00:00.000000000', '2009-01-02T00:00:00.000000000',
            '2009-01-05T00:00:00.000000000', '2009-01-06T00:00:00.000000000',
            '2009-01-07T00:00:00.000000000', '2009-01-08T00:00:00.000000000',
            '2009-01-09T00:00:00.000000000'
        ],
                 dtype='datetime64[ns]')
    }
    for col, values in model.source.data.items():
        np.testing.assert_array_equal(values, expected[col])
    def test_hash_pandas_object(self):

        for obj in [
                Series([1, 2, 3]),
                Series([1.0, 1.5, 3.2]),
                Series([1.0, 1.5, np.nan]),
                Series([1.0, 1.5, 3.2], index=[1.5, 1.1, 3.3]),
                Series(['a', 'b', 'c']),
                Series(['a', np.nan, 'c']),
                Series(['a', None, 'c']),
                Series([True, False, True]),
                Series(),
                Index([1, 2, 3]),
                Index([True, False, True]),
                DataFrame({
                    'x': ['a', 'b', 'c'],
                    'y': [1, 2, 3]
                }),
                DataFrame(),
                tm.makeMissingDataframe(),
                tm.makeMixedDataFrame(),
                tm.makeTimeDataFrame(),
                tm.makeTimeSeries(),
                tm.makeTimedeltaIndex(),
                tm.makePeriodIndex(),
                Series(tm.makePeriodIndex()),
                Series(pd.date_range('20130101', periods=3, tz='US/Eastern')),
                MultiIndex.from_product([
                    range(5), ['foo', 'bar', 'baz'],
                    pd.date_range('20130101', periods=2)
                ]),
                MultiIndex.from_product(
                    [pd.CategoricalIndex(list('aabc')),
                     range(3)])
        ]:
            self.check_equal(obj)
            self.check_not_equal_with_index(obj)
Example #29
0
def test_tabulator_stream_df_rollover(document, comm):
    df = makeMixedDataFrame()
    table = Tabulator(df)

    model = table.get_root(document, comm)

    stream_value = pd.Series({
        'A': 5,
        'B': 1,
        'C': 'foo6',
        'D': dt.datetime(2009, 1, 8)
    }).to_frame().T

    table.stream(stream_value, rollover=5)

    assert len(table.value) == 5

    expected = {
        'index':
        np.array([1, 2, 3, 4, 5]),
        'A':
        np.array([1, 2, 3, 4, 5]),
        'B':
        np.array([1, 0, 1, 0, 1]),
        'C':
        np.array(['foo2', 'foo3', 'foo4', 'foo5', 'foo6']),
        'D':
        np.array([
            '2009-01-02T00:00:00.000000000', '2009-01-05T00:00:00.000000000',
            '2009-01-06T00:00:00.000000000', '2009-01-07T00:00:00.000000000',
            '2009-01-08T00:00:00.000000000'
        ],
                 dtype='datetime64[ns]')
    }
    for col, values in model.source.data.items():
        np.testing.assert_array_equal(values, expected[col])
Example #30
0
def test_tabulator_function_filter(document, comm):
    df = makeMixedDataFrame()
    table = Tabulator(df)

    model = table.get_root(document, comm)

    widget = TextInput(value='foo3')

    def filter_c(df, value):
        return df[df.C.str.contains(value)]

    table.add_filter(bind(filter_c, value=widget), 'C')

    expected = {
        'index': np.array([2]),
        'A': np.array([2]),
        'B': np.array([0]),
        'C': np.array(['foo3']),
        'D': np.array(['2009-01-05T00:00:00.000000000'],
                      dtype='datetime64[ns]')
    }
    for col, values in model.source.data.items():
        np.testing.assert_array_equal(values, expected[col])

    widget.value = 'foo1'

    expected = {
        'index': np.array([0]),
        'A': np.array([0]),
        'B': np.array([0]),
        'C': np.array(['foo1']),
        'D': np.array(['2009-01-01T00:00:00.000000000'],
                      dtype='datetime64[ns]')
    }
    for col, values in model.source.data.items():
        np.testing.assert_array_equal(values, expected[col])
Example #31
0
def test_tabulator_constant_tuple_filter(document, comm):
    df = makeMixedDataFrame()
    table = Tabulator(df)

    model = table.get_root(document, comm)

    table.add_filter((2, 3), 'A')

    expected = {
        'index':
        np.array([2, 3]),
        'A':
        np.array([2, 3]),
        'B':
        np.array([0, 1]),
        'C':
        np.array(['foo3', 'foo4']),
        'D':
        np.array(
            ['2009-01-05T00:00:00.000000000', '2009-01-06T00:00:00.000000000'],
            dtype='datetime64[ns]')
    }
    for col, values in model.source.data.items():
        np.testing.assert_array_equal(values, expected[col])
Example #32
0
def test_equal_dataframes_compare():
    df1 = makeMixedDataFrame()
    output = Compare(left=df1, right=df1, key_columns='A').diff
    assert output is None
Example #33
0
    pd.Series([1., 2., 3.]),
    pd.Series([1., 2., 3.], name='foo'),
    pd.Series([1., 2., 3.], name='foo',
              index=[4, 5, 6]),
    pd.Series([1., 2., 3.], name='foo',
              index=pd.Index([4, 5, 6], name='bar')),
    pd.DataFrame({'x': ['a', 'b', 'c']}),
    pd.DataFrame({'x': [b'a', b'b', b'c']}),
    pd.DataFrame({'x': pd.Categorical(['a', 'b', 'a'], ordered=True)}),
    pd.DataFrame({'x': pd.Categorical(['a', 'b', 'a'], ordered=False)}),
    tm.makeCategoricalIndex(),
    tm.makeCustomDataframe(5, 3),
    tm.makeDataFrame(),
    tm.makeDateIndex(),
    tm.makeMissingDataframe(),
    tm.makeMixedDataFrame(),
    tm.makeObjectSeries(),
    tm.makePeriodFrame(),
    tm.makeRangeIndex(),
    tm.makeTimeDataFrame(),
    tm.makeTimeSeries(),
    tm.makeUnicodeIndex(),
]


@pytest.mark.parametrize('df', dfs)
def test_dumps_serialize_numpy(df):
    header, frames = serialize(df)
    if 'compression' in header:
        frames = decompress(header, frames)
    df2 = deserialize(header, frames)
Example #34
0
		# create a dataframe
		PRICEDOMSIZE=  5  # domain size of prices
		SIZEDOMSIZE= 100
		def createTable(N):
			return pd.DataFrame({
					'pA': np.random.randint(0, PRICEDOMSIZE, N),
					'pB': np.random.randint(0, PRICEDOMSIZE, N),
					'sA': np.random.randint(0, SIZEDOMSIZE, N),
					'sB': np.random.randint(0, SIZEDOMSIZE, N)})
		createTable(5)

		# quickly create a dataframe for testing
		import pandas.util.testing as tm
		tm.N, tm.K = 5,3
		tm.makeDataFrame(), tm.makeMixedDataFrame(), tm.makeTimeDataFrame(freq="W")


		lst = [40, 10, 20, 30]
		names = ['AAA','Adfsdf','dfwef','fwefw']
		temp_df=pd.DataFrame(lst)
		temp_df=pd.DataFrame(list(zip(names,lst)),columns=["Name","Age"])
		
		# Create blank dataframe: could be useful if we want to append data row by row to a Dataframe.
		# In that case it’s better to have predefined columns
		blank_df=pd.DataFrame(columns=["Name","Age"])


		# Create rows for values separated by commas in a cell
			d = {"Team":["FC Barcelona", "FC Real Madrid"], 
				"Players":["Ter Stegen, Semedo, Piqué, Lenglet, Alba, Rakitic, De Jong, Sergi Roberto, Messi, Suárez, Griezmann",
Example #35
0
 def test_describe(self):
     desc = tm.makeDataFrame().describe()
     desc = tm.makeMixedDataFrame().describe()
     desc = tm.makeTimeDataFrame().describe()
Example #36
0
    pd.DataFrame({"x": [1.0, 2.0, 3.0], "y": [4.0, 5.0, 6.0]}),
    pd.DataFrame({"x": [1.0, 2.0, 3.0]}, index=pd.Index([4, 5, 6], name="bar")),
    pd.Series([1.0, 2.0, 3.0]),
    pd.Series([1.0, 2.0, 3.0], name="foo"),
    pd.Series([1.0, 2.0, 3.0], name="foo", index=[4, 5, 6]),
    pd.Series([1.0, 2.0, 3.0], name="foo", index=pd.Index([4, 5, 6], name="bar")),
    pd.DataFrame({"x": ["a", "b", "c"]}),
    pd.DataFrame({"x": [b"a", b"b", b"c"]}),
    pd.DataFrame({"x": pd.Categorical(["a", "b", "a"], ordered=True)}),
    pd.DataFrame({"x": pd.Categorical(["a", "b", "a"], ordered=False)}),
    tm.makeCategoricalIndex(),
    tm.makeCustomDataframe(5, 3),
    tm.makeDataFrame(),
    tm.makeDateIndex(),
    tm.makeMissingDataframe(),
    tm.makeMixedDataFrame(),
    tm.makeObjectSeries(),
    tm.makePeriodFrame(),
    tm.makeRangeIndex(),
    tm.makeTimeDataFrame(),
    tm.makeTimeSeries(),
    tm.makeUnicodeIndex(),
]


@pytest.mark.parametrize("df", dfs)
def test_dumps_serialize_numpy(df):
    header, frames = serialize(df)
    if "compression" in header:
        frames = decompress(header, frames)
    df2 = deserialize(header, frames)
Example #37
0
def test_tabulator_pagination(document, comm):
    df = makeMixedDataFrame()
    table = Tabulator(df, pagination='remote', page_size=2)

    model = table.get_root(document, comm)

    assert model.max_page == 3
    assert model.page_size == 2
    assert model.page == 1

    expected = {
        'index':
        np.array([0, 1]),
        'A':
        np.array([0, 1]),
        'B':
        np.array([0, 1]),
        'C':
        np.array(['foo1', 'foo2']),
        'D':
        np.array(
            ['2009-01-01T00:00:00.000000000', '2009-01-02T00:00:00.000000000'],
            dtype='datetime64[ns]')
    }
    for col, values in model.source.data.items():
        np.testing.assert_array_equal(values, expected[col])

    table.page = 2

    expected = {
        'index':
        np.array([2, 3]),
        'A':
        np.array([2, 3]),
        'B':
        np.array([0., 1.]),
        'C':
        np.array(['foo3', 'foo4']),
        'D':
        np.array(
            ['2009-01-05T00:00:00.000000000', '2009-01-06T00:00:00.000000000'],
            dtype='datetime64[ns]')
    }
    for col, values in model.source.data.items():
        np.testing.assert_array_equal(values, expected[col])

    table.page_size = 3
    table.page = 1

    assert model.max_page == 2

    expected = {
        'index':
        np.array([0, 1, 2]),
        'A':
        np.array([0, 1, 2]),
        'B':
        np.array([0, 1, 0]),
        'C':
        np.array(['foo1', 'foo2', 'foo3']),
        'D':
        np.array([
            '2009-01-01T00:00:00.000000000', '2009-01-02T00:00:00.000000000',
            '2009-01-05T00:00:00.000000000'
        ],
                 dtype='datetime64[ns]')
    }
    for col, values in model.source.data.items():
        np.testing.assert_array_equal(values, expected[col])
Example #38
0
class TestHashing(object):
    @pytest.fixture(params=[
        Series([1, 2, 3] * 3, dtype='int32'),
        Series([None, 2.5, 3.5] * 3, dtype='float32'),
        Series(['a', 'b', 'c'] * 3, dtype='category'),
        Series(['d', 'e', 'f'] * 3),
        Series([True, False, True] * 3),
        Series(pd.date_range('20130101', periods=9)),
        Series(pd.date_range('20130101', periods=9, tz='US/Eastern')),
        Series(pd.timedelta_range('2000', periods=9))
    ])
    def series(self, request):
        return request.param

    def test_consistency(self):
        # check that our hash doesn't change because of a mistake
        # in the actual code; this is the ground truth
        result = hash_pandas_object(Index(['foo', 'bar', 'baz']))
        expected = Series(np.array(
            [3600424527151052760, 1374399572096150070, 477881037637427054],
            dtype='uint64'),
                          index=['foo', 'bar', 'baz'])
        tm.assert_series_equal(result, expected)

    def test_hash_array(self, series):
        a = series.values
        tm.assert_numpy_array_equal(hash_array(a), hash_array(a))

    def test_hash_array_mixed(self):
        result1 = hash_array(np.array([3, 4, 'All']))
        result2 = hash_array(np.array(['3', '4', 'All']))
        result3 = hash_array(np.array([3, 4, 'All'], dtype=object))
        tm.assert_numpy_array_equal(result1, result2)
        tm.assert_numpy_array_equal(result1, result3)

    @pytest.mark.parametrize('val', [5, 'foo', pd.Timestamp('20130101')])
    def test_hash_array_errors(self, val):
        msg = 'must pass a ndarray-like'
        with tm.assert_raises_regex(TypeError, msg):
            hash_array(val)

    def check_equal(self, obj, **kwargs):
        a = hash_pandas_object(obj, **kwargs)
        b = hash_pandas_object(obj, **kwargs)
        tm.assert_series_equal(a, b)

        kwargs.pop('index', None)
        a = hash_pandas_object(obj, **kwargs)
        b = hash_pandas_object(obj, **kwargs)
        tm.assert_series_equal(a, b)

    def check_not_equal_with_index(self, obj):

        # check that we are not hashing the same if
        # we include the index
        if not isinstance(obj, Index):
            a = hash_pandas_object(obj, index=True)
            b = hash_pandas_object(obj, index=False)
            if len(obj):
                assert not (a == b).all()

    def test_hash_tuples(self):
        tups = [(1, 'one'), (1, 'two'), (2, 'one')]
        result = hash_tuples(tups)
        expected = hash_pandas_object(MultiIndex.from_tuples(tups)).values
        tm.assert_numpy_array_equal(result, expected)

        result = hash_tuples(tups[0])
        assert result == expected[0]

    @pytest.mark.parametrize('tup', [(1, 'one'), (1, np.nan),
                                     (1.0, pd.NaT, 'A'),
                                     ('A', pd.Timestamp("2012-01-01"))])
    def test_hash_tuple(self, tup):
        # test equivalence between hash_tuples and hash_tuple
        result = hash_tuple(tup)
        expected = hash_tuples([tup])[0]
        assert result == expected

    @pytest.mark.parametrize('val', [
        1, 1.4, 'A', b'A', u'A',
        pd.Timestamp("2012-01-01"),
        pd.Timestamp("2012-01-01", tz='Europe/Brussels'),
        datetime.datetime(2012, 1, 1),
        pd.Timestamp("2012-01-01", tz='EST').to_pydatetime(),
        pd.Timedelta('1 days'),
        datetime.timedelta(1),
        pd.Period('2012-01-01', freq='D'),
        pd.Interval(0, 1), np.nan, pd.NaT, None
    ])
    def test_hash_scalar(self, val):
        result = _hash_scalar(val)
        expected = hash_array(np.array([val], dtype=object), categorize=True)
        assert result[0] == expected[0]

    @pytest.mark.parametrize('val', [5, 'foo', pd.Timestamp('20130101')])
    def test_hash_tuples_err(self, val):
        msg = 'must be convertible to a list-of-tuples'
        with tm.assert_raises_regex(TypeError, msg):
            hash_tuples(val)

    def test_multiindex_unique(self):
        mi = MultiIndex.from_tuples([(118, 472), (236, 118), (51, 204),
                                     (102, 51)])
        assert mi.is_unique is True
        result = hash_pandas_object(mi)
        assert result.is_unique is True

    def test_multiindex_objects(self):
        mi = MultiIndex(levels=[['b', 'd', 'a'], [1, 2, 3]],
                        labels=[[0, 1, 0, 2], [2, 0, 0, 1]],
                        names=['col1', 'col2'])
        recons = mi._sort_levels_monotonic()

        # these are equal
        assert mi.equals(recons)
        assert Index(mi.values).equals(Index(recons.values))

        # _hashed_values and hash_pandas_object(..., index=False)
        # equivalency
        expected = hash_pandas_object(mi, index=False).values
        result = mi._hashed_values
        tm.assert_numpy_array_equal(result, expected)

        expected = hash_pandas_object(recons, index=False).values
        result = recons._hashed_values
        tm.assert_numpy_array_equal(result, expected)

        expected = mi._hashed_values
        result = recons._hashed_values

        # values should match, but in different order
        tm.assert_numpy_array_equal(np.sort(result), np.sort(expected))

    @pytest.mark.parametrize('obj', [
        Series([1, 2, 3]),
        Series([1.0, 1.5, 3.2]),
        Series([1.0, 1.5, np.nan]),
        Series([1.0, 1.5, 3.2], index=[1.5, 1.1, 3.3]),
        Series(['a', 'b', 'c']),
        Series(['a', np.nan, 'c']),
        Series(['a', None, 'c']),
        Series([True, False, True]),
        Series(),
        Index([1, 2, 3]),
        Index([True, False, True]),
        DataFrame({
            'x': ['a', 'b', 'c'],
            'y': [1, 2, 3]
        }),
        DataFrame(),
        tm.makeMissingDataframe(),
        tm.makeMixedDataFrame(),
        tm.makeTimeDataFrame(),
        tm.makeTimeSeries(),
        tm.makeTimedeltaIndex(),
        tm.makePeriodIndex(),
        Series(tm.makePeriodIndex()),
        Series(pd.date_range('20130101', periods=3, tz='US/Eastern')),
        MultiIndex.from_product([
            range(5), ['foo', 'bar', 'baz'],
            pd.date_range('20130101', periods=2)
        ]),
        MultiIndex.from_product([pd.CategoricalIndex(list('aabc')),
                                 range(3)])
    ])
    def test_hash_pandas_object(self, obj):
        self.check_equal(obj)
        self.check_not_equal_with_index(obj)

    def test_hash_pandas_object2(self, series):
        self.check_equal(series)
        self.check_not_equal_with_index(series)

    @pytest.mark.parametrize(
        'obj',
        [Series([], dtype='float64'),
         Series([], dtype='object'),
         Index([])])
    def test_hash_pandas_empty_object(self, obj):
        # these are by-definition the same with
        # or w/o the index as the data is empty
        self.check_equal(obj)

    @pytest.mark.parametrize('s1', [
        Series(['a', 'b', 'c', 'd']),
        Series([1000, 2000, 3000, 4000]),
        Series(pd.date_range(0, periods=4))
    ])
    @pytest.mark.parametrize('categorize', [True, False])
    def test_categorical_consistency(self, s1, categorize):
        # GH15143
        # Check that categoricals hash consistent with their values, not codes
        # This should work for categoricals of any dtype
        s2 = s1.astype('category').cat.set_categories(s1)
        s3 = s2.cat.set_categories(list(reversed(s1)))

        # These should all hash identically
        h1 = hash_pandas_object(s1, categorize=categorize)
        h2 = hash_pandas_object(s2, categorize=categorize)
        h3 = hash_pandas_object(s3, categorize=categorize)
        tm.assert_series_equal(h1, h2)
        tm.assert_series_equal(h1, h3)

    def test_categorical_with_nan_consistency(self):
        c = pd.Categorical.from_codes([-1, 0, 1, 2, 3, 4],
                                      categories=pd.date_range('2012-01-01',
                                                               periods=5,
                                                               name='B'))
        expected = hash_array(c, categorize=False)
        c = pd.Categorical.from_codes([-1, 0],
                                      categories=[pd.Timestamp('2012-01-01')])
        result = hash_array(c, categorize=False)
        assert result[0] in expected
        assert result[1] in expected

    @pytest.mark.filterwarnings("ignore:\\nPanel:FutureWarning")
    def test_pandas_errors(self):
        with pytest.raises(TypeError):
            hash_pandas_object(pd.Timestamp('20130101'))

        obj = tm.makePanel()

        with pytest.raises(TypeError):
            hash_pandas_object(obj)

    def test_hash_keys(self):
        # using different hash keys, should have different hashes
        # for the same data

        # this only matters for object dtypes
        obj = Series(list('abc'))
        a = hash_pandas_object(obj, hash_key='9876543210123456')
        b = hash_pandas_object(obj, hash_key='9876543210123465')
        assert (a != b).all()

    def test_invalid_key(self):
        # this only matters for object dtypes
        msg = 'key should be a 16-byte string encoded'
        with tm.assert_raises_regex(ValueError, msg):
            hash_pandas_object(Series(list('abc')), hash_key='foo')

    def test_alread_encoded(self):
        # if already encoded then ok

        obj = Series(list('abc')).str.encode('utf8')
        self.check_equal(obj)

    def test_alternate_encoding(self):

        obj = Series(list('abc'))
        self.check_equal(obj, encoding='ascii')

    @pytest.mark.parametrize('l_exp', range(8))
    @pytest.mark.parametrize('l_add', [0, 1])
    def test_same_len_hash_collisions(self, l_exp, l_add):
        length = 2**(l_exp + 8) + l_add
        s = tm.rands_array(length, 2)
        result = hash_array(s, 'utf8')
        assert not result[0] == result[1]

    def test_hash_collisions(self):

        # hash collisions are bad
        # https://github.com/pandas-dev/pandas/issues/14711#issuecomment-264885726
        L = [
            'Ingrid-9Z9fKIZmkO7i7Cn51Li34pJm44fgX6DYGBNj3VPlOH50m7HnBlPxfIwFMrcNJNMP6PSgLmwWnInciMWrCSAlLEvt7JkJl4IxiMrVbXSa8ZQoVaq5xoQPjltuJEfwdNlO6jo8qRRHvD8sBEBMQASrRa6TsdaPTPCBo3nwIBpE7YzzmyH0vMBhjQZLx1aCT7faSEx7PgFxQhHdKFWROcysamgy9iVj8DO2Fmwg1NNl93rIAqC3mdqfrCxrzfvIY8aJdzin2cHVzy3QUJxZgHvtUtOLxoqnUHsYbNTeq0xcLXpTZEZCxD4PGubIuCNf32c33M7HFsnjWSEjE2yVdWKhmSVodyF8hFYVmhYnMCztQnJrt3O8ZvVRXd5IKwlLexiSp4h888w7SzAIcKgc3g5XQJf6MlSMftDXm9lIsE1mJNiJEv6uY6pgvC3fUPhatlR5JPpVAHNSbSEE73MBzJrhCAbOLXQumyOXigZuPoME7QgJcBalliQol7YZ9',  # noqa
            'Tim-b9MddTxOWW2AT1Py6vtVbZwGAmYCjbp89p8mxsiFoVX4FyDOF3wFiAkyQTUgwg9sVqVYOZo09Dh1AzhFHbgij52ylF0SEwgzjzHH8TGY8Lypart4p4onnDoDvVMBa0kdthVGKl6K0BDVGzyOXPXKpmnMF1H6rJzqHJ0HywfwS4XYpVwlAkoeNsiicHkJUFdUAhG229INzvIAiJuAHeJDUoyO4DCBqtoZ5TDend6TK7Y914yHlfH3g1WZu5LksKv68VQHJriWFYusW5e6ZZ6dKaMjTwEGuRgdT66iU5nqWTHRH8WSzpXoCFwGcTOwyuqPSe0fTe21DVtJn1FKj9F9nEnR9xOvJUO7E0piCIF4Ad9yAIDY4DBimpsTfKXCu1vdHpKYerzbndfuFe5AhfMduLYZJi5iAw8qKSwR5h86ttXV0Mc0QmXz8dsRvDgxjXSmupPxBggdlqUlC828hXiTPD7am0yETBV0F3bEtvPiNJfremszcV8NcqAoARMe'
        ]  # noqa

        # these should be different!
        result1 = hash_array(np.asarray(L[0:1], dtype=object), 'utf8')
        expected1 = np.array([14963968704024874985], dtype=np.uint64)
        tm.assert_numpy_array_equal(result1, expected1)

        result2 = hash_array(np.asarray(L[1:2], dtype=object), 'utf8')
        expected2 = np.array([16428432627716348016], dtype=np.uint64)
        tm.assert_numpy_array_equal(result2, expected2)

        result = hash_array(np.asarray(L, dtype=object), 'utf8')
        tm.assert_numpy_array_equal(
            result, np.concatenate([expected1, expected2], axis=0))
Example #39
0
 def test_describe(self):
     desc = tm.makeDataFrame().describe()
     desc = tm.makeMixedDataFrame().describe()
     desc = tm.makeTimeDataFrame().describe()
Example #40
0
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
import pandas.util.testing as pd_samples
from pandas.testing import assert_frame_equal, assert_series_equal

import pytest

from pandasbikeshed.plot import (robust_hist, robust_scatter, robust_info,
                                 robust_kde, robust_pairplot, corr_heatmap,
                                 dist_catplot)

nan_df = pd_samples.makeMissingDataframe()
nan_a = nan_df['A']
nan_b = nan_df['B']
mixed_df = pd_samples.makeMixedDataFrame()


def test_robust_hist():
    assert isinstance(robust_hist(nan_a), plt.Axes)
    assert isinstance(robust_hist(nan_a.values), plt.Axes)


def test_robust_scatter():
    assert isinstance(robust_scatter(nan_a, nan_b), plt.Axes)
    assert isinstance(robust_scatter(nan_a.values, nan_b.values), plt.Axes)


def test_robust_kde():
    assert isinstance(robust_kde(nan_a), plt.Axes)
    assert isinstance(robust_kde(nan_a, nan_b), plt.Axes)