Example #1
0
    def test_ignore_datelike_data(self):

        df = self.df.copy()
        df['date'] = pd.date_range('2010-01-01', periods=len(df), freq='d')
        result = ag.PairGrid(self.df).data
        expected = df.drop('date', axis=1)
        tm.assert_frame_equal(result, expected)
    def test_make_forecasting_frame_list(self):
        df, y = dataframe_functions.make_forecasting_frame(x=range(4), kind="test", max_timeshift=1, rolling_direction=1)
        expected_df = pd.DataFrame({"id": [1, 2, 3], "kind": ["test"]*3, "value": [0., 1., 2.], "time": [0., 1., 2.]})

        expected_y = pd.Series(data=[1, 2, 3], index=[1, 2, 3], name="value")
        assert_frame_equal(df.sort_index(axis=1), expected_df.sort_index(axis=1))
        assert_series_equal(y, expected_y)
Example #3
0
    def test_establish_variables_from_mix(self):

        p = lm._LinearPlotter()
        p.establish_variables(self.df, x="x", y=self.df.y)
        pdt.assert_series_equal(p.x, self.df.x)
        pdt.assert_series_equal(p.y, self.df.y)
        pdt.assert_frame_equal(p.data, self.df)
Example #4
0
    def test_variables_from_mix(self):

        p = lm._RegressionPlotter("x", self.df.y + 1, data=self.df)

        npt.assert_array_equal(p.x, self.df.x)
        npt.assert_array_equal(p.y, self.df.y + 1)
        pdt.assert_frame_equal(p.data, self.df)
Example #5
0
    def test_download_insert_hist_data(self):
        async def run(loop, req, broker, insert_limit):
            engine = await aiosa.create_engine(
                user=self.db_info['user'], db=self.db_info['db'],
                host=self.db_info['host'], password=self.db_info['password'],
                loop=loop)
            # Download, Insert and Query
            dl_blk = await download_insert_hist_data(
                req, broker, engine, insert_limit)
            db_blk = await query_hist_data(
                engine, req.SecType, req.Symbol, req.DataType, req.BarSize,
                *insert_limit)
            engine.close()
            await engine.wait_closed()
            return dl_blk, db_blk

        # Execute
        self._clear_db()
        init_db(self.db_info)
        req = testdata_download_insert_hist_data['req']
        broker, login = testdata_download_insert_hist_data['broker']
        insert_limit = testdata_download_insert_hist_data['insert_limit']
        broker.connect(*login)
        loop = asyncio.get_event_loop()
        dl_blk, db_blk = loop.run_until_complete(
            run(loop, req, broker, insert_limit))
        broker_blk = broker.req_hist_data(req)[0]

        # Verify
        lim0 = insert_limit[0]
        lim1 = insert_limit[1]
        assert_frame_equal(dl_blk.df, broker_blk.df)
        assert_frame_equal(db_blk.df,
                           broker_blk.df.loc(axis=0)[:, :, :, lim0:lim1])
Example #6
0
    def test_get_hist_data(self):
        async def run(loop, req, blk_db, broker):
            # Populate database
            engine = await aiosa.create_engine(
                user=self.db_info['user'], db=self.db_info['db'],
                host=self.db_info['host'], password=self.db_info['password'],
                loop=loop, echo=False)
            await insert_hist_data(engine, 'Stock', blk_db)
            engine.close()
            await engine.wait_closed()
            # Get hist data
            blk_db = await get_hist_data(
                req, broker, mysql={**self.db_info, 'loop': loop})
            return blk_db

        from time import sleep
        for data in testdata_get_hist_data:
            sleep(1.5)  # Avoid IB pacing violation
            _logger.debug("\n======= get_hist_data_async: %s ======\n",
                          data['testcase'])
            self._clear_db()
            init_db(self.db_info)
            blk_db = MarketDataBlock(data['df_db'])
            broker = data['broker'][0](*data['broker'][1])
            blk_exp = MarketDataBlock(data['blk_exp.df'])
            blk_exp.tz = data['xchg_tz']
            loop = asyncio.get_event_loop()
            blk_ret = loop.run_until_complete(
                run(loop, data['req'], blk_db, broker))
            assert_frame_equal(blk_ret.df, blk_exp.df)
Example #7
0
def assert_frame_not_equal(df1, df2, **kwargs):
    # assert_frame_equal exists, but we need the ability to assert that frames are not equal
    try:
        assert_frame_equal(df1, df2, **kwargs)
        raise AssertionError('DataFrames are equal.')
    except AssertionError:
        pass
    def test_two_iterations_with_metadata_were_values_are_unique(self):
        # This should be identical to test_without_metadata_df_two_iterations,
        # with just the `sample-id` replaced with `pet`.
        columns = pd.MultiIndex.from_product([[1, 200], [1, 2]],
                                             names=['depth', 'iter'])
        data = pd.DataFrame(data=[[1, 2, 3, 4], [1, 2, 3, 4], [1, 2, 3, 4]],
                            columns=columns, index=['russ', 'milo', 'pea'])

        counts = pd.DataFrame(data=[[1, 1, 1, 1], [1, 1, 1, 1], [1, 1, 1, 1]],
                              columns=columns, index=['russ', 'milo', 'pea'])

        obs = _compute_summary(data, 'pet', counts=counts)

        d = [
            ['russ', 1,   1., 1.02, 1.09, 1.25, 1.5, 1.75, 1.91, 1.98, 2., 1],
            ['russ', 200, 3., 3.02, 3.09, 3.25, 3.5, 3.75, 3.91, 3.98, 4., 1],
            ['milo', 1,   1., 1.02, 1.09, 1.25, 1.5, 1.75, 1.91, 1.98, 2., 1],
            ['milo', 200, 3., 3.02, 3.09, 3.25, 3.5, 3.75, 3.91, 3.98, 4., 1],
            ['pea', 1,    1., 1.02, 1.09, 1.25, 1.5, 1.75, 1.91, 1.98, 2., 1],
            ['pea', 200,  3., 3.02, 3.09, 3.25, 3.5, 3.75, 3.91, 3.98, 4., 1],
        ]
        exp = pd.DataFrame(data=d, columns=['pet', 'depth', 'min', '2%', '9%',
                                            '25%', '50%', '75%', '91%', '98%',
                                            'max', 'count'])
        pdt.assert_frame_equal(exp, obs)
Example #9
0
def test_mnl_estimation(obs, alts):
    """
    Confirm that estimated params from the new interface match urbansim.urbanchoice.
    Only runs if the urbansim package has been installed.
    
    """
    try:
        from urbansim.urbanchoice.mnl import mnl_estimate
    except:
        print("Comparison of MNL estimation results skipped because urbansim is not installed")
        return

    model_expression = 'obsval + altval - 1'
    mct = MergedChoiceTable(obs, alts, 'choice')
    
    # new interface
    m = MultinomialLogit(mct, model_expression)
    r = m.fit().get_raw_results()
    
    # old interface
    dm = dmatrix(model_expression, mct.to_frame())
    chosen = np.reshape(mct.to_frame()[mct.choice_col].values, (100, 5))
    log_lik, fit = mnl_estimate(np.array(dm), chosen, numalts=5)
    
    for k,v in log_lik.items():
        assert(v == pytest.approx(r['log_likelihood'][k], 0.00001))
    
    assert_frame_equal(fit, r['fit_parameters'][['Coefficient', 'Std. Error', 'T-Score']])
Example #10
0
    def test_create_lineages(self):

        df_with_lins = clustering.df_add_lineages(self.df_mult_groups, 0.85)

        expected = self.df_mult_groups.reset_index(drop=True)
        expected['lineage'] = [0, 0, 1, 2, 3]

        assert_frame_equal(df_with_lins, expected)
Example #11
0
 def test_init(self):
     sorted_feature_names = ["is_dutch", "is_english", "value_number"]
     self.assertEquals(
         sorted(self.frame.features.keys()),
         sorted_feature_names
     )
     self.assertTrue(callable(self.frame.content))
     assert_frame_equal(self.frame.data, self.test_frame, check_like=True)
Example #12
0
    def test_variables_from_frame(self):

        p = lm._RegressionPlotter("x", "y", data=self.df, units="s")

        pdt.assert_series_equal(p.x, self.df.x)
        pdt.assert_series_equal(p.y, self.df.y)
        pdt.assert_series_equal(p.units, self.df.s)
        pdt.assert_frame_equal(p.data, self.df)
Example #13
0
    def test_munge_metadata_ids_different_order(self):
        md = qiime2.CategoricalMetadataColumn(
            pd.Series(['russ', 'milo', 'russ'], name='pet',
                      index=pd.Index(['S2', 'S1', 'S3'], name='id')))
        obs = _munge_metadata(md, self.table, 'both')

        exp_idx = pd.Index(['milo | S1', 'russ | S2', 'russ | S3'],
                           name='pet | id')
        exp = pd.DataFrame([[0, 10], [10, 12], [10, 11]], columns=['O1', 'O2'],
                           index=exp_idx)
        assert_frame_equal(exp, obs)
Example #14
0
    def test_munge_metadata_empty_values(self):
        md = qiime2.CategoricalMetadataColumn(
            pd.Series([None, 'russ', np.nan], name='pet',
                      index=pd.Index(['S1', 'S2', 'S3'], name='id')))
        obs = _munge_metadata(md, self.table, 'both')

        exp_idx = pd.Index(['[No Value] | S1', 'russ | S2', '[No Value] | S3'],
                           name='pet | id')
        exp = pd.DataFrame([[0, 10], [10, 12], [10, 11]], columns=['O1', 'O2'],
                           index=exp_idx)
        assert_frame_equal(exp, obs)
Example #15
0
    def test_munge_metadata_sort_samples(self):
        md = qiime2.CategoricalMetadataColumn(
            pd.Series(['peanut', 'milo', 'russ'], name='pet',
                      index=pd.Index(['S1', 'S2', 'S3'], name='id')))
        obs = _munge_metadata(md, self.table, 'features')

        exp_idx = pd.Index(['milo | S2', 'peanut | S1', 'russ | S3'],
                           name='pet | id')
        exp = pd.DataFrame([[10, 12], [0, 10], [10, 11]], columns=['O1', 'O2'],
                           index=exp_idx)
        assert_frame_equal(exp, obs)
Example #16
0
 def test_b64(self):
     """Test the binary encoding"""
     if self.should_skip:
         return self.skip('pandas is not importable')
     # array of substantial size is stored as b64
     a = np.random.rand(20, 10)
     index = ['Row' + str(i) for i in range(1, a.shape[0] + 1)]
     columns = ['Col' + str(i) for i in range(1, a.shape[1] + 1)]
     df = pd.DataFrame(a, index=index, columns=columns)
     decoded_df = self.roundtrip(df)
     assert_frame_equal(decoded_df, df)
Example #17
0
    def test_TableFormula_sort(self):
        fLOG(__file__, self._testMethodName, OutputPrint=__name__ == "__main__")

        table = TableFormula()
        table["A"] = [0, 1]
        table.add_column_vector("B", [6, 7])
        table.sort(lambda row: -row["B"])
        exp = pandas.DataFrame(dict(A=[1, 0], B=[7, 6], C=[1, 0]))
        exp = exp.set_index("C")
        exp.index.rename(None, inplace=True)
        assert_frame_equal(table, exp, check_index_type=False)
Example #18
0
def test_run_ccc_example_output(file_name):
    '''
    Tests the script in ../../run_examples/run_ccc_example.py to
    ensure that it produces the expected results that are checked into
    the repo.
    '''
    run_example_path = os.path.join(CUR_PATH, '..', '..', 'run_examples')
    test_path = os.path.join(run_example_path, file_name + '.csv')
    test_df = pd.read_csv(test_path)
    expected_path = os.path.join(run_example_path, file_name + '_expected.csv')
    expected_df = pd.read_csv(expected_path)
    assert_frame_equal(test_df, expected_df)
def test_get_country():
    # call the function
    df = country.get_country(interim_data, 'Chile')

    # load my previous dataset
    base = pd.read_csv(processed_data)

    # check if I am getting a dataframe
    assert isinstance(df, pd.DataFrame)
    assert isinstance(base, pd.DataFrame)

    # check that they are the same dataframes
    pdt.assert_frame_equal(df, base)
Example #20
0
    def test_TableFormula_add(self):
        fLOG(__file__, self._testMethodName, OutputPrint=__name__ == "__main__")

        table = TableFormula()
        table["A"] = [0, 1]
        table.add_column_index([4, 5])
        table.add_column_vector("B", [6, 7])
        table.addc("C", lambda row: row["A"] * row["B"])
        exp = pandas.DataFrame(dict(sum_d_a=[0.84127, 1.47619]))
        exp = pandas.DataFrame(
            dict(A=[0, 1], B=[6, 7], C=[0, 7], __key__=[4, 5]))
        exp.set_index("__key__", inplace=True)
        exp.index.rename(None, inplace=True)
        assert_frame_equal(table, exp)
Example #21
0
    def test_json(self):
            ss = self.station.statistics
            ss.calc_temperature_stats()
            ss.calc_precipitation_stats()
            ss.calc_humidity_stats()
            ss.calc_radiation_stats()
            ss.calc_wind_stats()

            with tempfile.NamedTemporaryFile() as tmp:
                ss.to_json(tmp.name)
                tmp.seek(0)
                ss2 = melodist.StationStatistics.from_json(tmp.name)

            assert_series_equal(ss.temp.max_delta, ss2.temp.max_delta)
            assert_frame_equal(ss.temp.mean_course, ss2.temp.mean_course)

            assert_equal(ss.precip.months, ss2.precip.months)
            assert all([cs1 == cs2 for cs1, cs2 in zip(ss.precip.stats, ss2.precip.stats)])

            assert ss.hum.a0 == ss2.hum.a0
            assert ss.hum.a1 == ss2.hum.a1
            assert ss.hum.kr == ss2.hum.kr
            assert_series_equal(ss.hum.month_hour_precip_mean, ss2.hum.month_hour_precip_mean)

            assert_frame_equal(ss.glob.angstroem, ss2.glob.angstroem)
            assert_frame_equal(ss.glob.bristcamp, ss2.glob.bristcamp)
            assert_frame_equal(ss.glob.mean_course, ss2.glob.mean_course)

            assert ss.wind.a == ss2.wind.a
            assert ss.wind.b == ss2.wind.b
            assert ss.wind.t_shift == ss2.wind.t_shift
Example #22
0
    def test_adding_content_mixed(self):
        self.skipTest("Bug: GH-109")
        old = list(self.get_iterator())[-2:]

        def update(ind):
            ind.properties["value"] = int(ind.properties["value"]) * 5
            return ind

        updated = list(map(update, old))
        self.frame.load_content(
            lambda: iter(list(self.get_extra_iterator()) + updated)
        )
        self.test_frame_extra["value_number"].loc[[7, 8]] *= 5
        assert_frame_equal(self.frame.data, self.test_frame_extra, check_like=True)
Example #23
0
    def test_from_columns(self):
        tsn = "TEST_TIME_SERIES"

        fset = ComprehensiveFCParameters()
        self.assertRaises(TypeError, from_columns, 42)
        self.assertRaises(TypeError, from_columns, 42)
        self.assertRaises(ValueError, from_columns, ["This is not a column name"])
        self.assertRaises(ValueError, from_columns, ["This__neither"])
        self.assertRaises(ValueError, from_columns, ["This__also__not"])

        # Aggregate functions
        feature_names = [tsn + '__sum_values', tsn + "__median", tsn + "__length", tsn + "__sample_entropy"]

        # Aggregate functions with params
        feature_names += [tsn + '__quantile__q_10', tsn + '__quantile__q_70', tsn + '__number_peaks__n_30',
                          tsn + '__value_count__value_inf', tsn + '__value_count__value_-inf',
                          tsn + '__value_count__value_nan']

        # Apply functions
        feature_names += [tsn + '__ar_coefficient__k_20__coeff_4', tsn + '__ar_coefficient__coeff_10__k_-1']

        kind_to_fc_parameters = from_columns(feature_names)

        six.assertCountEqual(self, list(kind_to_fc_parameters[tsn].keys()),
                             ["sum_values", "median", "length", "sample_entropy", "quantile", "number_peaks",
                              "ar_coefficient", "value_count"])

        self.assertEqual(kind_to_fc_parameters[tsn]["sum_values"], None)
        self.assertEqual(kind_to_fc_parameters[tsn]["ar_coefficient"],
                         [{"k": 20, "coeff": 4}, {"k": -1, "coeff": 10}])

        self.assertEqual(kind_to_fc_parameters[tsn]["value_count"],
                         [{"value": np.PINF}, {"value": np.NINF}, {"value": np.NaN}])

        # test that it passes for all functions
        fset = ComprehensiveFCParameters()
        X_org = extract_features(pd.DataFrame({"value": [1, 2, 3], "id": [1, 1, 1]}),
                                 default_fc_parameters=fset,
                                 column_id="id", column_value="value",
                                 n_jobs=0)

        inferred_fset = from_columns(X_org)

        X_new = extract_features(pd.DataFrame({"value": [1, 2, 3], "id": [1, 1, 1]}),
                                 kind_to_fc_parameters=inferred_fset,
                                 column_id="id", column_value="value",
                                 n_jobs=0)

        assert_frame_equal(X_org.sort_index(), X_new.sort_index())
Example #24
0
    def check_load_cached_dataset(name):
        # Test the cacheing using a temporary file.
        # With Python 3.2+, we could use the tempfile.TemporaryDirectory()
        # context manager instead of this try...finally statement
        tmpdir = tempfile.mkdtemp()
        try:
            # download and cache
            ds = load_dataset(name, cache=True, data_home=tmpdir)

            # use cached version
            ds2 = load_dataset(name, cache=True, data_home=tmpdir)
            pdt.assert_frame_equal(ds, ds2)

        finally:
            shutil.rmtree(tmpdir)
    def test_make_forecasting_frame_pdSeries(self):

        t_index = pd.date_range('1/1/2011', periods=4, freq='H')
        df, y = dataframe_functions.make_forecasting_frame(x=pd.Series(data=range(4), index=t_index),
                                                           kind="test", max_timeshift=1, rolling_direction=1)

        expected_y = pd.Series(data=[1, 2, 3], index=pd.DatetimeIndex(["2011-01-01 01:00:00", "2011-01-01 02:00:00",
                                                                       "2011-01-01 03:00:00"]), name="value")
        expected_df = pd.DataFrame({"id": pd.DatetimeIndex(["2011-01-01 01:00:00", "2011-01-01 02:00:00",
                                                            "2011-01-01 03:00:00"]),
                                    "kind": ["test"]*3, "value": [0., 1., 2.],
                                    "time": pd.DatetimeIndex(["2011-01-01 00:00:00", "2011-01-01 01:00:00",
                                                              "2011-01-01 02:00:00"])
                                    })
        assert_frame_equal(df.sort_index(axis=1), expected_df.sort_index(axis=1))
        assert_series_equal(y, expected_y)
Example #26
0
 def test_market_data_block_merge(self):
     testdata = testdata_market_data_block_merge
     blk = MarketDataBlock(pd.DataFrame(testdata[0]),
                           datatype='TRADES', tz='US/Pacific')
     _logger.info('\n\nBlockTests:merge: Starting blk:\n%s', blk.df)
     for data in testdata[1:]:
         blk.update(pd.DataFrame(data[0]),
                    datatype='TRADES', tz='US/Pacific')
         blk_direct = MarketDataBlock(
             pd.DataFrame(data[1]), datatype='TRADES', tz='US/Pacific')
         _logger.debug('\n\nBlockTests:merge: blk.df\n%s', blk.df[:3])
         _logger.debug('\n\nBlockTests:merge: blk_direct.df\n%s',
                       blk_direct.df[:3])
         assert_frame_equal(blk.df, blk_direct.df)
         self.assertEqual(list(blk.df.index.names),
                          blk.__class__.data_index)
         self.assertEqual(list(blk_direct.df.index.names),
                          blk.__class__.data_index)
Example #27
0
 def test_dataframe_roundtrip(self):
     if self.should_skip:
         return self.skip('pandas is not importable')
     df = pd.DataFrame({
         'an_int': np.int_([1, 2, 3]),
         'a_float': np.float_([2.5, 3.5, 4.5]),
         'a_nan': np.array([np.nan] * 3),
         'a_minus_inf': np.array([-np.inf] * 3),
         'an_inf': np.array([np.inf] * 3),
         'a_str': np.str_('foo'),
         'a_unicode': np.unicode_('bar'),
         'date': np.array([np.datetime64('2014-01-01')] * 3),
         'complex': np.complex_([1 - 2j, 2 - 1.2j, 3 - 1.3j]),
         # TODO: the following dtypes are not currently supported.
         # 'object': np.object_([{'a': 'b'}]*3),
     })
     decoded_df = self.roundtrip(df)
     assert_frame_equal(decoded_df, df)
Example #28
0
 def test_resetting_features_no_content(self):
     features = [
         TestNumericFeaturesFrame.is_dutch
     ]
     frame = NumericFeaturesFrame(
         TestNumericFeaturesFrame.get_identifier,
         features
     )
     frame.reset(features=[
         TestNumericFeaturesFrame.value_number,
         TestNumericFeaturesFrame.is_english
     ])
     self.test_frame = self.test_frame.drop(labels="is_dutch", axis=1)
     assert_frame_equal(frame.data, self.test_frame[0:0], check_like=True)
     sorted_feature_names = ["is_english", "value_number"]
     self.assertEquals(
         sorted(frame.features.keys()),
         sorted_feature_names
     )
    def test_two_iterations_with_metadata_were_values_are_identical(self):
        columns = pd.MultiIndex.from_product([[1, 200], [1, 2]],
                                             names=['depth', 'iter'])
        data = pd.DataFrame(data=[[3, 6, 9, 9]], columns=columns,
                            index=['milo'])

        counts = pd.DataFrame(data=[[3, 3, 3, 3]], columns=columns,
                              index=['milo'])

        obs = _compute_summary(data, 'pet', counts=counts)

        d = [
            ['milo', 1,   3., 3.06, 3.27, 3.75, 4.5,  5.25, 5.73, 5.94, 6., 3],
            ['milo', 200, 9.,   9.,   9.,   9.,  9.,    9.,   9.,   9., 9., 3],
        ]
        exp = pd.DataFrame(data=d, columns=['pet', 'depth', 'min', '2%', '9%',
                                            '25%', '50%', '75%', '91%', '98%',
                                            'max', 'count'])
        pdt.assert_frame_equal(exp, obs)
Example #30
0
 def test_adding_features(self):
     features = [
         TestNumericFeaturesFrame.is_dutch
     ]
     frame = NumericFeaturesFrame(
         TestNumericFeaturesFrame.get_identifier,
         features,
         self.get_iterator
     )
     frame.load_features([
         TestNumericFeaturesFrame.value_number,
         TestNumericFeaturesFrame.is_english
     ])
     assert_frame_equal(frame.data, self.test_frame, check_like=True)
     sorted_feature_names = ["is_dutch", "is_english", "value_number"]
     self.assertEquals(
         sorted(self.frame.features.keys()),
         sorted_feature_names
     )
Example #31
0
	def test_test_data(self):
		pd_testing.assert_frame_equal(self.exercises.test_data, self.test_data)
Example #32
0
def assert_geodataframe_equal(
    left,
    right,
    check_dtype=True,
    check_index_type="equiv",
    check_column_type="equiv",
    check_frame_type=True,
    check_like=False,
    check_less_precise=False,
    check_geom_type=False,
    check_crs=True,
):
    """
    Check that two GeoDataFrames are equal/

    Parameters
    ----------
    left, right : two GeoDataFrames
    check_dtype : bool, default True
        Whether to check the DataFrame dtype is identical.
    check_index_type, check_column_type : bool, default 'equiv'
        Check that index types are equal.
    check_frame_type : bool, default True
        Check that both are same type (*and* are GeoDataFrames). If False,
        will attempt to convert both into GeoDataFrame.
    check_like : bool, default False
        If true, ignore the order of rows & columns
    check_less_precise : bool, default False
        If True, use geom_almost_equals. if False, use geom_equals.
    check_geom_type : bool, default False
        If True, check that all the geom types are equal.
    check_crs: bool, default True
        If `check_frame_type` is True, then also check that the
        crs matches.
    """
    try:
        # added from pandas 0.20
        from pandas.testing import assert_frame_equal, assert_index_equal
    except ImportError:
        from pandas.util.testing import assert_frame_equal, assert_index_equal

    # instance validation
    if check_frame_type:
        assert isinstance(left, GeoDataFrame)
        assert isinstance(left, type(right))

        if check_crs:
            # no crs can be either None or {}
            if not left.crs and not right.crs:
                pass
            else:
                assert left.crs == right.crs
    else:
        if not isinstance(left, GeoDataFrame):
            left = GeoDataFrame(left)
        if not isinstance(right, GeoDataFrame):
            right = GeoDataFrame(right)

    # shape comparison
    assert left.shape == right.shape, (
        "GeoDataFrame shape mismatch, left: {lshape!r}, right: {rshape!r}.\n"
        "Left columns: {lcols!r}, right columns: {rcols!r}".format(
            lshape=left.shape,
            rshape=right.shape,
            lcols=left.columns,
            rcols=right.columns,
        ))

    if check_like:
        left, right = left.reindex_like(right), right

    # column comparison
    assert_index_equal(left.columns,
                       right.columns,
                       exact=check_column_type,
                       obj="GeoDataFrame.columns")

    # geometry comparison
    assert_geoseries_equal(
        left.geometry,
        right.geometry,
        check_dtype=check_dtype,
        check_less_precise=check_less_precise,
        check_geom_type=check_geom_type,
        check_crs=False,
    )

    # drop geometries and check remaining columns
    left2 = left.drop([left._geometry_column_name], axis=1)
    right2 = right.drop([right._geometry_column_name], axis=1)
    assert_frame_equal(
        left2,
        right2,
        check_dtype=check_dtype,
        check_index_type=check_index_type,
        check_column_type=check_column_type,
        obj="GeoDataFrame",
    )
Example #33
0
def test_join_project_left_table(how, left, right, df1, df2):
    expr = left.join(right, left.key == right.key, how=how)[left, right.key3]
    result = expr.execute()
    expected = pd.merge(df1, df2, how=how,
                        on='key')[list(left.columns) + ['key3']]
    tm.assert_frame_equal(result[expected.columns], expected)
Example #34
0
def test_asof_join(time_left, time_right, time_df1, time_df2):
    expr = time_left.asof_join(time_right, 'time')[time_left,
                                                   time_right.other_value]
    result = expr.execute()
    expected = pd.merge_asof(time_df1, time_df2, on='time')
    tm.assert_frame_equal(result[expected.columns], expected)
Example #35
0
 def test_df_trans_acc_disp(self):
     pd_testing.assert_frame_equal(self.exercises.df_trans_acc_disp,
                                   self.df_trans_acc_disp)
Example #36
0
 def test_df_merged(self):
     pd_testing.assert_frame_equal(self.exercises.df_merged, self.df_merged)
Example #37
0
 def test_df(self):
     pd_testing.assert_frame_equal(self.exercises.bankData,
                                   self.bankData,
                                   check_dtype=False)
Example #38
0
 def test_pickle_method(self):
     filename = os.path.join(self.tempdir, "df.pkl")
     self.df.to_pickle(filename)
     unpickled = pd.read_pickle(filename)
     assert_frame_equal(self.df, unpickled)
     assert self.df.crs == unpickled.crs
Example #39
0
 def test_to_df_types(self, column_type, values, series):
     data = [(v, ) for v in values]
     results = QueryResults(["col"], [column_type], data)
     assert_frame_equal(results.to_df(),
                        pd.DataFrame({"col": series}),
                        check_column_type="exact")
Example #40
0
 def test_save_and_load(self, versioned_csv_data_set, dummy_dataframe):
     """Test that saved and reloaded data matches the original one for
     the versioned data set."""
     versioned_csv_data_set.save(dummy_dataframe)
     reloaded_df = versioned_csv_data_set.load()
     assert_frame_equal(dummy_dataframe, reloaded_df)
Example #41
0
#!/usr/bin/env python
# coding: utf-8

# In[58]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

get_ipython().run_line_magic('matplotlib', 'inline')

# In[123]:

movie = pd.read_csv('data/movie.csv', index_col='movie_title')
c1 = movie['title_year'] >= 2010
c2 = movie['title_year'].isnull()
criteria = c1 | c2
movie_mask = movie.mask(criteria).dropna(how='all')
movie_boolean = movie[movie['title_year'] < 2010]
from pandas.testing import assert_frame_equal
assert_frame_equal(movie_boolean, movie_mask, check_dtype=False)

# In[124]:

get_ipython().run_line_magic('timeit',
                             "movie.mask(criteria).dropna(how='all')")

# In[ ]:
Example #42
0
def test_gender_job():
    row = dfp.RowTransformer(columns=['Gender', 'Job'],
                             drop_values=['p', 'N/A'])
    out = row.fit_transform(df)
    assert_frame_equal(out, drop_gender_job_df)
Example #43
0
def test_create_or_update_move_stop_by_dist_time():
    move_df = MoveDataFrame(
        data=list_data,
        latitude=LATITUDE,
        longitude=LONGITUDE,
        datetime=DATETIME,
        traj_id=TRAJ_ID,
    )
    cols = [
        'segment_stop',
        'id',
        'lat',
        'lon',
        'datetime',
        'dist_to_prev',
        'time_to_prev',
        'speed_to_prev',
        'stop',
    ]

    stay_point_detection.create_or_update_move_stop_by_dist_time(
        move_df, dist_radius=3.5, time_radius=0.5, inplace=True)
    expected = DataFrame(
        data=[
            [
                1,
                1,
                39.984094,
                116.319236,
                Timestamp('2008-10-23 05:53:05'),
                nan,
                nan,
                nan,
                False,
            ],
            [
                2,
                1,
                39.984198,
                116.319322,
                Timestamp('2008-10-23 05:53:06'),
                nan,
                nan,
                nan,
                False,
            ],
            [
                3,
                2,
                39.984224,
                116.319402,
                Timestamp('2008-10-23 05:53:11'),
                nan,
                nan,
                nan,
                True,
            ],
            [
                3,
                2,
                39.984224,
                116.319402,
                Timestamp('2008-10-23 05:53:15'),
                0.0,
                4.0,
                0.0,
                True,
            ],
        ],
        columns=cols,
        index=[0, 1, 2, 3],
    )
    print(move_df)
    assert_frame_equal(move_df, expected)
Example #44
0
	def test_renamedBostonData(self):
		pd_testing.assert_frame_equal(self.exercises.renamedBostonData, self.renamedBostonData)
Example #45
0
 def test_df(self):
     pd_testing.assert_frame_equal(self.exercises.df, self.df)
Example #46
0
 def test_ndarray_input(self):
     cg = mat.ClusterGrid(self.x_norm, **self.default_kws)
     pdt.assert_frame_equal(cg.data, pd.DataFrame(self.x_norm))
     assert len(cg.fig.axes) == 4
     assert cg.ax_row_colors is None
     assert cg.ax_col_colors is None
def test_update_depr_methods(monkeypatch):
    '''
    Test of calcfunctions.update_depr_methods
    '''
    p = Specification()
    json_str = """
        {"schema": {
            "labels": {
                "asset_name": {"type": "str"},
                "BEA_code": {"type": "str"},
                "minor_asset_group": {"type": "str"},
                "major_asset_group": {"type": "str"},
                "ADS_life": {"type": "float"},
                "GDS_life": {"type": "float"},
                "system": {"type": "str"},
                "year": {
                    "type": "int",
                    "validators": {"range": {"min": 2013, "max": 2030}}
                }
            }
        },
        "asset": {
            "title": "Tax depreciation rules for assets",
            "description": "Tax depreciation rules for assets",
            "type": "depreciation_rules",
            "value": [
                  {
                      "ADS_life": 10.0,
                      "BEA_code": "1",
                      "GDS_life": 10.0,
                      "asset_name": "Steam engines",
                      "major_asset_group": "Group1",
                      "minor_asset_group": "Group1",
                      "system": "GDS",
                      "year": 2020, "value": {"life": 10,
                                              "method": "DB 200%"}
                  },
                  {
                      "ADS_life": 10.0,
                      "BEA_code": "2",
                      "GDS_life": 10.0,
                      "asset_name": "Custom software",
                      "major_asset_group": "Group1",
                      "minor_asset_group": "Group1",
                      "system": "GDS",
                      "year": 2020, "value": {"life": 10,
                                              "method": "DB 150%"}
                  },
                  {
                      "ADS_life": 3.0,
                      "BEA_code": "3",
                      "GDS_life": 3.0,
                      "asset_name": "Other furniture",
                      "major_asset_group": "Group1",
                      "minor_asset_group": "Group1",
                      "system": "GDS",
                      "year": 2020, "value": {"life": 3,
                                              "method": "SL"}
                  },
                  {
                      "ADS_life": 15.0,
                      "BEA_code": "4",
                      "GDS_life": 15.0,
                      "asset_name": "Mining and oilfield machinery",
                      "major_asset_group": "Group1",
                      "minor_asset_group": "Group1",
                      "system": "GDS",
                      "year": 2020, "value": {"life": 15,
                                              "method": "Economic"}
                  },
                  {
                      "ADS_life": 27.5,
                      "BEA_code": "5",
                      "GDS_life": 27.5,
                      "asset_name": "Expensing",
                      "major_asset_group": "Group1",
                      "minor_asset_group": "Group1",
                      "system": "GDS",
                      "year": 2020, "value": {"life": 27.5,
                                              "method": "Expensing"}
                  },
                  {
                      "ADS_life": 27.5,
                      "BEA_code": "6",
                      "GDS_life": 27.5,
                      "asset_name": "PCs",
                      "major_asset_group": "Group1",
                      "minor_asset_group": "Group1",
                      "system": "GDS",
                      "year": 2020, "value": {"life": 27.5,
                                              "method": "DB 200%"}
                  },
                  {
                      "ADS_life": 10.0,
                      "BEA_code": "7",
                      "GDS_life": 10.0,
                      "asset_name": "Terminals",
                      "major_asset_group": "Group1",
                      "minor_asset_group": "Group1",
                      "system": "GDS",
                      "year": 2020, "value": {"life": 10,
                                              "method": "DB 150%"}
                  },
                  {
                      "ADS_life": 3.0,
                      "BEA_code": "8",
                      "GDS_life": 3.0,
                      "asset_name": "Manufacturing",
                      "major_asset_group": "Group1",
                      "minor_asset_group": "Group1",
                      "system": "GDS",
                      "year": 2020, "value": {"life": 3,
                                              "method": "SL"}
                  },
                  {
                      "ADS_life": 15.0,
                      "BEA_code": "9",
                      "GDS_life": 15.0,
                      "asset_name": "Wind and solar",
                      "major_asset_group": "Group1",
                      "minor_asset_group": "Group1",
                      "system": "GDS",
                      "year": 2020, "value": {"life": 15,
                                              "method": "Economic"}
                  },
                  {
                      "ADS_life": 7.0,
                      "BEA_code": "10",
                      "GDS_life": 7.0,
                      "asset_name": "Equipment",
                      "major_asset_group": "Group1",
                      "minor_asset_group": "Group1",
                      "system": "GDS",
                      "year": 2020, "value": {"life": 7,
                                              "method": "Expensing"}
                  }]
            }
        }
        """
    monkeypatch.setattr(DepreciationParams, "defaults", json_str)
    dp = DepreciationParams()
    asset_df = pd.DataFrame.from_dict({
        'bea_asset_code': ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10']
    })
    expected_df = pd.DataFrame(dp.asset)
    expected_df = pd.concat([
        expected_df.drop(['value'], axis=1), expected_df['value'].apply(
            pd.Series)
    ],
                            axis=1)
    expected_df.drop(
        columns=['asset_name', 'minor_asset_group', 'major_asset_group'],
        inplace=True)
    expected_df['bea_asset_code'] = pd.Series(
        ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10'],
        index=expected_df.index)
    expected_df['bonus'] = pd.Series(
        [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0],
        index=expected_df.index)
    expected_df['b'] = pd.Series([2, 1.5, 1, 1, 1, 2, 1.5, 1, 1, 1],
                                 index=expected_df.index)
    expected_df['Y'] = pd.Series([10, 10, 3, 15, 27.5, 27.5, 10, 3, 15, 7],
                                 index=expected_df.index)
    print('Expected df =', expected_df)
    test_df = cf.update_depr_methods(asset_df, p, dp)

    assert_frame_equal(test_df, expected_df, check_like=True)
Example #48
0
 def test_df_input(self):
     cg = mat.ClusterGrid(self.df_norm, **self.default_kws)
     pdt.assert_frame_equal(cg.data, self.df_norm)
Example #49
0
 def test_df_disp_owner(self):
     pd_testing.assert_frame_equal(self.exercises.df_disp_owner,
                                   self.df_disp_owner)
	def test_df(self):
		pd_testing.assert_frame_equal(self.exercises.bankData, self.bankData)
def check_cox(rossi, x, stratify_by, formula):
    if stratify_by:
        cph_py = CoxPHFitter(strata=stratify_by)
    else:
        cph_py = CoxPHFitter()

    for col in stratify_by:
        rossi[col] = rossi[col].astype('category')

    cph_py.fit(rossi, duration_col='week', event_col='arrest')
    cph_py.print_summary()
    rossi_h2o = h2o.H2OFrame(rossi)

    for col in stratify_by:
        rossi_h2o[col] = rossi_h2o[col].asfactor()

    cph_h2o = H2OCoxProportionalHazardsEstimator(stop_column="week",
                                                 stratify_by=stratify_by)
    cph_h2o.train(x=x, y="arrest", training_frame=rossi_h2o)

    assert cph_h2o.model_id != ""
    assert cph_h2o.model_id != ""
    assert cph_h2o.formula(
    ) == formula, "Expected formula to be '" + formula + "' but it was " + cph_h2o.formula(
    )

    predH2O = cph_h2o.predict(test_data=rossi_h2o)
    assert len(predH2O) == len(rossi)
    metrics_h2o = cph_h2o.model_performance(rossi_h2o)
    concordance_py = concordance_for_lifelines(cph_py)
    assert abs(concordance_py - metrics_h2o.concordance()) < 0.001
    hazard_h2o = h2o.get_frame(
        cph_h2o._model_json['output']['baseline_hazard']['name'])
    hazard_h2o_as_pandas = hazard_h2o.as_data_frame(use_pandas=True)

    hazard_py = cph_py.baseline_hazard_

    for col_name in hazard_py.columns:
        hazard_py.rename(columns={col_name: str(col_name)}, inplace=True)

    hazard_py_reordered_columns = hazard_py.reset_index(drop=True).sort_index(
        axis=1)
    hazard_h2o_reordered_columns = hazard_h2o_as_pandas.drop(
        't', axis="columns").reset_index(drop=True).sort_index(axis=1)

    hazard_py_reordered_columns = fix_py_result_for_older_lifelines(
        hazard_py_reordered_columns)

    print("h2o:")
    print(hazard_h2o_as_pandas.reset_index(drop=True))

    print("lifelines:")
    print(hazard_py_reordered_columns.reset_index(drop=True))

    assert_frame_equal(hazard_py_reordered_columns,
                       hazard_h2o_reordered_columns,
                       check_dtype=False,
                       check_index_type=False,
                       check_column_type=False)

    survival_h2o = h2o.get_frame(
        cph_h2o._model_json['output']['baseline_survival']['name'])
    survival_h2o_as_pandas = survival_h2o.as_data_frame(use_pandas=True)

    survival_py = cph_py.baseline_survival_

    for col_name in survival_py.columns:
        survival_py.rename(columns={col_name: str(col_name)}, inplace=True)

    survival_py_reordered_columns = survival_py.reset_index(
        drop=True).sort_index(axis=1)
    survival_h2o_reordered_columns = survival_h2o_as_pandas.drop(
        't', axis="columns").reset_index(drop=True).sort_index(axis=1)

    survival_py_reordered_columns = fix_py_result_for_older_lifelines(
        survival_py_reordered_columns)

    print("h2o:")
    print(survival_h2o_as_pandas.reset_index(drop=True))

    print("lifelines:")
    print(survival_py_reordered_columns.reset_index(drop=True))

    assert_frame_equal(survival_py_reordered_columns,
                       survival_h2o_reordered_columns,
                       check_dtype=False,
                       check_index_type=False,
                       check_column_type=False)
Example #52
0
def test_prepare():
    def prep(csvs, args=[]):
        args = ac.parse_cmdline_args(args)
        return ac.prepare_bartables(ld(csvs), args)

    # one category and totals
    in1 = """
        Tag,Time
        a:x-y,00:01:00
        (unmatched time),00:02:00
        (total time),00:03:00
        """
    out1 = pd.DataFrame(
        {'Time': ['', '', '00:02:00', '00:01:00', '', '00:03:00'],
         'Type': ['text', 'text', 'bar', 'bar', 'text', 'total_bar'],
         'Frac': [None, None, 2/3, 1/3, None, 1],
         'FracAbove': [None, None, 0, 2/3, None, 0],
         'HourFrac': [None, None, 20, 20, None, 20]},
        index=pd.Index(['a', '═', '(unmatched time)', 'x-y', '', '(total time)'], name='Tag'))
    pdt.assert_frame_equal(prep([in1]), out1)

    # same, different totals
    in1_totals = """
        Tag,Time
        a:x-y,00:01:00
        (unmatched),00:02:00
        (screen),00:03:00
        """
    out1_totals = out1.set_index(
        pd.Index(['a', '═', '(unmatched)', 'x-y', '', '(screen)'], name='Tag'))
    pdt.assert_frame_equal(
        prep([in1_totals], args=["--totals-re", "^\\(screen"]),
        out1_totals)

    # same, subtags
    out1_subtags = out1.set_index(
        pd.MultiIndex.from_tuples(
            [('a', ''), ('═', ''), ('(unmatched time)', ''), ('x', 'y'), ('', ''), ('(total time)', '')],
            names=['Tag', 'SubTag']))
    pdt.assert_frame_equal(prep([in1], args=["--subtags"]), out1_subtags)

    # two categories and totals
    in2 = """
        Tag,Time
        b:z,00:01:00
        (unmatched time),00:02:00
        (total time),00:03:00
        """
    blank = pd.DataFrame(
        {'Time': [''], 'Type': ['text'], 'Frac': [None], 'FracAbove': [None], 'HourFrac': [None]},
        index=pd.Index([''], name='Tag'))
    out2 = out1.set_index(
        pd.Index(['b', '═', '(unmatched time)', 'z', '', '(total time)'], name='Tag'))
    pdt.assert_frame_equal(prep([in1, in2]), pd.concat([out1, blank, out2]))

    # three categories, subtags
    in3 = """
        Tag,Time
        c:z,00:01:00
        (unmatched time),00:02:00
        (total time),00:03:00
        """
    out2_subtags = out1.set_index(
        pd.MultiIndex.from_tuples(
            [('b', ''), ('═', ''), ('(unmatched time)', ''), ('z', ''), ('', ''), ('(total time)', '')],
            names=['Tag', 'SubTag']))
    out3_subtags = out1.set_index(
        pd.MultiIndex.from_tuples(
            [('c', ''), ('═', ''), ('(unmatched time)', ''), ('z', ''), ('', ''), ('(total time)', '')],
            names=['Tag', 'SubTag']))
    blank_subtags = blank.set_index(pd.MultiIndex.from_tuples([('', '')], names=['Tag', 'SubTag']))
    pdt.assert_frame_equal(
        prep([in1, in2, in3], args=["--subtags"]),
        pd.concat([out1_subtags, blank_subtags, out2_subtags, blank_subtags, out3_subtags]))
Example #53
0
def test_diff_data(test_mp):
    """diff() when Scenarios contain the same items, but different data."""
    scen_a = make_dantzig(test_mp)
    scen_b = make_dantzig(test_mp)

    # Modify `scen_a` and `scen_b`
    scen_a.check_out()
    scen_b.check_out()

    # Remove elements from "b"
    drop_args = dict(labels=["value", "unit"], axis=1)
    scen_a.remove_par("b", scen_a.par("b").iloc[0:1, :].drop(**drop_args))
    scen_b.remove_par("b", scen_b.par("b").iloc[1:2, :].drop(**drop_args))
    # Remove elements from "d"
    scen_a.remove_par(
        "d",
        scen_a.par("d").query("i == 'san-diego'").drop(**drop_args))
    # Modify values in "d"
    scen_b.add_par("d",
                   scen_b.par("d").query("i == 'seattle'").assign(value=123.4))

    # Expected results
    exp_b = pd.DataFrame(
        [
            ["chicago", 300.0, "cases", np.NaN, None, "left_only"],
            ["new-york", np.NaN, None, 325.0, "cases", "right_only"],
            ["topeka", 275.0, "cases", 275.0, "cases", "both"],
        ],
        columns="j value_a unit_a value_b unit_b _merge".split(),
    )
    exp_d = pd.DataFrame(
        [
            ["san-diego", "chicago", np.NaN, None, 1.8, "km", "right_only"],
            ["san-diego", "new-york", np.NaN, None, 2.5, "km", "right_only"],
            ["san-diego", "topeka", np.NaN, None, 1.4, "km", "right_only"],
            ["seattle", "chicago", 1.7, "km", 123.4, "km", "both"],
            ["seattle", "new-york", 2.5, "km", 123.4, "km", "both"],
            ["seattle", "topeka", 1.8, "km", 123.4, "km", "both"],
        ],
        columns="i j value_a unit_a value_b unit_b _merge".split(),
    )

    # Use the specific categorical produced by pd.merge()
    merge_cat = pd.CategoricalDtype(["left_only", "right_only", "both"])
    exp_b = exp_b.astype(dict(_merge=merge_cat))
    exp_d = exp_d.astype(dict(_merge=merge_cat))

    # Compare different scenarios without filters
    for name, df in utils.diff(scen_a, scen_b):
        if name == "b":
            pdt.assert_frame_equal(exp_b, df)
        elif name == "d":
            pdt.assert_frame_equal(exp_d, df)

    # Compare different scenarios with filters
    iterator = utils.diff(scen_a, scen_b, filters=dict(j=["chicago"]))
    for name, df in iterator:
        # Same as above, except only the filtered rows should appear
        if name == "b":
            pdt.assert_frame_equal(exp_b.iloc[0:1, :], df)
        elif name == "d":
            pdt.assert_frame_equal(
                exp_d.iloc[[0, 3], :].reset_index(drop=True), df)
Example #54
0
def test_stream():
    df = pd.DataFrame(data_stream)

    filename = os.path.join(testdir, "health.pdf")
    tables = camelot.read_pdf(filename, flavor="stream")
    assert_frame_equal(df, tables[0].df)
 def test_build_base_silva_taxonomy(self):
     input_taxranks = _prep_taxranks(self.taxranks)
     obs_taxonomy = _build_base_silva_taxonomy(self.taxtree,
                                               input_taxranks,
                                               ALLOWED_RANKS,
                                               rank_propagation=True)
     obs_taxonomy.sort_index(inplace=True)
     tid = {'taxid': ['2', '11084', '42913', '42914', '42915',
                      '11089', '24228', '24229', '42916', '42917'],
            'd__': ['Archaea', 'Archaea', 'Archaea', 'Archaea', 'Archaea',
                    'Archaea', 'Archaea', 'Archaea', 'Archaea', 'Archaea'],
            'sk__': ['Archaea', 'Archaea', 'Archaea', 'Archaea', 'Archaea',
                     'Archaea', 'Archaea', 'Archaea', 'Archaea', 'Archaea'],
            'k__': ['Archaea', 'Archaea', 'Archaea', 'Archaea', 'Archaea',
                    'Archaea', 'Archaea', 'Archaea', 'Archaea', 'Archaea'],
            'ks__': ['Archaea', 'Archaea', 'Archaea', 'Archaea', 'Archaea',
                     'Archaea', 'Archaea', 'Archaea', 'Archaea', 'Archaea'],
            'sp__': ['Archaea', 'Archaea', 'Archaea', 'Archaea', 'Archaea',
                     'Archaea', 'Archaea', 'Archaea', 'Archaea', 'Archaea'],
            'p__': ['Archaea', 'Aenigmarchaeota', 'Aenigmarchaeota',
                    'Aenigmarchaeota', 'Aenigmarchaeota', 'Aenigmarchaeota',
                    'Altiarchaeota', 'Altiarchaeota', 'Altiarchaeota',
                    'Altiarchaeota'],
            'ps__': ['Archaea', 'Aenigmarchaeota', 'Aenigmarchaeota',
                     'Aenigmarchaeota', 'Aenigmarchaeota',
                     'Aenigmarchaeota', 'Altiarchaeota', 'Altiarchaeota',
                     'Altiarchaeota', 'Altiarchaeota'],
            'pi__': ['Archaea', 'Aenigmarchaeota', 'Aenigmarchaeota',
                     'Aenigmarchaeota', 'Aenigmarchaeota',
                     'Aenigmarchaeota', 'Altiarchaeota', 'Altiarchaeota',
                     'Altiarchaeota', 'Altiarchaeota'],
            'sc__': ['Archaea', 'Aenigmarchaeota', 'Aenigmarchaeota',
                     'Aenigmarchaeota', 'Aenigmarchaeota',
                     'Aenigmarchaeota', 'Altiarchaeota', 'Altiarchaeota',
                     'Altiarchaeota', 'Altiarchaeota'],
            'c__': ['Archaea', 'Aenigmarchaeota', 'Aenigmarchaeia',
                    'Aenigmarchaeia', 'Aenigmarchaeia',
                    'Deep_Sea_Euryarchaeotic_Group(DSEG)', 'Altiarchaeota',
                    'Altiarchaeia', 'Altiarchaeia', 'Altiarchaeia'],
            'cs__': ['Archaea', 'Aenigmarchaeota', 'Aenigmarchaeia',
                     'Aenigmarchaeia', 'Aenigmarchaeia',
                     'Deep_Sea_Euryarchaeotic_Group(DSEG)', 'Altiarchaeota',
                     'Altiarchaeia', 'Altiarchaeia', 'Altiarchaeia'],
            'ci__': ['Archaea', 'Aenigmarchaeota', 'Aenigmarchaeia',
                     'Aenigmarchaeia', 'Aenigmarchaeia',
                     'Deep_Sea_Euryarchaeotic_Group(DSEG)', 'Altiarchaeota',
                     'Altiarchaeia', 'Altiarchaeia', 'Altiarchaeia'],
            'so__': ['Archaea', 'Aenigmarchaeota', 'Aenigmarchaeia',
                     'Aenigmarchaeia', 'Aenigmarchaeia',
                     'Deep_Sea_Euryarchaeotic_Group(DSEG)', 'Altiarchaeota',
                     'Altiarchaeia', 'Altiarchaeia', 'Altiarchaeia'],
            'o__': ['Archaea', 'Aenigmarchaeota', 'Aenigmarchaeia',
                    'Aenigmarchaeales', 'Aenigmarchaeales',
                    'Deep_Sea_Euryarchaeotic_Group(DSEG)', 'Altiarchaeota',
                    'Altiarchaeia', 'Altiarchaeales', 'Altiarchaeales'],
            'os__': ['Archaea', 'Aenigmarchaeota', 'Aenigmarchaeia',
                     'Aenigmarchaeales', 'Aenigmarchaeales',
                     'Deep_Sea_Euryarchaeotic_Group(DSEG)', 'Altiarchaeota',
                     'Altiarchaeia', 'Altiarchaeales', 'Altiarchaeales'],
            'sf__': ['Archaea', 'Aenigmarchaeota', 'Aenigmarchaeia',
                     'Aenigmarchaeales', 'Aenigmarchaeales',
                     'Deep_Sea_Euryarchaeotic_Group(DSEG)', 'Altiarchaeota',
                     'Altiarchaeia', 'Altiarchaeales', 'Altiarchaeales'],
            'f__': ['Archaea', 'Aenigmarchaeota', 'Aenigmarchaeia',
                    'Aenigmarchaeales', 'Aenigmarchaeales',
                    'Deep_Sea_Euryarchaeotic_Group(DSEG)', 'Altiarchaeota',
                    'Altiarchaeia', 'Altiarchaeales', 'Altiarchaeaceae'],
            'fs__': ['Archaea', 'Aenigmarchaeota', 'Aenigmarchaeia',
                     'Aenigmarchaeales', 'Aenigmarchaeales',
                     'Deep_Sea_Euryarchaeotic_Group(DSEG)', 'Altiarchaeota',
                     'Altiarchaeia', 'Altiarchaeales', 'Altiarchaeaceae'],
            'g__': ['Archaea', 'Aenigmarchaeota', 'Aenigmarchaeia',
                    'Aenigmarchaeales', 'Candidatus_Aenigmarchaeum',
                    'Deep_Sea_Euryarchaeotic_Group(DSEG)', 'Altiarchaeota',
                    'Altiarchaeia', 'Altiarchaeales', 'Altiarchaeaceae']}
     exp_taxonomy = pd.DataFrame(tid)
     exp_taxonomy.set_index('taxid', inplace=True)
     exp_taxonomy.sort_index(inplace=True)
     assert_frame_equal(obs_taxonomy, exp_taxonomy)
Example #56
0
def test_join(how, left, right, df1, df2):
    expr = left.join(right, left.key == right.key,
                     how=how)[left, right.other_value, right.key3]
    result = expr.execute()
    expected = pd.merge(df1, df2, how=how, on='key')
    tm.assert_frame_equal(result[expected.columns], expected)
Example #57
0
def test_write_csv_from_data_vendor():
    """Tests downloading market data from the data vendor and dumping to CSV. Checks written CSV against what is loaded
    in memory. Also checks data is available in each 'usual' market hour.

    Note, that we use cached data from disk, as we want to download relatively large sections of data, and doing
    this externally can cause the test to run very slowly.
    """

    for data_vendor_name in data_vendor_name_list:

        # database_source = database_source_dict[data_vendor_name]
        database_populator = database_populator_dict[data_vendor_name]
        chunk_int_min = chunk_int_min_dict[data_vendor_name]

        # specifically choose dates which straddle the weekend boundary
        start_date = '27 Apr 2018'
        finish_date = '03 May 2018'
        expected_csv_files = 5  # during British Summer Time in London
        # start_date = '02 Feb 2018'; finish_date = '07 Feb 2018'; expected_csv_files = 4    # during GMT time in London
        split_size = 'daily'
        write_csv = False

        # prepare the CSV folder first
        csv_folder = os.path.join(constants.test_data_harness_folder,
                                  'csv_' + data_vendor_name + '_dump')

        # empty the CSV test harness folder
        UtilFunc().forcibly_create_empty_folder(csv_folder)

        msg, df_dict = database_populator.download_to_csv(
            start_date,
            finish_date, ['EURUSD'],
            chunk_int_min=chunk_int_min,
            split_size=split_size,
            csv_folder=csv_folder,
            return_df=True,
            write_large_csv=write_csv,
            remove_duplicates=False,
            web_proxies=web_proxies)

        df_read_direct_from_data_vendor = df_dict['EURUSD']

        # check it has data for every market hour (eg. ignoring Saturdays)
        assert util_func.check_data_frame_points_in_every_hour(
            df_read_direct_from_data_vendor, start_date, finish_date)

        if write_csv:
            # read back the CSVs dumped on disk in the test harness CSV folder
            csv_file_list = glob.glob(csv_folder + '/EURUSD*.csv')

            assert len(csv_file_list) == expected_csv_files

            df_list = []

            for c in csv_file_list:
                df = pd.read_csv(c, index_col=0)
                df.index = pd.to_datetime(df.index)
                df_list.append(df)

            # now compare the CSVs on disk versus those read directly
            df_read_from_csv = pd.concat(df_list).tz_localize(pytz.utc)

            assert_frame_equal(df_read_from_csv,
                               df_read_direct_from_data_vendor)
Example #58
0
 def test_coord_slice_points(self):
     assert self.df2.cx[-2:-1, -2:-1].empty
     assert_frame_equal(self.df2, self.df2.cx[:, :])
     assert_frame_equal(self.df2.loc[5:], self.df2.cx[5:, :])
     assert_frame_equal(self.df2.loc[5:], self.df2.cx[:, 5:])
     assert_frame_equal(self.df2.loc[5:], self.df2.cx[5:, 5:])
Example #59
0
def test_create_update_move_and_stop_by_radius():
    move_df = MoveDataFrame(
        data=list_data,
        latitude=LATITUDE,
        longitude=LONGITUDE,
        datetime=DATETIME,
        traj_id=TRAJ_ID,
    )
    cols = [
        'id',
        'lat',
        'lon',
        'datetime',
        'dist_to_prev',
        'dist_to_next',
        'dist_prev_to_next',
        'situation',
    ]

    stay_point_detection.create_update_move_and_stop_by_radius(move_df,
                                                               radius=4.0)
    expected = DataFrame(
        data=[
            [
                1,
                39.984094,
                116.319236,
                Timestamp('2008-10-23 05:53:05'),
                nan,
                13.690153134343689,
                nan,
                'nan',
            ],
            [
                1,
                39.984198,
                116.319322,
                Timestamp('2008-10-23 05:53:06'),
                13.690153134343689,
                nan,
                nan,
                'move',
            ],
            [
                2,
                39.984224,
                116.319402,
                Timestamp('2008-10-23 05:53:11'),
                nan,
                0.0,
                nan,
                'nan',
            ],
            [
                2,
                39.984224,
                116.319402,
                Timestamp('2008-10-23 05:53:15'),
                0.0,
                nan,
                nan,
                'stop',
            ],
        ],
        columns=cols,
        index=[0, 1, 2, 3],
    )
    assert_frame_equal(move_df, expected)
Example #60
0
 def frame_equal(a, b):
     try:
         assert_frame_equal(a, b)
     except AssertionError:
         return False
     return True