def test_set_index_cast_datetimeindex(self): df = DataFrame({'A': [datetime(2000, 1, 1) + timedelta(i) for i in range(1000)], 'B': np.random.randn(1000)}) idf = df.set_index('A') assert isinstance(idf.index, pd.DatetimeIndex) # don't cast a DatetimeIndex WITH a tz, leave as object # GH 6032 i = (pd.DatetimeIndex( to_datetime(['2013-1-1 13:00', '2013-1-2 14:00'], errors="raise")) .tz_localize('US/Pacific')) df = DataFrame(np.random.randn(2, 1), columns=['A']) expected = Series(np.array([pd.Timestamp('2013-01-01 13:00:00-0800', tz='US/Pacific'), pd.Timestamp('2013-01-02 14:00:00-0800', tz='US/Pacific')], dtype="object")) # convert index to series result = Series(i) assert_series_equal(result, expected) # assignt to frame df['B'] = i result = df['B'] assert_series_equal(result, expected, check_names=False) assert result.name == 'B' # keep the timezone result = i.to_series(keep_tz=True) assert_series_equal(result.reset_index(drop=True), expected) # convert to utc df['C'] = i.to_series().reset_index(drop=True) result = df['C'] comp = pd.DatetimeIndex(expected.values) comp = comp.tz_localize(None) tm.assert_numpy_array_equal(result.values, comp.values) # list of datetimes with a tz df['D'] = i.to_pydatetime() result = df['D'] assert_series_equal(result, expected, check_names=False) assert result.name == 'D' # GH 6785 # set the index manually import pytz df = DataFrame( [{'ts': datetime(2014, 4, 1, tzinfo=pytz.utc), 'foo': 1}]) expected = df.set_index('ts') df.index = df['ts'] df.pop('ts') assert_frame_equal(df, expected)
def test_convert_dti_to_series(self): # don't cast a DatetimeIndex WITH a tz, leave as object # GH 6032 idx = DatetimeIndex(to_datetime(['2013-1-1 13:00', '2013-1-2 14:00']), name='B').tz_localize('US/Pacific') df = DataFrame(np.random.randn(2, 1), columns=['A']) expected = Series(np.array([Timestamp('2013-01-01 13:00:00-0800', tz='US/Pacific'), Timestamp('2013-01-02 14:00:00-0800', tz='US/Pacific')], dtype="object"), name='B') # convert index to series result = Series(idx) tm.assert_series_equal(result, expected) # assign to frame df['B'] = idx result = df['B'] tm.assert_series_equal(result, expected) # convert to series while keeping the timezone result = idx.to_series(keep_tz=True, index=[0, 1]) tm.assert_series_equal(result, expected) # convert to utc df['B'] = idx.to_series(index=[0, 1]) result = df['B'] comp = Series(DatetimeIndex(expected.values).tz_localize(None), name='B') tm.assert_series_equal(result, comp) # list of datetimes with a tz df['B'] = idx.to_pydatetime() result = df['B'] tm.assert_series_equal(result, expected) # GH 6785 # set the index manually import pytz df = DataFrame( [{'ts': datetime(2014, 4, 1, tzinfo=pytz.utc), 'foo': 1}]) expected = df.set_index('ts') df.index = df['ts'] df.pop('ts') tm.assert_frame_equal(df, expected)
def test_pop(self): self.frame.columns.name = 'baz' self.frame.pop('A') self.assertNotIn('A', self.frame) self.frame['foo'] = 'bar' self.frame.pop('foo') self.assertNotIn('foo', self.frame) # TODO self.assertEqual(self.frame.columns.name, 'baz') # 10912 # inplace ops cause caching issue a = DataFrame([[1, 2, 3], [4, 5, 6]], columns=[ 'A', 'B', 'C'], index=['X', 'Y']) b = a.pop('B') b += 1 # original frame expected = DataFrame([[1, 3], [4, 6]], columns=[ 'A', 'C'], index=['X', 'Y']) assert_frame_equal(a, expected) # result expected = Series([2, 5], index=['X', 'Y'], name='B') + 1 assert_series_equal(b, expected)
def to_dataframe(self, selected_fields=None, excluded_fields=None): from ..services import locations if excluded_fields: qs = self.exclude(*excluded_fields) else: qs = self.exclude(*self.DEFAULT_EXCLUDED_FIELDS) if selected_fields: qs = self.only(*selected_fields) df = DataFrame(list(qs.as_pymongo())).convert_objects(convert_numeric=True) if df.empty: return df # add fields with no values fields = filter( lambda f: f not in df.columns, map(lambda field: field.name, [field for group in self.first().form.groups for field in group.fields]), ) for field in fields: df[field] = Series(np.nan, index=df.index) # do cleanup of subdocument fields for field in self.SUBDOCUMENT_FIELDS: temp = df.pop(field).tolist() temp2 = [i if not isnull(i) else {} for i in temp] df = df.join(DataFrame(temp2)) rv_map = locations.registered_voters_map() df["registered_voters"] = df.location.apply(lambda i: rv_map.get(i, 0)) return df
def test_pop(self): self.frame.columns.name = 'baz' self.frame.pop('A') assert 'A' not in self.frame self.frame['foo'] = 'bar' self.frame.pop('foo') assert 'foo' not in self.frame # TODO assert self.frame.columns.name == 'baz' # gh-10912: inplace ops cause caching issue a = DataFrame([[1, 2, 3], [4, 5, 6]], columns=[ 'A', 'B', 'C'], index=['X', 'Y']) b = a.pop('B') b += 1 # original frame expected = DataFrame([[1, 3], [4, 6]], columns=[ 'A', 'C'], index=['X', 'Y']) tm.assert_frame_equal(a, expected) # result expected = Series([2, 5], index=['X', 'Y'], name='B') + 1 tm.assert_series_equal(b, expected)
def test_pop_non_unique_cols(self): df = DataFrame({0: [0, 1], 1: [0, 1], 2: [4, 5]}) df.columns = ["a", "b", "a"] res = df.pop("a") assert type(res) == DataFrame assert len(res) == 2 assert len(df.columns) == 1 assert "b" in df.columns assert "a" not in df.columns assert len(df.index) == 2
def get_hhi(dframe): HHI_VALS = [] for idx in dframe.iterrows(): shares = [s for s in idx[1] if s > 0] numfirms = len(shares) sqr_shares = [s*s for s in shares] hhi_val = sum(sqr_shares) HHI_VALS.append({'month': idx[0], 'hhi': hhi_val}) dframeHHI = DataFrame(HHI_VALS) dframeHHI.index = DatetimeIndex(dframeHHI.pop('month')) return dframeHHI['hhi']
def test_pop_non_unique_cols(self): df = DataFrame({0: [0, 1], 1: [0, 1], 2: [4, 5]}) df.columns = ["a", "b", "a"] res = df.pop("a") self.assertEqual(type(res), DataFrame) self.assertEqual(len(res), 2) self.assertEqual(len(df.columns), 1) self.assertTrue("b" in df.columns) self.assertFalse("a" in df.columns) self.assertEqual(len(df.index), 2)
def __init__(self, map_): map_ = np.atleast_1d(np.asanyarray(map_).squeeze()) try: m, n = map_.shape s = np.repeat(np.arange(n), m) except ValueError: m, = map_.shape s = np.ones(m, dtype=int) data = {'channel': map_.ravel(), 'shank': s} df = DataFrame(data).sort('shank').reset_index(drop=True) df.index = df.pop('channel') super(ElectrodeMap, self).__init__(df.sort())
class TestJoin: def setup_method(self): # aggregate multiple columns self.df = DataFrame({ "key1": get_test_data(), "key2": get_test_data(), "data1": np.random.randn(N), "data2": np.random.randn(N), }) # exclude a couple keys for fun self.df = self.df[self.df["key2"] > 1] self.df2 = DataFrame({ "key1": get_test_data(n=N // 5), "key2": get_test_data(ngroups=NGROUPS // 2, n=N // 5), "value": np.random.randn(N // 5), }) index, data = tm.getMixedTypeDict() self.target = DataFrame(data, index=index) # Join on string value self.source = DataFrame({ "MergedA": data["A"], "MergedD": data["D"] }, index=data["C"]) def test_left_outer_join(self): joined_key2 = merge(self.df, self.df2, on="key2") _check_join(self.df, self.df2, joined_key2, ["key2"], how="left") joined_both = merge(self.df, self.df2) _check_join(self.df, self.df2, joined_both, ["key1", "key2"], how="left") def test_right_outer_join(self): joined_key2 = merge(self.df, self.df2, on="key2", how="right") _check_join(self.df, self.df2, joined_key2, ["key2"], how="right") joined_both = merge(self.df, self.df2, how="right") _check_join(self.df, self.df2, joined_both, ["key1", "key2"], how="right") def test_full_outer_join(self): joined_key2 = merge(self.df, self.df2, on="key2", how="outer") _check_join(self.df, self.df2, joined_key2, ["key2"], how="outer") joined_both = merge(self.df, self.df2, how="outer") _check_join(self.df, self.df2, joined_both, ["key1", "key2"], how="outer") def test_inner_join(self): joined_key2 = merge(self.df, self.df2, on="key2", how="inner") _check_join(self.df, self.df2, joined_key2, ["key2"], how="inner") joined_both = merge(self.df, self.df2, how="inner") _check_join(self.df, self.df2, joined_both, ["key1", "key2"], how="inner") def test_handle_overlap(self): joined = merge(self.df, self.df2, on="key2", suffixes=(".foo", ".bar")) assert "key1.foo" in joined assert "key1.bar" in joined def test_handle_overlap_arbitrary_key(self): joined = merge( self.df, self.df2, left_on="key2", right_on="key1", suffixes=(".foo", ".bar"), ) assert "key1.foo" in joined assert "key2.bar" in joined def test_join_on(self): target = self.target source = self.source merged = target.join(source, on="C") tm.assert_series_equal(merged["MergedA"], target["A"], check_names=False) tm.assert_series_equal(merged["MergedD"], target["D"], check_names=False) # join with duplicates (fix regression from DataFrame/Matrix merge) df = DataFrame({"key": ["a", "a", "b", "b", "c"]}) df2 = DataFrame({"value": [0, 1, 2]}, index=["a", "b", "c"]) joined = df.join(df2, on="key") expected = DataFrame({ "key": ["a", "a", "b", "b", "c"], "value": [0, 0, 1, 1, 2] }) tm.assert_frame_equal(joined, expected) # Test when some are missing df_a = DataFrame([[1], [2], [3]], index=["a", "b", "c"], columns=["one"]) df_b = DataFrame([["foo"], ["bar"]], index=[1, 2], columns=["two"]) df_c = DataFrame([[1], [2]], index=[1, 2], columns=["three"]) joined = df_a.join(df_b, on="one") joined = joined.join(df_c, on="one") assert np.isnan(joined["two"]["c"]) assert np.isnan(joined["three"]["c"]) # merge column not p resent with pytest.raises(KeyError, match="^'E'$"): target.join(source, on="E") # overlap source_copy = source.copy() source_copy["A"] = 0 msg = ("You are trying to merge on float64 and object columns. If " "you wish to proceed you should use pd.concat") with pytest.raises(ValueError, match=msg): target.join(source_copy, on="A") def test_join_on_fails_with_different_right_index(self): df = DataFrame({ "a": np.random.choice(["m", "f"], size=3), "b": np.random.randn(3) }) df2 = DataFrame( { "a": np.random.choice(["m", "f"], size=10), "b": np.random.randn(10) }, index=tm.makeCustomIndex(10, 2), ) msg = r'len\(left_on\) must equal the number of levels in the index of "right"' with pytest.raises(ValueError, match=msg): merge(df, df2, left_on="a", right_index=True) def test_join_on_fails_with_different_left_index(self): df = DataFrame( { "a": np.random.choice(["m", "f"], size=3), "b": np.random.randn(3) }, index=tm.makeCustomIndex(3, 2), ) df2 = DataFrame({ "a": np.random.choice(["m", "f"], size=10), "b": np.random.randn(10) }) msg = r'len\(right_on\) must equal the number of levels in the index of "left"' with pytest.raises(ValueError, match=msg): merge(df, df2, right_on="b", left_index=True) def test_join_on_fails_with_different_column_counts(self): df = DataFrame({ "a": np.random.choice(["m", "f"], size=3), "b": np.random.randn(3) }) df2 = DataFrame( { "a": np.random.choice(["m", "f"], size=10), "b": np.random.randn(10) }, index=tm.makeCustomIndex(10, 2), ) msg = r"len\(right_on\) must equal len\(left_on\)" with pytest.raises(ValueError, match=msg): merge(df, df2, right_on="a", left_on=["a", "b"]) @pytest.mark.parametrize("wrong_type", [2, "str", None, np.array([0, 1])]) def test_join_on_fails_with_wrong_object_type(self, wrong_type): # GH12081 - original issue # GH21220 - merging of Series and DataFrame is now allowed # Edited test to remove the Series object from test parameters df = DataFrame({"a": [1, 1]}) msg = ("Can only merge Series or DataFrame objects, " f"a {type(wrong_type)} was passed") with pytest.raises(TypeError, match=msg): merge(wrong_type, df, left_on="a", right_on="a") with pytest.raises(TypeError, match=msg): merge(df, wrong_type, left_on="a", right_on="a") def test_join_on_pass_vector(self): expected = self.target.join(self.source, on="C") del expected["C"] join_col = self.target.pop("C") result = self.target.join(self.source, on=join_col) tm.assert_frame_equal(result, expected) def test_join_with_len0(self): # nothing to merge merged = self.target.join(self.source.reindex([]), on="C") for col in self.source: assert col in merged assert merged[col].isna().all() merged2 = self.target.join(self.source.reindex([]), on="C", how="inner") tm.assert_index_equal(merged2.columns, merged.columns) assert len(merged2) == 0 def test_join_on_inner(self): df = DataFrame({"key": ["a", "a", "d", "b", "b", "c"]}) df2 = DataFrame({"value": [0, 1]}, index=["a", "b"]) joined = df.join(df2, on="key", how="inner") expected = df.join(df2, on="key") expected = expected[expected["value"].notna()] tm.assert_series_equal(joined["key"], expected["key"]) tm.assert_series_equal(joined["value"], expected["value"], check_dtype=False) tm.assert_index_equal(joined.index, expected.index) def test_join_on_singlekey_list(self): df = DataFrame({"key": ["a", "a", "b", "b", "c"]}) df2 = DataFrame({"value": [0, 1, 2]}, index=["a", "b", "c"]) # corner cases joined = df.join(df2, on=["key"]) expected = df.join(df2, on="key") tm.assert_frame_equal(joined, expected) def test_join_on_series(self): result = self.target.join(self.source["MergedA"], on="C") expected = self.target.join(self.source[["MergedA"]], on="C") tm.assert_frame_equal(result, expected) def test_join_on_series_buglet(self): # GH #638 df = DataFrame({"a": [1, 1]}) ds = Series([2], index=[1], name="b") result = df.join(ds, on="a") expected = DataFrame({"a": [1, 1], "b": [2, 2]}, index=df.index) tm.assert_frame_equal(result, expected) def test_join_index_mixed(self, join_type): # no overlapping blocks df1 = DataFrame(index=np.arange(10)) df1["bool"] = True df1["string"] = "foo" df2 = DataFrame(index=np.arange(5, 15)) df2["int"] = 1 df2["float"] = 1.0 joined = df1.join(df2, how=join_type) expected = _join_by_hand(df1, df2, how=join_type) tm.assert_frame_equal(joined, expected) joined = df2.join(df1, how=join_type) expected = _join_by_hand(df2, df1, how=join_type) tm.assert_frame_equal(joined, expected) def test_join_index_mixed_overlap(self): df1 = DataFrame( { "A": 1.0, "B": 2, "C": "foo", "D": True }, index=np.arange(10), columns=["A", "B", "C", "D"], ) assert df1["B"].dtype == np.int64 assert df1["D"].dtype == np.bool_ df2 = DataFrame( { "A": 1.0, "B": 2, "C": "foo", "D": True }, index=np.arange(0, 10, 2), columns=["A", "B", "C", "D"], ) # overlap joined = df1.join(df2, lsuffix="_one", rsuffix="_two") expected_columns = [ "A_one", "B_one", "C_one", "D_one", "A_two", "B_two", "C_two", "D_two", ] df1.columns = expected_columns[:4] df2.columns = expected_columns[4:] expected = _join_by_hand(df1, df2) tm.assert_frame_equal(joined, expected) def test_join_empty_bug(self): # generated an exception in 0.4.3 x = DataFrame() x.join(DataFrame([3], index=[0], columns=["A"]), how="outer") def test_join_unconsolidated(self): # GH #331 a = DataFrame(np.random.randn(30, 2), columns=["a", "b"]) c = Series(np.random.randn(30)) a["c"] = c d = DataFrame(np.random.randn(30, 1), columns=["q"]) # it works! a.join(d) d.join(a) def test_join_multiindex(self): index1 = MultiIndex.from_arrays( [["a", "a", "a", "b", "b", "b"], [1, 2, 3, 1, 2, 3]], names=["first", "second"], ) index2 = MultiIndex.from_arrays( [["b", "b", "b", "c", "c", "c"], [1, 2, 3, 1, 2, 3]], names=["first", "second"], ) df1 = DataFrame(data=np.random.randn(6), index=index1, columns=["var X"]) df2 = DataFrame(data=np.random.randn(6), index=index2, columns=["var Y"]) df1 = df1.sort_index(level=0) df2 = df2.sort_index(level=0) joined = df1.join(df2, how="outer") ex_index = Index(index1.values).union(Index(index2.values)) expected = df1.reindex(ex_index).join(df2.reindex(ex_index)) expected.index.names = index1.names tm.assert_frame_equal(joined, expected) assert joined.index.names == index1.names df1 = df1.sort_index(level=1) df2 = df2.sort_index(level=1) joined = df1.join(df2, how="outer").sort_index(level=0) ex_index = Index(index1.values).union(Index(index2.values)) expected = df1.reindex(ex_index).join(df2.reindex(ex_index)) expected.index.names = index1.names tm.assert_frame_equal(joined, expected) assert joined.index.names == index1.names def test_join_inner_multiindex(self, lexsorted_two_level_string_multiindex): key1 = [ "bar", "bar", "bar", "foo", "foo", "baz", "baz", "qux", "qux", "snap" ] key2 = [ "two", "one", "three", "one", "two", "one", "two", "two", "three", "one", ] data = np.random.randn(len(key1)) data = DataFrame({"key1": key1, "key2": key2, "data": data}) index = lexsorted_two_level_string_multiindex to_join = DataFrame(np.random.randn(10, 3), index=index, columns=["j_one", "j_two", "j_three"]) joined = data.join(to_join, on=["key1", "key2"], how="inner") expected = merge( data, to_join.reset_index(), left_on=["key1", "key2"], right_on=["first", "second"], how="inner", sort=False, ) expected2 = merge( to_join, data, right_on=["key1", "key2"], left_index=True, how="inner", sort=False, ) tm.assert_frame_equal(joined, expected2.reindex_like(joined)) expected2 = merge( to_join, data, right_on=["key1", "key2"], left_index=True, how="inner", sort=False, ) expected = expected.drop(["first", "second"], axis=1) expected.index = joined.index assert joined.index.is_monotonic_increasing tm.assert_frame_equal(joined, expected) # _assert_same_contents(expected, expected2.loc[:, expected.columns]) def test_join_hierarchical_mixed(self): # GH 2024 df = DataFrame([(1, 2, 3), (4, 5, 6)], columns=["a", "b", "c"]) new_df = df.groupby(["a"]).agg({"b": [np.mean, np.sum]}) other_df = DataFrame([(1, 2, 3), (7, 10, 6)], columns=["a", "b", "d"]) other_df.set_index("a", inplace=True) # GH 9455, 12219 msg = "merging between different levels is deprecated" with tm.assert_produces_warning(FutureWarning, match=msg): result = merge(new_df, other_df, left_index=True, right_index=True) assert ("b", "mean") in result assert "b" in result def test_join_float64_float32(self): a = DataFrame(np.random.randn(10, 2), columns=["a", "b"], dtype=np.float64) b = DataFrame(np.random.randn(10, 1), columns=["c"], dtype=np.float32) joined = a.join(b) assert joined.dtypes["a"] == "float64" assert joined.dtypes["b"] == "float64" assert joined.dtypes["c"] == "float32" a = np.random.randint(0, 5, 100).astype("int64") b = np.random.random(100).astype("float64") c = np.random.random(100).astype("float32") df = DataFrame({"a": a, "b": b, "c": c}) xpdf = DataFrame({"a": a, "b": b, "c": c}) s = DataFrame(np.random.random(5).astype("float32"), columns=["md"]) rs = df.merge(s, left_on="a", right_index=True) assert rs.dtypes["a"] == "int64" assert rs.dtypes["b"] == "float64" assert rs.dtypes["c"] == "float32" assert rs.dtypes["md"] == "float32" xp = xpdf.merge(s, left_on="a", right_index=True) tm.assert_frame_equal(rs, xp) def test_join_many_non_unique_index(self): df1 = DataFrame({"a": [1, 1], "b": [1, 1], "c": [10, 20]}) df2 = DataFrame({"a": [1, 1], "b": [1, 2], "d": [100, 200]}) df3 = DataFrame({"a": [1, 1], "b": [1, 2], "e": [1000, 2000]}) idf1 = df1.set_index(["a", "b"]) idf2 = df2.set_index(["a", "b"]) idf3 = df3.set_index(["a", "b"]) result = idf1.join([idf2, idf3], how="outer") df_partially_merged = merge(df1, df2, on=["a", "b"], how="outer") expected = merge(df_partially_merged, df3, on=["a", "b"], how="outer") result = result.reset_index() expected = expected[result.columns] expected["a"] = expected.a.astype("int64") expected["b"] = expected.b.astype("int64") tm.assert_frame_equal(result, expected) df1 = DataFrame({"a": [1, 1, 1], "b": [1, 1, 1], "c": [10, 20, 30]}) df2 = DataFrame({"a": [1, 1, 1], "b": [1, 1, 2], "d": [100, 200, 300]}) df3 = DataFrame({ "a": [1, 1, 1], "b": [1, 1, 2], "e": [1000, 2000, 3000] }) idf1 = df1.set_index(["a", "b"]) idf2 = df2.set_index(["a", "b"]) idf3 = df3.set_index(["a", "b"]) result = idf1.join([idf2, idf3], how="inner") df_partially_merged = merge(df1, df2, on=["a", "b"], how="inner") expected = merge(df_partially_merged, df3, on=["a", "b"], how="inner") result = result.reset_index() tm.assert_frame_equal(result, expected.loc[:, result.columns]) # GH 11519 df = DataFrame({ "A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"], "B": ["one", "one", "two", "three", "two", "two", "one", "three"], "C": np.random.randn(8), "D": np.random.randn(8), }) s = Series(np.repeat(np.arange(8), 2), index=np.repeat(np.arange(8), 2), name="TEST") inner = df.join(s, how="inner") outer = df.join(s, how="outer") left = df.join(s, how="left") right = df.join(s, how="right") tm.assert_frame_equal(inner, outer) tm.assert_frame_equal(inner, left) tm.assert_frame_equal(inner, right) def test_join_sort(self): left = DataFrame({ "key": ["foo", "bar", "baz", "foo"], "value": [1, 2, 3, 4] }) right = DataFrame({"value2": ["a", "b", "c"]}, index=["bar", "baz", "foo"]) joined = left.join(right, on="key", sort=True) expected = DataFrame( { "key": ["bar", "baz", "foo", "foo"], "value": [2, 3, 1, 4], "value2": ["a", "b", "c", "c"], }, index=[1, 2, 0, 3], ) tm.assert_frame_equal(joined, expected) # smoke test joined = left.join(right, on="key", sort=False) tm.assert_index_equal(joined.index, Index(range(4)), exact=True) def test_join_mixed_non_unique_index(self): # GH 12814, unorderable types in py3 with a non-unique index df1 = DataFrame({"a": [1, 2, 3, 4]}, index=[1, 2, 3, "a"]) df2 = DataFrame({"b": [5, 6, 7, 8]}, index=[1, 3, 3, 4]) result = df1.join(df2) expected = DataFrame( { "a": [1, 2, 3, 3, 4], "b": [5, np.nan, 6, 7, np.nan] }, index=[1, 2, 3, 3, "a"], ) tm.assert_frame_equal(result, expected) df3 = DataFrame({"a": [1, 2, 3, 4]}, index=[1, 2, 2, "a"]) df4 = DataFrame({"b": [5, 6, 7, 8]}, index=[1, 2, 3, 4]) result = df3.join(df4) expected = DataFrame({ "a": [1, 2, 3, 4], "b": [5, 6, 6, np.nan] }, index=[1, 2, 2, "a"]) tm.assert_frame_equal(result, expected) def test_join_non_unique_period_index(self): # GH #16871 index = pd.period_range("2016-01-01", periods=16, freq="M") df = DataFrame(list(range(len(index))), index=index, columns=["pnum"]) df2 = concat([df, df]) result = df.join(df2, how="inner", rsuffix="_df2") expected = DataFrame( np.tile(np.arange(16, dtype=np.int64).repeat(2).reshape(-1, 1), 2), columns=["pnum", "pnum_df2"], index=df2.sort_index().index, ) tm.assert_frame_equal(result, expected) def test_mixed_type_join_with_suffix(self): # GH #916 df = DataFrame(np.random.randn(20, 6), columns=["a", "b", "c", "d", "e", "f"]) df.insert(0, "id", 0) df.insert(5, "dt", "foo") grouped = df.groupby("id") mn = grouped.mean() cn = grouped.count() # it works! mn.join(cn, rsuffix="_right") def test_join_many(self): df = DataFrame(np.random.randn(10, 6), columns=list("abcdef")) df_list = [df[["a", "b"]], df[["c", "d"]], df[["e", "f"]]] joined = df_list[0].join(df_list[1:]) tm.assert_frame_equal(joined, df) df_list = [ df[["a", "b"]][:-2], df[["c", "d"]][2:], df[["e", "f"]][1:9] ] def _check_diff_index(df_list, result, exp_index): reindexed = [x.reindex(exp_index) for x in df_list] expected = reindexed[0].join(reindexed[1:]) tm.assert_frame_equal(result, expected) # different join types joined = df_list[0].join(df_list[1:], how="outer") _check_diff_index(df_list, joined, df.index) joined = df_list[0].join(df_list[1:]) _check_diff_index(df_list, joined, df_list[0].index) joined = df_list[0].join(df_list[1:], how="inner") _check_diff_index(df_list, joined, df.index[2:8]) msg = "Joining multiple DataFrames only supported for joining on index" with pytest.raises(ValueError, match=msg): df_list[0].join(df_list[1:], on="a") def test_join_many_mixed(self): df = DataFrame(np.random.randn(8, 4), columns=["A", "B", "C", "D"]) df["key"] = ["foo", "bar"] * 4 df1 = df.loc[:, ["A", "B"]] df2 = df.loc[:, ["C", "D"]] df3 = df.loc[:, ["key"]] result = df1.join([df2, df3]) tm.assert_frame_equal(result, df) def test_join_dups(self): # joining dups df = concat( [ DataFrame(np.random.randn(10, 4), columns=["A", "A", "B", "B" ]), DataFrame(np.random.randint(0, 10, size=20).reshape(10, 2), columns=["A", "C"]), ], axis=1, ) expected = concat([df, df], axis=1) result = df.join(df, rsuffix="_2") result.columns = expected.columns tm.assert_frame_equal(result, expected) # GH 4975, invalid join on dups w = DataFrame(np.random.randn(4, 2), columns=["x", "y"]) x = DataFrame(np.random.randn(4, 2), columns=["x", "y"]) y = DataFrame(np.random.randn(4, 2), columns=["x", "y"]) z = DataFrame(np.random.randn(4, 2), columns=["x", "y"]) dta = x.merge(y, left_index=True, right_index=True).merge(z, left_index=True, right_index=True, how="outer") with tm.assert_produces_warning(FutureWarning): dta = dta.merge(w, left_index=True, right_index=True) expected = concat([x, y, z, w], axis=1) expected.columns = [ "x_x", "y_x", "x_y", "y_y", "x_x", "y_x", "x_y", "y_y" ] tm.assert_frame_equal(dta, expected) def test_join_multi_to_multi(self, join_type): # GH 20475 leftindex = MultiIndex.from_product( [list("abc"), list("xy"), [1, 2]], names=["abc", "xy", "num"]) left = DataFrame({"v1": range(12)}, index=leftindex) rightindex = MultiIndex.from_product( [list("abc"), list("xy")], names=["abc", "xy"]) right = DataFrame({"v2": [100 * i for i in range(1, 7)]}, index=rightindex) result = left.join(right, on=["abc", "xy"], how=join_type) expected = (left.reset_index().merge(right.reset_index(), on=["abc", "xy"], how=join_type).set_index( ["abc", "xy", "num"])) tm.assert_frame_equal(expected, result) msg = r'len\(left_on\) must equal the number of levels in the index of "right"' with pytest.raises(ValueError, match=msg): left.join(right, on="xy", how=join_type) with pytest.raises(ValueError, match=msg): right.join(left, on=["abc", "xy"], how=join_type) def test_join_on_tz_aware_datetimeindex(self): # GH 23931, 26335 df1 = DataFrame({ "date": pd.date_range(start="2018-01-01", periods=5, tz="America/Chicago"), "vals": list("abcde"), }) df2 = DataFrame({ "date": pd.date_range(start="2018-01-03", periods=5, tz="America/Chicago"), "vals_2": list("tuvwx"), }) result = df1.join(df2.set_index("date"), on="date") expected = df1.copy() expected["vals_2"] = Series([np.nan] * 2 + list("tuv"), dtype=object) tm.assert_frame_equal(result, expected) def test_join_datetime_string(self): # GH 5647 dfa = DataFrame( [ ["2012-08-02", "L", 10], ["2012-08-02", "J", 15], ["2013-04-06", "L", 20], ["2013-04-06", "J", 25], ], columns=["x", "y", "a"], ) dfa["x"] = pd.to_datetime(dfa["x"]) dfb = DataFrame( [["2012-08-02", "J", 1], ["2013-04-06", "L", 2]], columns=["x", "y", "z"], index=[2, 4], ) dfb["x"] = pd.to_datetime(dfb["x"]) result = dfb.join(dfa.set_index(["x", "y"]), on=["x", "y"]) expected = DataFrame( [ [Timestamp("2012-08-02 00:00:00"), "J", 1, 15], [Timestamp("2013-04-06 00:00:00"), "L", 2, 20], ], index=[2, 4], columns=["x", "y", "z", "a"], ) tm.assert_frame_equal(result, expected)
class TestJoin: def setup_method(self, method): # aggregate multiple columns self.df = DataFrame({ 'key1': get_test_data(), 'key2': get_test_data(), 'data1': np.random.randn(N), 'data2': np.random.randn(N) }) # exclude a couple keys for fun self.df = self.df[self.df['key2'] > 1] self.df2 = DataFrame({ 'key1': get_test_data(n=N // 5), 'key2': get_test_data(ngroups=NGROUPS // 2, n=N // 5), 'value': np.random.randn(N // 5) }) index, data = tm.getMixedTypeDict() self.target = DataFrame(data, index=index) # Join on string value self.source = DataFrame({ 'MergedA': data['A'], 'MergedD': data['D'] }, index=data['C']) def test_cython_left_outer_join(self): left = a_([0, 1, 2, 1, 2, 0, 0, 1, 2, 3, 3], dtype=np.int64) right = a_([1, 1, 0, 4, 2, 2, 1], dtype=np.int64) max_group = 5 ls, rs = libjoin.left_outer_join(left, right, max_group) exp_ls = left.argsort(kind='mergesort') exp_rs = right.argsort(kind='mergesort') exp_li = a_( [0, 1, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, 6, 6, 7, 7, 8, 8, 9, 10]) exp_ri = a_( [0, 0, 0, 1, 2, 3, 1, 2, 3, 1, 2, 3, 4, 5, 4, 5, 4, 5, -1, -1]) exp_ls = exp_ls.take(exp_li) exp_ls[exp_li == -1] = -1 exp_rs = exp_rs.take(exp_ri) exp_rs[exp_ri == -1] = -1 tm.assert_numpy_array_equal(ls, exp_ls, check_dtype=False) tm.assert_numpy_array_equal(rs, exp_rs, check_dtype=False) def test_cython_right_outer_join(self): left = a_([0, 1, 2, 1, 2, 0, 0, 1, 2, 3, 3], dtype=np.int64) right = a_([1, 1, 0, 4, 2, 2, 1], dtype=np.int64) max_group = 5 rs, ls = libjoin.left_outer_join(right, left, max_group) exp_ls = left.argsort(kind='mergesort') exp_rs = right.argsort(kind='mergesort') # 0 1 1 1 exp_li = a_([ 0, 1, 2, 3, 4, 5, 3, 4, 5, 3, 4, 5, # 2 2 4 6, 7, 8, 6, 7, 8, -1 ]) exp_ri = a_([0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, 6]) exp_ls = exp_ls.take(exp_li) exp_ls[exp_li == -1] = -1 exp_rs = exp_rs.take(exp_ri) exp_rs[exp_ri == -1] = -1 tm.assert_numpy_array_equal(ls, exp_ls, check_dtype=False) tm.assert_numpy_array_equal(rs, exp_rs, check_dtype=False) def test_cython_inner_join(self): left = a_([0, 1, 2, 1, 2, 0, 0, 1, 2, 3, 3], dtype=np.int64) right = a_([1, 1, 0, 4, 2, 2, 1, 4], dtype=np.int64) max_group = 5 ls, rs = libjoin.inner_join(left, right, max_group) exp_ls = left.argsort(kind='mergesort') exp_rs = right.argsort(kind='mergesort') exp_li = a_([0, 1, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, 6, 6, 7, 7, 8, 8]) exp_ri = a_([0, 0, 0, 1, 2, 3, 1, 2, 3, 1, 2, 3, 4, 5, 4, 5, 4, 5]) exp_ls = exp_ls.take(exp_li) exp_ls[exp_li == -1] = -1 exp_rs = exp_rs.take(exp_ri) exp_rs[exp_ri == -1] = -1 tm.assert_numpy_array_equal(ls, exp_ls, check_dtype=False) tm.assert_numpy_array_equal(rs, exp_rs, check_dtype=False) def test_left_outer_join(self): joined_key2 = merge(self.df, self.df2, on='key2') _check_join(self.df, self.df2, joined_key2, ['key2'], how='left') joined_both = merge(self.df, self.df2) _check_join(self.df, self.df2, joined_both, ['key1', 'key2'], how='left') def test_right_outer_join(self): joined_key2 = merge(self.df, self.df2, on='key2', how='right') _check_join(self.df, self.df2, joined_key2, ['key2'], how='right') joined_both = merge(self.df, self.df2, how='right') _check_join(self.df, self.df2, joined_both, ['key1', 'key2'], how='right') def test_full_outer_join(self): joined_key2 = merge(self.df, self.df2, on='key2', how='outer') _check_join(self.df, self.df2, joined_key2, ['key2'], how='outer') joined_both = merge(self.df, self.df2, how='outer') _check_join(self.df, self.df2, joined_both, ['key1', 'key2'], how='outer') def test_inner_join(self): joined_key2 = merge(self.df, self.df2, on='key2', how='inner') _check_join(self.df, self.df2, joined_key2, ['key2'], how='inner') joined_both = merge(self.df, self.df2, how='inner') _check_join(self.df, self.df2, joined_both, ['key1', 'key2'], how='inner') def test_handle_overlap(self): joined = merge(self.df, self.df2, on='key2', suffixes=['.foo', '.bar']) assert 'key1.foo' in joined assert 'key1.bar' in joined def test_handle_overlap_arbitrary_key(self): joined = merge(self.df, self.df2, left_on='key2', right_on='key1', suffixes=['.foo', '.bar']) assert 'key1.foo' in joined assert 'key2.bar' in joined def test_join_on(self): target = self.target source = self.source merged = target.join(source, on='C') tm.assert_series_equal(merged['MergedA'], target['A'], check_names=False) tm.assert_series_equal(merged['MergedD'], target['D'], check_names=False) # join with duplicates (fix regression from DataFrame/Matrix merge) df = DataFrame({'key': ['a', 'a', 'b', 'b', 'c']}) df2 = DataFrame({'value': [0, 1, 2]}, index=['a', 'b', 'c']) joined = df.join(df2, on='key') expected = DataFrame({ 'key': ['a', 'a', 'b', 'b', 'c'], 'value': [0, 0, 1, 1, 2] }) assert_frame_equal(joined, expected) # Test when some are missing df_a = DataFrame([[1], [2], [3]], index=['a', 'b', 'c'], columns=['one']) df_b = DataFrame([['foo'], ['bar']], index=[1, 2], columns=['two']) df_c = DataFrame([[1], [2]], index=[1, 2], columns=['three']) joined = df_a.join(df_b, on='one') joined = joined.join(df_c, on='one') assert np.isnan(joined['two']['c']) assert np.isnan(joined['three']['c']) # merge column not p resent with pytest.raises(KeyError, match="^'E'$"): target.join(source, on='E') # overlap source_copy = source.copy() source_copy['A'] = 0 msg = ("You are trying to merge on float64 and object columns. If" " you wish to proceed you should use pd.concat") with pytest.raises(ValueError, match=msg): target.join(source_copy, on='A') def test_join_on_fails_with_different_right_index(self): df = DataFrame({ 'a': np.random.choice(['m', 'f'], size=3), 'b': np.random.randn(3) }) df2 = DataFrame( { 'a': np.random.choice(['m', 'f'], size=10), 'b': np.random.randn(10) }, index=tm.makeCustomIndex(10, 2)) msg = (r'len\(left_on\) must equal the number of levels in the index' ' of "right"') with pytest.raises(ValueError, match=msg): merge(df, df2, left_on='a', right_index=True) def test_join_on_fails_with_different_left_index(self): df = DataFrame( { 'a': np.random.choice(['m', 'f'], size=3), 'b': np.random.randn(3) }, index=tm.makeCustomIndex(3, 2)) df2 = DataFrame({ 'a': np.random.choice(['m', 'f'], size=10), 'b': np.random.randn(10) }) msg = (r'len\(right_on\) must equal the number of levels in the index' ' of "left"') with pytest.raises(ValueError, match=msg): merge(df, df2, right_on='b', left_index=True) def test_join_on_fails_with_different_column_counts(self): df = DataFrame({ 'a': np.random.choice(['m', 'f'], size=3), 'b': np.random.randn(3) }) df2 = DataFrame( { 'a': np.random.choice(['m', 'f'], size=10), 'b': np.random.randn(10) }, index=tm.makeCustomIndex(10, 2)) msg = r"len\(right_on\) must equal len\(left_on\)" with pytest.raises(ValueError, match=msg): merge(df, df2, right_on='a', left_on=['a', 'b']) @pytest.mark.parametrize("wrong_type", [2, 'str', None, np.array([0, 1])]) def test_join_on_fails_with_wrong_object_type(self, wrong_type): # GH12081 - original issue # GH21220 - merging of Series and DataFrame is now allowed # Edited test to remove the Series object from test parameters df = DataFrame({'a': [1, 1]}) msg = ("Can only merge Series or DataFrame objects, a {} was passed". format(str(type(wrong_type)))) with pytest.raises(TypeError, match=msg): merge(wrong_type, df, left_on='a', right_on='a') with pytest.raises(TypeError, match=msg): merge(df, wrong_type, left_on='a', right_on='a') def test_join_on_pass_vector(self): expected = self.target.join(self.source, on='C') del expected['C'] join_col = self.target.pop('C') result = self.target.join(self.source, on=join_col) assert_frame_equal(result, expected) def test_join_with_len0(self): # nothing to merge merged = self.target.join(self.source.reindex([]), on='C') for col in self.source: assert col in merged assert merged[col].isna().all() merged2 = self.target.join(self.source.reindex([]), on='C', how='inner') tm.assert_index_equal(merged2.columns, merged.columns) assert len(merged2) == 0 def test_join_on_inner(self): df = DataFrame({'key': ['a', 'a', 'd', 'b', 'b', 'c']}) df2 = DataFrame({'value': [0, 1]}, index=['a', 'b']) joined = df.join(df2, on='key', how='inner') expected = df.join(df2, on='key') expected = expected[expected['value'].notna()] tm.assert_series_equal(joined['key'], expected['key'], check_dtype=False) tm.assert_series_equal(joined['value'], expected['value'], check_dtype=False) tm.assert_index_equal(joined.index, expected.index) def test_join_on_singlekey_list(self): df = DataFrame({'key': ['a', 'a', 'b', 'b', 'c']}) df2 = DataFrame({'value': [0, 1, 2]}, index=['a', 'b', 'c']) # corner cases joined = df.join(df2, on=['key']) expected = df.join(df2, on='key') assert_frame_equal(joined, expected) def test_join_on_series(self): result = self.target.join(self.source['MergedA'], on='C') expected = self.target.join(self.source[['MergedA']], on='C') assert_frame_equal(result, expected) def test_join_on_series_buglet(self): # GH #638 df = DataFrame({'a': [1, 1]}) ds = Series([2], index=[1], name='b') result = df.join(ds, on='a') expected = DataFrame({'a': [1, 1], 'b': [2, 2]}, index=df.index) tm.assert_frame_equal(result, expected) def test_join_index_mixed(self, join_type): # no overlapping blocks df1 = DataFrame(index=np.arange(10)) df1['bool'] = True df1['string'] = 'foo' df2 = DataFrame(index=np.arange(5, 15)) df2['int'] = 1 df2['float'] = 1. joined = df1.join(df2, how=join_type) expected = _join_by_hand(df1, df2, how=join_type) assert_frame_equal(joined, expected) joined = df2.join(df1, how=join_type) expected = _join_by_hand(df2, df1, how=join_type) assert_frame_equal(joined, expected) def test_join_index_mixed_overlap(self): df1 = DataFrame({ 'A': 1., 'B': 2, 'C': 'foo', 'D': True }, index=np.arange(10), columns=['A', 'B', 'C', 'D']) assert df1['B'].dtype == np.int64 assert df1['D'].dtype == np.bool_ df2 = DataFrame({ 'A': 1., 'B': 2, 'C': 'foo', 'D': True }, index=np.arange(0, 10, 2), columns=['A', 'B', 'C', 'D']) # overlap joined = df1.join(df2, lsuffix='_one', rsuffix='_two') expected_columns = [ 'A_one', 'B_one', 'C_one', 'D_one', 'A_two', 'B_two', 'C_two', 'D_two' ] df1.columns = expected_columns[:4] df2.columns = expected_columns[4:] expected = _join_by_hand(df1, df2) assert_frame_equal(joined, expected) def test_join_empty_bug(self): # generated an exception in 0.4.3 x = DataFrame() x.join(DataFrame([3], index=[0], columns=['A']), how='outer') def test_join_unconsolidated(self): # GH #331 a = DataFrame(randn(30, 2), columns=['a', 'b']) c = Series(randn(30)) a['c'] = c d = DataFrame(randn(30, 1), columns=['q']) # it works! a.join(d) d.join(a) def test_join_multiindex(self): index1 = MultiIndex.from_arrays( [['a', 'a', 'a', 'b', 'b', 'b'], [1, 2, 3, 1, 2, 3]], names=['first', 'second']) index2 = MultiIndex.from_arrays( [['b', 'b', 'b', 'c', 'c', 'c'], [1, 2, 3, 1, 2, 3]], names=['first', 'second']) df1 = DataFrame(data=np.random.randn(6), index=index1, columns=['var X']) df2 = DataFrame(data=np.random.randn(6), index=index2, columns=['var Y']) df1 = df1.sort_index(level=0) df2 = df2.sort_index(level=0) joined = df1.join(df2, how='outer') ex_index = Index(index1.values).union(Index(index2.values)) expected = df1.reindex(ex_index).join(df2.reindex(ex_index)) expected.index.names = index1.names assert_frame_equal(joined, expected) assert joined.index.names == index1.names df1 = df1.sort_index(level=1) df2 = df2.sort_index(level=1) joined = df1.join(df2, how='outer').sort_index(level=0) ex_index = Index(index1.values).union(Index(index2.values)) expected = df1.reindex(ex_index).join(df2.reindex(ex_index)) expected.index.names = index1.names assert_frame_equal(joined, expected) assert joined.index.names == index1.names def test_join_inner_multiindex(self): key1 = [ 'bar', 'bar', 'bar', 'foo', 'foo', 'baz', 'baz', 'qux', 'qux', 'snap' ] key2 = [ 'two', 'one', 'three', 'one', 'two', 'one', 'two', 'two', 'three', 'one' ] data = np.random.randn(len(key1)) data = DataFrame({'key1': key1, 'key2': key2, 'data': data}) index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'], ['one', 'two', 'three']], codes=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], names=['first', 'second']) to_join = DataFrame(np.random.randn(10, 3), index=index, columns=['j_one', 'j_two', 'j_three']) joined = data.join(to_join, on=['key1', 'key2'], how='inner') expected = merge(data, to_join.reset_index(), left_on=['key1', 'key2'], right_on=['first', 'second'], how='inner', sort=False) expected2 = merge(to_join, data, right_on=['key1', 'key2'], left_index=True, how='inner', sort=False) assert_frame_equal(joined, expected2.reindex_like(joined)) expected2 = merge(to_join, data, right_on=['key1', 'key2'], left_index=True, how='inner', sort=False) expected = expected.drop(['first', 'second'], axis=1) expected.index = joined.index assert joined.index.is_monotonic assert_frame_equal(joined, expected) # _assert_same_contents(expected, expected2.loc[:, expected.columns]) def test_join_hierarchical_mixed(self): # GH 2024 df = DataFrame([(1, 2, 3), (4, 5, 6)], columns=['a', 'b', 'c']) new_df = df.groupby(['a']).agg({'b': [np.mean, np.sum]}) other_df = DataFrame([(1, 2, 3), (7, 10, 6)], columns=['a', 'b', 'd']) other_df.set_index('a', inplace=True) # GH 9455, 12219 with tm.assert_produces_warning(UserWarning): result = merge(new_df, other_df, left_index=True, right_index=True) assert ('b', 'mean') in result assert 'b' in result def test_join_float64_float32(self): a = DataFrame(randn(10, 2), columns=['a', 'b'], dtype=np.float64) b = DataFrame(randn(10, 1), columns=['c'], dtype=np.float32) joined = a.join(b) assert joined.dtypes['a'] == 'float64' assert joined.dtypes['b'] == 'float64' assert joined.dtypes['c'] == 'float32' a = np.random.randint(0, 5, 100).astype('int64') b = np.random.random(100).astype('float64') c = np.random.random(100).astype('float32') df = DataFrame({'a': a, 'b': b, 'c': c}) xpdf = DataFrame({'a': a, 'b': b, 'c': c}) s = DataFrame(np.random.random(5).astype('float32'), columns=['md']) rs = df.merge(s, left_on='a', right_index=True) assert rs.dtypes['a'] == 'int64' assert rs.dtypes['b'] == 'float64' assert rs.dtypes['c'] == 'float32' assert rs.dtypes['md'] == 'float32' xp = xpdf.merge(s, left_on='a', right_index=True) assert_frame_equal(rs, xp) def test_join_many_non_unique_index(self): df1 = DataFrame({"a": [1, 1], "b": [1, 1], "c": [10, 20]}) df2 = DataFrame({"a": [1, 1], "b": [1, 2], "d": [100, 200]}) df3 = DataFrame({"a": [1, 1], "b": [1, 2], "e": [1000, 2000]}) idf1 = df1.set_index(["a", "b"]) idf2 = df2.set_index(["a", "b"]) idf3 = df3.set_index(["a", "b"]) result = idf1.join([idf2, idf3], how='outer') df_partially_merged = merge(df1, df2, on=['a', 'b'], how='outer') expected = merge(df_partially_merged, df3, on=['a', 'b'], how='outer') result = result.reset_index() expected = expected[result.columns] expected['a'] = expected.a.astype('int64') expected['b'] = expected.b.astype('int64') assert_frame_equal(result, expected) df1 = DataFrame({"a": [1, 1, 1], "b": [1, 1, 1], "c": [10, 20, 30]}) df2 = DataFrame({"a": [1, 1, 1], "b": [1, 1, 2], "d": [100, 200, 300]}) df3 = DataFrame({ "a": [1, 1, 1], "b": [1, 1, 2], "e": [1000, 2000, 3000] }) idf1 = df1.set_index(["a", "b"]) idf2 = df2.set_index(["a", "b"]) idf3 = df3.set_index(["a", "b"]) result = idf1.join([idf2, idf3], how='inner') df_partially_merged = merge(df1, df2, on=['a', 'b'], how='inner') expected = merge(df_partially_merged, df3, on=['a', 'b'], how='inner') result = result.reset_index() assert_frame_equal(result, expected.loc[:, result.columns]) # GH 11519 df = DataFrame({ 'A': ['foo', 'bar', 'foo', 'bar', 'foo', 'bar', 'foo', 'foo'], 'B': ['one', 'one', 'two', 'three', 'two', 'two', 'one', 'three'], 'C': np.random.randn(8), 'D': np.random.randn(8) }) s = Series(np.repeat(np.arange(8), 2), index=np.repeat(np.arange(8), 2), name='TEST') inner = df.join(s, how='inner') outer = df.join(s, how='outer') left = df.join(s, how='left') right = df.join(s, how='right') assert_frame_equal(inner, outer) assert_frame_equal(inner, left) assert_frame_equal(inner, right) def test_join_sort(self): left = DataFrame({ 'key': ['foo', 'bar', 'baz', 'foo'], 'value': [1, 2, 3, 4] }) right = DataFrame({'value2': ['a', 'b', 'c']}, index=['bar', 'baz', 'foo']) joined = left.join(right, on='key', sort=True) expected = DataFrame( { 'key': ['bar', 'baz', 'foo', 'foo'], 'value': [2, 3, 1, 4], 'value2': ['a', 'b', 'c', 'c'] }, index=[1, 2, 0, 3]) assert_frame_equal(joined, expected) # smoke test joined = left.join(right, on='key', sort=False) tm.assert_index_equal(joined.index, pd.Index(lrange(4))) def test_join_mixed_non_unique_index(self): # GH 12814, unorderable types in py3 with a non-unique index df1 = DataFrame({'a': [1, 2, 3, 4]}, index=[1, 2, 3, 'a']) df2 = DataFrame({'b': [5, 6, 7, 8]}, index=[1, 3, 3, 4]) result = df1.join(df2) expected = DataFrame( { 'a': [1, 2, 3, 3, 4], 'b': [5, np.nan, 6, 7, np.nan] }, index=[1, 2, 3, 3, 'a']) tm.assert_frame_equal(result, expected) df3 = DataFrame({'a': [1, 2, 3, 4]}, index=[1, 2, 2, 'a']) df4 = DataFrame({'b': [5, 6, 7, 8]}, index=[1, 2, 3, 4]) result = df3.join(df4) expected = DataFrame({ 'a': [1, 2, 3, 4], 'b': [5, 6, 6, np.nan] }, index=[1, 2, 2, 'a']) tm.assert_frame_equal(result, expected) def test_join_non_unique_period_index(self): # GH #16871 index = pd.period_range('2016-01-01', periods=16, freq='M') df = DataFrame([i for i in range(len(index))], index=index, columns=['pnum']) df2 = concat([df, df]) result = df.join(df2, how='inner', rsuffix='_df2') expected = DataFrame(np.tile( np.arange(16, dtype=np.int64).repeat(2).reshape(-1, 1), 2), columns=['pnum', 'pnum_df2'], index=df2.sort_index().index) tm.assert_frame_equal(result, expected) def test_mixed_type_join_with_suffix(self): # GH #916 df = DataFrame(np.random.randn(20, 6), columns=['a', 'b', 'c', 'd', 'e', 'f']) df.insert(0, 'id', 0) df.insert(5, 'dt', 'foo') grouped = df.groupby('id') mn = grouped.mean() cn = grouped.count() # it works! mn.join(cn, rsuffix='_right') def test_join_many(self): df = DataFrame(np.random.randn(10, 6), columns=list('abcdef')) df_list = [df[['a', 'b']], df[['c', 'd']], df[['e', 'f']]] joined = df_list[0].join(df_list[1:]) tm.assert_frame_equal(joined, df) df_list = [ df[['a', 'b']][:-2], df[['c', 'd']][2:], df[['e', 'f']][1:9] ] def _check_diff_index(df_list, result, exp_index): reindexed = [x.reindex(exp_index) for x in df_list] expected = reindexed[0].join(reindexed[1:]) tm.assert_frame_equal(result, expected) # different join types joined = df_list[0].join(df_list[1:], how='outer') _check_diff_index(df_list, joined, df.index) joined = df_list[0].join(df_list[1:]) _check_diff_index(df_list, joined, df_list[0].index) joined = df_list[0].join(df_list[1:], how='inner') _check_diff_index(df_list, joined, df.index[2:8]) msg = "Joining multiple DataFrames only supported for joining on index" with pytest.raises(ValueError, match=msg): df_list[0].join(df_list[1:], on='a') def test_join_many_mixed(self): df = DataFrame(np.random.randn(8, 4), columns=['A', 'B', 'C', 'D']) df['key'] = ['foo', 'bar'] * 4 df1 = df.loc[:, ['A', 'B']] df2 = df.loc[:, ['C', 'D']] df3 = df.loc[:, ['key']] result = df1.join([df2, df3]) assert_frame_equal(result, df) def test_join_dups(self): # joining dups df = concat([ DataFrame(np.random.randn(10, 4), columns=['A', 'A', 'B', 'B']), DataFrame(np.random.randint(0, 10, size=20).reshape(10, 2), columns=['A', 'C']) ], axis=1) expected = concat([df, df], axis=1) result = df.join(df, rsuffix='_2') result.columns = expected.columns assert_frame_equal(result, expected) # GH 4975, invalid join on dups w = DataFrame(np.random.randn(4, 2), columns=["x", "y"]) x = DataFrame(np.random.randn(4, 2), columns=["x", "y"]) y = DataFrame(np.random.randn(4, 2), columns=["x", "y"]) z = DataFrame(np.random.randn(4, 2), columns=["x", "y"]) dta = x.merge(y, left_index=True, right_index=True).merge(z, left_index=True, right_index=True, how="outer") dta = dta.merge(w, left_index=True, right_index=True) expected = concat([x, y, z, w], axis=1) expected.columns = [ 'x_x', 'y_x', 'x_y', 'y_y', 'x_x', 'y_x', 'x_y', 'y_y' ] assert_frame_equal(dta, expected) def test_join_multi_to_multi(self, join_type): # GH 20475 leftindex = MultiIndex.from_product( [list('abc'), list('xy'), [1, 2]], names=['abc', 'xy', 'num']) left = DataFrame({'v1': range(12)}, index=leftindex) rightindex = MultiIndex.from_product( [list('abc'), list('xy')], names=['abc', 'xy']) right = DataFrame({'v2': [100 * i for i in range(1, 7)]}, index=rightindex) result = left.join(right, on=['abc', 'xy'], how=join_type) expected = (left.reset_index().merge(right.reset_index(), on=['abc', 'xy'], how=join_type).set_index( ['abc', 'xy', 'num'])) assert_frame_equal(expected, result) msg = (r'len\(left_on\) must equal the number of levels in the index' ' of "right"') with pytest.raises(ValueError, match=msg): left.join(right, on='xy', how=join_type) with pytest.raises(ValueError, match=msg): right.join(left, on=['abc', 'xy'], how=join_type) def test_join_on_tz_aware_datetimeindex(self): # GH 23931 df1 = pd.DataFrame({ 'date': pd.date_range(start='2018-01-01', periods=5, tz='America/Chicago'), 'vals': list('abcde') }) df2 = pd.DataFrame({ 'date': pd.date_range(start='2018-01-03', periods=5, tz='America/Chicago'), 'vals_2': list('tuvwx') }) result = df1.join(df2.set_index('date'), on='date') expected = df1.copy() expected['vals_2'] = pd.Series([np.nan] * len(expected), dtype=object) assert_frame_equal(result, expected)
def test_set_index_cast_datetimeindex(self): df = DataFrame({ 'A': [datetime(2000, 1, 1) + timedelta(i) for i in range(1000)], 'B': np.random.randn(1000) }) idf = df.set_index('A') assert isinstance(idf.index, pd.DatetimeIndex) # don't cast a DatetimeIndex WITH a tz, leave as object # GH 6032 i = (pd.DatetimeIndex( to_datetime(['2013-1-1 13:00', '2013-1-2 14:00'], errors="raise")).tz_localize('US/Pacific')) df = DataFrame(np.random.randn(2, 1), columns=['A']) expected = Series( np.array([ pd.Timestamp('2013-01-01 13:00:00-0800', tz='US/Pacific'), pd.Timestamp('2013-01-02 14:00:00-0800', tz='US/Pacific') ], dtype="object")) # convert index to series result = Series(i) assert_series_equal(result, expected) # assignt to frame df['B'] = i result = df['B'] assert_series_equal(result, expected, check_names=False) assert result.name == 'B' # keep the timezone result = i.to_series(keep_tz=True) assert_series_equal(result.reset_index(drop=True), expected) # convert to utc df['C'] = i.to_series().reset_index(drop=True) result = df['C'] comp = pd.DatetimeIndex(expected.values).copy() comp.tz = None tm.assert_numpy_array_equal(result.values, comp.values) # list of datetimes with a tz df['D'] = i.to_pydatetime() result = df['D'] assert_series_equal(result, expected, check_names=False) assert result.name == 'D' # GH 6785 # set the index manually import pytz df = DataFrame([{ 'ts': datetime(2014, 4, 1, tzinfo=pytz.utc), 'foo': 1 }]) expected = df.set_index('ts') df.index = df['ts'] df.pop('ts') assert_frame_equal(df, expected) # GH 3950 # reset_index with single level for tz in ['UTC', 'Asia/Tokyo', 'US/Eastern']: idx = pd.date_range('1/1/2011', periods=5, freq='D', tz=tz, name='idx') df = pd.DataFrame({ 'a': range(5), 'b': ['A', 'B', 'C', 'D', 'E'] }, index=idx) expected = pd.DataFrame( { 'idx': [ datetime(2011, 1, 1), datetime(2011, 1, 2), datetime(2011, 1, 3), datetime(2011, 1, 4), datetime(2011, 1, 5) ], 'a': range(5), 'b': ['A', 'B', 'C', 'D', 'E'] }, columns=['idx', 'a', 'b']) expected['idx'] = expected['idx'].apply( lambda d: pd.Timestamp(d, tz=tz)) assert_frame_equal(df.reset_index(), expected)
def melt( frame: DataFrame, id_vars=None, value_vars=None, var_name=None, value_name="value", col_level=None, ignore_index: bool = True, ) -> DataFrame: # If multiindex, gather names of columns on all level for checking presence # of `id_vars` and `value_vars` if isinstance(frame.columns, MultiIndex): cols = [x for c in frame.columns for x in c] else: cols = list(frame.columns) if value_name in frame.columns: warnings.warn( "This dataframe has a column name that matches the 'value_name' column " "name of the resulting Dataframe. " "In the future this will raise an error, please set the 'value_name' " "parameter of DataFrame.melt to a unique name.", FutureWarning, stacklevel=find_stack_level(), ) if id_vars is not None: if not is_list_like(id_vars): id_vars = [id_vars] elif isinstance(frame.columns, MultiIndex) and not isinstance(id_vars, list): raise ValueError( "id_vars must be a list of tuples when columns are a MultiIndex" ) else: # Check that `id_vars` are in frame id_vars = list(id_vars) missing = Index(com.flatten(id_vars)).difference(cols) if not missing.empty: raise KeyError( "The following 'id_vars' are not present " f"in the DataFrame: {list(missing)}" ) else: id_vars = [] if value_vars is not None: if not is_list_like(value_vars): value_vars = [value_vars] elif isinstance(frame.columns, MultiIndex) and not isinstance(value_vars, list): raise ValueError( "value_vars must be a list of tuples when columns are a MultiIndex" ) else: value_vars = list(value_vars) # Check that `value_vars` are in frame missing = Index(com.flatten(value_vars)).difference(cols) if not missing.empty: raise KeyError( "The following 'value_vars' are not present in " f"the DataFrame: {list(missing)}" ) if col_level is not None: idx = frame.columns.get_level_values(col_level).get_indexer( id_vars + value_vars ) else: idx = algos.unique(frame.columns.get_indexer_for(id_vars + value_vars)) frame = frame.iloc[:, idx] else: frame = frame.copy() if col_level is not None: # allow list or other? # frame is a copy frame.columns = frame.columns.get_level_values(col_level) if var_name is None: if isinstance(frame.columns, MultiIndex): if len(frame.columns.names) == len(set(frame.columns.names)): var_name = frame.columns.names else: var_name = [f"variable_{i}" for i in range(len(frame.columns.names))] else: var_name = [ frame.columns.name if frame.columns.name is not None else "variable" ] if isinstance(var_name, str): var_name = [var_name] N, K = frame.shape K -= len(id_vars) mdata = {} for col in id_vars: id_data = frame.pop(col) if is_extension_array_dtype(id_data): id_data = concat([id_data] * K, ignore_index=True) else: # error: Incompatible types in assignment (expression has type # "ndarray[Any, dtype[Any]]", variable has type "Series") id_data = np.tile(id_data._values, K) # type: ignore[assignment] mdata[col] = id_data mcolumns = id_vars + var_name + [value_name] # error: Incompatible types in assignment (expression has type "ndarray", # target has type "Series") mdata[value_name] = frame._values.ravel("F") # type: ignore[assignment] for i, col in enumerate(var_name): # asanyarray will keep the columns as an Index # error: Incompatible types in assignment (expression has type "ndarray", target # has type "Series") mdata[col] = np.asanyarray( # type: ignore[assignment] frame.columns._get_level_values(i) ).repeat(N) result = frame._constructor(mdata, columns=mcolumns) if not ignore_index: result.index = tile_compat(frame.index, K) return result
class TestJoin(object): def setup_method(self, method): # aggregate multiple columns self.df = DataFrame({ 'key1': get_test_data(), 'key2': get_test_data(), 'data1': np.random.randn(N), 'data2': np.random.randn(N) }) # exclude a couple keys for fun self.df = self.df[self.df['key2'] > 1] self.df2 = DataFrame({ 'key1': get_test_data(n=N // 5), 'key2': get_test_data(ngroups=NGROUPS // 2, n=N // 5), 'value': np.random.randn(N // 5) }) index, data = tm.getMixedTypeDict() self.target = DataFrame(data, index=index) # Join on string value self.source = DataFrame({ 'MergedA': data['A'], 'MergedD': data['D'] }, index=data['C']) def test_cython_left_outer_join(self): left = a_([0, 1, 2, 1, 2, 0, 0, 1, 2, 3, 3], dtype=np.int64) right = a_([1, 1, 0, 4, 2, 2, 1], dtype=np.int64) max_group = 5 ls, rs = libjoin.left_outer_join(left, right, max_group) exp_ls = left.argsort(kind='mergesort') exp_rs = right.argsort(kind='mergesort') exp_li = a_( [0, 1, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, 6, 6, 7, 7, 8, 8, 9, 10]) exp_ri = a_( [0, 0, 0, 1, 2, 3, 1, 2, 3, 1, 2, 3, 4, 5, 4, 5, 4, 5, -1, -1]) exp_ls = exp_ls.take(exp_li) exp_ls[exp_li == -1] = -1 exp_rs = exp_rs.take(exp_ri) exp_rs[exp_ri == -1] = -1 tm.assert_numpy_array_equal(ls, exp_ls, check_dtype=False) tm.assert_numpy_array_equal(rs, exp_rs, check_dtype=False) def test_cython_right_outer_join(self): left = a_([0, 1, 2, 1, 2, 0, 0, 1, 2, 3, 3], dtype=np.int64) right = a_([1, 1, 0, 4, 2, 2, 1], dtype=np.int64) max_group = 5 rs, ls = libjoin.left_outer_join(right, left, max_group) exp_ls = left.argsort(kind='mergesort') exp_rs = right.argsort(kind='mergesort') # 0 1 1 1 exp_li = a_([ 0, 1, 2, 3, 4, 5, 3, 4, 5, 3, 4, 5, # 2 2 4 6, 7, 8, 6, 7, 8, -1 ]) exp_ri = a_([0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, 6]) exp_ls = exp_ls.take(exp_li) exp_ls[exp_li == -1] = -1 exp_rs = exp_rs.take(exp_ri) exp_rs[exp_ri == -1] = -1 tm.assert_numpy_array_equal(ls, exp_ls, check_dtype=False) tm.assert_numpy_array_equal(rs, exp_rs, check_dtype=False) def test_cython_inner_join(self): left = a_([0, 1, 2, 1, 2, 0, 0, 1, 2, 3, 3], dtype=np.int64) right = a_([1, 1, 0, 4, 2, 2, 1, 4], dtype=np.int64) max_group = 5 ls, rs = libjoin.inner_join(left, right, max_group) exp_ls = left.argsort(kind='mergesort') exp_rs = right.argsort(kind='mergesort') exp_li = a_([0, 1, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, 6, 6, 7, 7, 8, 8]) exp_ri = a_([0, 0, 0, 1, 2, 3, 1, 2, 3, 1, 2, 3, 4, 5, 4, 5, 4, 5]) exp_ls = exp_ls.take(exp_li) exp_ls[exp_li == -1] = -1 exp_rs = exp_rs.take(exp_ri) exp_rs[exp_ri == -1] = -1 tm.assert_numpy_array_equal(ls, exp_ls, check_dtype=False) tm.assert_numpy_array_equal(rs, exp_rs, check_dtype=False) def test_left_outer_join(self): joined_key2 = merge(self.df, self.df2, on='key2') _check_join(self.df, self.df2, joined_key2, ['key2'], how='left') joined_both = merge(self.df, self.df2) _check_join(self.df, self.df2, joined_both, ['key1', 'key2'], how='left') def test_right_outer_join(self): joined_key2 = merge(self.df, self.df2, on='key2', how='right') _check_join(self.df, self.df2, joined_key2, ['key2'], how='right') joined_both = merge(self.df, self.df2, how='right') _check_join(self.df, self.df2, joined_both, ['key1', 'key2'], how='right') def test_full_outer_join(self): joined_key2 = merge(self.df, self.df2, on='key2', how='outer') _check_join(self.df, self.df2, joined_key2, ['key2'], how='outer') joined_both = merge(self.df, self.df2, how='outer') _check_join(self.df, self.df2, joined_both, ['key1', 'key2'], how='outer') def test_inner_join(self): joined_key2 = merge(self.df, self.df2, on='key2', how='inner') _check_join(self.df, self.df2, joined_key2, ['key2'], how='inner') joined_both = merge(self.df, self.df2, how='inner') _check_join(self.df, self.df2, joined_both, ['key1', 'key2'], how='inner') def test_handle_overlap(self): joined = merge(self.df, self.df2, on='key2', suffixes=['.foo', '.bar']) assert 'key1.foo' in joined assert 'key1.bar' in joined def test_handle_overlap_arbitrary_key(self): joined = merge(self.df, self.df2, left_on='key2', right_on='key1', suffixes=['.foo', '.bar']) assert 'key1.foo' in joined assert 'key2.bar' in joined def test_join_on(self): target = self.target source = self.source merged = target.join(source, on='C') tm.assert_series_equal(merged['MergedA'], target['A'], check_names=False) tm.assert_series_equal(merged['MergedD'], target['D'], check_names=False) # join with duplicates (fix regression from DataFrame/Matrix merge) df = DataFrame({'key': ['a', 'a', 'b', 'b', 'c']}) df2 = DataFrame({'value': [0, 1, 2]}, index=['a', 'b', 'c']) joined = df.join(df2, on='key') expected = DataFrame({ 'key': ['a', 'a', 'b', 'b', 'c'], 'value': [0, 0, 1, 1, 2] }) assert_frame_equal(joined, expected) # Test when some are missing df_a = DataFrame([[1], [2], [3]], index=['a', 'b', 'c'], columns=['one']) df_b = DataFrame([['foo'], ['bar']], index=[1, 2], columns=['two']) df_c = DataFrame([[1], [2]], index=[1, 2], columns=['three']) joined = df_a.join(df_b, on='one') joined = joined.join(df_c, on='one') assert np.isnan(joined['two']['c']) assert np.isnan(joined['three']['c']) # merge column not p resent pytest.raises(KeyError, target.join, source, on='E') # overlap source_copy = source.copy() source_copy['A'] = 0 pytest.raises(ValueError, target.join, source_copy, on='A') def test_join_on_fails_with_different_right_index(self): with pytest.raises(ValueError): df = DataFrame({ 'a': np.random.choice(['m', 'f'], size=3), 'b': np.random.randn(3) }) df2 = DataFrame( { 'a': np.random.choice(['m', 'f'], size=10), 'b': np.random.randn(10) }, index=tm.makeCustomIndex(10, 2)) merge(df, df2, left_on='a', right_index=True) def test_join_on_fails_with_different_left_index(self): with pytest.raises(ValueError): df = DataFrame( { 'a': np.random.choice(['m', 'f'], size=3), 'b': np.random.randn(3) }, index=tm.makeCustomIndex(10, 2)) df2 = DataFrame({ 'a': np.random.choice(['m', 'f'], size=10), 'b': np.random.randn(10) }) merge(df, df2, right_on='b', left_index=True) def test_join_on_fails_with_different_column_counts(self): with pytest.raises(ValueError): df = DataFrame({ 'a': np.random.choice(['m', 'f'], size=3), 'b': np.random.randn(3) }) df2 = DataFrame( { 'a': np.random.choice(['m', 'f'], size=10), 'b': np.random.randn(10) }, index=tm.makeCustomIndex(10, 2)) merge(df, df2, right_on='a', left_on=['a', 'b']) def test_join_on_fails_with_wrong_object_type(self): # GH12081 wrongly_typed = [Series([0, 1]), 2, 'str', None, np.array([0, 1])] df = DataFrame({'a': [1, 1]}) for obj in wrongly_typed: with tm.assert_raises_regex(ValueError, str(type(obj))): merge(obj, df, left_on='a', right_on='a') with tm.assert_raises_regex(ValueError, str(type(obj))): merge(df, obj, left_on='a', right_on='a') def test_join_on_pass_vector(self): expected = self.target.join(self.source, on='C') del expected['C'] join_col = self.target.pop('C') result = self.target.join(self.source, on=join_col) assert_frame_equal(result, expected) def test_join_with_len0(self): # nothing to merge merged = self.target.join(self.source.reindex([]), on='C') for col in self.source: assert col in merged assert merged[col].isna().all() merged2 = self.target.join(self.source.reindex([]), on='C', how='inner') tm.assert_index_equal(merged2.columns, merged.columns) assert len(merged2) == 0 def test_join_on_inner(self): df = DataFrame({'key': ['a', 'a', 'd', 'b', 'b', 'c']}) df2 = DataFrame({'value': [0, 1]}, index=['a', 'b']) joined = df.join(df2, on='key', how='inner') expected = df.join(df2, on='key') expected = expected[expected['value'].notna()] tm.assert_series_equal(joined['key'], expected['key'], check_dtype=False) tm.assert_series_equal(joined['value'], expected['value'], check_dtype=False) tm.assert_index_equal(joined.index, expected.index) def test_join_on_singlekey_list(self): df = DataFrame({'key': ['a', 'a', 'b', 'b', 'c']}) df2 = DataFrame({'value': [0, 1, 2]}, index=['a', 'b', 'c']) # corner cases joined = df.join(df2, on=['key']) expected = df.join(df2, on='key') assert_frame_equal(joined, expected) def test_join_on_series(self): result = self.target.join(self.source['MergedA'], on='C') expected = self.target.join(self.source[['MergedA']], on='C') assert_frame_equal(result, expected) def test_join_on_series_buglet(self): # GH #638 df = DataFrame({'a': [1, 1]}) ds = Series([2], index=[1], name='b') result = df.join(ds, on='a') expected = DataFrame({'a': [1, 1], 'b': [2, 2]}, index=df.index) tm.assert_frame_equal(result, expected) def test_join_index_mixed(self, join_type): # no overlapping blocks df1 = DataFrame(index=np.arange(10)) df1['bool'] = True df1['string'] = 'foo' df2 = DataFrame(index=np.arange(5, 15)) df2['int'] = 1 df2['float'] = 1. joined = df1.join(df2, how=join_type) expected = _join_by_hand(df1, df2, how=join_type) assert_frame_equal(joined, expected) joined = df2.join(df1, how=join_type) expected = _join_by_hand(df2, df1, how=join_type) assert_frame_equal(joined, expected) def test_join_index_mixed_overlap(self): df1 = DataFrame({ 'A': 1., 'B': 2, 'C': 'foo', 'D': True }, index=np.arange(10), columns=['A', 'B', 'C', 'D']) assert df1['B'].dtype == np.int64 assert df1['D'].dtype == np.bool_ df2 = DataFrame({ 'A': 1., 'B': 2, 'C': 'foo', 'D': True }, index=np.arange(0, 10, 2), columns=['A', 'B', 'C', 'D']) # overlap joined = df1.join(df2, lsuffix='_one', rsuffix='_two') expected_columns = [ 'A_one', 'B_one', 'C_one', 'D_one', 'A_two', 'B_two', 'C_two', 'D_two' ] df1.columns = expected_columns[:4] df2.columns = expected_columns[4:] expected = _join_by_hand(df1, df2) assert_frame_equal(joined, expected) def test_join_empty_bug(self): # generated an exception in 0.4.3 x = DataFrame() x.join(DataFrame([3], index=[0], columns=['A']), how='outer') def test_join_unconsolidated(self): # GH #331 a = DataFrame(randn(30, 2), columns=['a', 'b']) c = Series(randn(30)) a['c'] = c d = DataFrame(randn(30, 1), columns=['q']) # it works! a.join(d) d.join(a) def test_join_multiindex(self): index1 = MultiIndex.from_arrays( [['a', 'a', 'a', 'b', 'b', 'b'], [1, 2, 3, 1, 2, 3]], names=['first', 'second']) index2 = MultiIndex.from_arrays( [['b', 'b', 'b', 'c', 'c', 'c'], [1, 2, 3, 1, 2, 3]], names=['first', 'second']) df1 = DataFrame(data=np.random.randn(6), index=index1, columns=['var X']) df2 = DataFrame(data=np.random.randn(6), index=index2, columns=['var Y']) df1 = df1.sort_index(level=0) df2 = df2.sort_index(level=0) joined = df1.join(df2, how='outer') ex_index = Index(index1.values).union(Index(index2.values)) expected = df1.reindex(ex_index).join(df2.reindex(ex_index)) expected.index.names = index1.names assert_frame_equal(joined, expected) assert joined.index.names == index1.names df1 = df1.sort_index(level=1) df2 = df2.sort_index(level=1) joined = df1.join(df2, how='outer').sort_index(level=0) ex_index = Index(index1.values).union(Index(index2.values)) expected = df1.reindex(ex_index).join(df2.reindex(ex_index)) expected.index.names = index1.names assert_frame_equal(joined, expected) assert joined.index.names == index1.names def test_join_inner_multiindex(self): key1 = [ 'bar', 'bar', 'bar', 'foo', 'foo', 'baz', 'baz', 'qux', 'qux', 'snap' ] key2 = [ 'two', 'one', 'three', 'one', 'two', 'one', 'two', 'two', 'three', 'one' ] data = np.random.randn(len(key1)) data = DataFrame({'key1': key1, 'key2': key2, 'data': data}) index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'], ['one', 'two', 'three']], labels=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], names=['first', 'second']) to_join = DataFrame(np.random.randn(10, 3), index=index, columns=['j_one', 'j_two', 'j_three']) joined = data.join(to_join, on=['key1', 'key2'], how='inner') expected = merge(data, to_join.reset_index(), left_on=['key1', 'key2'], right_on=['first', 'second'], how='inner', sort=False) expected2 = merge(to_join, data, right_on=['key1', 'key2'], left_index=True, how='inner', sort=False) assert_frame_equal(joined, expected2.reindex_like(joined)) expected2 = merge(to_join, data, right_on=['key1', 'key2'], left_index=True, how='inner', sort=False) expected = expected.drop(['first', 'second'], axis=1) expected.index = joined.index assert joined.index.is_monotonic assert_frame_equal(joined, expected) # _assert_same_contents(expected, expected2.loc[:, expected.columns]) def test_join_hierarchical_mixed(self): # GH 2024 df = DataFrame([(1, 2, 3), (4, 5, 6)], columns=['a', 'b', 'c']) new_df = df.groupby(['a']).agg({'b': [np.mean, np.sum]}) other_df = DataFrame([(1, 2, 3), (7, 10, 6)], columns=['a', 'b', 'd']) other_df.set_index('a', inplace=True) # GH 9455, 12219 with tm.assert_produces_warning(UserWarning): result = merge(new_df, other_df, left_index=True, right_index=True) assert ('b', 'mean') in result assert 'b' in result def test_join_float64_float32(self): a = DataFrame(randn(10, 2), columns=['a', 'b'], dtype=np.float64) b = DataFrame(randn(10, 1), columns=['c'], dtype=np.float32) joined = a.join(b) assert joined.dtypes['a'] == 'float64' assert joined.dtypes['b'] == 'float64' assert joined.dtypes['c'] == 'float32' a = np.random.randint(0, 5, 100).astype('int64') b = np.random.random(100).astype('float64') c = np.random.random(100).astype('float32') df = DataFrame({'a': a, 'b': b, 'c': c}) xpdf = DataFrame({'a': a, 'b': b, 'c': c}) s = DataFrame(np.random.random(5).astype('float32'), columns=['md']) rs = df.merge(s, left_on='a', right_index=True) assert rs.dtypes['a'] == 'int64' assert rs.dtypes['b'] == 'float64' assert rs.dtypes['c'] == 'float32' assert rs.dtypes['md'] == 'float32' xp = xpdf.merge(s, left_on='a', right_index=True) assert_frame_equal(rs, xp) def test_join_many_non_unique_index(self): df1 = DataFrame({"a": [1, 1], "b": [1, 1], "c": [10, 20]}) df2 = DataFrame({"a": [1, 1], "b": [1, 2], "d": [100, 200]}) df3 = DataFrame({"a": [1, 1], "b": [1, 2], "e": [1000, 2000]}) idf1 = df1.set_index(["a", "b"]) idf2 = df2.set_index(["a", "b"]) idf3 = df3.set_index(["a", "b"]) result = idf1.join([idf2, idf3], how='outer') df_partially_merged = merge(df1, df2, on=['a', 'b'], how='outer') expected = merge(df_partially_merged, df3, on=['a', 'b'], how='outer') result = result.reset_index() expected = expected[result.columns] expected['a'] = expected.a.astype('int64') expected['b'] = expected.b.astype('int64') assert_frame_equal(result, expected) df1 = DataFrame({"a": [1, 1, 1], "b": [1, 1, 1], "c": [10, 20, 30]}) df2 = DataFrame({"a": [1, 1, 1], "b": [1, 1, 2], "d": [100, 200, 300]}) df3 = DataFrame({ "a": [1, 1, 1], "b": [1, 1, 2], "e": [1000, 2000, 3000] }) idf1 = df1.set_index(["a", "b"]) idf2 = df2.set_index(["a", "b"]) idf3 = df3.set_index(["a", "b"]) result = idf1.join([idf2, idf3], how='inner') df_partially_merged = merge(df1, df2, on=['a', 'b'], how='inner') expected = merge(df_partially_merged, df3, on=['a', 'b'], how='inner') result = result.reset_index() assert_frame_equal(result, expected.loc[:, result.columns]) # GH 11519 df = DataFrame({ 'A': ['foo', 'bar', 'foo', 'bar', 'foo', 'bar', 'foo', 'foo'], 'B': ['one', 'one', 'two', 'three', 'two', 'two', 'one', 'three'], 'C': np.random.randn(8), 'D': np.random.randn(8) }) s = Series(np.repeat(np.arange(8), 2), index=np.repeat(np.arange(8), 2), name='TEST') inner = df.join(s, how='inner') outer = df.join(s, how='outer') left = df.join(s, how='left') right = df.join(s, how='right') assert_frame_equal(inner, outer) assert_frame_equal(inner, left) assert_frame_equal(inner, right) def test_join_sort(self): left = DataFrame({ 'key': ['foo', 'bar', 'baz', 'foo'], 'value': [1, 2, 3, 4] }) right = DataFrame({'value2': ['a', 'b', 'c']}, index=['bar', 'baz', 'foo']) joined = left.join(right, on='key', sort=True) expected = DataFrame( { 'key': ['bar', 'baz', 'foo', 'foo'], 'value': [2, 3, 1, 4], 'value2': ['a', 'b', 'c', 'c'] }, index=[1, 2, 0, 3]) assert_frame_equal(joined, expected) # smoke test joined = left.join(right, on='key', sort=False) tm.assert_index_equal(joined.index, pd.Index(lrange(4))) def test_join_mixed_non_unique_index(self): # GH 12814, unorderable types in py3 with a non-unique index df1 = DataFrame({'a': [1, 2, 3, 4]}, index=[1, 2, 3, 'a']) df2 = DataFrame({'b': [5, 6, 7, 8]}, index=[1, 3, 3, 4]) result = df1.join(df2) expected = DataFrame( { 'a': [1, 2, 3, 3, 4], 'b': [5, np.nan, 6, 7, np.nan] }, index=[1, 2, 3, 3, 'a']) tm.assert_frame_equal(result, expected) df3 = DataFrame({'a': [1, 2, 3, 4]}, index=[1, 2, 2, 'a']) df4 = DataFrame({'b': [5, 6, 7, 8]}, index=[1, 2, 3, 4]) result = df3.join(df4) expected = DataFrame({ 'a': [1, 2, 3, 4], 'b': [5, 6, 6, np.nan] }, index=[1, 2, 2, 'a']) tm.assert_frame_equal(result, expected) def test_join_non_unique_period_index(self): # GH #16871 index = pd.period_range('2016-01-01', periods=16, freq='M') df = DataFrame([i for i in range(len(index))], index=index, columns=['pnum']) df2 = concat([df, df]) result = df.join(df2, how='inner', rsuffix='_df2') expected = DataFrame(np.tile( np.arange(16, dtype=np.int64).repeat(2).reshape(-1, 1), 2), columns=['pnum', 'pnum_df2'], index=df2.sort_index().index) tm.assert_frame_equal(result, expected) def test_mixed_type_join_with_suffix(self): # GH #916 df = DataFrame(np.random.randn(20, 6), columns=['a', 'b', 'c', 'd', 'e', 'f']) df.insert(0, 'id', 0) df.insert(5, 'dt', 'foo') grouped = df.groupby('id') mn = grouped.mean() cn = grouped.count() # it works! mn.join(cn, rsuffix='_right') def test_join_many(self): df = DataFrame(np.random.randn(10, 6), columns=list('abcdef')) df_list = [df[['a', 'b']], df[['c', 'd']], df[['e', 'f']]] joined = df_list[0].join(df_list[1:]) tm.assert_frame_equal(joined, df) df_list = [ df[['a', 'b']][:-2], df[['c', 'd']][2:], df[['e', 'f']][1:9] ] def _check_diff_index(df_list, result, exp_index): reindexed = [x.reindex(exp_index) for x in df_list] expected = reindexed[0].join(reindexed[1:]) tm.assert_frame_equal(result, expected) # different join types joined = df_list[0].join(df_list[1:], how='outer') _check_diff_index(df_list, joined, df.index) joined = df_list[0].join(df_list[1:]) _check_diff_index(df_list, joined, df_list[0].index) joined = df_list[0].join(df_list[1:], how='inner') _check_diff_index(df_list, joined, df.index[2:8]) pytest.raises(ValueError, df_list[0].join, df_list[1:], on='a') def test_join_many_mixed(self): df = DataFrame(np.random.randn(8, 4), columns=['A', 'B', 'C', 'D']) df['key'] = ['foo', 'bar'] * 4 df1 = df.loc[:, ['A', 'B']] df2 = df.loc[:, ['C', 'D']] df3 = df.loc[:, ['key']] result = df1.join([df2, df3]) assert_frame_equal(result, df) def test_join_dups(self): # joining dups df = concat([ DataFrame(np.random.randn(10, 4), columns=['A', 'A', 'B', 'B']), DataFrame(np.random.randint(0, 10, size=20).reshape(10, 2), columns=['A', 'C']) ], axis=1) expected = concat([df, df], axis=1) result = df.join(df, rsuffix='_2') result.columns = expected.columns assert_frame_equal(result, expected) # GH 4975, invalid join on dups w = DataFrame(np.random.randn(4, 2), columns=["x", "y"]) x = DataFrame(np.random.randn(4, 2), columns=["x", "y"]) y = DataFrame(np.random.randn(4, 2), columns=["x", "y"]) z = DataFrame(np.random.randn(4, 2), columns=["x", "y"]) dta = x.merge(y, left_index=True, right_index=True).merge(z, left_index=True, right_index=True, how="outer") dta = dta.merge(w, left_index=True, right_index=True) expected = concat([x, y, z, w], axis=1) expected.columns = [ 'x_x', 'y_x', 'x_y', 'y_y', 'x_x', 'y_x', 'x_y', 'y_y' ] assert_frame_equal(dta, expected) def test_panel_join(self): with catch_warnings(record=True): panel = tm.makePanel() tm.add_nans(panel) p1 = panel.iloc[:2, :10, :3] p2 = panel.iloc[2:, 5:, 2:] # left join result = p1.join(p2) expected = p1.copy() expected['ItemC'] = p2['ItemC'] tm.assert_panel_equal(result, expected) # right join result = p1.join(p2, how='right') expected = p2.copy() expected['ItemA'] = p1['ItemA'] expected['ItemB'] = p1['ItemB'] expected = expected.reindex(items=['ItemA', 'ItemB', 'ItemC']) tm.assert_panel_equal(result, expected) # inner join result = p1.join(p2, how='inner') expected = panel.iloc[:, 5:10, 2:3] tm.assert_panel_equal(result, expected) # outer join result = p1.join(p2, how='outer') expected = p1.reindex(major=panel.major_axis, minor=panel.minor_axis) expected = expected.join( p2.reindex(major=panel.major_axis, minor=panel.minor_axis)) tm.assert_panel_equal(result, expected) def test_panel_join_overlap(self): with catch_warnings(record=True): panel = tm.makePanel() tm.add_nans(panel) p1 = panel.loc[['ItemA', 'ItemB', 'ItemC']] p2 = panel.loc[['ItemB', 'ItemC']] # Expected index is # # ItemA, ItemB_p1, ItemC_p1, ItemB_p2, ItemC_p2 joined = p1.join(p2, lsuffix='_p1', rsuffix='_p2') p1_suf = p1.loc[['ItemB', 'ItemC']].add_suffix('_p1') p2_suf = p2.loc[['ItemB', 'ItemC']].add_suffix('_p2') no_overlap = panel.loc[['ItemA']] expected = no_overlap.join(p1_suf.join(p2_suf)) tm.assert_panel_equal(joined, expected) def test_panel_join_many(self): with catch_warnings(record=True): tm.K = 10 panel = tm.makePanel() tm.K = 4 panels = [panel.iloc[:2], panel.iloc[2:6], panel.iloc[6:]] joined = panels[0].join(panels[1:]) tm.assert_panel_equal(joined, panel) panels = [ panel.iloc[:2, :-5], panel.iloc[2:6, 2:], panel.iloc[6:, 5:-7] ] data_dict = {} for p in panels: data_dict.update(p.iteritems()) joined = panels[0].join(panels[1:], how='inner') expected = pd.Panel.from_dict(data_dict, intersect=True) tm.assert_panel_equal(joined, expected) joined = panels[0].join(panels[1:], how='outer') expected = pd.Panel.from_dict(data_dict, intersect=False) tm.assert_panel_equal(joined, expected) # edge cases pytest.raises(ValueError, panels[0].join, panels[1:], how='outer', lsuffix='foo', rsuffix='bar') pytest.raises(ValueError, panels[0].join, panels[1:], how='right')
def __pop_safe(dataframe: DataFrame, column_name: str): try: return dataframe.pop(column_name) except: print("DataFrame do not consists such column : {}".format(column_name))
def _metadata_to_categories(metadata_df: pd.DataFrame) -> list: """ Add category information to `data_df`'s column headers in the format that Clustergrammer expects: "([Category 1]: [Value 1], [Category 2]: [Value 2], ...)" """ metadata_df = metadata_df.copy() # so don't modify original CLINICAL_FIELD_PREFIX = "arbitrary_trial_specific_clinical_annotations." columns = [] for c in metadata_df.columns: # go through and check cardinality = # unique # also rename the columns to pretty things cardinality = len(metadata_df[c].unique()) if (cardinality > CLUSTERGRAMMER_MAX_CATEGORY_CARDINALITY or cardinality <= 1 or cardinality == metadata_df.shape[0]): # only want if not all the same, not too many, and not each unique to sample if c not in [ "cimac_participant_id", "cohort_name", "collection_event_name", ]: # we want to keep the above no matter what metadata_df.pop(c) continue if "(1=Yes,0=No)" in c: # these are boolean! let's treat them that way metadata_df[c] = metadata_df[c].astype(bool) if c.startswith(CLINICAL_FIELD_PREFIX): # for 10021 participants.csv: ## remove the prefix ## remove any parentheses cat = c[len(CLINICAL_FIELD_PREFIX):] if "(" in cat and ")" in cat and cat.index(")") > cat.index("("): cat = cat.split("(", 1)[0] + cat.rsplit(")", 1)[1] else: # otherwise ## break up underscores ## title case ## drop 'CIDC' / 'CIMAC' anywhere ## drop trailing 'Name' cat = c.replace("_", " ").title().replace("Cidc", "").replace("Cimac", "") if cat.endswith("Name") and not cat == "Name": cat = cat[:-4] # strip so it's pretty! if cat.strip() not in columns: columns.append(cat.strip()) else: # if it's a repeated name, pop it metadata_df.pop(c) metadata_df.columns = columns print("CG Category options:", ", ".join(columns)) # cut down to only the categories we want columns = [ c for c in [ "Participant Id", "Collection Event", "Cohort", "Treatment", "Disease progression", "RECIST clinical benefit status", ] if c in metadata_df.columns ] columns = sorted(columns, key=lambda c: len(metadata_df[c].unique())) metadata_df = metadata_df[columns] if "Disease progression" in columns: columns[columns.index("Disease progression")] = "Disease prog" if "RECIST clinical benefit status" in columns: columns[columns.index( "RECIST clinical benefit status")] = "Clin benefit" metadata_df.columns = columns # build the output str in ClusterGrammer compatible format categories = [] for idx, row in metadata_df.iterrows(): temp = [f"CIMAC Id: {idx}"] for cat, val in row.items(): temp.append(f"{cat}: {val}") categories.append(tuple(temp)) return categories
columns=['Close']) scaler = MinMaxScaler().fit(train2) train2 = DataFrame(scaler.transform(train2), index=train2.index, columns=train2.columns) train2 = pd.concat([train2.shift(i) for i in range(window_width, 0, -1)] + [train2], axis=1)[window_width:] train2.columns = [ column + '-' + str(t) for column, t in zip(train2.columns, range(window_width, 0, -1)) ] + ['Close'] x_train = train2.drop('Close', axis=1).to_numpy() y_train = train2.pop('Close').to_numpy() # x_train = scaler.fit_transform(x_train) # y_train = scaler.fit_transform(y_train.reshape(-1, 1)) x_train = np.reshape(x_train, (x_train.shape[0], x_train.shape[1], 1)) load_model = True if load_model: with open(json_file, "r") as f: json_model = f.read() model = model_from_json(json_model) model.load_weights(weights_file) else: model = Sequential() model.add(LSTM(40, input_shape=(x_train.shape[1], 1))) # model.add(Dropout(0.3))
def split_predictor(self, data: DataFrame) -> Tuple[DataFrame, Series]: predictions = self.tensor_predictor(data.pop("Targets").to_list()) return data, predictions
def _get_next_state( state: pd.DataFrame, seperation: float, cohesion: float, alignment: float, visibility: float, dimensions: tp.List[str], step: float, ) -> pd.DataFrame: # Self-cross-product Boids for all (center, neighbor) pairs. state["i"] = range(len(state)) state["j"] = 0 pairs = pd.merge( left=state, right=state.add_prefix(prefix="n"), left_on="j", right_on="nj", how="outer", ) # Unpack columns. cols = [ ( f"p{i}", # Positions f"v{i}", # Velocitys f"np{i}", # Neighbor positions. f"nv{i}", # Neighbor velocitys. f"nd{i}", # Neighbor distances. ) for i in dimensions ] p, v, np, nv, nd = map(list, zip(*cols)) # For each dimension: for pi, npi, ndi in zip(p, np, nd): # Compute neighbor-to-center translations. pairs[ndi] = pairs[pi] - pairs[npi] # Compute neighbor-to-center distances. ndmag = pairs[nd].pow(2).sum(axis=1).pow(0.5) # Subset pairs to visible neighbors. pairs = pairs.loc[ndmag.le(visibility)] # For each dimension: for ndi in nd: # Transform neighbor-to-center translations to repulsions. pairs[ndi] /= ndmag.pow(2) # Compute neighbor velocity magnitudes. nvmag = pairs[nv].pow(2).sum(axis=1).pow(0.5) # For each dimension: for nvi in nv: # Transform neighbor velocities to (unit) neighbor directions. pairs[nvi] /= nvmag pairs[nvi].where(cond=nvmag.gt(0), other=0, inplace=True) # Nullify neighbors that are centers. pairs.loc[pairs["i"] == pairs["ni"], [*np, *nv, *nd]] = None # Augment repulsor behaviour. centers = pairs["t"].eq("repulsor") pairs.loc[centers, np] = None pairs.loc[centers, nv] = None pairs.loc[centers, nd] = None neighbors = pairs["nt"].eq("repulsor") pairs.loc[neighbors, np] = None pairs.loc[neighbors, nv] = None pairs.loc[neighbors, nd] *= 30 # Aggregate neighbor information per center Boid. agg_last = {col: "last" for col in ("t", *p, *v)} agg_mean = {col: "mean" for col in (*np, *nv, *nd)} agg = {**agg_last, **agg_mean} groups = pairs.groupby(by="i", as_index=False, sort=False) state = groups.agg(func=agg).drop(columns="i") # For each dimension: for pi, npi in zip(p, np): # Transform mean-neighbor positions to center-to-mean-neighbor translations. state[npi] -= state[pi] # For each dimension: for pi, vi, npi, nvi, ndi in zip(p, v, np, nv, nd): # Compute accelerations. ai = 0 ai += seperation * state.pop(ndi).where(cond=pd.notnull, other=0) ai += cohesion * state.pop(npi).where(cond=pd.notnull, other=0) ai += alignment * state.pop(nvi).where(cond=pd.notnull, other=0) # Update velocities and positions. state[vi] += ai * step**2 state[pi] += state[vi] * step return state
def create_frame_from_record( record: pd.DataFrame, col_start_dt: str, col_end_dt: str, frequency: str, col_date_nm: str, **kwargs, ) -> pd.DataFrame: """ Create a frame with a date colum ranging from the start_dt to the end_dt from a record. This function is to be used when you have a record of data from a DataFrame and you want to expand the data out over a date range. Note the DataFrame must contain a column for a start date and an end date. All columns from the original record will be duplicated the length of the date range. See example for usage. If wanting to do this for many records use the function ``footings.actuarial_tools.expand_frame_per_record``. When doing one record ``create_frame_from_record`` is faster than expand_frame_per_record. Parameters ---------- record : pd.DataFrame A single record from a DataFrame. col_start_dt : str The name of the start date column. col_end_dt : str The name of the end date column. frequency : str The frequency at which records are created. col_date_nm : str The column name to assign the date column. kwargs : See kwargs under footings.actuarial_tools.create_frame Returns ------- pandas.DataFrame A DataFrame with a date column, any passed kwargs, and columns from the record. Raises ------ ValueError If the length of record is not 1. See Also -------- footings.actuarial_tools.create_frame footings.actuarial_tools.expand_frame_per_record Examples -------- >>> import pandas as pd >>> from footings.actuarial_tools import create_frame_from_record >>> record = pd.DataFrame( >>> { >>> "POLICY": ["P1"], >>> "GENDER": ["M"], >>> "START_DATE": [pd.Timestamp("2020-01-10")], >>> "END_DATE": [pd.Timestamp("2020-05-30")] >>> } >>> ) >>> frame = create_frame_from_record( >>> record=record, >>> col_start_dt="START_DATE", >>> col_end_dt="END_DATE", >>> frequency="M", >>> col_date_nm="DATE", >>> duration_month="DURATION_MONTH" >>> ) >>> frame >>> # DATE DURATION_MONTH POLICY GENDER >>> # 0 2020-01-10 1 P1 M >>> # 1 2020-02-10 2 P1 M >>> # 2 2020-03-10 3 P1 M >>> # 3 2020-04-10 4 P1 M >>> # 4 2020-05-10 5 P1 M >>> # 5 2020-06-10 6 P1 M """ record = record.to_dict(orient="records") if len(record) != 1: msg = f"The record must be a pd.DataFrame with one row. The record pass has {len(record)} rows." raise ValueError(msg) record = record[0] start_dt = record[col_start_dt] record.pop(col_start_dt) end_dt = record[col_end_dt] record.pop(col_end_dt) return create_frame( start_dt=start_dt, end_dt=end_dt, frequency=frequency, col_date_nm=col_date_nm, **kwargs, ).assign(**record)
def remove_fea(df: pd.DataFrame, fea_list: list): for fea in fea_list: df.pop(fea)
class TestJoin(tm.TestCase): _multiprocess_can_split_ = True def setUp(self): # aggregate multiple columns self.df = DataFrame({'key1': get_test_data(), 'key2': get_test_data(), 'data1': np.random.randn(N), 'data2': np.random.randn(N)}) # exclude a couple keys for fun self.df = self.df[self.df['key2'] > 1] self.df2 = DataFrame({'key1': get_test_data(n=N // 5), 'key2': get_test_data(ngroups=NGROUPS // 2, n=N // 5), 'value': np.random.randn(N // 5)}) index, data = tm.getMixedTypeDict() self.target = DataFrame(data, index=index) # Join on string value self.source = DataFrame({'MergedA': data['A'], 'MergedD': data['D']}, index=data['C']) def test_cython_left_outer_join(self): left = a_([0, 1, 2, 1, 2, 0, 0, 1, 2, 3, 3], dtype=np.int64) right = a_([1, 1, 0, 4, 2, 2, 1], dtype=np.int64) max_group = 5 ls, rs = _join.left_outer_join(left, right, max_group) exp_ls = left.argsort(kind='mergesort') exp_rs = right.argsort(kind='mergesort') exp_li = a_([0, 1, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, 6, 6, 7, 7, 8, 8, 9, 10]) exp_ri = a_([0, 0, 0, 1, 2, 3, 1, 2, 3, 1, 2, 3, 4, 5, 4, 5, 4, 5, -1, -1]) exp_ls = exp_ls.take(exp_li) exp_ls[exp_li == -1] = -1 exp_rs = exp_rs.take(exp_ri) exp_rs[exp_ri == -1] = -1 self.assert_numpy_array_equal(ls, exp_ls, check_dtype=False) self.assert_numpy_array_equal(rs, exp_rs, check_dtype=False) def test_cython_right_outer_join(self): left = a_([0, 1, 2, 1, 2, 0, 0, 1, 2, 3, 3], dtype=np.int64) right = a_([1, 1, 0, 4, 2, 2, 1], dtype=np.int64) max_group = 5 rs, ls = _join.left_outer_join(right, left, max_group) exp_ls = left.argsort(kind='mergesort') exp_rs = right.argsort(kind='mergesort') # 0 1 1 1 exp_li = a_([0, 1, 2, 3, 4, 5, 3, 4, 5, 3, 4, 5, # 2 2 4 6, 7, 8, 6, 7, 8, -1]) exp_ri = a_([0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, 6]) exp_ls = exp_ls.take(exp_li) exp_ls[exp_li == -1] = -1 exp_rs = exp_rs.take(exp_ri) exp_rs[exp_ri == -1] = -1 self.assert_numpy_array_equal(ls, exp_ls, check_dtype=False) self.assert_numpy_array_equal(rs, exp_rs, check_dtype=False) def test_cython_inner_join(self): left = a_([0, 1, 2, 1, 2, 0, 0, 1, 2, 3, 3], dtype=np.int64) right = a_([1, 1, 0, 4, 2, 2, 1, 4], dtype=np.int64) max_group = 5 ls, rs = _join.inner_join(left, right, max_group) exp_ls = left.argsort(kind='mergesort') exp_rs = right.argsort(kind='mergesort') exp_li = a_([0, 1, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, 6, 6, 7, 7, 8, 8]) exp_ri = a_([0, 0, 0, 1, 2, 3, 1, 2, 3, 1, 2, 3, 4, 5, 4, 5, 4, 5]) exp_ls = exp_ls.take(exp_li) exp_ls[exp_li == -1] = -1 exp_rs = exp_rs.take(exp_ri) exp_rs[exp_ri == -1] = -1 self.assert_numpy_array_equal(ls, exp_ls, check_dtype=False) self.assert_numpy_array_equal(rs, exp_rs, check_dtype=False) def test_left_outer_join(self): joined_key2 = merge(self.df, self.df2, on='key2') _check_join(self.df, self.df2, joined_key2, ['key2'], how='left') joined_both = merge(self.df, self.df2) _check_join(self.df, self.df2, joined_both, ['key1', 'key2'], how='left') def test_right_outer_join(self): joined_key2 = merge(self.df, self.df2, on='key2', how='right') _check_join(self.df, self.df2, joined_key2, ['key2'], how='right') joined_both = merge(self.df, self.df2, how='right') _check_join(self.df, self.df2, joined_both, ['key1', 'key2'], how='right') def test_full_outer_join(self): joined_key2 = merge(self.df, self.df2, on='key2', how='outer') _check_join(self.df, self.df2, joined_key2, ['key2'], how='outer') joined_both = merge(self.df, self.df2, how='outer') _check_join(self.df, self.df2, joined_both, ['key1', 'key2'], how='outer') def test_inner_join(self): joined_key2 = merge(self.df, self.df2, on='key2', how='inner') _check_join(self.df, self.df2, joined_key2, ['key2'], how='inner') joined_both = merge(self.df, self.df2, how='inner') _check_join(self.df, self.df2, joined_both, ['key1', 'key2'], how='inner') def test_handle_overlap(self): joined = merge(self.df, self.df2, on='key2', suffixes=['.foo', '.bar']) self.assertIn('key1.foo', joined) self.assertIn('key1.bar', joined) def test_handle_overlap_arbitrary_key(self): joined = merge(self.df, self.df2, left_on='key2', right_on='key1', suffixes=['.foo', '.bar']) self.assertIn('key1.foo', joined) self.assertIn('key2.bar', joined) def test_join_on(self): target = self.target source = self.source merged = target.join(source, on='C') self.assert_series_equal(merged['MergedA'], target['A'], check_names=False) self.assert_series_equal(merged['MergedD'], target['D'], check_names=False) # join with duplicates (fix regression from DataFrame/Matrix merge) df = DataFrame({'key': ['a', 'a', 'b', 'b', 'c']}) df2 = DataFrame({'value': [0, 1, 2]}, index=['a', 'b', 'c']) joined = df.join(df2, on='key') expected = DataFrame({'key': ['a', 'a', 'b', 'b', 'c'], 'value': [0, 0, 1, 1, 2]}) assert_frame_equal(joined, expected) # Test when some are missing df_a = DataFrame([[1], [2], [3]], index=['a', 'b', 'c'], columns=['one']) df_b = DataFrame([['foo'], ['bar']], index=[1, 2], columns=['two']) df_c = DataFrame([[1], [2]], index=[1, 2], columns=['three']) joined = df_a.join(df_b, on='one') joined = joined.join(df_c, on='one') self.assertTrue(np.isnan(joined['two']['c'])) self.assertTrue(np.isnan(joined['three']['c'])) # merge column not p resent self.assertRaises(KeyError, target.join, source, on='E') # overlap source_copy = source.copy() source_copy['A'] = 0 self.assertRaises(ValueError, target.join, source_copy, on='A') def test_join_on_fails_with_different_right_index(self): with tm.assertRaises(ValueError): df = DataFrame({'a': np.random.choice(['m', 'f'], size=3), 'b': np.random.randn(3)}) df2 = DataFrame({'a': np.random.choice(['m', 'f'], size=10), 'b': np.random.randn(10)}, index=tm.makeCustomIndex(10, 2)) merge(df, df2, left_on='a', right_index=True) def test_join_on_fails_with_different_left_index(self): with tm.assertRaises(ValueError): df = DataFrame({'a': np.random.choice(['m', 'f'], size=3), 'b': np.random.randn(3)}, index=tm.makeCustomIndex(10, 2)) df2 = DataFrame({'a': np.random.choice(['m', 'f'], size=10), 'b': np.random.randn(10)}) merge(df, df2, right_on='b', left_index=True) def test_join_on_fails_with_different_column_counts(self): with tm.assertRaises(ValueError): df = DataFrame({'a': np.random.choice(['m', 'f'], size=3), 'b': np.random.randn(3)}) df2 = DataFrame({'a': np.random.choice(['m', 'f'], size=10), 'b': np.random.randn(10)}, index=tm.makeCustomIndex(10, 2)) merge(df, df2, right_on='a', left_on=['a', 'b']) def test_join_on_fails_with_wrong_object_type(self): # GH12081 wrongly_typed = [Series([0, 1]), 2, 'str', None, np.array([0, 1])] df = DataFrame({'a': [1, 1]}) for obj in wrongly_typed: with tm.assertRaisesRegexp(ValueError, str(type(obj))): merge(obj, df, left_on='a', right_on='a') with tm.assertRaisesRegexp(ValueError, str(type(obj))): merge(df, obj, left_on='a', right_on='a') def test_join_on_pass_vector(self): expected = self.target.join(self.source, on='C') del expected['C'] join_col = self.target.pop('C') result = self.target.join(self.source, on=join_col) assert_frame_equal(result, expected) def test_join_with_len0(self): # nothing to merge merged = self.target.join(self.source.reindex([]), on='C') for col in self.source: self.assertIn(col, merged) self.assertTrue(merged[col].isnull().all()) merged2 = self.target.join(self.source.reindex([]), on='C', how='inner') self.assert_index_equal(merged2.columns, merged.columns) self.assertEqual(len(merged2), 0) def test_join_on_inner(self): df = DataFrame({'key': ['a', 'a', 'd', 'b', 'b', 'c']}) df2 = DataFrame({'value': [0, 1]}, index=['a', 'b']) joined = df.join(df2, on='key', how='inner') expected = df.join(df2, on='key') expected = expected[expected['value'].notnull()] self.assert_series_equal(joined['key'], expected['key'], check_dtype=False) self.assert_series_equal(joined['value'], expected['value'], check_dtype=False) self.assert_index_equal(joined.index, expected.index) def test_join_on_singlekey_list(self): df = DataFrame({'key': ['a', 'a', 'b', 'b', 'c']}) df2 = DataFrame({'value': [0, 1, 2]}, index=['a', 'b', 'c']) # corner cases joined = df.join(df2, on=['key']) expected = df.join(df2, on='key') assert_frame_equal(joined, expected) def test_join_on_series(self): result = self.target.join(self.source['MergedA'], on='C') expected = self.target.join(self.source[['MergedA']], on='C') assert_frame_equal(result, expected) def test_join_on_series_buglet(self): # GH #638 df = DataFrame({'a': [1, 1]}) ds = Series([2], index=[1], name='b') result = df.join(ds, on='a') expected = DataFrame({'a': [1, 1], 'b': [2, 2]}, index=df.index) tm.assert_frame_equal(result, expected) def test_join_index_mixed(self): df1 = DataFrame({'A': 1., 'B': 2, 'C': 'foo', 'D': True}, index=np.arange(10), columns=['A', 'B', 'C', 'D']) self.assertEqual(df1['B'].dtype, np.int64) self.assertEqual(df1['D'].dtype, np.bool_) df2 = DataFrame({'A': 1., 'B': 2, 'C': 'foo', 'D': True}, index=np.arange(0, 10, 2), columns=['A', 'B', 'C', 'D']) # overlap joined = df1.join(df2, lsuffix='_one', rsuffix='_two') expected_columns = ['A_one', 'B_one', 'C_one', 'D_one', 'A_two', 'B_two', 'C_two', 'D_two'] df1.columns = expected_columns[:4] df2.columns = expected_columns[4:] expected = _join_by_hand(df1, df2) assert_frame_equal(joined, expected) # no overlapping blocks df1 = DataFrame(index=np.arange(10)) df1['bool'] = True df1['string'] = 'foo' df2 = DataFrame(index=np.arange(5, 15)) df2['int'] = 1 df2['float'] = 1. for kind in ['inner', 'outer', 'left', 'right']: joined = df1.join(df2, how=kind) expected = _join_by_hand(df1, df2, how=kind) assert_frame_equal(joined, expected) joined = df2.join(df1, how=kind) expected = _join_by_hand(df2, df1, how=kind) assert_frame_equal(joined, expected) def test_join_empty_bug(self): # generated an exception in 0.4.3 x = DataFrame() x.join(DataFrame([3], index=[0], columns=['A']), how='outer') def test_join_unconsolidated(self): # GH #331 a = DataFrame(randn(30, 2), columns=['a', 'b']) c = Series(randn(30)) a['c'] = c d = DataFrame(randn(30, 1), columns=['q']) # it works! a.join(d) d.join(a) def test_join_multiindex(self): index1 = MultiIndex.from_arrays([['a', 'a', 'a', 'b', 'b', 'b'], [1, 2, 3, 1, 2, 3]], names=['first', 'second']) index2 = MultiIndex.from_arrays([['b', 'b', 'b', 'c', 'c', 'c'], [1, 2, 3, 1, 2, 3]], names=['first', 'second']) df1 = DataFrame(data=np.random.randn(6), index=index1, columns=['var X']) df2 = DataFrame(data=np.random.randn(6), index=index2, columns=['var Y']) df1 = df1.sortlevel(0) df2 = df2.sortlevel(0) joined = df1.join(df2, how='outer') ex_index = index1._tuple_index.union(index2._tuple_index) expected = df1.reindex(ex_index).join(df2.reindex(ex_index)) expected.index.names = index1.names assert_frame_equal(joined, expected) self.assertEqual(joined.index.names, index1.names) df1 = df1.sortlevel(1) df2 = df2.sortlevel(1) joined = df1.join(df2, how='outer').sortlevel(0) ex_index = index1._tuple_index.union(index2._tuple_index) expected = df1.reindex(ex_index).join(df2.reindex(ex_index)) expected.index.names = index1.names assert_frame_equal(joined, expected) self.assertEqual(joined.index.names, index1.names) def test_join_inner_multiindex(self): key1 = ['bar', 'bar', 'bar', 'foo', 'foo', 'baz', 'baz', 'qux', 'qux', 'snap'] key2 = ['two', 'one', 'three', 'one', 'two', 'one', 'two', 'two', 'three', 'one'] data = np.random.randn(len(key1)) data = DataFrame({'key1': key1, 'key2': key2, 'data': data}) index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'], ['one', 'two', 'three']], labels=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], names=['first', 'second']) to_join = DataFrame(np.random.randn(10, 3), index=index, columns=['j_one', 'j_two', 'j_three']) joined = data.join(to_join, on=['key1', 'key2'], how='inner') expected = merge(data, to_join.reset_index(), left_on=['key1', 'key2'], right_on=['first', 'second'], how='inner', sort=False) expected2 = merge(to_join, data, right_on=['key1', 'key2'], left_index=True, how='inner', sort=False) assert_frame_equal(joined, expected2.reindex_like(joined)) expected2 = merge(to_join, data, right_on=['key1', 'key2'], left_index=True, how='inner', sort=False) expected = expected.drop(['first', 'second'], axis=1) expected.index = joined.index self.assertTrue(joined.index.is_monotonic) assert_frame_equal(joined, expected) # _assert_same_contents(expected, expected2.ix[:, expected.columns]) def test_join_hierarchical_mixed(self): # GH 2024 df = DataFrame([(1, 2, 3), (4, 5, 6)], columns=['a', 'b', 'c']) new_df = df.groupby(['a']).agg({'b': [np.mean, np.sum]}) other_df = DataFrame( [(1, 2, 3), (7, 10, 6)], columns=['a', 'b', 'd']) other_df.set_index('a', inplace=True) # GH 9455, 12219 with tm.assert_produces_warning(UserWarning): result = merge(new_df, other_df, left_index=True, right_index=True) self.assertTrue(('b', 'mean') in result) self.assertTrue('b' in result) def test_join_float64_float32(self): a = DataFrame(randn(10, 2), columns=['a', 'b'], dtype=np.float64) b = DataFrame(randn(10, 1), columns=['c'], dtype=np.float32) joined = a.join(b) self.assertEqual(joined.dtypes['a'], 'float64') self.assertEqual(joined.dtypes['b'], 'float64') self.assertEqual(joined.dtypes['c'], 'float32') a = np.random.randint(0, 5, 100).astype('int64') b = np.random.random(100).astype('float64') c = np.random.random(100).astype('float32') df = DataFrame({'a': a, 'b': b, 'c': c}) xpdf = DataFrame({'a': a, 'b': b, 'c': c}) s = DataFrame(np.random.random(5).astype('float32'), columns=['md']) rs = df.merge(s, left_on='a', right_index=True) self.assertEqual(rs.dtypes['a'], 'int64') self.assertEqual(rs.dtypes['b'], 'float64') self.assertEqual(rs.dtypes['c'], 'float32') self.assertEqual(rs.dtypes['md'], 'float32') xp = xpdf.merge(s, left_on='a', right_index=True) assert_frame_equal(rs, xp) def test_join_many_non_unique_index(self): df1 = DataFrame({"a": [1, 1], "b": [1, 1], "c": [10, 20]}) df2 = DataFrame({"a": [1, 1], "b": [1, 2], "d": [100, 200]}) df3 = DataFrame({"a": [1, 1], "b": [1, 2], "e": [1000, 2000]}) idf1 = df1.set_index(["a", "b"]) idf2 = df2.set_index(["a", "b"]) idf3 = df3.set_index(["a", "b"]) result = idf1.join([idf2, idf3], how='outer') df_partially_merged = merge(df1, df2, on=['a', 'b'], how='outer') expected = merge(df_partially_merged, df3, on=['a', 'b'], how='outer') result = result.reset_index() expected = expected[result.columns] expected['a'] = expected.a.astype('int64') expected['b'] = expected.b.astype('int64') assert_frame_equal(result, expected) df1 = DataFrame({"a": [1, 1, 1], "b": [1, 1, 1], "c": [10, 20, 30]}) df2 = DataFrame({"a": [1, 1, 1], "b": [1, 1, 2], "d": [100, 200, 300]}) df3 = DataFrame( {"a": [1, 1, 1], "b": [1, 1, 2], "e": [1000, 2000, 3000]}) idf1 = df1.set_index(["a", "b"]) idf2 = df2.set_index(["a", "b"]) idf3 = df3.set_index(["a", "b"]) result = idf1.join([idf2, idf3], how='inner') df_partially_merged = merge(df1, df2, on=['a', 'b'], how='inner') expected = merge(df_partially_merged, df3, on=['a', 'b'], how='inner') result = result.reset_index() assert_frame_equal(result, expected.ix[:, result.columns]) # GH 11519 df = DataFrame({'A': ['foo', 'bar', 'foo', 'bar', 'foo', 'bar', 'foo', 'foo'], 'B': ['one', 'one', 'two', 'three', 'two', 'two', 'one', 'three'], 'C': np.random.randn(8), 'D': np.random.randn(8)}) s = Series(np.repeat(np.arange(8), 2), index=np.repeat(np.arange(8), 2), name='TEST') inner = df.join(s, how='inner') outer = df.join(s, how='outer') left = df.join(s, how='left') right = df.join(s, how='right') assert_frame_equal(inner, outer) assert_frame_equal(inner, left) assert_frame_equal(inner, right) def test_join_sort(self): left = DataFrame({'key': ['foo', 'bar', 'baz', 'foo'], 'value': [1, 2, 3, 4]}) right = DataFrame({'value2': ['a', 'b', 'c']}, index=['bar', 'baz', 'foo']) joined = left.join(right, on='key', sort=True) expected = DataFrame({'key': ['bar', 'baz', 'foo', 'foo'], 'value': [2, 3, 1, 4], 'value2': ['a', 'b', 'c', 'c']}, index=[1, 2, 0, 3]) assert_frame_equal(joined, expected) # smoke test joined = left.join(right, on='key', sort=False) self.assert_index_equal(joined.index, pd.Index(lrange(4))) def test_join_mixed_non_unique_index(self): # GH 12814, unorderable types in py3 with a non-unique index df1 = DataFrame({'a': [1, 2, 3, 4]}, index=[1, 2, 3, 'a']) df2 = DataFrame({'b': [5, 6, 7, 8]}, index=[1, 3, 3, 4]) result = df1.join(df2) expected = DataFrame({'a': [1, 2, 3, 3, 4], 'b': [5, np.nan, 6, 7, np.nan]}, index=[1, 2, 3, 3, 'a']) tm.assert_frame_equal(result, expected) df3 = DataFrame({'a': [1, 2, 3, 4]}, index=[1, 2, 2, 'a']) df4 = DataFrame({'b': [5, 6, 7, 8]}, index=[1, 2, 3, 4]) result = df3.join(df4) expected = DataFrame({'a': [1, 2, 3, 4], 'b': [5, 6, 6, np.nan]}, index=[1, 2, 2, 'a']) tm.assert_frame_equal(result, expected) def test_mixed_type_join_with_suffix(self): # GH #916 df = DataFrame(np.random.randn(20, 6), columns=['a', 'b', 'c', 'd', 'e', 'f']) df.insert(0, 'id', 0) df.insert(5, 'dt', 'foo') grouped = df.groupby('id') mn = grouped.mean() cn = grouped.count() # it works! mn.join(cn, rsuffix='_right') def test_join_many(self): df = DataFrame(np.random.randn(10, 6), columns=list('abcdef')) df_list = [df[['a', 'b']], df[['c', 'd']], df[['e', 'f']]] joined = df_list[0].join(df_list[1:]) tm.assert_frame_equal(joined, df) df_list = [df[['a', 'b']][:-2], df[['c', 'd']][2:], df[['e', 'f']][1:9]] def _check_diff_index(df_list, result, exp_index): reindexed = [x.reindex(exp_index) for x in df_list] expected = reindexed[0].join(reindexed[1:]) tm.assert_frame_equal(result, expected) # different join types joined = df_list[0].join(df_list[1:], how='outer') _check_diff_index(df_list, joined, df.index) joined = df_list[0].join(df_list[1:]) _check_diff_index(df_list, joined, df_list[0].index) joined = df_list[0].join(df_list[1:], how='inner') _check_diff_index(df_list, joined, df.index[2:8]) self.assertRaises(ValueError, df_list[0].join, df_list[1:], on='a') def test_join_many_mixed(self): df = DataFrame(np.random.randn(8, 4), columns=['A', 'B', 'C', 'D']) df['key'] = ['foo', 'bar'] * 4 df1 = df.ix[:, ['A', 'B']] df2 = df.ix[:, ['C', 'D']] df3 = df.ix[:, ['key']] result = df1.join([df2, df3]) assert_frame_equal(result, df) def test_join_dups(self): # joining dups df = concat([DataFrame(np.random.randn(10, 4), columns=['A', 'A', 'B', 'B']), DataFrame(np.random.randint(0, 10, size=20) .reshape(10, 2), columns=['A', 'C'])], axis=1) expected = concat([df, df], axis=1) result = df.join(df, rsuffix='_2') result.columns = expected.columns assert_frame_equal(result, expected) # GH 4975, invalid join on dups w = DataFrame(np.random.randn(4, 2), columns=["x", "y"]) x = DataFrame(np.random.randn(4, 2), columns=["x", "y"]) y = DataFrame(np.random.randn(4, 2), columns=["x", "y"]) z = DataFrame(np.random.randn(4, 2), columns=["x", "y"]) dta = x.merge(y, left_index=True, right_index=True).merge( z, left_index=True, right_index=True, how="outer") dta = dta.merge(w, left_index=True, right_index=True) expected = concat([x, y, z, w], axis=1) expected.columns = ['x_x', 'y_x', 'x_y', 'y_y', 'x_x', 'y_x', 'x_y', 'y_y'] assert_frame_equal(dta, expected) def test_panel_join(self): panel = tm.makePanel() tm.add_nans(panel) p1 = panel.ix[:2, :10, :3] p2 = panel.ix[2:, 5:, 2:] # left join result = p1.join(p2) expected = p1.copy() expected['ItemC'] = p2['ItemC'] tm.assert_panel_equal(result, expected) # right join result = p1.join(p2, how='right') expected = p2.copy() expected['ItemA'] = p1['ItemA'] expected['ItemB'] = p1['ItemB'] expected = expected.reindex(items=['ItemA', 'ItemB', 'ItemC']) tm.assert_panel_equal(result, expected) # inner join result = p1.join(p2, how='inner') expected = panel.ix[:, 5:10, 2:3] tm.assert_panel_equal(result, expected) # outer join result = p1.join(p2, how='outer') expected = p1.reindex(major=panel.major_axis, minor=panel.minor_axis) expected = expected.join(p2.reindex(major=panel.major_axis, minor=panel.minor_axis)) tm.assert_panel_equal(result, expected) def test_panel_join_overlap(self): panel = tm.makePanel() tm.add_nans(panel) p1 = panel.ix[['ItemA', 'ItemB', 'ItemC']] p2 = panel.ix[['ItemB', 'ItemC']] # Expected index is # # ItemA, ItemB_p1, ItemC_p1, ItemB_p2, ItemC_p2 joined = p1.join(p2, lsuffix='_p1', rsuffix='_p2') p1_suf = p1.ix[['ItemB', 'ItemC']].add_suffix('_p1') p2_suf = p2.ix[['ItemB', 'ItemC']].add_suffix('_p2') no_overlap = panel.ix[['ItemA']] expected = no_overlap.join(p1_suf.join(p2_suf)) tm.assert_panel_equal(joined, expected) def test_panel_join_many(self): tm.K = 10 panel = tm.makePanel() tm.K = 4 panels = [panel.ix[:2], panel.ix[2:6], panel.ix[6:]] joined = panels[0].join(panels[1:]) tm.assert_panel_equal(joined, panel) panels = [panel.ix[:2, :-5], panel.ix[2:6, 2:], panel.ix[6:, 5:-7]] data_dict = {} for p in panels: data_dict.update(p.iteritems()) joined = panels[0].join(panels[1:], how='inner') expected = pd.Panel.from_dict(data_dict, intersect=True) tm.assert_panel_equal(joined, expected) joined = panels[0].join(panels[1:], how='outer') expected = pd.Panel.from_dict(data_dict, intersect=False) tm.assert_panel_equal(joined, expected) # edge cases self.assertRaises(ValueError, panels[0].join, panels[1:], how='outer', lsuffix='foo', rsuffix='bar') self.assertRaises(ValueError, panels[0].join, panels[1:], how='right')
def functional_connectivity(data: pd.DataFrame, metric: str = 'cov', **kwargs) -> np.ndarray: """Calculate functional connectivity of node timeseries in data. Parameters ---------- data Pandas dataframe containing the simulation results. metric Type of connectivtiy measurement that should be used. - `cov` for covariance (uses `np.cov`) - `corr` for pearsson correlation (uses `np.corrcoef`) - `csd` for cross-spectral density (uses `mne.time_frequency.csd_array_morlet`) - `coh` for coherence (uses `mne.connectivtiy.spectral_connectivity`) - `cohy` for coherency (uses `mne.connectivtiy.spectral_connectivity`) - `imcoh` for imaginary coherence (uses `mne.connectivtiy.spectral_connectivity`) - `plv` for phase locking value (uses `mne.connectivtiy.spectral_connectivity`) - `ppc` for pairwise phase consistency (uses `mne.connectivtiy.spectral_connectivity`) - `pli` for phase lag index (uses `mne.connectivtiy.spectral_connectivity`) - `pli2_unbiased` for unbiased estimate of squared phase lag index (uses `mne.connectivtiy.spectral_connectivity`) - `wpli`for weighted phase lag index (uses `mne.connectivtiy.spectral_connectivity`) - `wpli2_debiased` for debiased weighted phase lag index (uses `mne.connectivtiy.spectral_connectivity`) kwargs Additional keyword arguments passed to respective function used for fc calculation. Returns ------- np.ndarray Pairwise functional connectivity """ if 'time' in data.columns.values: idx = data.pop('time') data.index = idx # calculate functional connectivity ################################### if metric == 'cov': # covariance fc = np.cov(data.values.T, **kwargs) elif metric == 'corr': # pearsson correlation coefficient fc = np.corrcoef(data.values.T, **kwargs) elif metric == 'csd': from mne.time_frequency import csd_array_morlet fc = np.abs(csd_array_morlet(X=np.reshape(data.values, (1, data.shape[1], data.shape[0])), sfreq=1./(data.index[1] - data.index[0]), ch_names=data.columns.values, **kwargs).mean().get_data()) elif metric in 'cohcohyimcohplvppcplipli2_unbiasedwpliwpli2_debiased': # phase-based connectivtiy/synchronization measurement from mne.connectivity import spectral_connectivity fc, _, _, _, _ = spectral_connectivity(np.reshape(data.values.T, (1, data.shape[1], data.shape[0])), method=metric, sfreq=1./(data.index[1] - data.index[0]), **kwargs) fc = fc.squeeze() else: raise ValueError(f'FC metric is not supported by this function: {metric}. Check the documentation of the ' f'argument `metric` for valid options.') return fc
def labeling(self, data: pd.DataFrame) -> pd.Series: labels = data['price_range'].apply(lambda x: 0 if x in [0, 1] else 1) self.classes = [0, 1] data.pop('price_range') return labels
def explode_id_list(df: pd.DataFrame, column_name: str) -> pd.DataFrame: df = df.join(pd.DataFrame(df.pop(column_name).to_list())) return df
# 高频值填充f2 age_maxf = df['f2'].value_counts().index[0] df['f2'].fillna(age_maxf, inplace=True) # 获取 weight 数据列中单位为 lbs 的数据 rows_with_lbs = df['Weight'].str.contains('lbs').fillna(False) # 将 lbs转换为 kgs, 2.2lbs=1kgs for i, lbs_row in df[rows_with_lbs].iterrows(): # 截取从头开始到倒数第三个字符之前,即去掉lbs。 weight = int(float(lbs_row['Weight'][:-3]) / 2.2) df.at[i, 'Weight'] = '{}kgs'.format(weight) # 高频值填充weight weight_maxf = df['Weight'].value_counts().index[0] df['Weight'].fillna(weight_maxf, inplace=True) # 拆分name列, split的第二个参数表示将分列结果转换为DataFrame df['FirstName'], df['LastName'] = df['Name'].str.split(' ', True).str # 删除原Name列 df.drop(['Name'], axis=1, inplace=True) # 移动first_name和last_name这俩列 first_name = df.pop('FirstName') df.insert(1, 'FirstName', first_name) last_name = df.pop('LastName') df.insert(2, 'LastName', last_name) print(df)
def __get_empty_price_symbols(all_prices_df: pd.DataFrame): all_prices_df.index = all_prices_df.pop('symbol') all_prices_df = all_prices_df.mean(axis=1) return list(all_prices_df[all_prices_df == 0].index)
def __init__(self, data: pd.DataFrame, primary_id_column: str, output_column_name: str) -> None: self._data = data self._id_column = primary_id_column self._id_series = data.pop(self._id_column) self._output_column_name = output_column_name
def _tidy_up_df(self, df: pd.DataFrame, dataset) -> pd.DataFrame: """ Implementation of _tidy_up_df for DWD Observations :param df: untidy DataFrame :param dataset: dataset enumeration :return: tidied DataFrame """ droppable_columns = [ # Hourly # Cloud type DwdObservationParameter.HOURLY.CLOUD_TYPE. CLOUD_TYPE_LAYER1_ABBREVIATION.value, DwdObservationParameter.HOURLY.CLOUD_TYPE. CLOUD_TYPE_LAYER2_ABBREVIATION.value, DwdObservationParameter.HOURLY.CLOUD_TYPE. CLOUD_TYPE_LAYER3_ABBREVIATION.value, DwdObservationParameter.HOURLY.CLOUD_TYPE. CLOUD_TYPE_LAYER4_ABBREVIATION.value, # Cloudiness DwdObservationParameter.HOURLY.CLOUDINESS. CLOUD_COVER_TOTAL_INDICATOR.value, # Solar DwdObservationParameter.HOURLY.SOLAR.END_OF_INTERVAL.value, DwdObservationParameter.HOURLY.SOLAR.TRUE_LOCAL_TIME.value, # Visibility DwdObservationParameter.HOURLY.VISIBILITY. VISIBILITY_RANGE_INDICATOR.value, # Weather DwdObservationParameter.HOURLY.WEATHER_PHENOMENA.WEATHER_TEXT. value, ] # Drop string columns, can't be coerced to float df = df.drop( columns=droppable_columns, errors="ignore", ) resolution = self.sr.stations.resolution if dataset == DwdObservationDataset.CLIMATE_SUMMARY: if resolution == Resolution.DAILY: quality_wind = df.pop(DwdObservationParameter.DAILY. CLIMATE_SUMMARY.QUALITY_WIND.value) quality_general = df.pop(DwdObservationParameter.DAILY. CLIMATE_SUMMARY.QUALITY_GENERAL.value) quality = pd.concat([ pd.Series(repeat(quality_wind.tolist(), 2)).explode(), pd.Series(repeat(quality_general.tolist(), 12)).explode(), ]) elif resolution in (Resolution.MONTHLY, Resolution.ANNUAL): quality_general = df.pop(DwdObservationParameter.MONTHLY. CLIMATE_SUMMARY.QUALITY_GENERAL.value) quality_precipitation = df.pop( DwdObservationParameter.MONTHLY.CLIMATE_SUMMARY. QUALITY_PRECIPITATION.value) quality = pd.concat([ pd.Series(repeat(quality_general, 9)).explode(), pd.Series(repeat(quality_precipitation, 2)).explode(), ]) elif resolution == Resolution.SUBDAILY and DwdObservationDataset.WIND_EXTREME: quality_fx_3 = df.pop("qn_8_3") quality_fx_6 = df.pop("qn_8_6") quality = pd.concat([quality_fx_3, quality_fx_6]) else: quality = df.pop(df.columns[2]) quality = pd.Series(repeat(quality, df.shape[1])).explode() possible_id_vars = ( Columns.STATION_ID.value, Columns.DATE.value, Columns.FROM_DATE.value, Columns.TO_DATE.value, ) id_vars = df.columns.intersection(possible_id_vars) df_tidy = df.melt( id_vars=id_vars, var_name=Columns.PARAMETER.value, value_name=Columns.VALUE.value, ) df_tidy[Columns.QUALITY.value] = quality.reset_index(drop=True) return df_tidy
def format_data(df: pd.DataFrame, stats: Dict, metadata: pd.DataFrame, ids: Ids) -> Tuple[List, List, List]: df = df.reset_index() df.insert(7, 'Gene ID', df.pop('TARGET')) col_types = np.fromiter(map(get_col_type, df.columns), 'U20', df.shape[1]) num_cols = is_numeric_column(col_types) num_df = df.loc[:, num_cols] p_values = col_types == 'Pvalue' edge_counts = stats['edge_counts'] edge_counts.index = edge_counts.index.tolist() total_edge_counts = stats['total'] total_edge_counts.index = total_edge_counts.index.tolist() fc_cols = col_types == 'Log2FC' induce_repress = stats['induce_repress_count'] induce_repress.index = induce_repress.index.tolist() empty_cols = df.isnull().all(axis=0) # for JSON response, can't have NaN or Inf df.loc[:, num_cols] = num_df.mask(np.isinf(num_df), np.nan) df = df.replace({np.nan: None}) data_col_len = 8 columns = list(map(list, zip_longest(*((col,) for col in df.columns[:data_col_len]), *df.columns[data_col_len:]))) none_cols = [None] * len(columns[0]) columns[-1: -1] = [none_cols.copy(), none_cols.copy(), none_cols.copy()] # avoid references prev = (None,) * 5 name = None tech = None method = None edge = None ind_rep = None for i, col in enumerate(islice(zip(*columns), data_col_len, None), data_col_len): try: if col[0:2] != prev[0:2]: name, _, uuid_ = col[0] analysis = metadata.loc[:, col[1]] name = get_name(name, analysis, ids[col[0:2]]) tech = get_tech(analysis) method = get_analysis_method(analysis) edge_count = edge_counts.at[col[:2]] total_edge_count = total_edge_counts.at[col[:2]] edge = get_edge(analysis).format(edge_count) if edge_count != total_edge_count: edge += " ({})".format(total_edge_count) try: ind_rep = "Induced-{0[induced]:} Repressed-{0[repressed]:}".format( induce_repress[(*col[:2], 'Log2FC')]) except KeyError: ind_rep = None columns[0][i] = name columns[1][i] = tech columns[2][i] = method columns[3][i] = edge columns[4][i] = ind_rep except KeyError: pass if col[-1] == 'DAP': columns[2][i] = columns[1][i] = None prev = col merged_cells = get_merge_cells(columns) # Column formatting for Handsontable column_formats = [] for i, (num, p, fc, empty, col) in enumerate(zip(num_cols, p_values, fc_cols, empty_cols, zip(*columns))): opt = {} if num: if p: opt['type'] = 'p_value' else: opt['type'] = 'numeric' if fc: opt['renderer'] = 'renderFc' elif i >= data_col_len and not p: opt['renderer'] = 'renderExp' if i >= data_col_len: opt['validator'] = 'exponential' else: opt['type'] = 'text' if col[-1] == 'EDGE': opt['className'] = 'htCenter' opt['renderer'] = 'renderEdge' if empty: opt['width'] = 1 column_formats.append(opt) return column_formats, merged_cells, columns + df.values.tolist()
def test_set_index_cast_datetimeindex(self): df = DataFrame({ 'A': [datetime(2000, 1, 1) + timedelta(i) for i in range(1000)], 'B': np.random.randn(1000) }) idf = df.set_index('A') assert isinstance(idf.index, pd.DatetimeIndex) # don't cast a DatetimeIndex WITH a tz, leave as object # GH 6032 i = (pd.DatetimeIndex( to_datetime(['2013-1-1 13:00', '2013-1-2 14:00'], errors="raise")).tz_localize('US/Pacific')) df = DataFrame(np.random.randn(2, 1), columns=['A']) expected = Series( np.array([ pd.Timestamp('2013-01-01 13:00:00-0800', tz='US/Pacific'), pd.Timestamp('2013-01-02 14:00:00-0800', tz='US/Pacific') ], dtype="object")) # convert index to series result = Series(i) assert_series_equal(result, expected) # assignt to frame df['B'] = i result = df['B'] assert_series_equal(result, expected, check_names=False) assert result.name == 'B' # keep the timezone result = i.to_series(keep_tz=True) assert_series_equal(result.reset_index(drop=True), expected) # convert to utc df['C'] = i.to_series().reset_index(drop=True) result = df['C'] comp = pd.DatetimeIndex(expected.values) comp = comp.tz_localize(None) tm.assert_numpy_array_equal(result.values, comp.values) # list of datetimes with a tz df['D'] = i.to_pydatetime() result = df['D'] assert_series_equal(result, expected, check_names=False) assert result.name == 'D' # GH 6785 # set the index manually import pytz df = DataFrame([{ 'ts': datetime(2014, 4, 1, tzinfo=pytz.utc), 'foo': 1 }]) expected = df.set_index('ts') df.index = df['ts'] df.pop('ts') assert_frame_equal(df, expected)
class TestJoin(object): def setup_method(self, method): # aggregate multiple columns self.df = DataFrame({'key1': get_test_data(), 'key2': get_test_data(), 'data1': np.random.randn(N), 'data2': np.random.randn(N)}) # exclude a couple keys for fun self.df = self.df[self.df['key2'] > 1] self.df2 = DataFrame({'key1': get_test_data(n=N // 5), 'key2': get_test_data(ngroups=NGROUPS // 2, n=N // 5), 'value': np.random.randn(N // 5)}) index, data = tm.getMixedTypeDict() self.target = DataFrame(data, index=index) # Join on string value self.source = DataFrame({'MergedA': data['A'], 'MergedD': data['D']}, index=data['C']) def test_cython_left_outer_join(self): left = a_([0, 1, 2, 1, 2, 0, 0, 1, 2, 3, 3], dtype=np.int64) right = a_([1, 1, 0, 4, 2, 2, 1], dtype=np.int64) max_group = 5 ls, rs = libjoin.left_outer_join(left, right, max_group) exp_ls = left.argsort(kind='mergesort') exp_rs = right.argsort(kind='mergesort') exp_li = a_([0, 1, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, 6, 6, 7, 7, 8, 8, 9, 10]) exp_ri = a_([0, 0, 0, 1, 2, 3, 1, 2, 3, 1, 2, 3, 4, 5, 4, 5, 4, 5, -1, -1]) exp_ls = exp_ls.take(exp_li) exp_ls[exp_li == -1] = -1 exp_rs = exp_rs.take(exp_ri) exp_rs[exp_ri == -1] = -1 tm.assert_numpy_array_equal(ls, exp_ls, check_dtype=False) tm.assert_numpy_array_equal(rs, exp_rs, check_dtype=False) def test_cython_right_outer_join(self): left = a_([0, 1, 2, 1, 2, 0, 0, 1, 2, 3, 3], dtype=np.int64) right = a_([1, 1, 0, 4, 2, 2, 1], dtype=np.int64) max_group = 5 rs, ls = libjoin.left_outer_join(right, left, max_group) exp_ls = left.argsort(kind='mergesort') exp_rs = right.argsort(kind='mergesort') # 0 1 1 1 exp_li = a_([0, 1, 2, 3, 4, 5, 3, 4, 5, 3, 4, 5, # 2 2 4 6, 7, 8, 6, 7, 8, -1]) exp_ri = a_([0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, 6]) exp_ls = exp_ls.take(exp_li) exp_ls[exp_li == -1] = -1 exp_rs = exp_rs.take(exp_ri) exp_rs[exp_ri == -1] = -1 tm.assert_numpy_array_equal(ls, exp_ls, check_dtype=False) tm.assert_numpy_array_equal(rs, exp_rs, check_dtype=False) def test_cython_inner_join(self): left = a_([0, 1, 2, 1, 2, 0, 0, 1, 2, 3, 3], dtype=np.int64) right = a_([1, 1, 0, 4, 2, 2, 1, 4], dtype=np.int64) max_group = 5 ls, rs = libjoin.inner_join(left, right, max_group) exp_ls = left.argsort(kind='mergesort') exp_rs = right.argsort(kind='mergesort') exp_li = a_([0, 1, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, 6, 6, 7, 7, 8, 8]) exp_ri = a_([0, 0, 0, 1, 2, 3, 1, 2, 3, 1, 2, 3, 4, 5, 4, 5, 4, 5]) exp_ls = exp_ls.take(exp_li) exp_ls[exp_li == -1] = -1 exp_rs = exp_rs.take(exp_ri) exp_rs[exp_ri == -1] = -1 tm.assert_numpy_array_equal(ls, exp_ls, check_dtype=False) tm.assert_numpy_array_equal(rs, exp_rs, check_dtype=False) def test_left_outer_join(self): joined_key2 = merge(self.df, self.df2, on='key2') _check_join(self.df, self.df2, joined_key2, ['key2'], how='left') joined_both = merge(self.df, self.df2) _check_join(self.df, self.df2, joined_both, ['key1', 'key2'], how='left') def test_right_outer_join(self): joined_key2 = merge(self.df, self.df2, on='key2', how='right') _check_join(self.df, self.df2, joined_key2, ['key2'], how='right') joined_both = merge(self.df, self.df2, how='right') _check_join(self.df, self.df2, joined_both, ['key1', 'key2'], how='right') def test_full_outer_join(self): joined_key2 = merge(self.df, self.df2, on='key2', how='outer') _check_join(self.df, self.df2, joined_key2, ['key2'], how='outer') joined_both = merge(self.df, self.df2, how='outer') _check_join(self.df, self.df2, joined_both, ['key1', 'key2'], how='outer') def test_inner_join(self): joined_key2 = merge(self.df, self.df2, on='key2', how='inner') _check_join(self.df, self.df2, joined_key2, ['key2'], how='inner') joined_both = merge(self.df, self.df2, how='inner') _check_join(self.df, self.df2, joined_both, ['key1', 'key2'], how='inner') def test_handle_overlap(self): joined = merge(self.df, self.df2, on='key2', suffixes=['.foo', '.bar']) assert 'key1.foo' in joined assert 'key1.bar' in joined def test_handle_overlap_arbitrary_key(self): joined = merge(self.df, self.df2, left_on='key2', right_on='key1', suffixes=['.foo', '.bar']) assert 'key1.foo' in joined assert 'key2.bar' in joined def test_join_on(self): target = self.target source = self.source merged = target.join(source, on='C') tm.assert_series_equal(merged['MergedA'], target['A'], check_names=False) tm.assert_series_equal(merged['MergedD'], target['D'], check_names=False) # join with duplicates (fix regression from DataFrame/Matrix merge) df = DataFrame({'key': ['a', 'a', 'b', 'b', 'c']}) df2 = DataFrame({'value': [0, 1, 2]}, index=['a', 'b', 'c']) joined = df.join(df2, on='key') expected = DataFrame({'key': ['a', 'a', 'b', 'b', 'c'], 'value': [0, 0, 1, 1, 2]}) assert_frame_equal(joined, expected) # Test when some are missing df_a = DataFrame([[1], [2], [3]], index=['a', 'b', 'c'], columns=['one']) df_b = DataFrame([['foo'], ['bar']], index=[1, 2], columns=['two']) df_c = DataFrame([[1], [2]], index=[1, 2], columns=['three']) joined = df_a.join(df_b, on='one') joined = joined.join(df_c, on='one') assert np.isnan(joined['two']['c']) assert np.isnan(joined['three']['c']) # merge column not p resent with pytest.raises(KeyError, match="^'E'$"): target.join(source, on='E') # overlap source_copy = source.copy() source_copy['A'] = 0 msg = ("You are trying to merge on float64 and object columns. If" " you wish to proceed you should use pd.concat") with pytest.raises(ValueError, match=msg): target.join(source_copy, on='A') def test_join_on_fails_with_different_right_index(self): df = DataFrame({'a': np.random.choice(['m', 'f'], size=3), 'b': np.random.randn(3)}) df2 = DataFrame({'a': np.random.choice(['m', 'f'], size=10), 'b': np.random.randn(10)}, index=tm.makeCustomIndex(10, 2)) msg = (r'len\(left_on\) must equal the number of levels in the index' ' of "right"') with pytest.raises(ValueError, match=msg): merge(df, df2, left_on='a', right_index=True) def test_join_on_fails_with_different_left_index(self): df = DataFrame({'a': np.random.choice(['m', 'f'], size=3), 'b': np.random.randn(3)}, index=tm.makeCustomIndex(3, 2)) df2 = DataFrame({'a': np.random.choice(['m', 'f'], size=10), 'b': np.random.randn(10)}) msg = (r'len\(right_on\) must equal the number of levels in the index' ' of "left"') with pytest.raises(ValueError, match=msg): merge(df, df2, right_on='b', left_index=True) def test_join_on_fails_with_different_column_counts(self): df = DataFrame({'a': np.random.choice(['m', 'f'], size=3), 'b': np.random.randn(3)}) df2 = DataFrame({'a': np.random.choice(['m', 'f'], size=10), 'b': np.random.randn(10)}, index=tm.makeCustomIndex(10, 2)) msg = r"len\(right_on\) must equal len\(left_on\)" with pytest.raises(ValueError, match=msg): merge(df, df2, right_on='a', left_on=['a', 'b']) @pytest.mark.parametrize("wrong_type", [2, 'str', None, np.array([0, 1])]) def test_join_on_fails_with_wrong_object_type(self, wrong_type): # GH12081 - original issue # GH21220 - merging of Series and DataFrame is now allowed # Edited test to remove the Series object from test parameters df = DataFrame({'a': [1, 1]}) msg = ("Can only merge Series or DataFrame objects, a {} was passed" .format(str(type(wrong_type)))) with pytest.raises(TypeError, match=msg): merge(wrong_type, df, left_on='a', right_on='a') with pytest.raises(TypeError, match=msg): merge(df, wrong_type, left_on='a', right_on='a') def test_join_on_pass_vector(self): expected = self.target.join(self.source, on='C') del expected['C'] join_col = self.target.pop('C') result = self.target.join(self.source, on=join_col) assert_frame_equal(result, expected) def test_join_with_len0(self): # nothing to merge merged = self.target.join(self.source.reindex([]), on='C') for col in self.source: assert col in merged assert merged[col].isna().all() merged2 = self.target.join(self.source.reindex([]), on='C', how='inner') tm.assert_index_equal(merged2.columns, merged.columns) assert len(merged2) == 0 def test_join_on_inner(self): df = DataFrame({'key': ['a', 'a', 'd', 'b', 'b', 'c']}) df2 = DataFrame({'value': [0, 1]}, index=['a', 'b']) joined = df.join(df2, on='key', how='inner') expected = df.join(df2, on='key') expected = expected[expected['value'].notna()] tm.assert_series_equal(joined['key'], expected['key'], check_dtype=False) tm.assert_series_equal(joined['value'], expected['value'], check_dtype=False) tm.assert_index_equal(joined.index, expected.index) def test_join_on_singlekey_list(self): df = DataFrame({'key': ['a', 'a', 'b', 'b', 'c']}) df2 = DataFrame({'value': [0, 1, 2]}, index=['a', 'b', 'c']) # corner cases joined = df.join(df2, on=['key']) expected = df.join(df2, on='key') assert_frame_equal(joined, expected) def test_join_on_series(self): result = self.target.join(self.source['MergedA'], on='C') expected = self.target.join(self.source[['MergedA']], on='C') assert_frame_equal(result, expected) def test_join_on_series_buglet(self): # GH #638 df = DataFrame({'a': [1, 1]}) ds = Series([2], index=[1], name='b') result = df.join(ds, on='a') expected = DataFrame({'a': [1, 1], 'b': [2, 2]}, index=df.index) tm.assert_frame_equal(result, expected) def test_join_index_mixed(self, join_type): # no overlapping blocks df1 = DataFrame(index=np.arange(10)) df1['bool'] = True df1['string'] = 'foo' df2 = DataFrame(index=np.arange(5, 15)) df2['int'] = 1 df2['float'] = 1. joined = df1.join(df2, how=join_type) expected = _join_by_hand(df1, df2, how=join_type) assert_frame_equal(joined, expected) joined = df2.join(df1, how=join_type) expected = _join_by_hand(df2, df1, how=join_type) assert_frame_equal(joined, expected) def test_join_index_mixed_overlap(self): df1 = DataFrame({'A': 1., 'B': 2, 'C': 'foo', 'D': True}, index=np.arange(10), columns=['A', 'B', 'C', 'D']) assert df1['B'].dtype == np.int64 assert df1['D'].dtype == np.bool_ df2 = DataFrame({'A': 1., 'B': 2, 'C': 'foo', 'D': True}, index=np.arange(0, 10, 2), columns=['A', 'B', 'C', 'D']) # overlap joined = df1.join(df2, lsuffix='_one', rsuffix='_two') expected_columns = ['A_one', 'B_one', 'C_one', 'D_one', 'A_two', 'B_two', 'C_two', 'D_two'] df1.columns = expected_columns[:4] df2.columns = expected_columns[4:] expected = _join_by_hand(df1, df2) assert_frame_equal(joined, expected) def test_join_empty_bug(self): # generated an exception in 0.4.3 x = DataFrame() x.join(DataFrame([3], index=[0], columns=['A']), how='outer') def test_join_unconsolidated(self): # GH #331 a = DataFrame(randn(30, 2), columns=['a', 'b']) c = Series(randn(30)) a['c'] = c d = DataFrame(randn(30, 1), columns=['q']) # it works! a.join(d) d.join(a) def test_join_multiindex(self): index1 = MultiIndex.from_arrays([['a', 'a', 'a', 'b', 'b', 'b'], [1, 2, 3, 1, 2, 3]], names=['first', 'second']) index2 = MultiIndex.from_arrays([['b', 'b', 'b', 'c', 'c', 'c'], [1, 2, 3, 1, 2, 3]], names=['first', 'second']) df1 = DataFrame(data=np.random.randn(6), index=index1, columns=['var X']) df2 = DataFrame(data=np.random.randn(6), index=index2, columns=['var Y']) df1 = df1.sort_index(level=0) df2 = df2.sort_index(level=0) joined = df1.join(df2, how='outer') ex_index = Index(index1.values).union(Index(index2.values)) expected = df1.reindex(ex_index).join(df2.reindex(ex_index)) expected.index.names = index1.names assert_frame_equal(joined, expected) assert joined.index.names == index1.names df1 = df1.sort_index(level=1) df2 = df2.sort_index(level=1) joined = df1.join(df2, how='outer').sort_index(level=0) ex_index = Index(index1.values).union(Index(index2.values)) expected = df1.reindex(ex_index).join(df2.reindex(ex_index)) expected.index.names = index1.names assert_frame_equal(joined, expected) assert joined.index.names == index1.names def test_join_inner_multiindex(self): key1 = ['bar', 'bar', 'bar', 'foo', 'foo', 'baz', 'baz', 'qux', 'qux', 'snap'] key2 = ['two', 'one', 'three', 'one', 'two', 'one', 'two', 'two', 'three', 'one'] data = np.random.randn(len(key1)) data = DataFrame({'key1': key1, 'key2': key2, 'data': data}) index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'], ['one', 'two', 'three']], codes=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], names=['first', 'second']) to_join = DataFrame(np.random.randn(10, 3), index=index, columns=['j_one', 'j_two', 'j_three']) joined = data.join(to_join, on=['key1', 'key2'], how='inner') expected = merge(data, to_join.reset_index(), left_on=['key1', 'key2'], right_on=['first', 'second'], how='inner', sort=False) expected2 = merge(to_join, data, right_on=['key1', 'key2'], left_index=True, how='inner', sort=False) assert_frame_equal(joined, expected2.reindex_like(joined)) expected2 = merge(to_join, data, right_on=['key1', 'key2'], left_index=True, how='inner', sort=False) expected = expected.drop(['first', 'second'], axis=1) expected.index = joined.index assert joined.index.is_monotonic assert_frame_equal(joined, expected) # _assert_same_contents(expected, expected2.loc[:, expected.columns]) def test_join_hierarchical_mixed(self): # GH 2024 df = DataFrame([(1, 2, 3), (4, 5, 6)], columns=['a', 'b', 'c']) new_df = df.groupby(['a']).agg({'b': [np.mean, np.sum]}) other_df = DataFrame( [(1, 2, 3), (7, 10, 6)], columns=['a', 'b', 'd']) other_df.set_index('a', inplace=True) # GH 9455, 12219 with tm.assert_produces_warning(UserWarning): result = merge(new_df, other_df, left_index=True, right_index=True) assert ('b', 'mean') in result assert 'b' in result def test_join_float64_float32(self): a = DataFrame(randn(10, 2), columns=['a', 'b'], dtype=np.float64) b = DataFrame(randn(10, 1), columns=['c'], dtype=np.float32) joined = a.join(b) assert joined.dtypes['a'] == 'float64' assert joined.dtypes['b'] == 'float64' assert joined.dtypes['c'] == 'float32' a = np.random.randint(0, 5, 100).astype('int64') b = np.random.random(100).astype('float64') c = np.random.random(100).astype('float32') df = DataFrame({'a': a, 'b': b, 'c': c}) xpdf = DataFrame({'a': a, 'b': b, 'c': c}) s = DataFrame(np.random.random(5).astype('float32'), columns=['md']) rs = df.merge(s, left_on='a', right_index=True) assert rs.dtypes['a'] == 'int64' assert rs.dtypes['b'] == 'float64' assert rs.dtypes['c'] == 'float32' assert rs.dtypes['md'] == 'float32' xp = xpdf.merge(s, left_on='a', right_index=True) assert_frame_equal(rs, xp) def test_join_many_non_unique_index(self): df1 = DataFrame({"a": [1, 1], "b": [1, 1], "c": [10, 20]}) df2 = DataFrame({"a": [1, 1], "b": [1, 2], "d": [100, 200]}) df3 = DataFrame({"a": [1, 1], "b": [1, 2], "e": [1000, 2000]}) idf1 = df1.set_index(["a", "b"]) idf2 = df2.set_index(["a", "b"]) idf3 = df3.set_index(["a", "b"]) result = idf1.join([idf2, idf3], how='outer') df_partially_merged = merge(df1, df2, on=['a', 'b'], how='outer') expected = merge(df_partially_merged, df3, on=['a', 'b'], how='outer') result = result.reset_index() expected = expected[result.columns] expected['a'] = expected.a.astype('int64') expected['b'] = expected.b.astype('int64') assert_frame_equal(result, expected) df1 = DataFrame({"a": [1, 1, 1], "b": [1, 1, 1], "c": [10, 20, 30]}) df2 = DataFrame({"a": [1, 1, 1], "b": [1, 1, 2], "d": [100, 200, 300]}) df3 = DataFrame( {"a": [1, 1, 1], "b": [1, 1, 2], "e": [1000, 2000, 3000]}) idf1 = df1.set_index(["a", "b"]) idf2 = df2.set_index(["a", "b"]) idf3 = df3.set_index(["a", "b"]) result = idf1.join([idf2, idf3], how='inner') df_partially_merged = merge(df1, df2, on=['a', 'b'], how='inner') expected = merge(df_partially_merged, df3, on=['a', 'b'], how='inner') result = result.reset_index() assert_frame_equal(result, expected.loc[:, result.columns]) # GH 11519 df = DataFrame({'A': ['foo', 'bar', 'foo', 'bar', 'foo', 'bar', 'foo', 'foo'], 'B': ['one', 'one', 'two', 'three', 'two', 'two', 'one', 'three'], 'C': np.random.randn(8), 'D': np.random.randn(8)}) s = Series(np.repeat(np.arange(8), 2), index=np.repeat(np.arange(8), 2), name='TEST') inner = df.join(s, how='inner') outer = df.join(s, how='outer') left = df.join(s, how='left') right = df.join(s, how='right') assert_frame_equal(inner, outer) assert_frame_equal(inner, left) assert_frame_equal(inner, right) def test_join_sort(self): left = DataFrame({'key': ['foo', 'bar', 'baz', 'foo'], 'value': [1, 2, 3, 4]}) right = DataFrame({'value2': ['a', 'b', 'c']}, index=['bar', 'baz', 'foo']) joined = left.join(right, on='key', sort=True) expected = DataFrame({'key': ['bar', 'baz', 'foo', 'foo'], 'value': [2, 3, 1, 4], 'value2': ['a', 'b', 'c', 'c']}, index=[1, 2, 0, 3]) assert_frame_equal(joined, expected) # smoke test joined = left.join(right, on='key', sort=False) tm.assert_index_equal(joined.index, pd.Index(lrange(4))) def test_join_mixed_non_unique_index(self): # GH 12814, unorderable types in py3 with a non-unique index df1 = DataFrame({'a': [1, 2, 3, 4]}, index=[1, 2, 3, 'a']) df2 = DataFrame({'b': [5, 6, 7, 8]}, index=[1, 3, 3, 4]) result = df1.join(df2) expected = DataFrame({'a': [1, 2, 3, 3, 4], 'b': [5, np.nan, 6, 7, np.nan]}, index=[1, 2, 3, 3, 'a']) tm.assert_frame_equal(result, expected) df3 = DataFrame({'a': [1, 2, 3, 4]}, index=[1, 2, 2, 'a']) df4 = DataFrame({'b': [5, 6, 7, 8]}, index=[1, 2, 3, 4]) result = df3.join(df4) expected = DataFrame({'a': [1, 2, 3, 4], 'b': [5, 6, 6, np.nan]}, index=[1, 2, 2, 'a']) tm.assert_frame_equal(result, expected) def test_join_non_unique_period_index(self): # GH #16871 index = pd.period_range('2016-01-01', periods=16, freq='M') df = DataFrame([i for i in range(len(index))], index=index, columns=['pnum']) df2 = concat([df, df]) result = df.join(df2, how='inner', rsuffix='_df2') expected = DataFrame( np.tile(np.arange(16, dtype=np.int64).repeat(2).reshape(-1, 1), 2), columns=['pnum', 'pnum_df2'], index=df2.sort_index().index) tm.assert_frame_equal(result, expected) def test_mixed_type_join_with_suffix(self): # GH #916 df = DataFrame(np.random.randn(20, 6), columns=['a', 'b', 'c', 'd', 'e', 'f']) df.insert(0, 'id', 0) df.insert(5, 'dt', 'foo') grouped = df.groupby('id') mn = grouped.mean() cn = grouped.count() # it works! mn.join(cn, rsuffix='_right') def test_join_many(self): df = DataFrame(np.random.randn(10, 6), columns=list('abcdef')) df_list = [df[['a', 'b']], df[['c', 'd']], df[['e', 'f']]] joined = df_list[0].join(df_list[1:]) tm.assert_frame_equal(joined, df) df_list = [df[['a', 'b']][:-2], df[['c', 'd']][2:], df[['e', 'f']][1:9]] def _check_diff_index(df_list, result, exp_index): reindexed = [x.reindex(exp_index) for x in df_list] expected = reindexed[0].join(reindexed[1:]) tm.assert_frame_equal(result, expected) # different join types joined = df_list[0].join(df_list[1:], how='outer') _check_diff_index(df_list, joined, df.index) joined = df_list[0].join(df_list[1:]) _check_diff_index(df_list, joined, df_list[0].index) joined = df_list[0].join(df_list[1:], how='inner') _check_diff_index(df_list, joined, df.index[2:8]) msg = "Joining multiple DataFrames only supported for joining on index" with pytest.raises(ValueError, match=msg): df_list[0].join(df_list[1:], on='a') def test_join_many_mixed(self): df = DataFrame(np.random.randn(8, 4), columns=['A', 'B', 'C', 'D']) df['key'] = ['foo', 'bar'] * 4 df1 = df.loc[:, ['A', 'B']] df2 = df.loc[:, ['C', 'D']] df3 = df.loc[:, ['key']] result = df1.join([df2, df3]) assert_frame_equal(result, df) def test_join_dups(self): # joining dups df = concat([DataFrame(np.random.randn(10, 4), columns=['A', 'A', 'B', 'B']), DataFrame(np.random.randint(0, 10, size=20) .reshape(10, 2), columns=['A', 'C'])], axis=1) expected = concat([df, df], axis=1) result = df.join(df, rsuffix='_2') result.columns = expected.columns assert_frame_equal(result, expected) # GH 4975, invalid join on dups w = DataFrame(np.random.randn(4, 2), columns=["x", "y"]) x = DataFrame(np.random.randn(4, 2), columns=["x", "y"]) y = DataFrame(np.random.randn(4, 2), columns=["x", "y"]) z = DataFrame(np.random.randn(4, 2), columns=["x", "y"]) dta = x.merge(y, left_index=True, right_index=True).merge( z, left_index=True, right_index=True, how="outer") dta = dta.merge(w, left_index=True, right_index=True) expected = concat([x, y, z, w], axis=1) expected.columns = ['x_x', 'y_x', 'x_y', 'y_y', 'x_x', 'y_x', 'x_y', 'y_y'] assert_frame_equal(dta, expected) def test_panel_join(self): with catch_warnings(record=True): panel = tm.makePanel() tm.add_nans(panel) p1 = panel.iloc[:2, :10, :3] p2 = panel.iloc[2:, 5:, 2:] # left join result = p1.join(p2) expected = p1.copy() expected['ItemC'] = p2['ItemC'] tm.assert_panel_equal(result, expected) # right join result = p1.join(p2, how='right') expected = p2.copy() expected['ItemA'] = p1['ItemA'] expected['ItemB'] = p1['ItemB'] expected = expected.reindex(items=['ItemA', 'ItemB', 'ItemC']) tm.assert_panel_equal(result, expected) # inner join result = p1.join(p2, how='inner') expected = panel.iloc[:, 5:10, 2:3] tm.assert_panel_equal(result, expected) # outer join result = p1.join(p2, how='outer') expected = p1.reindex(major=panel.major_axis, minor=panel.minor_axis) expected = expected.join(p2.reindex(major=panel.major_axis, minor=panel.minor_axis)) tm.assert_panel_equal(result, expected) def test_panel_join_overlap(self): with catch_warnings(record=True): panel = tm.makePanel() tm.add_nans(panel) p1 = panel.loc[['ItemA', 'ItemB', 'ItemC']] p2 = panel.loc[['ItemB', 'ItemC']] # Expected index is # # ItemA, ItemB_p1, ItemC_p1, ItemB_p2, ItemC_p2 joined = p1.join(p2, lsuffix='_p1', rsuffix='_p2') p1_suf = p1.loc[['ItemB', 'ItemC']].add_suffix('_p1') p2_suf = p2.loc[['ItemB', 'ItemC']].add_suffix('_p2') no_overlap = panel.loc[['ItemA']] expected = no_overlap.join(p1_suf.join(p2_suf)) tm.assert_panel_equal(joined, expected) def test_panel_join_many(self): with catch_warnings(record=True): tm.K = 10 panel = tm.makePanel() tm.K = 4 panels = [panel.iloc[:2], panel.iloc[2:6], panel.iloc[6:]] joined = panels[0].join(panels[1:]) tm.assert_panel_equal(joined, panel) panels = [panel.iloc[:2, :-5], panel.iloc[2:6, 2:], panel.iloc[6:, 5:-7]] data_dict = {} for p in panels: data_dict.update(p.iteritems()) joined = panels[0].join(panels[1:], how='inner') expected = pd.Panel.from_dict(data_dict, intersect=True) tm.assert_panel_equal(joined, expected) joined = panels[0].join(panels[1:], how='outer') expected = pd.Panel.from_dict(data_dict, intersect=False) tm.assert_panel_equal(joined, expected) # edge cases msg = "Suffixes not supported when passing multiple panels" with pytest.raises(ValueError, match=msg): panels[0].join(panels[1:], how='outer', lsuffix='foo', rsuffix='bar') msg = "Right join not supported with multiple panels" with pytest.raises(ValueError, match=msg): panels[0].join(panels[1:], how='right') def test_join_multi_to_multi(self, join_type): # GH 20475 leftindex = MultiIndex.from_product([list('abc'), list('xy'), [1, 2]], names=['abc', 'xy', 'num']) left = DataFrame({'v1': range(12)}, index=leftindex) rightindex = MultiIndex.from_product([list('abc'), list('xy')], names=['abc', 'xy']) right = DataFrame({'v2': [100 * i for i in range(1, 7)]}, index=rightindex) result = left.join(right, on=['abc', 'xy'], how=join_type) expected = (left.reset_index() .merge(right.reset_index(), on=['abc', 'xy'], how=join_type) .set_index(['abc', 'xy', 'num']) ) assert_frame_equal(expected, result) msg = (r'len\(left_on\) must equal the number of levels in the index' ' of "right"') with pytest.raises(ValueError, match=msg): left.join(right, on='xy', how=join_type) with pytest.raises(ValueError, match=msg): right.join(left, on=['abc', 'xy'], how=join_type)
def test_set_index_datetime(self): # GH#3950 df = DataFrame({ "label": ["a", "a", "a", "b", "b", "b"], "datetime": [ "2011-07-19 07:00:00", "2011-07-19 08:00:00", "2011-07-19 09:00:00", "2011-07-19 07:00:00", "2011-07-19 08:00:00", "2011-07-19 09:00:00", ], "value": range(6), }) df.index = to_datetime(df.pop("datetime"), utc=True) df.index = df.index.tz_convert("US/Pacific") expected = DatetimeIndex( [ "2011-07-19 07:00:00", "2011-07-19 08:00:00", "2011-07-19 09:00:00" ], name="datetime", ) expected = expected.tz_localize("UTC").tz_convert("US/Pacific") df = df.set_index("label", append=True) tm.assert_index_equal(df.index.levels[0], expected) tm.assert_index_equal(df.index.levels[1], Index(["a", "b"], name="label")) assert df.index.names == ["datetime", "label"] df = df.swaplevel(0, 1) tm.assert_index_equal(df.index.levels[0], Index(["a", "b"], name="label")) tm.assert_index_equal(df.index.levels[1], expected) assert df.index.names == ["label", "datetime"] df = DataFrame(np.random.random(6)) idx1 = DatetimeIndex( [ "2011-07-19 07:00:00", "2011-07-19 08:00:00", "2011-07-19 09:00:00", "2011-07-19 07:00:00", "2011-07-19 08:00:00", "2011-07-19 09:00:00", ], tz="US/Eastern", ) idx2 = DatetimeIndex( [ "2012-04-01 09:00", "2012-04-01 09:00", "2012-04-01 09:00", "2012-04-02 09:00", "2012-04-02 09:00", "2012-04-02 09:00", ], tz="US/Eastern", ) idx3 = date_range("2011-01-01 09:00", periods=6, tz="Asia/Tokyo") idx3 = idx3._with_freq(None) df = df.set_index(idx1) df = df.set_index(idx2, append=True) df = df.set_index(idx3, append=True) expected1 = DatetimeIndex( [ "2011-07-19 07:00:00", "2011-07-19 08:00:00", "2011-07-19 09:00:00" ], tz="US/Eastern", ) expected2 = DatetimeIndex(["2012-04-01 09:00", "2012-04-02 09:00"], tz="US/Eastern") tm.assert_index_equal(df.index.levels[0], expected1) tm.assert_index_equal(df.index.levels[1], expected2) tm.assert_index_equal(df.index.levels[2], idx3) # GH#7092 tm.assert_index_equal(df.index.get_level_values(0), idx1) tm.assert_index_equal(df.index.get_level_values(1), idx2) tm.assert_index_equal(df.index.get_level_values(2), idx3)
'Diagnosis'], axis=1).drop(labels=0) riskModelData['General feeling'] = riskModelData['General feeling'].astype( float) riskModelData['Age'] = riskModelData['Age'].astype(float) riskModelData['Gender'] = riskModelData['Gender'].astype(float) riskModelData['Sexuality'] = riskModelData['Sexuality'].astype(float) riskModelData['Anger'] = riskModelData['Anger'].astype(float) riskModelData['Disgust'] = riskModelData['Disgust'].astype(float) riskModelData['Fear'] = riskModelData['Fear'].astype(float) riskModelData['Joy'] = riskModelData['Joy'].astype(float) riskModelData['Sadness'] = riskModelData['Sadness'].astype(float) riskModelData['Risk Factor'] = riskModelData['Anger'] + riskModelData[ 'Disgust'] + riskModelData['Fear'] - riskModelData['Joy'] + riskModelData[ 'Sadness'] + riskModelData['General feeling'] riskModelData = riskModelData.drop( labels=['General feeling', 'Anger', 'Disgust', 'Fear', 'Joy', 'Sadness'], axis=1) riskModelOutput = riskModelData.pop('Risk Factor') pp.pprint(riskModelData) pp.pprint(riskModelOutput) predictionModel = linear_model.RidgeCV() predictionModel.fit(riskModelData, riskModelOutput) print(predictionModel.score(riskModelData, riskModelOutput)) print(predictionModel.coef_) predictedRisk = predictionModel.predict(riskModelData) pp.pprint(predictedRisk)
def platform_bill(bill, timestamp): data = DataFrame(bill).sum().to_dict() if 'timestamp' in data: data.pop('timestamp') print timestamp, data LyDaily.update_one({'timestamp': timestamp}, data, upsert=True)
# coding:UTF-8
def test_convert_dti_to_series(self): # don't cast a DatetimeIndex WITH a tz, leave as object # GH 6032 idx = DatetimeIndex(to_datetime(["2013-1-1 13:00", "2013-1-2 14:00"]), name="B").tz_localize("US/Pacific") df = DataFrame(np.random.randn(2, 1), columns=["A"]) expected = Series( np.array( [ Timestamp("2013-01-01 13:00:00-0800", tz="US/Pacific"), Timestamp("2013-01-02 14:00:00-0800", tz="US/Pacific"), ], dtype="object", ), name="B", ) # convert index to series result = Series(idx) tm.assert_series_equal(result, expected) # assign to frame df["B"] = idx result = df["B"] tm.assert_series_equal(result, expected) # convert to series while keeping the timezone msg = "stop passing 'keep_tz'" with tm.assert_produces_warning(FutureWarning) as m: result = idx.to_series(keep_tz=True, index=[0, 1]) tm.assert_series_equal(result, expected) assert msg in str(m[0].message) # convert to utc with tm.assert_produces_warning(FutureWarning) as m: df["B"] = idx.to_series(keep_tz=False, index=[0, 1]) result = df["B"] comp = Series(DatetimeIndex(expected.values).tz_localize(None), name="B") tm.assert_series_equal(result, comp) msg = "do 'idx.tz_convert(None)' before calling" assert msg in str(m[0].message) result = idx.to_series(index=[0, 1]) tm.assert_series_equal(result, expected) with tm.assert_produces_warning(FutureWarning) as m: result = idx.to_series(keep_tz=False, index=[0, 1]) tm.assert_series_equal(result, expected.dt.tz_convert(None)) msg = "do 'idx.tz_convert(None)' before calling" assert msg in str(m[0].message) # list of datetimes with a tz df["B"] = idx.to_pydatetime() result = df["B"] tm.assert_series_equal(result, expected) # GH 6785 # set the index manually import pytz df = DataFrame([{ "ts": datetime(2014, 4, 1, tzinfo=pytz.utc), "foo": 1 }]) expected = df.set_index("ts") df.index = df["ts"] df.pop("ts") tm.assert_frame_equal(df, expected)
def pop_target(df: pd.DataFrame, params: FeatureParams) -> pd.Series: target = df.pop(params.target_col) return target
def preprocess(df: pd.DataFrame) -> tuple: with timeit("Preprocessamento", log): df = df.copy() df["neighborhood"] = df["neighborhood"].map( df.groupby("neighborhood")["total_fee"].median() ) y = df.pop("total_fee") X = df[ [ "neighborhood", "url", "usable_area", "floors", "type_unit", "bedrooms", "bathrooms", "suites", "parking_spaces", "amenities", "address_lat", "address_lon", "estacao", "distance", "created_date", "updated_date", ] ].set_index("url") # # Colunas Numéricas # numeric_columns = [ "usable_area", "floors", "bedrooms", "bathrooms", "suites", "parking_spaces", "address_lat", "address_lon", "distance", ] X[numeric_columns] = X[numeric_columns].astype(float).fillna(-999) # # Transforma datas de criação e atualização em features # X["qtd_days_created"] = ( ( (datetime.now() - pd.to_datetime(X["created_date"])) / np.timedelta64(1, "D") ) .round() .fillna(-1) .astype(int) ) X["qtd_days_updated"] = ( ( (datetime.now() - pd.to_datetime(X["updated_date"])) / np.timedelta64(1, "D") ) .round() .fillna(-1) .astype(int) ) # # Transforma colunas não numéricas # # Type Unit valid_type_unit = ["APARTMENT", "HOME", "CONDOMINIUM", "PENTHOUSE", "FLAT"] X.loc[~X.type_unit.isin(valid_type_unit), "type_unit"] = "OTHERS" dummies_type_units = onehot( X.type_unit, valid_type_unit + ["OTHERS"], "type_unit" ) X[dummies_type_units.columns] = dummies_type_units.values # Estação de trem/metrô estacoes_validas = [ "Estação_Jardim_Oceânico", "Estação_Uruguai", "Estação_Botafogo/Coca-Cola", "Estação_Afonso_Pena", "Estação_Saens_Peña", "Estação_Flamengo", "Estação_São_Francisco_Xavier_(Metrô_Rio)", "Estação_Madureira", ] # estacoes = X.loc[X.estacao != "", "estacao"].value_counts(normalize=True) # estacoes_validas = list(estacoes[estacoes > 0.05].index) X.loc[~X.estacao.isin(estacoes_validas), "estacao"] = "OTHERS" dummies_estacoes = onehot(X.type_unit, estacoes_validas + ["OTHERS"], "estação") X[dummies_estacoes.columns] = dummies_estacoes.values # Amenities valid_amenities = [ "ELEVATOR", "POOL", "BARBECUE_GRILL", "PARTY_HALL", "PLAYGROUND", "GATED_COMMUNITY", "BALCONY", "INTERCOM", "KITCHEN_CABINETS", "GYM", "SAUNA", "FURNISHED", "SPORTS_COURT", ] amenities = X.amenities.explode() amenities[~amenities.isin([valid_amenities])] = "OTHERS" dummies_amenities = ( onehot(amenities, valid_amenities, "amenity").groupby(level=0).max() ) X[dummies_amenities.columns] = dummies_amenities.values # Remove colunas X = X.drop( columns=[ "type_unit", "estacao", "created_date", "updated_date", "amenities", ] ) log.debug(f"shape: {X.shape}, columns: {X.columns.tolist()}") return X, y
def analytic_signal(data: pd.DataFrame, fmin: float, fmax: float, nodes: List[str] = None, **kwargs) -> pd.DataFrame: """Calculates analytic signal from simulation results, using the hilbert transform. Parameters ---------- data Simulation results. fmin Lower bound frequency for bandpass filter that will be applied to the data. fmax Upper bound frequency for bandpass filter that will be applied to the data. nodes List of node names for which to calculate the analytic signal. kwargs Additional keyword arguments that will be passed to the `mne.Raw.filter` method. Returns ------- pd.DataFrame Dataframe containing the fields `time`, `node`, `amplitude` and `phase`. """ if 'time' in data.columns.values: idx = data.pop('time') data.index = idx if nodes: if type(nodes[0]) is str: data = data.loc[:, nodes] else: data = data.iloc[:, nodes] # create mne raw data object from pyrates.utility.mne_wrapper import mne_from_dataframe raw = mne_from_dataframe(data) # bandpass filter the raw data raw.filter(l_freq=fmin, h_freq=fmax, **kwargs) # apply hilbert transform raw.apply_hilbert() # get phase of analytic signal def get_angle(x): return np.angle(x) + np.pi raw_phase = raw.copy() raw_phase.apply_function(get_angle) raw_phase.apply_function(np.real, dtype=np.float32) raw_phase.apply_function(np.unwrap) # get amplitude of analytic signal raw_amplitude = raw.copy() raw_amplitude.apply_function(np.abs) raw_amplitude.apply_function(np.real, dtype=np.float32) # combine phase and amplitude into dataframe time = data.index data_phase = raw_phase.to_data_frame(scalings={'eeg': 1.}) data_phase['time'] = time data_amp = raw_amplitude.to_data_frame(scalings={'eeg': 1.}) data_amp['time'] = time data = pd.melt(data_phase, id_vars=['time'], var_name='node', value_name='phase') data_tmp = pd.melt(data_amp, id_vars=['time'], var_name='node', value_name='amplitude') data['amplitude'] = data_tmp['amplitude'] return data
def test_set_index_cast_datetimeindex(self): df = DataFrame({'A': [datetime(2000, 1, 1) + timedelta(i) for i in range(1000)], 'B': np.random.randn(1000)}) idf = df.set_index('A') assert isinstance(idf.index, pd.DatetimeIndex) # don't cast a DatetimeIndex WITH a tz, leave as object # GH 6032 i = (pd.DatetimeIndex( to_datetime(['2013-1-1 13:00', '2013-1-2 14:00'], errors="raise")) .tz_localize('US/Pacific')) df = DataFrame(np.random.randn(2, 1), columns=['A']) expected = Series(np.array([pd.Timestamp('2013-01-01 13:00:00-0800', tz='US/Pacific'), pd.Timestamp('2013-01-02 14:00:00-0800', tz='US/Pacific')], dtype="object")) # convert index to series result = Series(i) assert_series_equal(result, expected) # assignt to frame df['B'] = i result = df['B'] assert_series_equal(result, expected, check_names=False) assert result.name == 'B' # keep the timezone result = i.to_series(keep_tz=True) assert_series_equal(result.reset_index(drop=True), expected) # convert to utc df['C'] = i.to_series().reset_index(drop=True) result = df['C'] comp = pd.DatetimeIndex(expected.values).copy() comp.tz = None tm.assert_numpy_array_equal(result.values, comp.values) # list of datetimes with a tz df['D'] = i.to_pydatetime() result = df['D'] assert_series_equal(result, expected, check_names=False) assert result.name == 'D' # GH 6785 # set the index manually import pytz df = DataFrame( [{'ts': datetime(2014, 4, 1, tzinfo=pytz.utc), 'foo': 1}]) expected = df.set_index('ts') df.index = df['ts'] df.pop('ts') assert_frame_equal(df, expected) # GH 3950 # reset_index with single level for tz in ['UTC', 'Asia/Tokyo', 'US/Eastern']: idx = pd.date_range('1/1/2011', periods=5, freq='D', tz=tz, name='idx') df = pd.DataFrame( {'a': range(5), 'b': ['A', 'B', 'C', 'D', 'E']}, index=idx) expected = pd.DataFrame({'idx': [datetime(2011, 1, 1), datetime(2011, 1, 2), datetime(2011, 1, 3), datetime(2011, 1, 4), datetime(2011, 1, 5)], 'a': range(5), 'b': ['A', 'B', 'C', 'D', 'E']}, columns=['idx', 'a', 'b']) expected['idx'] = expected['idx'].apply( lambda d: pd.Timestamp(d, tz=tz)) assert_frame_equal(df.reset_index(), expected)
target.append(10) elif c == 'watertower': target.append(11) elif c == 'library': target.append(12) elif c == 'sportcenter': target.append(13) # Classes: ['outdoorligths', 'trafficlights', 'kindergarten', 'playgroundwithbuilding', 'n # ursinghome', 'school', 'healthcenter', 'officebuilding', 'indoorswimmingpool', 'hospital # ', 'firestation', 'watertower', 'library', 'sportcenter'] # Total number of classes: 14 # We remove the column with class names and add the column with class numbers df['TARGET'] = target df.pop('CLASS') ######################################### ## Building the model ## ######################################### model = keras.Sequential([ keras.layers.Dense(25, activation=tf.nn.relu), keras.layers.Dense(64, activation=tf.nn.relu), # We could generalize the number of nodes of the output layer # according to the number of classes our data contains. keras.layers.Dense(14, activation=tf.nn.softmax) ]) model.compile(
def get_labels(dataset: pd.DataFrame) -> pd.Series: return dataset.pop('MPG')
print tableViews(df[0:40]) #SaveInFile(df[0:40], "b-techs.ma-10-2014.txt") #nous allons suprimer quelque donner que nous n'allons pas utiliser. del df['%V']; del df['%i']; del df['%l']; del df['%{User-Agent}i'] #nous allons renomer les colonne que nous afichons. df = df.rename(columns={'%>s': 'Status', '%b':'b', '%h':'IP', '%r':'Request', '%t': 'Time'}) #print df.head() #cette ligne permet de concertir le temps (datetime) df.index = pd.to_datetime(df.pop('Time')) #convertir le type status en int df['Status'] = df['Status'].astype('int') df['b'][93] #df['b'] = df['b'].replace('-', 'NaN', regex=True).astype('float') #df['b'] = df['b']/1048576. #print df['b'] #df['b'][93] def dash2nan(x): if x == '-': x = np.nan else: x = float(x) * 10**(-3) #Convertir les byts en K-byts multipier 0.0001