Example #1
0
    def test_set_index_cast_datetimeindex(self):
        df = DataFrame({'A': [datetime(2000, 1, 1) + timedelta(i)
                              for i in range(1000)],
                        'B': np.random.randn(1000)})

        idf = df.set_index('A')
        assert isinstance(idf.index, pd.DatetimeIndex)

        # don't cast a DatetimeIndex WITH a tz, leave as object
        # GH 6032
        i = (pd.DatetimeIndex(
            to_datetime(['2013-1-1 13:00',
                         '2013-1-2 14:00'], errors="raise"))
             .tz_localize('US/Pacific'))
        df = DataFrame(np.random.randn(2, 1), columns=['A'])

        expected = Series(np.array([pd.Timestamp('2013-01-01 13:00:00-0800',
                                                 tz='US/Pacific'),
                                    pd.Timestamp('2013-01-02 14:00:00-0800',
                                                 tz='US/Pacific')],
                                   dtype="object"))

        # convert index to series
        result = Series(i)
        assert_series_equal(result, expected)

        # assignt to frame
        df['B'] = i
        result = df['B']
        assert_series_equal(result, expected, check_names=False)
        assert result.name == 'B'

        # keep the timezone
        result = i.to_series(keep_tz=True)
        assert_series_equal(result.reset_index(drop=True), expected)

        # convert to utc
        df['C'] = i.to_series().reset_index(drop=True)
        result = df['C']
        comp = pd.DatetimeIndex(expected.values)
        comp = comp.tz_localize(None)
        tm.assert_numpy_array_equal(result.values, comp.values)

        # list of datetimes with a tz
        df['D'] = i.to_pydatetime()
        result = df['D']
        assert_series_equal(result, expected, check_names=False)
        assert result.name == 'D'

        # GH 6785
        # set the index manually
        import pytz
        df = DataFrame(
            [{'ts': datetime(2014, 4, 1, tzinfo=pytz.utc), 'foo': 1}])
        expected = df.set_index('ts')
        df.index = df['ts']
        df.pop('ts')
        assert_frame_equal(df, expected)
Example #2
0
    def test_convert_dti_to_series(self):
        # don't cast a DatetimeIndex WITH a tz, leave as object
        # GH 6032
        idx = DatetimeIndex(to_datetime(['2013-1-1 13:00',
                                         '2013-1-2 14:00']),
                            name='B').tz_localize('US/Pacific')
        df = DataFrame(np.random.randn(2, 1), columns=['A'])

        expected = Series(np.array([Timestamp('2013-01-01 13:00:00-0800',
                                              tz='US/Pacific'),
                                    Timestamp('2013-01-02 14:00:00-0800',
                                              tz='US/Pacific')],
                                   dtype="object"), name='B')

        # convert index to series
        result = Series(idx)
        tm.assert_series_equal(result, expected)

        # assign to frame
        df['B'] = idx
        result = df['B']
        tm.assert_series_equal(result, expected)

        # convert to series while keeping the timezone
        result = idx.to_series(keep_tz=True, index=[0, 1])
        tm.assert_series_equal(result, expected)

        # convert to utc
        df['B'] = idx.to_series(index=[0, 1])
        result = df['B']
        comp = Series(DatetimeIndex(expected.values).tz_localize(None),
                      name='B')
        tm.assert_series_equal(result, comp)

        # list of datetimes with a tz
        df['B'] = idx.to_pydatetime()
        result = df['B']
        tm.assert_series_equal(result, expected)

        # GH 6785
        # set the index manually
        import pytz
        df = DataFrame(
            [{'ts': datetime(2014, 4, 1, tzinfo=pytz.utc), 'foo': 1}])
        expected = df.set_index('ts')
        df.index = df['ts']
        df.pop('ts')
        tm.assert_frame_equal(df, expected)
Example #3
0
    def test_pop(self):
        self.frame.columns.name = 'baz'

        self.frame.pop('A')
        self.assertNotIn('A', self.frame)

        self.frame['foo'] = 'bar'
        self.frame.pop('foo')
        self.assertNotIn('foo', self.frame)
        # TODO self.assertEqual(self.frame.columns.name, 'baz')

        # 10912
        # inplace ops cause caching issue
        a = DataFrame([[1, 2, 3], [4, 5, 6]], columns=[
                      'A', 'B', 'C'], index=['X', 'Y'])
        b = a.pop('B')
        b += 1

        # original frame
        expected = DataFrame([[1, 3], [4, 6]], columns=[
                             'A', 'C'], index=['X', 'Y'])
        assert_frame_equal(a, expected)

        # result
        expected = Series([2, 5], index=['X', 'Y'], name='B') + 1
        assert_series_equal(b, expected)
Example #4
0
    def to_dataframe(self, selected_fields=None, excluded_fields=None):
        from ..services import locations

        if excluded_fields:
            qs = self.exclude(*excluded_fields)
        else:
            qs = self.exclude(*self.DEFAULT_EXCLUDED_FIELDS)
        if selected_fields:
            qs = self.only(*selected_fields)

        df = DataFrame(list(qs.as_pymongo())).convert_objects(convert_numeric=True)
        if df.empty:
            return df

        # add fields with no values
        fields = filter(
            lambda f: f not in df.columns,
            map(lambda field: field.name, [field for group in self.first().form.groups for field in group.fields]),
        )

        for field in fields:
            df[field] = Series(np.nan, index=df.index)

        # do cleanup of subdocument fields
        for field in self.SUBDOCUMENT_FIELDS:
            temp = df.pop(field).tolist()
            temp2 = [i if not isnull(i) else {} for i in temp]
            df = df.join(DataFrame(temp2))

        rv_map = locations.registered_voters_map()

        df["registered_voters"] = df.location.apply(lambda i: rv_map.get(i, 0))

        return df
    def test_pop(self):
        self.frame.columns.name = 'baz'

        self.frame.pop('A')
        assert 'A' not in self.frame

        self.frame['foo'] = 'bar'
        self.frame.pop('foo')
        assert 'foo' not in self.frame
        # TODO assert self.frame.columns.name == 'baz'

        # gh-10912: inplace ops cause caching issue
        a = DataFrame([[1, 2, 3], [4, 5, 6]], columns=[
                      'A', 'B', 'C'], index=['X', 'Y'])
        b = a.pop('B')
        b += 1

        # original frame
        expected = DataFrame([[1, 3], [4, 6]], columns=[
                             'A', 'C'], index=['X', 'Y'])
        tm.assert_frame_equal(a, expected)

        # result
        expected = Series([2, 5], index=['X', 'Y'], name='B') + 1
        tm.assert_series_equal(b, expected)
    def test_pop_non_unique_cols(self):
        df = DataFrame({0: [0, 1], 1: [0, 1], 2: [4, 5]})
        df.columns = ["a", "b", "a"]

        res = df.pop("a")
        assert type(res) == DataFrame
        assert len(res) == 2
        assert len(df.columns) == 1
        assert "b" in df.columns
        assert "a" not in df.columns
        assert len(df.index) == 2
Example #7
0
def get_hhi(dframe):
    HHI_VALS = []
    for idx in dframe.iterrows():
        shares = [s for s in idx[1] if s > 0]
        numfirms = len(shares)
        sqr_shares = [s*s for s in shares]
        hhi_val = sum(sqr_shares)
        HHI_VALS.append({'month': idx[0], 'hhi': hhi_val})
    dframeHHI = DataFrame(HHI_VALS)
    dframeHHI.index = DatetimeIndex(dframeHHI.pop('month'))
    return dframeHHI['hhi']
Example #8
0
    def test_pop_non_unique_cols(self):
        df = DataFrame({0: [0, 1], 1: [0, 1], 2: [4, 5]})
        df.columns = ["a", "b", "a"]

        res = df.pop("a")
        self.assertEqual(type(res), DataFrame)
        self.assertEqual(len(res), 2)
        self.assertEqual(len(df.columns), 1)
        self.assertTrue("b" in df.columns)
        self.assertFalse("a" in df.columns)
        self.assertEqual(len(df.index), 2)
Example #9
0
    def __init__(self, map_):
        map_ = np.atleast_1d(np.asanyarray(map_).squeeze())

        try:
            m, n = map_.shape
            s = np.repeat(np.arange(n), m)
        except ValueError:
            m, = map_.shape
            s = np.ones(m, dtype=int)

        data = {'channel': map_.ravel(), 'shank': s}
        df = DataFrame(data).sort('shank').reset_index(drop=True)
        df.index = df.pop('channel')

        super(ElectrodeMap, self).__init__(df.sort())
Example #10
0
class TestJoin:
    def setup_method(self):
        # aggregate multiple columns
        self.df = DataFrame({
            "key1": get_test_data(),
            "key2": get_test_data(),
            "data1": np.random.randn(N),
            "data2": np.random.randn(N),
        })

        # exclude a couple keys for fun
        self.df = self.df[self.df["key2"] > 1]

        self.df2 = DataFrame({
            "key1":
            get_test_data(n=N // 5),
            "key2":
            get_test_data(ngroups=NGROUPS // 2, n=N // 5),
            "value":
            np.random.randn(N // 5),
        })

        index, data = tm.getMixedTypeDict()
        self.target = DataFrame(data, index=index)

        # Join on string value
        self.source = DataFrame({
            "MergedA": data["A"],
            "MergedD": data["D"]
        },
                                index=data["C"])

    def test_left_outer_join(self):
        joined_key2 = merge(self.df, self.df2, on="key2")
        _check_join(self.df, self.df2, joined_key2, ["key2"], how="left")

        joined_both = merge(self.df, self.df2)
        _check_join(self.df,
                    self.df2,
                    joined_both, ["key1", "key2"],
                    how="left")

    def test_right_outer_join(self):
        joined_key2 = merge(self.df, self.df2, on="key2", how="right")
        _check_join(self.df, self.df2, joined_key2, ["key2"], how="right")

        joined_both = merge(self.df, self.df2, how="right")
        _check_join(self.df,
                    self.df2,
                    joined_both, ["key1", "key2"],
                    how="right")

    def test_full_outer_join(self):
        joined_key2 = merge(self.df, self.df2, on="key2", how="outer")
        _check_join(self.df, self.df2, joined_key2, ["key2"], how="outer")

        joined_both = merge(self.df, self.df2, how="outer")
        _check_join(self.df,
                    self.df2,
                    joined_both, ["key1", "key2"],
                    how="outer")

    def test_inner_join(self):
        joined_key2 = merge(self.df, self.df2, on="key2", how="inner")
        _check_join(self.df, self.df2, joined_key2, ["key2"], how="inner")

        joined_both = merge(self.df, self.df2, how="inner")
        _check_join(self.df,
                    self.df2,
                    joined_both, ["key1", "key2"],
                    how="inner")

    def test_handle_overlap(self):
        joined = merge(self.df, self.df2, on="key2", suffixes=(".foo", ".bar"))

        assert "key1.foo" in joined
        assert "key1.bar" in joined

    def test_handle_overlap_arbitrary_key(self):
        joined = merge(
            self.df,
            self.df2,
            left_on="key2",
            right_on="key1",
            suffixes=(".foo", ".bar"),
        )
        assert "key1.foo" in joined
        assert "key2.bar" in joined

    def test_join_on(self):
        target = self.target
        source = self.source

        merged = target.join(source, on="C")
        tm.assert_series_equal(merged["MergedA"],
                               target["A"],
                               check_names=False)
        tm.assert_series_equal(merged["MergedD"],
                               target["D"],
                               check_names=False)

        # join with duplicates (fix regression from DataFrame/Matrix merge)
        df = DataFrame({"key": ["a", "a", "b", "b", "c"]})
        df2 = DataFrame({"value": [0, 1, 2]}, index=["a", "b", "c"])
        joined = df.join(df2, on="key")
        expected = DataFrame({
            "key": ["a", "a", "b", "b", "c"],
            "value": [0, 0, 1, 1, 2]
        })
        tm.assert_frame_equal(joined, expected)

        # Test when some are missing
        df_a = DataFrame([[1], [2], [3]],
                         index=["a", "b", "c"],
                         columns=["one"])
        df_b = DataFrame([["foo"], ["bar"]], index=[1, 2], columns=["two"])
        df_c = DataFrame([[1], [2]], index=[1, 2], columns=["three"])
        joined = df_a.join(df_b, on="one")
        joined = joined.join(df_c, on="one")
        assert np.isnan(joined["two"]["c"])
        assert np.isnan(joined["three"]["c"])

        # merge column not p resent
        with pytest.raises(KeyError, match="^'E'$"):
            target.join(source, on="E")

        # overlap
        source_copy = source.copy()
        source_copy["A"] = 0
        msg = ("You are trying to merge on float64 and object columns. If "
               "you wish to proceed you should use pd.concat")
        with pytest.raises(ValueError, match=msg):
            target.join(source_copy, on="A")

    def test_join_on_fails_with_different_right_index(self):
        df = DataFrame({
            "a": np.random.choice(["m", "f"], size=3),
            "b": np.random.randn(3)
        })
        df2 = DataFrame(
            {
                "a": np.random.choice(["m", "f"], size=10),
                "b": np.random.randn(10)
            },
            index=tm.makeCustomIndex(10, 2),
        )
        msg = r'len\(left_on\) must equal the number of levels in the index of "right"'
        with pytest.raises(ValueError, match=msg):
            merge(df, df2, left_on="a", right_index=True)

    def test_join_on_fails_with_different_left_index(self):
        df = DataFrame(
            {
                "a": np.random.choice(["m", "f"], size=3),
                "b": np.random.randn(3)
            },
            index=tm.makeCustomIndex(3, 2),
        )
        df2 = DataFrame({
            "a": np.random.choice(["m", "f"], size=10),
            "b": np.random.randn(10)
        })
        msg = r'len\(right_on\) must equal the number of levels in the index of "left"'
        with pytest.raises(ValueError, match=msg):
            merge(df, df2, right_on="b", left_index=True)

    def test_join_on_fails_with_different_column_counts(self):
        df = DataFrame({
            "a": np.random.choice(["m", "f"], size=3),
            "b": np.random.randn(3)
        })
        df2 = DataFrame(
            {
                "a": np.random.choice(["m", "f"], size=10),
                "b": np.random.randn(10)
            },
            index=tm.makeCustomIndex(10, 2),
        )
        msg = r"len\(right_on\) must equal len\(left_on\)"
        with pytest.raises(ValueError, match=msg):
            merge(df, df2, right_on="a", left_on=["a", "b"])

    @pytest.mark.parametrize("wrong_type", [2, "str", None, np.array([0, 1])])
    def test_join_on_fails_with_wrong_object_type(self, wrong_type):
        # GH12081 - original issue

        # GH21220 - merging of Series and DataFrame is now allowed
        # Edited test to remove the Series object from test parameters

        df = DataFrame({"a": [1, 1]})
        msg = ("Can only merge Series or DataFrame objects, "
               f"a {type(wrong_type)} was passed")
        with pytest.raises(TypeError, match=msg):
            merge(wrong_type, df, left_on="a", right_on="a")
        with pytest.raises(TypeError, match=msg):
            merge(df, wrong_type, left_on="a", right_on="a")

    def test_join_on_pass_vector(self):
        expected = self.target.join(self.source, on="C")
        del expected["C"]

        join_col = self.target.pop("C")
        result = self.target.join(self.source, on=join_col)
        tm.assert_frame_equal(result, expected)

    def test_join_with_len0(self):
        # nothing to merge
        merged = self.target.join(self.source.reindex([]), on="C")
        for col in self.source:
            assert col in merged
            assert merged[col].isna().all()

        merged2 = self.target.join(self.source.reindex([]),
                                   on="C",
                                   how="inner")
        tm.assert_index_equal(merged2.columns, merged.columns)
        assert len(merged2) == 0

    def test_join_on_inner(self):
        df = DataFrame({"key": ["a", "a", "d", "b", "b", "c"]})
        df2 = DataFrame({"value": [0, 1]}, index=["a", "b"])

        joined = df.join(df2, on="key", how="inner")

        expected = df.join(df2, on="key")
        expected = expected[expected["value"].notna()]
        tm.assert_series_equal(joined["key"], expected["key"])
        tm.assert_series_equal(joined["value"],
                               expected["value"],
                               check_dtype=False)
        tm.assert_index_equal(joined.index, expected.index)

    def test_join_on_singlekey_list(self):
        df = DataFrame({"key": ["a", "a", "b", "b", "c"]})
        df2 = DataFrame({"value": [0, 1, 2]}, index=["a", "b", "c"])

        # corner cases
        joined = df.join(df2, on=["key"])
        expected = df.join(df2, on="key")

        tm.assert_frame_equal(joined, expected)

    def test_join_on_series(self):
        result = self.target.join(self.source["MergedA"], on="C")
        expected = self.target.join(self.source[["MergedA"]], on="C")
        tm.assert_frame_equal(result, expected)

    def test_join_on_series_buglet(self):
        # GH #638
        df = DataFrame({"a": [1, 1]})
        ds = Series([2], index=[1], name="b")
        result = df.join(ds, on="a")
        expected = DataFrame({"a": [1, 1], "b": [2, 2]}, index=df.index)
        tm.assert_frame_equal(result, expected)

    def test_join_index_mixed(self, join_type):
        # no overlapping blocks
        df1 = DataFrame(index=np.arange(10))
        df1["bool"] = True
        df1["string"] = "foo"

        df2 = DataFrame(index=np.arange(5, 15))
        df2["int"] = 1
        df2["float"] = 1.0

        joined = df1.join(df2, how=join_type)
        expected = _join_by_hand(df1, df2, how=join_type)
        tm.assert_frame_equal(joined, expected)

        joined = df2.join(df1, how=join_type)
        expected = _join_by_hand(df2, df1, how=join_type)
        tm.assert_frame_equal(joined, expected)

    def test_join_index_mixed_overlap(self):
        df1 = DataFrame(
            {
                "A": 1.0,
                "B": 2,
                "C": "foo",
                "D": True
            },
            index=np.arange(10),
            columns=["A", "B", "C", "D"],
        )
        assert df1["B"].dtype == np.int64
        assert df1["D"].dtype == np.bool_

        df2 = DataFrame(
            {
                "A": 1.0,
                "B": 2,
                "C": "foo",
                "D": True
            },
            index=np.arange(0, 10, 2),
            columns=["A", "B", "C", "D"],
        )

        # overlap
        joined = df1.join(df2, lsuffix="_one", rsuffix="_two")
        expected_columns = [
            "A_one",
            "B_one",
            "C_one",
            "D_one",
            "A_two",
            "B_two",
            "C_two",
            "D_two",
        ]
        df1.columns = expected_columns[:4]
        df2.columns = expected_columns[4:]
        expected = _join_by_hand(df1, df2)
        tm.assert_frame_equal(joined, expected)

    def test_join_empty_bug(self):
        # generated an exception in 0.4.3
        x = DataFrame()
        x.join(DataFrame([3], index=[0], columns=["A"]), how="outer")

    def test_join_unconsolidated(self):
        # GH #331
        a = DataFrame(np.random.randn(30, 2), columns=["a", "b"])
        c = Series(np.random.randn(30))
        a["c"] = c
        d = DataFrame(np.random.randn(30, 1), columns=["q"])

        # it works!
        a.join(d)
        d.join(a)

    def test_join_multiindex(self):
        index1 = MultiIndex.from_arrays(
            [["a", "a", "a", "b", "b", "b"], [1, 2, 3, 1, 2, 3]],
            names=["first", "second"],
        )

        index2 = MultiIndex.from_arrays(
            [["b", "b", "b", "c", "c", "c"], [1, 2, 3, 1, 2, 3]],
            names=["first", "second"],
        )

        df1 = DataFrame(data=np.random.randn(6),
                        index=index1,
                        columns=["var X"])
        df2 = DataFrame(data=np.random.randn(6),
                        index=index2,
                        columns=["var Y"])

        df1 = df1.sort_index(level=0)
        df2 = df2.sort_index(level=0)

        joined = df1.join(df2, how="outer")
        ex_index = Index(index1.values).union(Index(index2.values))
        expected = df1.reindex(ex_index).join(df2.reindex(ex_index))
        expected.index.names = index1.names
        tm.assert_frame_equal(joined, expected)
        assert joined.index.names == index1.names

        df1 = df1.sort_index(level=1)
        df2 = df2.sort_index(level=1)

        joined = df1.join(df2, how="outer").sort_index(level=0)
        ex_index = Index(index1.values).union(Index(index2.values))
        expected = df1.reindex(ex_index).join(df2.reindex(ex_index))
        expected.index.names = index1.names

        tm.assert_frame_equal(joined, expected)
        assert joined.index.names == index1.names

    def test_join_inner_multiindex(self,
                                   lexsorted_two_level_string_multiindex):
        key1 = [
            "bar", "bar", "bar", "foo", "foo", "baz", "baz", "qux", "qux",
            "snap"
        ]
        key2 = [
            "two",
            "one",
            "three",
            "one",
            "two",
            "one",
            "two",
            "two",
            "three",
            "one",
        ]

        data = np.random.randn(len(key1))
        data = DataFrame({"key1": key1, "key2": key2, "data": data})

        index = lexsorted_two_level_string_multiindex
        to_join = DataFrame(np.random.randn(10, 3),
                            index=index,
                            columns=["j_one", "j_two", "j_three"])

        joined = data.join(to_join, on=["key1", "key2"], how="inner")
        expected = merge(
            data,
            to_join.reset_index(),
            left_on=["key1", "key2"],
            right_on=["first", "second"],
            how="inner",
            sort=False,
        )

        expected2 = merge(
            to_join,
            data,
            right_on=["key1", "key2"],
            left_index=True,
            how="inner",
            sort=False,
        )
        tm.assert_frame_equal(joined, expected2.reindex_like(joined))

        expected2 = merge(
            to_join,
            data,
            right_on=["key1", "key2"],
            left_index=True,
            how="inner",
            sort=False,
        )

        expected = expected.drop(["first", "second"], axis=1)
        expected.index = joined.index

        assert joined.index.is_monotonic_increasing
        tm.assert_frame_equal(joined, expected)

        # _assert_same_contents(expected, expected2.loc[:, expected.columns])

    def test_join_hierarchical_mixed(self):
        # GH 2024
        df = DataFrame([(1, 2, 3), (4, 5, 6)], columns=["a", "b", "c"])
        new_df = df.groupby(["a"]).agg({"b": [np.mean, np.sum]})
        other_df = DataFrame([(1, 2, 3), (7, 10, 6)], columns=["a", "b", "d"])
        other_df.set_index("a", inplace=True)
        # GH 9455, 12219
        msg = "merging between different levels is deprecated"
        with tm.assert_produces_warning(FutureWarning, match=msg):
            result = merge(new_df, other_df, left_index=True, right_index=True)
        assert ("b", "mean") in result
        assert "b" in result

    def test_join_float64_float32(self):

        a = DataFrame(np.random.randn(10, 2),
                      columns=["a", "b"],
                      dtype=np.float64)
        b = DataFrame(np.random.randn(10, 1), columns=["c"], dtype=np.float32)
        joined = a.join(b)
        assert joined.dtypes["a"] == "float64"
        assert joined.dtypes["b"] == "float64"
        assert joined.dtypes["c"] == "float32"

        a = np.random.randint(0, 5, 100).astype("int64")
        b = np.random.random(100).astype("float64")
        c = np.random.random(100).astype("float32")
        df = DataFrame({"a": a, "b": b, "c": c})
        xpdf = DataFrame({"a": a, "b": b, "c": c})
        s = DataFrame(np.random.random(5).astype("float32"), columns=["md"])
        rs = df.merge(s, left_on="a", right_index=True)
        assert rs.dtypes["a"] == "int64"
        assert rs.dtypes["b"] == "float64"
        assert rs.dtypes["c"] == "float32"
        assert rs.dtypes["md"] == "float32"

        xp = xpdf.merge(s, left_on="a", right_index=True)
        tm.assert_frame_equal(rs, xp)

    def test_join_many_non_unique_index(self):
        df1 = DataFrame({"a": [1, 1], "b": [1, 1], "c": [10, 20]})
        df2 = DataFrame({"a": [1, 1], "b": [1, 2], "d": [100, 200]})
        df3 = DataFrame({"a": [1, 1], "b": [1, 2], "e": [1000, 2000]})
        idf1 = df1.set_index(["a", "b"])
        idf2 = df2.set_index(["a", "b"])
        idf3 = df3.set_index(["a", "b"])

        result = idf1.join([idf2, idf3], how="outer")

        df_partially_merged = merge(df1, df2, on=["a", "b"], how="outer")
        expected = merge(df_partially_merged, df3, on=["a", "b"], how="outer")

        result = result.reset_index()
        expected = expected[result.columns]
        expected["a"] = expected.a.astype("int64")
        expected["b"] = expected.b.astype("int64")
        tm.assert_frame_equal(result, expected)

        df1 = DataFrame({"a": [1, 1, 1], "b": [1, 1, 1], "c": [10, 20, 30]})
        df2 = DataFrame({"a": [1, 1, 1], "b": [1, 1, 2], "d": [100, 200, 300]})
        df3 = DataFrame({
            "a": [1, 1, 1],
            "b": [1, 1, 2],
            "e": [1000, 2000, 3000]
        })
        idf1 = df1.set_index(["a", "b"])
        idf2 = df2.set_index(["a", "b"])
        idf3 = df3.set_index(["a", "b"])
        result = idf1.join([idf2, idf3], how="inner")

        df_partially_merged = merge(df1, df2, on=["a", "b"], how="inner")
        expected = merge(df_partially_merged, df3, on=["a", "b"], how="inner")

        result = result.reset_index()

        tm.assert_frame_equal(result, expected.loc[:, result.columns])

        # GH 11519
        df = DataFrame({
            "A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"],
            "B": ["one", "one", "two", "three", "two", "two", "one", "three"],
            "C":
            np.random.randn(8),
            "D":
            np.random.randn(8),
        })
        s = Series(np.repeat(np.arange(8), 2),
                   index=np.repeat(np.arange(8), 2),
                   name="TEST")
        inner = df.join(s, how="inner")
        outer = df.join(s, how="outer")
        left = df.join(s, how="left")
        right = df.join(s, how="right")
        tm.assert_frame_equal(inner, outer)
        tm.assert_frame_equal(inner, left)
        tm.assert_frame_equal(inner, right)

    def test_join_sort(self):
        left = DataFrame({
            "key": ["foo", "bar", "baz", "foo"],
            "value": [1, 2, 3, 4]
        })
        right = DataFrame({"value2": ["a", "b", "c"]},
                          index=["bar", "baz", "foo"])

        joined = left.join(right, on="key", sort=True)
        expected = DataFrame(
            {
                "key": ["bar", "baz", "foo", "foo"],
                "value": [2, 3, 1, 4],
                "value2": ["a", "b", "c", "c"],
            },
            index=[1, 2, 0, 3],
        )
        tm.assert_frame_equal(joined, expected)

        # smoke test
        joined = left.join(right, on="key", sort=False)
        tm.assert_index_equal(joined.index, Index(range(4)), exact=True)

    def test_join_mixed_non_unique_index(self):
        # GH 12814, unorderable types in py3 with a non-unique index
        df1 = DataFrame({"a": [1, 2, 3, 4]}, index=[1, 2, 3, "a"])
        df2 = DataFrame({"b": [5, 6, 7, 8]}, index=[1, 3, 3, 4])
        result = df1.join(df2)
        expected = DataFrame(
            {
                "a": [1, 2, 3, 3, 4],
                "b": [5, np.nan, 6, 7, np.nan]
            },
            index=[1, 2, 3, 3, "a"],
        )
        tm.assert_frame_equal(result, expected)

        df3 = DataFrame({"a": [1, 2, 3, 4]}, index=[1, 2, 2, "a"])
        df4 = DataFrame({"b": [5, 6, 7, 8]}, index=[1, 2, 3, 4])
        result = df3.join(df4)
        expected = DataFrame({
            "a": [1, 2, 3, 4],
            "b": [5, 6, 6, np.nan]
        },
                             index=[1, 2, 2, "a"])
        tm.assert_frame_equal(result, expected)

    def test_join_non_unique_period_index(self):
        # GH #16871
        index = pd.period_range("2016-01-01", periods=16, freq="M")
        df = DataFrame(list(range(len(index))), index=index, columns=["pnum"])
        df2 = concat([df, df])
        result = df.join(df2, how="inner", rsuffix="_df2")
        expected = DataFrame(
            np.tile(np.arange(16, dtype=np.int64).repeat(2).reshape(-1, 1), 2),
            columns=["pnum", "pnum_df2"],
            index=df2.sort_index().index,
        )
        tm.assert_frame_equal(result, expected)

    def test_mixed_type_join_with_suffix(self):
        # GH #916
        df = DataFrame(np.random.randn(20, 6),
                       columns=["a", "b", "c", "d", "e", "f"])
        df.insert(0, "id", 0)
        df.insert(5, "dt", "foo")

        grouped = df.groupby("id")
        mn = grouped.mean()
        cn = grouped.count()

        # it works!
        mn.join(cn, rsuffix="_right")

    def test_join_many(self):
        df = DataFrame(np.random.randn(10, 6), columns=list("abcdef"))
        df_list = [df[["a", "b"]], df[["c", "d"]], df[["e", "f"]]]

        joined = df_list[0].join(df_list[1:])
        tm.assert_frame_equal(joined, df)

        df_list = [
            df[["a", "b"]][:-2], df[["c", "d"]][2:], df[["e", "f"]][1:9]
        ]

        def _check_diff_index(df_list, result, exp_index):
            reindexed = [x.reindex(exp_index) for x in df_list]
            expected = reindexed[0].join(reindexed[1:])
            tm.assert_frame_equal(result, expected)

        # different join types
        joined = df_list[0].join(df_list[1:], how="outer")
        _check_diff_index(df_list, joined, df.index)

        joined = df_list[0].join(df_list[1:])
        _check_diff_index(df_list, joined, df_list[0].index)

        joined = df_list[0].join(df_list[1:], how="inner")
        _check_diff_index(df_list, joined, df.index[2:8])

        msg = "Joining multiple DataFrames only supported for joining on index"
        with pytest.raises(ValueError, match=msg):
            df_list[0].join(df_list[1:], on="a")

    def test_join_many_mixed(self):
        df = DataFrame(np.random.randn(8, 4), columns=["A", "B", "C", "D"])
        df["key"] = ["foo", "bar"] * 4
        df1 = df.loc[:, ["A", "B"]]
        df2 = df.loc[:, ["C", "D"]]
        df3 = df.loc[:, ["key"]]

        result = df1.join([df2, df3])
        tm.assert_frame_equal(result, df)

    def test_join_dups(self):

        # joining dups
        df = concat(
            [
                DataFrame(np.random.randn(10, 4), columns=["A", "A", "B", "B"
                                                           ]),
                DataFrame(np.random.randint(0, 10, size=20).reshape(10, 2),
                          columns=["A", "C"]),
            ],
            axis=1,
        )

        expected = concat([df, df], axis=1)
        result = df.join(df, rsuffix="_2")
        result.columns = expected.columns
        tm.assert_frame_equal(result, expected)

        # GH 4975, invalid join on dups
        w = DataFrame(np.random.randn(4, 2), columns=["x", "y"])
        x = DataFrame(np.random.randn(4, 2), columns=["x", "y"])
        y = DataFrame(np.random.randn(4, 2), columns=["x", "y"])
        z = DataFrame(np.random.randn(4, 2), columns=["x", "y"])

        dta = x.merge(y, left_index=True,
                      right_index=True).merge(z,
                                              left_index=True,
                                              right_index=True,
                                              how="outer")
        with tm.assert_produces_warning(FutureWarning):
            dta = dta.merge(w, left_index=True, right_index=True)
        expected = concat([x, y, z, w], axis=1)
        expected.columns = [
            "x_x", "y_x", "x_y", "y_y", "x_x", "y_x", "x_y", "y_y"
        ]
        tm.assert_frame_equal(dta, expected)

    def test_join_multi_to_multi(self, join_type):
        # GH 20475
        leftindex = MultiIndex.from_product(
            [list("abc"), list("xy"), [1, 2]], names=["abc", "xy", "num"])
        left = DataFrame({"v1": range(12)}, index=leftindex)

        rightindex = MultiIndex.from_product(
            [list("abc"), list("xy")], names=["abc", "xy"])
        right = DataFrame({"v2": [100 * i for i in range(1, 7)]},
                          index=rightindex)

        result = left.join(right, on=["abc", "xy"], how=join_type)
        expected = (left.reset_index().merge(right.reset_index(),
                                             on=["abc", "xy"],
                                             how=join_type).set_index(
                                                 ["abc", "xy", "num"]))
        tm.assert_frame_equal(expected, result)

        msg = r'len\(left_on\) must equal the number of levels in the index of "right"'
        with pytest.raises(ValueError, match=msg):
            left.join(right, on="xy", how=join_type)

        with pytest.raises(ValueError, match=msg):
            right.join(left, on=["abc", "xy"], how=join_type)

    def test_join_on_tz_aware_datetimeindex(self):
        # GH 23931, 26335
        df1 = DataFrame({
            "date":
            pd.date_range(start="2018-01-01", periods=5, tz="America/Chicago"),
            "vals":
            list("abcde"),
        })

        df2 = DataFrame({
            "date":
            pd.date_range(start="2018-01-03", periods=5, tz="America/Chicago"),
            "vals_2":
            list("tuvwx"),
        })
        result = df1.join(df2.set_index("date"), on="date")
        expected = df1.copy()
        expected["vals_2"] = Series([np.nan] * 2 + list("tuv"), dtype=object)
        tm.assert_frame_equal(result, expected)

    def test_join_datetime_string(self):
        # GH 5647
        dfa = DataFrame(
            [
                ["2012-08-02", "L", 10],
                ["2012-08-02", "J", 15],
                ["2013-04-06", "L", 20],
                ["2013-04-06", "J", 25],
            ],
            columns=["x", "y", "a"],
        )
        dfa["x"] = pd.to_datetime(dfa["x"])
        dfb = DataFrame(
            [["2012-08-02", "J", 1], ["2013-04-06", "L", 2]],
            columns=["x", "y", "z"],
            index=[2, 4],
        )
        dfb["x"] = pd.to_datetime(dfb["x"])
        result = dfb.join(dfa.set_index(["x", "y"]), on=["x", "y"])
        expected = DataFrame(
            [
                [Timestamp("2012-08-02 00:00:00"), "J", 1, 15],
                [Timestamp("2013-04-06 00:00:00"), "L", 2, 20],
            ],
            index=[2, 4],
            columns=["x", "y", "z", "a"],
        )
        tm.assert_frame_equal(result, expected)
Example #11
0
class TestJoin:
    def setup_method(self, method):
        # aggregate multiple columns
        self.df = DataFrame({
            'key1': get_test_data(),
            'key2': get_test_data(),
            'data1': np.random.randn(N),
            'data2': np.random.randn(N)
        })

        # exclude a couple keys for fun
        self.df = self.df[self.df['key2'] > 1]

        self.df2 = DataFrame({
            'key1':
            get_test_data(n=N // 5),
            'key2':
            get_test_data(ngroups=NGROUPS // 2, n=N // 5),
            'value':
            np.random.randn(N // 5)
        })

        index, data = tm.getMixedTypeDict()
        self.target = DataFrame(data, index=index)

        # Join on string value
        self.source = DataFrame({
            'MergedA': data['A'],
            'MergedD': data['D']
        },
                                index=data['C'])

    def test_cython_left_outer_join(self):
        left = a_([0, 1, 2, 1, 2, 0, 0, 1, 2, 3, 3], dtype=np.int64)
        right = a_([1, 1, 0, 4, 2, 2, 1], dtype=np.int64)
        max_group = 5

        ls, rs = libjoin.left_outer_join(left, right, max_group)

        exp_ls = left.argsort(kind='mergesort')
        exp_rs = right.argsort(kind='mergesort')

        exp_li = a_(
            [0, 1, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, 6, 6, 7, 7, 8, 8, 9, 10])
        exp_ri = a_(
            [0, 0, 0, 1, 2, 3, 1, 2, 3, 1, 2, 3, 4, 5, 4, 5, 4, 5, -1, -1])

        exp_ls = exp_ls.take(exp_li)
        exp_ls[exp_li == -1] = -1

        exp_rs = exp_rs.take(exp_ri)
        exp_rs[exp_ri == -1] = -1

        tm.assert_numpy_array_equal(ls, exp_ls, check_dtype=False)
        tm.assert_numpy_array_equal(rs, exp_rs, check_dtype=False)

    def test_cython_right_outer_join(self):
        left = a_([0, 1, 2, 1, 2, 0, 0, 1, 2, 3, 3], dtype=np.int64)
        right = a_([1, 1, 0, 4, 2, 2, 1], dtype=np.int64)
        max_group = 5

        rs, ls = libjoin.left_outer_join(right, left, max_group)

        exp_ls = left.argsort(kind='mergesort')
        exp_rs = right.argsort(kind='mergesort')

        #            0        1        1        1
        exp_li = a_([
            0,
            1,
            2,
            3,
            4,
            5,
            3,
            4,
            5,
            3,
            4,
            5,
            #            2        2        4
            6,
            7,
            8,
            6,
            7,
            8,
            -1
        ])
        exp_ri = a_([0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, 6])

        exp_ls = exp_ls.take(exp_li)
        exp_ls[exp_li == -1] = -1

        exp_rs = exp_rs.take(exp_ri)
        exp_rs[exp_ri == -1] = -1

        tm.assert_numpy_array_equal(ls, exp_ls, check_dtype=False)
        tm.assert_numpy_array_equal(rs, exp_rs, check_dtype=False)

    def test_cython_inner_join(self):
        left = a_([0, 1, 2, 1, 2, 0, 0, 1, 2, 3, 3], dtype=np.int64)
        right = a_([1, 1, 0, 4, 2, 2, 1, 4], dtype=np.int64)
        max_group = 5

        ls, rs = libjoin.inner_join(left, right, max_group)

        exp_ls = left.argsort(kind='mergesort')
        exp_rs = right.argsort(kind='mergesort')

        exp_li = a_([0, 1, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, 6, 6, 7, 7, 8, 8])
        exp_ri = a_([0, 0, 0, 1, 2, 3, 1, 2, 3, 1, 2, 3, 4, 5, 4, 5, 4, 5])

        exp_ls = exp_ls.take(exp_li)
        exp_ls[exp_li == -1] = -1

        exp_rs = exp_rs.take(exp_ri)
        exp_rs[exp_ri == -1] = -1

        tm.assert_numpy_array_equal(ls, exp_ls, check_dtype=False)
        tm.assert_numpy_array_equal(rs, exp_rs, check_dtype=False)

    def test_left_outer_join(self):
        joined_key2 = merge(self.df, self.df2, on='key2')
        _check_join(self.df, self.df2, joined_key2, ['key2'], how='left')

        joined_both = merge(self.df, self.df2)
        _check_join(self.df,
                    self.df2,
                    joined_both, ['key1', 'key2'],
                    how='left')

    def test_right_outer_join(self):
        joined_key2 = merge(self.df, self.df2, on='key2', how='right')
        _check_join(self.df, self.df2, joined_key2, ['key2'], how='right')

        joined_both = merge(self.df, self.df2, how='right')
        _check_join(self.df,
                    self.df2,
                    joined_both, ['key1', 'key2'],
                    how='right')

    def test_full_outer_join(self):
        joined_key2 = merge(self.df, self.df2, on='key2', how='outer')
        _check_join(self.df, self.df2, joined_key2, ['key2'], how='outer')

        joined_both = merge(self.df, self.df2, how='outer')
        _check_join(self.df,
                    self.df2,
                    joined_both, ['key1', 'key2'],
                    how='outer')

    def test_inner_join(self):
        joined_key2 = merge(self.df, self.df2, on='key2', how='inner')
        _check_join(self.df, self.df2, joined_key2, ['key2'], how='inner')

        joined_both = merge(self.df, self.df2, how='inner')
        _check_join(self.df,
                    self.df2,
                    joined_both, ['key1', 'key2'],
                    how='inner')

    def test_handle_overlap(self):
        joined = merge(self.df, self.df2, on='key2', suffixes=['.foo', '.bar'])

        assert 'key1.foo' in joined
        assert 'key1.bar' in joined

    def test_handle_overlap_arbitrary_key(self):
        joined = merge(self.df,
                       self.df2,
                       left_on='key2',
                       right_on='key1',
                       suffixes=['.foo', '.bar'])
        assert 'key1.foo' in joined
        assert 'key2.bar' in joined

    def test_join_on(self):
        target = self.target
        source = self.source

        merged = target.join(source, on='C')
        tm.assert_series_equal(merged['MergedA'],
                               target['A'],
                               check_names=False)
        tm.assert_series_equal(merged['MergedD'],
                               target['D'],
                               check_names=False)

        # join with duplicates (fix regression from DataFrame/Matrix merge)
        df = DataFrame({'key': ['a', 'a', 'b', 'b', 'c']})
        df2 = DataFrame({'value': [0, 1, 2]}, index=['a', 'b', 'c'])
        joined = df.join(df2, on='key')
        expected = DataFrame({
            'key': ['a', 'a', 'b', 'b', 'c'],
            'value': [0, 0, 1, 1, 2]
        })
        assert_frame_equal(joined, expected)

        # Test when some are missing
        df_a = DataFrame([[1], [2], [3]],
                         index=['a', 'b', 'c'],
                         columns=['one'])
        df_b = DataFrame([['foo'], ['bar']], index=[1, 2], columns=['two'])
        df_c = DataFrame([[1], [2]], index=[1, 2], columns=['three'])
        joined = df_a.join(df_b, on='one')
        joined = joined.join(df_c, on='one')
        assert np.isnan(joined['two']['c'])
        assert np.isnan(joined['three']['c'])

        # merge column not p resent
        with pytest.raises(KeyError, match="^'E'$"):
            target.join(source, on='E')

        # overlap
        source_copy = source.copy()
        source_copy['A'] = 0
        msg = ("You are trying to merge on float64 and object columns. If"
               " you wish to proceed you should use pd.concat")
        with pytest.raises(ValueError, match=msg):
            target.join(source_copy, on='A')

    def test_join_on_fails_with_different_right_index(self):
        df = DataFrame({
            'a': np.random.choice(['m', 'f'], size=3),
            'b': np.random.randn(3)
        })
        df2 = DataFrame(
            {
                'a': np.random.choice(['m', 'f'], size=10),
                'b': np.random.randn(10)
            },
            index=tm.makeCustomIndex(10, 2))
        msg = (r'len\(left_on\) must equal the number of levels in the index'
               ' of "right"')
        with pytest.raises(ValueError, match=msg):
            merge(df, df2, left_on='a', right_index=True)

    def test_join_on_fails_with_different_left_index(self):
        df = DataFrame(
            {
                'a': np.random.choice(['m', 'f'], size=3),
                'b': np.random.randn(3)
            },
            index=tm.makeCustomIndex(3, 2))
        df2 = DataFrame({
            'a': np.random.choice(['m', 'f'], size=10),
            'b': np.random.randn(10)
        })
        msg = (r'len\(right_on\) must equal the number of levels in the index'
               ' of "left"')
        with pytest.raises(ValueError, match=msg):
            merge(df, df2, right_on='b', left_index=True)

    def test_join_on_fails_with_different_column_counts(self):
        df = DataFrame({
            'a': np.random.choice(['m', 'f'], size=3),
            'b': np.random.randn(3)
        })
        df2 = DataFrame(
            {
                'a': np.random.choice(['m', 'f'], size=10),
                'b': np.random.randn(10)
            },
            index=tm.makeCustomIndex(10, 2))
        msg = r"len\(right_on\) must equal len\(left_on\)"
        with pytest.raises(ValueError, match=msg):
            merge(df, df2, right_on='a', left_on=['a', 'b'])

    @pytest.mark.parametrize("wrong_type", [2, 'str', None, np.array([0, 1])])
    def test_join_on_fails_with_wrong_object_type(self, wrong_type):
        # GH12081 - original issue

        # GH21220 - merging of Series and DataFrame is now allowed
        # Edited test to remove the Series object from test parameters

        df = DataFrame({'a': [1, 1]})
        msg = ("Can only merge Series or DataFrame objects, a {} was passed".
               format(str(type(wrong_type))))
        with pytest.raises(TypeError, match=msg):
            merge(wrong_type, df, left_on='a', right_on='a')
        with pytest.raises(TypeError, match=msg):
            merge(df, wrong_type, left_on='a', right_on='a')

    def test_join_on_pass_vector(self):
        expected = self.target.join(self.source, on='C')
        del expected['C']

        join_col = self.target.pop('C')
        result = self.target.join(self.source, on=join_col)
        assert_frame_equal(result, expected)

    def test_join_with_len0(self):
        # nothing to merge
        merged = self.target.join(self.source.reindex([]), on='C')
        for col in self.source:
            assert col in merged
            assert merged[col].isna().all()

        merged2 = self.target.join(self.source.reindex([]),
                                   on='C',
                                   how='inner')
        tm.assert_index_equal(merged2.columns, merged.columns)
        assert len(merged2) == 0

    def test_join_on_inner(self):
        df = DataFrame({'key': ['a', 'a', 'd', 'b', 'b', 'c']})
        df2 = DataFrame({'value': [0, 1]}, index=['a', 'b'])

        joined = df.join(df2, on='key', how='inner')

        expected = df.join(df2, on='key')
        expected = expected[expected['value'].notna()]
        tm.assert_series_equal(joined['key'],
                               expected['key'],
                               check_dtype=False)
        tm.assert_series_equal(joined['value'],
                               expected['value'],
                               check_dtype=False)
        tm.assert_index_equal(joined.index, expected.index)

    def test_join_on_singlekey_list(self):
        df = DataFrame({'key': ['a', 'a', 'b', 'b', 'c']})
        df2 = DataFrame({'value': [0, 1, 2]}, index=['a', 'b', 'c'])

        # corner cases
        joined = df.join(df2, on=['key'])
        expected = df.join(df2, on='key')

        assert_frame_equal(joined, expected)

    def test_join_on_series(self):
        result = self.target.join(self.source['MergedA'], on='C')
        expected = self.target.join(self.source[['MergedA']], on='C')
        assert_frame_equal(result, expected)

    def test_join_on_series_buglet(self):
        # GH #638
        df = DataFrame({'a': [1, 1]})
        ds = Series([2], index=[1], name='b')
        result = df.join(ds, on='a')
        expected = DataFrame({'a': [1, 1], 'b': [2, 2]}, index=df.index)
        tm.assert_frame_equal(result, expected)

    def test_join_index_mixed(self, join_type):
        # no overlapping blocks
        df1 = DataFrame(index=np.arange(10))
        df1['bool'] = True
        df1['string'] = 'foo'

        df2 = DataFrame(index=np.arange(5, 15))
        df2['int'] = 1
        df2['float'] = 1.

        joined = df1.join(df2, how=join_type)
        expected = _join_by_hand(df1, df2, how=join_type)
        assert_frame_equal(joined, expected)

        joined = df2.join(df1, how=join_type)
        expected = _join_by_hand(df2, df1, how=join_type)
        assert_frame_equal(joined, expected)

    def test_join_index_mixed_overlap(self):
        df1 = DataFrame({
            'A': 1.,
            'B': 2,
            'C': 'foo',
            'D': True
        },
                        index=np.arange(10),
                        columns=['A', 'B', 'C', 'D'])
        assert df1['B'].dtype == np.int64
        assert df1['D'].dtype == np.bool_

        df2 = DataFrame({
            'A': 1.,
            'B': 2,
            'C': 'foo',
            'D': True
        },
                        index=np.arange(0, 10, 2),
                        columns=['A', 'B', 'C', 'D'])

        # overlap
        joined = df1.join(df2, lsuffix='_one', rsuffix='_two')
        expected_columns = [
            'A_one', 'B_one', 'C_one', 'D_one', 'A_two', 'B_two', 'C_two',
            'D_two'
        ]
        df1.columns = expected_columns[:4]
        df2.columns = expected_columns[4:]
        expected = _join_by_hand(df1, df2)
        assert_frame_equal(joined, expected)

    def test_join_empty_bug(self):
        # generated an exception in 0.4.3
        x = DataFrame()
        x.join(DataFrame([3], index=[0], columns=['A']), how='outer')

    def test_join_unconsolidated(self):
        # GH #331
        a = DataFrame(randn(30, 2), columns=['a', 'b'])
        c = Series(randn(30))
        a['c'] = c
        d = DataFrame(randn(30, 1), columns=['q'])

        # it works!
        a.join(d)
        d.join(a)

    def test_join_multiindex(self):
        index1 = MultiIndex.from_arrays(
            [['a', 'a', 'a', 'b', 'b', 'b'], [1, 2, 3, 1, 2, 3]],
            names=['first', 'second'])

        index2 = MultiIndex.from_arrays(
            [['b', 'b', 'b', 'c', 'c', 'c'], [1, 2, 3, 1, 2, 3]],
            names=['first', 'second'])

        df1 = DataFrame(data=np.random.randn(6),
                        index=index1,
                        columns=['var X'])
        df2 = DataFrame(data=np.random.randn(6),
                        index=index2,
                        columns=['var Y'])

        df1 = df1.sort_index(level=0)
        df2 = df2.sort_index(level=0)

        joined = df1.join(df2, how='outer')
        ex_index = Index(index1.values).union(Index(index2.values))
        expected = df1.reindex(ex_index).join(df2.reindex(ex_index))
        expected.index.names = index1.names
        assert_frame_equal(joined, expected)
        assert joined.index.names == index1.names

        df1 = df1.sort_index(level=1)
        df2 = df2.sort_index(level=1)

        joined = df1.join(df2, how='outer').sort_index(level=0)
        ex_index = Index(index1.values).union(Index(index2.values))
        expected = df1.reindex(ex_index).join(df2.reindex(ex_index))
        expected.index.names = index1.names

        assert_frame_equal(joined, expected)
        assert joined.index.names == index1.names

    def test_join_inner_multiindex(self):
        key1 = [
            'bar', 'bar', 'bar', 'foo', 'foo', 'baz', 'baz', 'qux', 'qux',
            'snap'
        ]
        key2 = [
            'two', 'one', 'three', 'one', 'two', 'one', 'two', 'two', 'three',
            'one'
        ]

        data = np.random.randn(len(key1))
        data = DataFrame({'key1': key1, 'key2': key2, 'data': data})

        index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'],
                                   ['one', 'two', 'three']],
                           codes=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3],
                                  [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]],
                           names=['first', 'second'])
        to_join = DataFrame(np.random.randn(10, 3),
                            index=index,
                            columns=['j_one', 'j_two', 'j_three'])

        joined = data.join(to_join, on=['key1', 'key2'], how='inner')
        expected = merge(data,
                         to_join.reset_index(),
                         left_on=['key1', 'key2'],
                         right_on=['first', 'second'],
                         how='inner',
                         sort=False)

        expected2 = merge(to_join,
                          data,
                          right_on=['key1', 'key2'],
                          left_index=True,
                          how='inner',
                          sort=False)
        assert_frame_equal(joined, expected2.reindex_like(joined))

        expected2 = merge(to_join,
                          data,
                          right_on=['key1', 'key2'],
                          left_index=True,
                          how='inner',
                          sort=False)

        expected = expected.drop(['first', 'second'], axis=1)
        expected.index = joined.index

        assert joined.index.is_monotonic
        assert_frame_equal(joined, expected)

        # _assert_same_contents(expected, expected2.loc[:, expected.columns])

    def test_join_hierarchical_mixed(self):
        # GH 2024
        df = DataFrame([(1, 2, 3), (4, 5, 6)], columns=['a', 'b', 'c'])
        new_df = df.groupby(['a']).agg({'b': [np.mean, np.sum]})
        other_df = DataFrame([(1, 2, 3), (7, 10, 6)], columns=['a', 'b', 'd'])
        other_df.set_index('a', inplace=True)
        # GH 9455, 12219
        with tm.assert_produces_warning(UserWarning):
            result = merge(new_df, other_df, left_index=True, right_index=True)
        assert ('b', 'mean') in result
        assert 'b' in result

    def test_join_float64_float32(self):

        a = DataFrame(randn(10, 2), columns=['a', 'b'], dtype=np.float64)
        b = DataFrame(randn(10, 1), columns=['c'], dtype=np.float32)
        joined = a.join(b)
        assert joined.dtypes['a'] == 'float64'
        assert joined.dtypes['b'] == 'float64'
        assert joined.dtypes['c'] == 'float32'

        a = np.random.randint(0, 5, 100).astype('int64')
        b = np.random.random(100).astype('float64')
        c = np.random.random(100).astype('float32')
        df = DataFrame({'a': a, 'b': b, 'c': c})
        xpdf = DataFrame({'a': a, 'b': b, 'c': c})
        s = DataFrame(np.random.random(5).astype('float32'), columns=['md'])
        rs = df.merge(s, left_on='a', right_index=True)
        assert rs.dtypes['a'] == 'int64'
        assert rs.dtypes['b'] == 'float64'
        assert rs.dtypes['c'] == 'float32'
        assert rs.dtypes['md'] == 'float32'

        xp = xpdf.merge(s, left_on='a', right_index=True)
        assert_frame_equal(rs, xp)

    def test_join_many_non_unique_index(self):
        df1 = DataFrame({"a": [1, 1], "b": [1, 1], "c": [10, 20]})
        df2 = DataFrame({"a": [1, 1], "b": [1, 2], "d": [100, 200]})
        df3 = DataFrame({"a": [1, 1], "b": [1, 2], "e": [1000, 2000]})
        idf1 = df1.set_index(["a", "b"])
        idf2 = df2.set_index(["a", "b"])
        idf3 = df3.set_index(["a", "b"])

        result = idf1.join([idf2, idf3], how='outer')

        df_partially_merged = merge(df1, df2, on=['a', 'b'], how='outer')
        expected = merge(df_partially_merged, df3, on=['a', 'b'], how='outer')

        result = result.reset_index()
        expected = expected[result.columns]
        expected['a'] = expected.a.astype('int64')
        expected['b'] = expected.b.astype('int64')
        assert_frame_equal(result, expected)

        df1 = DataFrame({"a": [1, 1, 1], "b": [1, 1, 1], "c": [10, 20, 30]})
        df2 = DataFrame({"a": [1, 1, 1], "b": [1, 1, 2], "d": [100, 200, 300]})
        df3 = DataFrame({
            "a": [1, 1, 1],
            "b": [1, 1, 2],
            "e": [1000, 2000, 3000]
        })
        idf1 = df1.set_index(["a", "b"])
        idf2 = df2.set_index(["a", "b"])
        idf3 = df3.set_index(["a", "b"])
        result = idf1.join([idf2, idf3], how='inner')

        df_partially_merged = merge(df1, df2, on=['a', 'b'], how='inner')
        expected = merge(df_partially_merged, df3, on=['a', 'b'], how='inner')

        result = result.reset_index()

        assert_frame_equal(result, expected.loc[:, result.columns])

        # GH 11519
        df = DataFrame({
            'A': ['foo', 'bar', 'foo', 'bar', 'foo', 'bar', 'foo', 'foo'],
            'B': ['one', 'one', 'two', 'three', 'two', 'two', 'one', 'three'],
            'C':
            np.random.randn(8),
            'D':
            np.random.randn(8)
        })
        s = Series(np.repeat(np.arange(8), 2),
                   index=np.repeat(np.arange(8), 2),
                   name='TEST')
        inner = df.join(s, how='inner')
        outer = df.join(s, how='outer')
        left = df.join(s, how='left')
        right = df.join(s, how='right')
        assert_frame_equal(inner, outer)
        assert_frame_equal(inner, left)
        assert_frame_equal(inner, right)

    def test_join_sort(self):
        left = DataFrame({
            'key': ['foo', 'bar', 'baz', 'foo'],
            'value': [1, 2, 3, 4]
        })
        right = DataFrame({'value2': ['a', 'b', 'c']},
                          index=['bar', 'baz', 'foo'])

        joined = left.join(right, on='key', sort=True)
        expected = DataFrame(
            {
                'key': ['bar', 'baz', 'foo', 'foo'],
                'value': [2, 3, 1, 4],
                'value2': ['a', 'b', 'c', 'c']
            },
            index=[1, 2, 0, 3])
        assert_frame_equal(joined, expected)

        # smoke test
        joined = left.join(right, on='key', sort=False)
        tm.assert_index_equal(joined.index, pd.Index(lrange(4)))

    def test_join_mixed_non_unique_index(self):
        # GH 12814, unorderable types in py3 with a non-unique index
        df1 = DataFrame({'a': [1, 2, 3, 4]}, index=[1, 2, 3, 'a'])
        df2 = DataFrame({'b': [5, 6, 7, 8]}, index=[1, 3, 3, 4])
        result = df1.join(df2)
        expected = DataFrame(
            {
                'a': [1, 2, 3, 3, 4],
                'b': [5, np.nan, 6, 7, np.nan]
            },
            index=[1, 2, 3, 3, 'a'])
        tm.assert_frame_equal(result, expected)

        df3 = DataFrame({'a': [1, 2, 3, 4]}, index=[1, 2, 2, 'a'])
        df4 = DataFrame({'b': [5, 6, 7, 8]}, index=[1, 2, 3, 4])
        result = df3.join(df4)
        expected = DataFrame({
            'a': [1, 2, 3, 4],
            'b': [5, 6, 6, np.nan]
        },
                             index=[1, 2, 2, 'a'])
        tm.assert_frame_equal(result, expected)

    def test_join_non_unique_period_index(self):
        # GH #16871
        index = pd.period_range('2016-01-01', periods=16, freq='M')
        df = DataFrame([i for i in range(len(index))],
                       index=index,
                       columns=['pnum'])
        df2 = concat([df, df])
        result = df.join(df2, how='inner', rsuffix='_df2')
        expected = DataFrame(np.tile(
            np.arange(16, dtype=np.int64).repeat(2).reshape(-1, 1), 2),
                             columns=['pnum', 'pnum_df2'],
                             index=df2.sort_index().index)
        tm.assert_frame_equal(result, expected)

    def test_mixed_type_join_with_suffix(self):
        # GH #916
        df = DataFrame(np.random.randn(20, 6),
                       columns=['a', 'b', 'c', 'd', 'e', 'f'])
        df.insert(0, 'id', 0)
        df.insert(5, 'dt', 'foo')

        grouped = df.groupby('id')
        mn = grouped.mean()
        cn = grouped.count()

        # it works!
        mn.join(cn, rsuffix='_right')

    def test_join_many(self):
        df = DataFrame(np.random.randn(10, 6), columns=list('abcdef'))
        df_list = [df[['a', 'b']], df[['c', 'd']], df[['e', 'f']]]

        joined = df_list[0].join(df_list[1:])
        tm.assert_frame_equal(joined, df)

        df_list = [
            df[['a', 'b']][:-2], df[['c', 'd']][2:], df[['e', 'f']][1:9]
        ]

        def _check_diff_index(df_list, result, exp_index):
            reindexed = [x.reindex(exp_index) for x in df_list]
            expected = reindexed[0].join(reindexed[1:])
            tm.assert_frame_equal(result, expected)

        # different join types
        joined = df_list[0].join(df_list[1:], how='outer')
        _check_diff_index(df_list, joined, df.index)

        joined = df_list[0].join(df_list[1:])
        _check_diff_index(df_list, joined, df_list[0].index)

        joined = df_list[0].join(df_list[1:], how='inner')
        _check_diff_index(df_list, joined, df.index[2:8])

        msg = "Joining multiple DataFrames only supported for joining on index"
        with pytest.raises(ValueError, match=msg):
            df_list[0].join(df_list[1:], on='a')

    def test_join_many_mixed(self):
        df = DataFrame(np.random.randn(8, 4), columns=['A', 'B', 'C', 'D'])
        df['key'] = ['foo', 'bar'] * 4
        df1 = df.loc[:, ['A', 'B']]
        df2 = df.loc[:, ['C', 'D']]
        df3 = df.loc[:, ['key']]

        result = df1.join([df2, df3])
        assert_frame_equal(result, df)

    def test_join_dups(self):

        # joining dups
        df = concat([
            DataFrame(np.random.randn(10, 4), columns=['A', 'A', 'B', 'B']),
            DataFrame(np.random.randint(0, 10, size=20).reshape(10, 2),
                      columns=['A', 'C'])
        ],
                    axis=1)

        expected = concat([df, df], axis=1)
        result = df.join(df, rsuffix='_2')
        result.columns = expected.columns
        assert_frame_equal(result, expected)

        # GH 4975, invalid join on dups
        w = DataFrame(np.random.randn(4, 2), columns=["x", "y"])
        x = DataFrame(np.random.randn(4, 2), columns=["x", "y"])
        y = DataFrame(np.random.randn(4, 2), columns=["x", "y"])
        z = DataFrame(np.random.randn(4, 2), columns=["x", "y"])

        dta = x.merge(y, left_index=True,
                      right_index=True).merge(z,
                                              left_index=True,
                                              right_index=True,
                                              how="outer")
        dta = dta.merge(w, left_index=True, right_index=True)
        expected = concat([x, y, z, w], axis=1)
        expected.columns = [
            'x_x', 'y_x', 'x_y', 'y_y', 'x_x', 'y_x', 'x_y', 'y_y'
        ]
        assert_frame_equal(dta, expected)

    def test_join_multi_to_multi(self, join_type):
        # GH 20475
        leftindex = MultiIndex.from_product(
            [list('abc'), list('xy'), [1, 2]], names=['abc', 'xy', 'num'])
        left = DataFrame({'v1': range(12)}, index=leftindex)

        rightindex = MultiIndex.from_product(
            [list('abc'), list('xy')], names=['abc', 'xy'])
        right = DataFrame({'v2': [100 * i for i in range(1, 7)]},
                          index=rightindex)

        result = left.join(right, on=['abc', 'xy'], how=join_type)
        expected = (left.reset_index().merge(right.reset_index(),
                                             on=['abc', 'xy'],
                                             how=join_type).set_index(
                                                 ['abc', 'xy', 'num']))
        assert_frame_equal(expected, result)

        msg = (r'len\(left_on\) must equal the number of levels in the index'
               ' of "right"')
        with pytest.raises(ValueError, match=msg):
            left.join(right, on='xy', how=join_type)

        with pytest.raises(ValueError, match=msg):
            right.join(left, on=['abc', 'xy'], how=join_type)

    def test_join_on_tz_aware_datetimeindex(self):
        # GH 23931
        df1 = pd.DataFrame({
            'date':
            pd.date_range(start='2018-01-01', periods=5, tz='America/Chicago'),
            'vals':
            list('abcde')
        })

        df2 = pd.DataFrame({
            'date':
            pd.date_range(start='2018-01-03', periods=5, tz='America/Chicago'),
            'vals_2':
            list('tuvwx')
        })
        result = df1.join(df2.set_index('date'), on='date')
        expected = df1.copy()
        expected['vals_2'] = pd.Series([np.nan] * len(expected), dtype=object)
        assert_frame_equal(result, expected)
    def test_set_index_cast_datetimeindex(self):
        df = DataFrame({
            'A': [datetime(2000, 1, 1) + timedelta(i) for i in range(1000)],
            'B':
            np.random.randn(1000)
        })

        idf = df.set_index('A')
        assert isinstance(idf.index, pd.DatetimeIndex)

        # don't cast a DatetimeIndex WITH a tz, leave as object
        # GH 6032
        i = (pd.DatetimeIndex(
            to_datetime(['2013-1-1 13:00', '2013-1-2 14:00'],
                        errors="raise")).tz_localize('US/Pacific'))
        df = DataFrame(np.random.randn(2, 1), columns=['A'])

        expected = Series(
            np.array([
                pd.Timestamp('2013-01-01 13:00:00-0800', tz='US/Pacific'),
                pd.Timestamp('2013-01-02 14:00:00-0800', tz='US/Pacific')
            ],
                     dtype="object"))

        # convert index to series
        result = Series(i)
        assert_series_equal(result, expected)

        # assignt to frame
        df['B'] = i
        result = df['B']
        assert_series_equal(result, expected, check_names=False)
        assert result.name == 'B'

        # keep the timezone
        result = i.to_series(keep_tz=True)
        assert_series_equal(result.reset_index(drop=True), expected)

        # convert to utc
        df['C'] = i.to_series().reset_index(drop=True)
        result = df['C']
        comp = pd.DatetimeIndex(expected.values).copy()
        comp.tz = None
        tm.assert_numpy_array_equal(result.values, comp.values)

        # list of datetimes with a tz
        df['D'] = i.to_pydatetime()
        result = df['D']
        assert_series_equal(result, expected, check_names=False)
        assert result.name == 'D'

        # GH 6785
        # set the index manually
        import pytz
        df = DataFrame([{
            'ts': datetime(2014, 4, 1, tzinfo=pytz.utc),
            'foo': 1
        }])
        expected = df.set_index('ts')
        df.index = df['ts']
        df.pop('ts')
        assert_frame_equal(df, expected)

        # GH 3950
        # reset_index with single level
        for tz in ['UTC', 'Asia/Tokyo', 'US/Eastern']:
            idx = pd.date_range('1/1/2011',
                                periods=5,
                                freq='D',
                                tz=tz,
                                name='idx')
            df = pd.DataFrame({
                'a': range(5),
                'b': ['A', 'B', 'C', 'D', 'E']
            },
                              index=idx)

            expected = pd.DataFrame(
                {
                    'idx': [
                        datetime(2011, 1, 1),
                        datetime(2011, 1, 2),
                        datetime(2011, 1, 3),
                        datetime(2011, 1, 4),
                        datetime(2011, 1, 5)
                    ],
                    'a':
                    range(5),
                    'b': ['A', 'B', 'C', 'D', 'E']
                },
                columns=['idx', 'a', 'b'])
            expected['idx'] = expected['idx'].apply(
                lambda d: pd.Timestamp(d, tz=tz))
            assert_frame_equal(df.reset_index(), expected)
Example #13
0
def melt(
    frame: DataFrame,
    id_vars=None,
    value_vars=None,
    var_name=None,
    value_name="value",
    col_level=None,
    ignore_index: bool = True,
) -> DataFrame:
    # If multiindex, gather names of columns on all level for checking presence
    # of `id_vars` and `value_vars`
    if isinstance(frame.columns, MultiIndex):
        cols = [x for c in frame.columns for x in c]
    else:
        cols = list(frame.columns)

    if value_name in frame.columns:
        warnings.warn(
            "This dataframe has a column name that matches the 'value_name' column "
            "name of the resulting Dataframe. "
            "In the future this will raise an error, please set the 'value_name' "
            "parameter of DataFrame.melt to a unique name.",
            FutureWarning,
            stacklevel=find_stack_level(),
        )

    if id_vars is not None:
        if not is_list_like(id_vars):
            id_vars = [id_vars]
        elif isinstance(frame.columns, MultiIndex) and not isinstance(id_vars, list):
            raise ValueError(
                "id_vars must be a list of tuples when columns are a MultiIndex"
            )
        else:
            # Check that `id_vars` are in frame
            id_vars = list(id_vars)
            missing = Index(com.flatten(id_vars)).difference(cols)
            if not missing.empty:
                raise KeyError(
                    "The following 'id_vars' are not present "
                    f"in the DataFrame: {list(missing)}"
                )
    else:
        id_vars = []

    if value_vars is not None:
        if not is_list_like(value_vars):
            value_vars = [value_vars]
        elif isinstance(frame.columns, MultiIndex) and not isinstance(value_vars, list):
            raise ValueError(
                "value_vars must be a list of tuples when columns are a MultiIndex"
            )
        else:
            value_vars = list(value_vars)
            # Check that `value_vars` are in frame
            missing = Index(com.flatten(value_vars)).difference(cols)
            if not missing.empty:
                raise KeyError(
                    "The following 'value_vars' are not present in "
                    f"the DataFrame: {list(missing)}"
                )
        if col_level is not None:
            idx = frame.columns.get_level_values(col_level).get_indexer(
                id_vars + value_vars
            )
        else:
            idx = algos.unique(frame.columns.get_indexer_for(id_vars + value_vars))
        frame = frame.iloc[:, idx]
    else:
        frame = frame.copy()

    if col_level is not None:  # allow list or other?
        # frame is a copy
        frame.columns = frame.columns.get_level_values(col_level)

    if var_name is None:
        if isinstance(frame.columns, MultiIndex):
            if len(frame.columns.names) == len(set(frame.columns.names)):
                var_name = frame.columns.names
            else:
                var_name = [f"variable_{i}" for i in range(len(frame.columns.names))]
        else:
            var_name = [
                frame.columns.name if frame.columns.name is not None else "variable"
            ]
    if isinstance(var_name, str):
        var_name = [var_name]

    N, K = frame.shape
    K -= len(id_vars)

    mdata = {}
    for col in id_vars:
        id_data = frame.pop(col)
        if is_extension_array_dtype(id_data):
            id_data = concat([id_data] * K, ignore_index=True)
        else:
            # error: Incompatible types in assignment (expression has type
            # "ndarray[Any, dtype[Any]]", variable has type "Series")
            id_data = np.tile(id_data._values, K)  # type: ignore[assignment]
        mdata[col] = id_data

    mcolumns = id_vars + var_name + [value_name]

    # error: Incompatible types in assignment (expression has type "ndarray",
    # target has type "Series")
    mdata[value_name] = frame._values.ravel("F")  # type: ignore[assignment]
    for i, col in enumerate(var_name):
        # asanyarray will keep the columns as an Index

        # error: Incompatible types in assignment (expression has type "ndarray", target
        # has type "Series")
        mdata[col] = np.asanyarray(  # type: ignore[assignment]
            frame.columns._get_level_values(i)
        ).repeat(N)

    result = frame._constructor(mdata, columns=mcolumns)

    if not ignore_index:
        result.index = tile_compat(frame.index, K)

    return result
class TestJoin(object):
    def setup_method(self, method):
        # aggregate multiple columns
        self.df = DataFrame({
            'key1': get_test_data(),
            'key2': get_test_data(),
            'data1': np.random.randn(N),
            'data2': np.random.randn(N)
        })

        # exclude a couple keys for fun
        self.df = self.df[self.df['key2'] > 1]

        self.df2 = DataFrame({
            'key1':
            get_test_data(n=N // 5),
            'key2':
            get_test_data(ngroups=NGROUPS // 2, n=N // 5),
            'value':
            np.random.randn(N // 5)
        })

        index, data = tm.getMixedTypeDict()
        self.target = DataFrame(data, index=index)

        # Join on string value
        self.source = DataFrame({
            'MergedA': data['A'],
            'MergedD': data['D']
        },
                                index=data['C'])

    def test_cython_left_outer_join(self):
        left = a_([0, 1, 2, 1, 2, 0, 0, 1, 2, 3, 3], dtype=np.int64)
        right = a_([1, 1, 0, 4, 2, 2, 1], dtype=np.int64)
        max_group = 5

        ls, rs = libjoin.left_outer_join(left, right, max_group)

        exp_ls = left.argsort(kind='mergesort')
        exp_rs = right.argsort(kind='mergesort')

        exp_li = a_(
            [0, 1, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, 6, 6, 7, 7, 8, 8, 9, 10])
        exp_ri = a_(
            [0, 0, 0, 1, 2, 3, 1, 2, 3, 1, 2, 3, 4, 5, 4, 5, 4, 5, -1, -1])

        exp_ls = exp_ls.take(exp_li)
        exp_ls[exp_li == -1] = -1

        exp_rs = exp_rs.take(exp_ri)
        exp_rs[exp_ri == -1] = -1

        tm.assert_numpy_array_equal(ls, exp_ls, check_dtype=False)
        tm.assert_numpy_array_equal(rs, exp_rs, check_dtype=False)

    def test_cython_right_outer_join(self):
        left = a_([0, 1, 2, 1, 2, 0, 0, 1, 2, 3, 3], dtype=np.int64)
        right = a_([1, 1, 0, 4, 2, 2, 1], dtype=np.int64)
        max_group = 5

        rs, ls = libjoin.left_outer_join(right, left, max_group)

        exp_ls = left.argsort(kind='mergesort')
        exp_rs = right.argsort(kind='mergesort')

        #            0        1        1        1
        exp_li = a_([
            0,
            1,
            2,
            3,
            4,
            5,
            3,
            4,
            5,
            3,
            4,
            5,
            #            2        2        4
            6,
            7,
            8,
            6,
            7,
            8,
            -1
        ])
        exp_ri = a_([0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, 6])

        exp_ls = exp_ls.take(exp_li)
        exp_ls[exp_li == -1] = -1

        exp_rs = exp_rs.take(exp_ri)
        exp_rs[exp_ri == -1] = -1

        tm.assert_numpy_array_equal(ls, exp_ls, check_dtype=False)
        tm.assert_numpy_array_equal(rs, exp_rs, check_dtype=False)

    def test_cython_inner_join(self):
        left = a_([0, 1, 2, 1, 2, 0, 0, 1, 2, 3, 3], dtype=np.int64)
        right = a_([1, 1, 0, 4, 2, 2, 1, 4], dtype=np.int64)
        max_group = 5

        ls, rs = libjoin.inner_join(left, right, max_group)

        exp_ls = left.argsort(kind='mergesort')
        exp_rs = right.argsort(kind='mergesort')

        exp_li = a_([0, 1, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, 6, 6, 7, 7, 8, 8])
        exp_ri = a_([0, 0, 0, 1, 2, 3, 1, 2, 3, 1, 2, 3, 4, 5, 4, 5, 4, 5])

        exp_ls = exp_ls.take(exp_li)
        exp_ls[exp_li == -1] = -1

        exp_rs = exp_rs.take(exp_ri)
        exp_rs[exp_ri == -1] = -1

        tm.assert_numpy_array_equal(ls, exp_ls, check_dtype=False)
        tm.assert_numpy_array_equal(rs, exp_rs, check_dtype=False)

    def test_left_outer_join(self):
        joined_key2 = merge(self.df, self.df2, on='key2')
        _check_join(self.df, self.df2, joined_key2, ['key2'], how='left')

        joined_both = merge(self.df, self.df2)
        _check_join(self.df,
                    self.df2,
                    joined_both, ['key1', 'key2'],
                    how='left')

    def test_right_outer_join(self):
        joined_key2 = merge(self.df, self.df2, on='key2', how='right')
        _check_join(self.df, self.df2, joined_key2, ['key2'], how='right')

        joined_both = merge(self.df, self.df2, how='right')
        _check_join(self.df,
                    self.df2,
                    joined_both, ['key1', 'key2'],
                    how='right')

    def test_full_outer_join(self):
        joined_key2 = merge(self.df, self.df2, on='key2', how='outer')
        _check_join(self.df, self.df2, joined_key2, ['key2'], how='outer')

        joined_both = merge(self.df, self.df2, how='outer')
        _check_join(self.df,
                    self.df2,
                    joined_both, ['key1', 'key2'],
                    how='outer')

    def test_inner_join(self):
        joined_key2 = merge(self.df, self.df2, on='key2', how='inner')
        _check_join(self.df, self.df2, joined_key2, ['key2'], how='inner')

        joined_both = merge(self.df, self.df2, how='inner')
        _check_join(self.df,
                    self.df2,
                    joined_both, ['key1', 'key2'],
                    how='inner')

    def test_handle_overlap(self):
        joined = merge(self.df, self.df2, on='key2', suffixes=['.foo', '.bar'])

        assert 'key1.foo' in joined
        assert 'key1.bar' in joined

    def test_handle_overlap_arbitrary_key(self):
        joined = merge(self.df,
                       self.df2,
                       left_on='key2',
                       right_on='key1',
                       suffixes=['.foo', '.bar'])
        assert 'key1.foo' in joined
        assert 'key2.bar' in joined

    def test_join_on(self):
        target = self.target
        source = self.source

        merged = target.join(source, on='C')
        tm.assert_series_equal(merged['MergedA'],
                               target['A'],
                               check_names=False)
        tm.assert_series_equal(merged['MergedD'],
                               target['D'],
                               check_names=False)

        # join with duplicates (fix regression from DataFrame/Matrix merge)
        df = DataFrame({'key': ['a', 'a', 'b', 'b', 'c']})
        df2 = DataFrame({'value': [0, 1, 2]}, index=['a', 'b', 'c'])
        joined = df.join(df2, on='key')
        expected = DataFrame({
            'key': ['a', 'a', 'b', 'b', 'c'],
            'value': [0, 0, 1, 1, 2]
        })
        assert_frame_equal(joined, expected)

        # Test when some are missing
        df_a = DataFrame([[1], [2], [3]],
                         index=['a', 'b', 'c'],
                         columns=['one'])
        df_b = DataFrame([['foo'], ['bar']], index=[1, 2], columns=['two'])
        df_c = DataFrame([[1], [2]], index=[1, 2], columns=['three'])
        joined = df_a.join(df_b, on='one')
        joined = joined.join(df_c, on='one')
        assert np.isnan(joined['two']['c'])
        assert np.isnan(joined['three']['c'])

        # merge column not p resent
        pytest.raises(KeyError, target.join, source, on='E')

        # overlap
        source_copy = source.copy()
        source_copy['A'] = 0
        pytest.raises(ValueError, target.join, source_copy, on='A')

    def test_join_on_fails_with_different_right_index(self):
        with pytest.raises(ValueError):
            df = DataFrame({
                'a': np.random.choice(['m', 'f'], size=3),
                'b': np.random.randn(3)
            })
            df2 = DataFrame(
                {
                    'a': np.random.choice(['m', 'f'], size=10),
                    'b': np.random.randn(10)
                },
                index=tm.makeCustomIndex(10, 2))
            merge(df, df2, left_on='a', right_index=True)

    def test_join_on_fails_with_different_left_index(self):
        with pytest.raises(ValueError):
            df = DataFrame(
                {
                    'a': np.random.choice(['m', 'f'], size=3),
                    'b': np.random.randn(3)
                },
                index=tm.makeCustomIndex(10, 2))
            df2 = DataFrame({
                'a': np.random.choice(['m', 'f'], size=10),
                'b': np.random.randn(10)
            })
            merge(df, df2, right_on='b', left_index=True)

    def test_join_on_fails_with_different_column_counts(self):
        with pytest.raises(ValueError):
            df = DataFrame({
                'a': np.random.choice(['m', 'f'], size=3),
                'b': np.random.randn(3)
            })
            df2 = DataFrame(
                {
                    'a': np.random.choice(['m', 'f'], size=10),
                    'b': np.random.randn(10)
                },
                index=tm.makeCustomIndex(10, 2))
            merge(df, df2, right_on='a', left_on=['a', 'b'])

    def test_join_on_fails_with_wrong_object_type(self):
        # GH12081
        wrongly_typed = [Series([0, 1]), 2, 'str', None, np.array([0, 1])]
        df = DataFrame({'a': [1, 1]})

        for obj in wrongly_typed:
            with tm.assert_raises_regex(ValueError, str(type(obj))):
                merge(obj, df, left_on='a', right_on='a')
            with tm.assert_raises_regex(ValueError, str(type(obj))):
                merge(df, obj, left_on='a', right_on='a')

    def test_join_on_pass_vector(self):
        expected = self.target.join(self.source, on='C')
        del expected['C']

        join_col = self.target.pop('C')
        result = self.target.join(self.source, on=join_col)
        assert_frame_equal(result, expected)

    def test_join_with_len0(self):
        # nothing to merge
        merged = self.target.join(self.source.reindex([]), on='C')
        for col in self.source:
            assert col in merged
            assert merged[col].isna().all()

        merged2 = self.target.join(self.source.reindex([]),
                                   on='C',
                                   how='inner')
        tm.assert_index_equal(merged2.columns, merged.columns)
        assert len(merged2) == 0

    def test_join_on_inner(self):
        df = DataFrame({'key': ['a', 'a', 'd', 'b', 'b', 'c']})
        df2 = DataFrame({'value': [0, 1]}, index=['a', 'b'])

        joined = df.join(df2, on='key', how='inner')

        expected = df.join(df2, on='key')
        expected = expected[expected['value'].notna()]
        tm.assert_series_equal(joined['key'],
                               expected['key'],
                               check_dtype=False)
        tm.assert_series_equal(joined['value'],
                               expected['value'],
                               check_dtype=False)
        tm.assert_index_equal(joined.index, expected.index)

    def test_join_on_singlekey_list(self):
        df = DataFrame({'key': ['a', 'a', 'b', 'b', 'c']})
        df2 = DataFrame({'value': [0, 1, 2]}, index=['a', 'b', 'c'])

        # corner cases
        joined = df.join(df2, on=['key'])
        expected = df.join(df2, on='key')

        assert_frame_equal(joined, expected)

    def test_join_on_series(self):
        result = self.target.join(self.source['MergedA'], on='C')
        expected = self.target.join(self.source[['MergedA']], on='C')
        assert_frame_equal(result, expected)

    def test_join_on_series_buglet(self):
        # GH #638
        df = DataFrame({'a': [1, 1]})
        ds = Series([2], index=[1], name='b')
        result = df.join(ds, on='a')
        expected = DataFrame({'a': [1, 1], 'b': [2, 2]}, index=df.index)
        tm.assert_frame_equal(result, expected)

    def test_join_index_mixed(self, join_type):
        # no overlapping blocks
        df1 = DataFrame(index=np.arange(10))
        df1['bool'] = True
        df1['string'] = 'foo'

        df2 = DataFrame(index=np.arange(5, 15))
        df2['int'] = 1
        df2['float'] = 1.

        joined = df1.join(df2, how=join_type)
        expected = _join_by_hand(df1, df2, how=join_type)
        assert_frame_equal(joined, expected)

        joined = df2.join(df1, how=join_type)
        expected = _join_by_hand(df2, df1, how=join_type)
        assert_frame_equal(joined, expected)

    def test_join_index_mixed_overlap(self):
        df1 = DataFrame({
            'A': 1.,
            'B': 2,
            'C': 'foo',
            'D': True
        },
                        index=np.arange(10),
                        columns=['A', 'B', 'C', 'D'])
        assert df1['B'].dtype == np.int64
        assert df1['D'].dtype == np.bool_

        df2 = DataFrame({
            'A': 1.,
            'B': 2,
            'C': 'foo',
            'D': True
        },
                        index=np.arange(0, 10, 2),
                        columns=['A', 'B', 'C', 'D'])

        # overlap
        joined = df1.join(df2, lsuffix='_one', rsuffix='_two')
        expected_columns = [
            'A_one', 'B_one', 'C_one', 'D_one', 'A_two', 'B_two', 'C_two',
            'D_two'
        ]
        df1.columns = expected_columns[:4]
        df2.columns = expected_columns[4:]
        expected = _join_by_hand(df1, df2)
        assert_frame_equal(joined, expected)

    def test_join_empty_bug(self):
        # generated an exception in 0.4.3
        x = DataFrame()
        x.join(DataFrame([3], index=[0], columns=['A']), how='outer')

    def test_join_unconsolidated(self):
        # GH #331
        a = DataFrame(randn(30, 2), columns=['a', 'b'])
        c = Series(randn(30))
        a['c'] = c
        d = DataFrame(randn(30, 1), columns=['q'])

        # it works!
        a.join(d)
        d.join(a)

    def test_join_multiindex(self):
        index1 = MultiIndex.from_arrays(
            [['a', 'a', 'a', 'b', 'b', 'b'], [1, 2, 3, 1, 2, 3]],
            names=['first', 'second'])

        index2 = MultiIndex.from_arrays(
            [['b', 'b', 'b', 'c', 'c', 'c'], [1, 2, 3, 1, 2, 3]],
            names=['first', 'second'])

        df1 = DataFrame(data=np.random.randn(6),
                        index=index1,
                        columns=['var X'])
        df2 = DataFrame(data=np.random.randn(6),
                        index=index2,
                        columns=['var Y'])

        df1 = df1.sort_index(level=0)
        df2 = df2.sort_index(level=0)

        joined = df1.join(df2, how='outer')
        ex_index = Index(index1.values).union(Index(index2.values))
        expected = df1.reindex(ex_index).join(df2.reindex(ex_index))
        expected.index.names = index1.names
        assert_frame_equal(joined, expected)
        assert joined.index.names == index1.names

        df1 = df1.sort_index(level=1)
        df2 = df2.sort_index(level=1)

        joined = df1.join(df2, how='outer').sort_index(level=0)
        ex_index = Index(index1.values).union(Index(index2.values))
        expected = df1.reindex(ex_index).join(df2.reindex(ex_index))
        expected.index.names = index1.names

        assert_frame_equal(joined, expected)
        assert joined.index.names == index1.names

    def test_join_inner_multiindex(self):
        key1 = [
            'bar', 'bar', 'bar', 'foo', 'foo', 'baz', 'baz', 'qux', 'qux',
            'snap'
        ]
        key2 = [
            'two', 'one', 'three', 'one', 'two', 'one', 'two', 'two', 'three',
            'one'
        ]

        data = np.random.randn(len(key1))
        data = DataFrame({'key1': key1, 'key2': key2, 'data': data})

        index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'],
                                   ['one', 'two', 'three']],
                           labels=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3],
                                   [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]],
                           names=['first', 'second'])
        to_join = DataFrame(np.random.randn(10, 3),
                            index=index,
                            columns=['j_one', 'j_two', 'j_three'])

        joined = data.join(to_join, on=['key1', 'key2'], how='inner')
        expected = merge(data,
                         to_join.reset_index(),
                         left_on=['key1', 'key2'],
                         right_on=['first', 'second'],
                         how='inner',
                         sort=False)

        expected2 = merge(to_join,
                          data,
                          right_on=['key1', 'key2'],
                          left_index=True,
                          how='inner',
                          sort=False)
        assert_frame_equal(joined, expected2.reindex_like(joined))

        expected2 = merge(to_join,
                          data,
                          right_on=['key1', 'key2'],
                          left_index=True,
                          how='inner',
                          sort=False)

        expected = expected.drop(['first', 'second'], axis=1)
        expected.index = joined.index

        assert joined.index.is_monotonic
        assert_frame_equal(joined, expected)

        # _assert_same_contents(expected, expected2.loc[:, expected.columns])

    def test_join_hierarchical_mixed(self):
        # GH 2024
        df = DataFrame([(1, 2, 3), (4, 5, 6)], columns=['a', 'b', 'c'])
        new_df = df.groupby(['a']).agg({'b': [np.mean, np.sum]})
        other_df = DataFrame([(1, 2, 3), (7, 10, 6)], columns=['a', 'b', 'd'])
        other_df.set_index('a', inplace=True)
        # GH 9455, 12219
        with tm.assert_produces_warning(UserWarning):
            result = merge(new_df, other_df, left_index=True, right_index=True)
        assert ('b', 'mean') in result
        assert 'b' in result

    def test_join_float64_float32(self):

        a = DataFrame(randn(10, 2), columns=['a', 'b'], dtype=np.float64)
        b = DataFrame(randn(10, 1), columns=['c'], dtype=np.float32)
        joined = a.join(b)
        assert joined.dtypes['a'] == 'float64'
        assert joined.dtypes['b'] == 'float64'
        assert joined.dtypes['c'] == 'float32'

        a = np.random.randint(0, 5, 100).astype('int64')
        b = np.random.random(100).astype('float64')
        c = np.random.random(100).astype('float32')
        df = DataFrame({'a': a, 'b': b, 'c': c})
        xpdf = DataFrame({'a': a, 'b': b, 'c': c})
        s = DataFrame(np.random.random(5).astype('float32'), columns=['md'])
        rs = df.merge(s, left_on='a', right_index=True)
        assert rs.dtypes['a'] == 'int64'
        assert rs.dtypes['b'] == 'float64'
        assert rs.dtypes['c'] == 'float32'
        assert rs.dtypes['md'] == 'float32'

        xp = xpdf.merge(s, left_on='a', right_index=True)
        assert_frame_equal(rs, xp)

    def test_join_many_non_unique_index(self):
        df1 = DataFrame({"a": [1, 1], "b": [1, 1], "c": [10, 20]})
        df2 = DataFrame({"a": [1, 1], "b": [1, 2], "d": [100, 200]})
        df3 = DataFrame({"a": [1, 1], "b": [1, 2], "e": [1000, 2000]})
        idf1 = df1.set_index(["a", "b"])
        idf2 = df2.set_index(["a", "b"])
        idf3 = df3.set_index(["a", "b"])

        result = idf1.join([idf2, idf3], how='outer')

        df_partially_merged = merge(df1, df2, on=['a', 'b'], how='outer')
        expected = merge(df_partially_merged, df3, on=['a', 'b'], how='outer')

        result = result.reset_index()
        expected = expected[result.columns]
        expected['a'] = expected.a.astype('int64')
        expected['b'] = expected.b.astype('int64')
        assert_frame_equal(result, expected)

        df1 = DataFrame({"a": [1, 1, 1], "b": [1, 1, 1], "c": [10, 20, 30]})
        df2 = DataFrame({"a": [1, 1, 1], "b": [1, 1, 2], "d": [100, 200, 300]})
        df3 = DataFrame({
            "a": [1, 1, 1],
            "b": [1, 1, 2],
            "e": [1000, 2000, 3000]
        })
        idf1 = df1.set_index(["a", "b"])
        idf2 = df2.set_index(["a", "b"])
        idf3 = df3.set_index(["a", "b"])
        result = idf1.join([idf2, idf3], how='inner')

        df_partially_merged = merge(df1, df2, on=['a', 'b'], how='inner')
        expected = merge(df_partially_merged, df3, on=['a', 'b'], how='inner')

        result = result.reset_index()

        assert_frame_equal(result, expected.loc[:, result.columns])

        # GH 11519
        df = DataFrame({
            'A': ['foo', 'bar', 'foo', 'bar', 'foo', 'bar', 'foo', 'foo'],
            'B': ['one', 'one', 'two', 'three', 'two', 'two', 'one', 'three'],
            'C':
            np.random.randn(8),
            'D':
            np.random.randn(8)
        })
        s = Series(np.repeat(np.arange(8), 2),
                   index=np.repeat(np.arange(8), 2),
                   name='TEST')
        inner = df.join(s, how='inner')
        outer = df.join(s, how='outer')
        left = df.join(s, how='left')
        right = df.join(s, how='right')
        assert_frame_equal(inner, outer)
        assert_frame_equal(inner, left)
        assert_frame_equal(inner, right)

    def test_join_sort(self):
        left = DataFrame({
            'key': ['foo', 'bar', 'baz', 'foo'],
            'value': [1, 2, 3, 4]
        })
        right = DataFrame({'value2': ['a', 'b', 'c']},
                          index=['bar', 'baz', 'foo'])

        joined = left.join(right, on='key', sort=True)
        expected = DataFrame(
            {
                'key': ['bar', 'baz', 'foo', 'foo'],
                'value': [2, 3, 1, 4],
                'value2': ['a', 'b', 'c', 'c']
            },
            index=[1, 2, 0, 3])
        assert_frame_equal(joined, expected)

        # smoke test
        joined = left.join(right, on='key', sort=False)
        tm.assert_index_equal(joined.index, pd.Index(lrange(4)))

    def test_join_mixed_non_unique_index(self):
        # GH 12814, unorderable types in py3 with a non-unique index
        df1 = DataFrame({'a': [1, 2, 3, 4]}, index=[1, 2, 3, 'a'])
        df2 = DataFrame({'b': [5, 6, 7, 8]}, index=[1, 3, 3, 4])
        result = df1.join(df2)
        expected = DataFrame(
            {
                'a': [1, 2, 3, 3, 4],
                'b': [5, np.nan, 6, 7, np.nan]
            },
            index=[1, 2, 3, 3, 'a'])
        tm.assert_frame_equal(result, expected)

        df3 = DataFrame({'a': [1, 2, 3, 4]}, index=[1, 2, 2, 'a'])
        df4 = DataFrame({'b': [5, 6, 7, 8]}, index=[1, 2, 3, 4])
        result = df3.join(df4)
        expected = DataFrame({
            'a': [1, 2, 3, 4],
            'b': [5, 6, 6, np.nan]
        },
                             index=[1, 2, 2, 'a'])
        tm.assert_frame_equal(result, expected)

    def test_join_non_unique_period_index(self):
        # GH #16871
        index = pd.period_range('2016-01-01', periods=16, freq='M')
        df = DataFrame([i for i in range(len(index))],
                       index=index,
                       columns=['pnum'])
        df2 = concat([df, df])
        result = df.join(df2, how='inner', rsuffix='_df2')
        expected = DataFrame(np.tile(
            np.arange(16, dtype=np.int64).repeat(2).reshape(-1, 1), 2),
                             columns=['pnum', 'pnum_df2'],
                             index=df2.sort_index().index)
        tm.assert_frame_equal(result, expected)

    def test_mixed_type_join_with_suffix(self):
        # GH #916
        df = DataFrame(np.random.randn(20, 6),
                       columns=['a', 'b', 'c', 'd', 'e', 'f'])
        df.insert(0, 'id', 0)
        df.insert(5, 'dt', 'foo')

        grouped = df.groupby('id')
        mn = grouped.mean()
        cn = grouped.count()

        # it works!
        mn.join(cn, rsuffix='_right')

    def test_join_many(self):
        df = DataFrame(np.random.randn(10, 6), columns=list('abcdef'))
        df_list = [df[['a', 'b']], df[['c', 'd']], df[['e', 'f']]]

        joined = df_list[0].join(df_list[1:])
        tm.assert_frame_equal(joined, df)

        df_list = [
            df[['a', 'b']][:-2], df[['c', 'd']][2:], df[['e', 'f']][1:9]
        ]

        def _check_diff_index(df_list, result, exp_index):
            reindexed = [x.reindex(exp_index) for x in df_list]
            expected = reindexed[0].join(reindexed[1:])
            tm.assert_frame_equal(result, expected)

        # different join types
        joined = df_list[0].join(df_list[1:], how='outer')
        _check_diff_index(df_list, joined, df.index)

        joined = df_list[0].join(df_list[1:])
        _check_diff_index(df_list, joined, df_list[0].index)

        joined = df_list[0].join(df_list[1:], how='inner')
        _check_diff_index(df_list, joined, df.index[2:8])

        pytest.raises(ValueError, df_list[0].join, df_list[1:], on='a')

    def test_join_many_mixed(self):
        df = DataFrame(np.random.randn(8, 4), columns=['A', 'B', 'C', 'D'])
        df['key'] = ['foo', 'bar'] * 4
        df1 = df.loc[:, ['A', 'B']]
        df2 = df.loc[:, ['C', 'D']]
        df3 = df.loc[:, ['key']]

        result = df1.join([df2, df3])
        assert_frame_equal(result, df)

    def test_join_dups(self):

        # joining dups
        df = concat([
            DataFrame(np.random.randn(10, 4), columns=['A', 'A', 'B', 'B']),
            DataFrame(np.random.randint(0, 10, size=20).reshape(10, 2),
                      columns=['A', 'C'])
        ],
                    axis=1)

        expected = concat([df, df], axis=1)
        result = df.join(df, rsuffix='_2')
        result.columns = expected.columns
        assert_frame_equal(result, expected)

        # GH 4975, invalid join on dups
        w = DataFrame(np.random.randn(4, 2), columns=["x", "y"])
        x = DataFrame(np.random.randn(4, 2), columns=["x", "y"])
        y = DataFrame(np.random.randn(4, 2), columns=["x", "y"])
        z = DataFrame(np.random.randn(4, 2), columns=["x", "y"])

        dta = x.merge(y, left_index=True,
                      right_index=True).merge(z,
                                              left_index=True,
                                              right_index=True,
                                              how="outer")
        dta = dta.merge(w, left_index=True, right_index=True)
        expected = concat([x, y, z, w], axis=1)
        expected.columns = [
            'x_x', 'y_x', 'x_y', 'y_y', 'x_x', 'y_x', 'x_y', 'y_y'
        ]
        assert_frame_equal(dta, expected)

    def test_panel_join(self):
        with catch_warnings(record=True):
            panel = tm.makePanel()
            tm.add_nans(panel)

            p1 = panel.iloc[:2, :10, :3]
            p2 = panel.iloc[2:, 5:, 2:]

            # left join
            result = p1.join(p2)
            expected = p1.copy()
            expected['ItemC'] = p2['ItemC']
            tm.assert_panel_equal(result, expected)

            # right join
            result = p1.join(p2, how='right')
            expected = p2.copy()
            expected['ItemA'] = p1['ItemA']
            expected['ItemB'] = p1['ItemB']
            expected = expected.reindex(items=['ItemA', 'ItemB', 'ItemC'])
            tm.assert_panel_equal(result, expected)

            # inner join
            result = p1.join(p2, how='inner')
            expected = panel.iloc[:, 5:10, 2:3]
            tm.assert_panel_equal(result, expected)

            # outer join
            result = p1.join(p2, how='outer')
            expected = p1.reindex(major=panel.major_axis,
                                  minor=panel.minor_axis)
            expected = expected.join(
                p2.reindex(major=panel.major_axis, minor=panel.minor_axis))
            tm.assert_panel_equal(result, expected)

    def test_panel_join_overlap(self):
        with catch_warnings(record=True):
            panel = tm.makePanel()
            tm.add_nans(panel)

            p1 = panel.loc[['ItemA', 'ItemB', 'ItemC']]
            p2 = panel.loc[['ItemB', 'ItemC']]

            # Expected index is
            #
            # ItemA, ItemB_p1, ItemC_p1, ItemB_p2, ItemC_p2
            joined = p1.join(p2, lsuffix='_p1', rsuffix='_p2')
            p1_suf = p1.loc[['ItemB', 'ItemC']].add_suffix('_p1')
            p2_suf = p2.loc[['ItemB', 'ItemC']].add_suffix('_p2')
            no_overlap = panel.loc[['ItemA']]
            expected = no_overlap.join(p1_suf.join(p2_suf))
            tm.assert_panel_equal(joined, expected)

    def test_panel_join_many(self):
        with catch_warnings(record=True):
            tm.K = 10
            panel = tm.makePanel()
            tm.K = 4

            panels = [panel.iloc[:2], panel.iloc[2:6], panel.iloc[6:]]

            joined = panels[0].join(panels[1:])
            tm.assert_panel_equal(joined, panel)

            panels = [
                panel.iloc[:2, :-5], panel.iloc[2:6, 2:], panel.iloc[6:, 5:-7]
            ]

            data_dict = {}
            for p in panels:
                data_dict.update(p.iteritems())

            joined = panels[0].join(panels[1:], how='inner')
            expected = pd.Panel.from_dict(data_dict, intersect=True)
            tm.assert_panel_equal(joined, expected)

            joined = panels[0].join(panels[1:], how='outer')
            expected = pd.Panel.from_dict(data_dict, intersect=False)
            tm.assert_panel_equal(joined, expected)

            # edge cases
            pytest.raises(ValueError,
                          panels[0].join,
                          panels[1:],
                          how='outer',
                          lsuffix='foo',
                          rsuffix='bar')
            pytest.raises(ValueError, panels[0].join, panels[1:], how='right')
Example #15
0
def __pop_safe(dataframe: DataFrame, column_name: str):
  try:
    return dataframe.pop(column_name)
  except:
    print("DataFrame do not consists such column : {}".format(column_name))
Example #16
0
def _metadata_to_categories(metadata_df: pd.DataFrame) -> list:
    """
    Add category information to `data_df`'s column headers in the format that Clustergrammer expects:
        "([Category 1]: [Value 1], [Category 2]: [Value 2], ...)"
    """
    metadata_df = metadata_df.copy()  # so don't modify original

    CLINICAL_FIELD_PREFIX = "arbitrary_trial_specific_clinical_annotations."
    columns = []
    for c in metadata_df.columns:
        # go through and check cardinality = # unique
        # also rename the columns to pretty things
        cardinality = len(metadata_df[c].unique())
        if (cardinality > CLUSTERGRAMMER_MAX_CATEGORY_CARDINALITY
                or cardinality <= 1 or cardinality == metadata_df.shape[0]):
            # only want if not all the same, not too many, and not each unique to sample

            if c not in [
                    "cimac_participant_id",
                    "cohort_name",
                    "collection_event_name",
            ]:
                # we want to keep the above no matter what
                metadata_df.pop(c)
                continue

        if "(1=Yes,0=No)" in c:
            # these are boolean! let's treat them that way
            metadata_df[c] = metadata_df[c].astype(bool)

        if c.startswith(CLINICAL_FIELD_PREFIX):
            # for 10021 participants.csv:
            ## remove the prefix
            ## remove any parentheses

            cat = c[len(CLINICAL_FIELD_PREFIX):]
            if "(" in cat and ")" in cat and cat.index(")") > cat.index("("):
                cat = cat.split("(", 1)[0] + cat.rsplit(")", 1)[1]
        else:
            # otherwise
            ## break up underscores
            ## title case
            ## drop 'CIDC' / 'CIMAC' anywhere
            ## drop trailing 'Name'
            cat = c.replace("_", " ").title().replace("Cidc",
                                                      "").replace("Cimac", "")
            if cat.endswith("Name") and not cat == "Name":
                cat = cat[:-4]

        # strip so it's pretty!
        if cat.strip() not in columns:
            columns.append(cat.strip())
        else:
            # if it's a repeated name, pop it
            metadata_df.pop(c)

    metadata_df.columns = columns
    print("CG Category options:", ", ".join(columns))

    # cut down to only the categories we want
    columns = [
        c for c in [
            "Participant Id",
            "Collection Event",
            "Cohort",
            "Treatment",
            "Disease progression",
            "RECIST clinical benefit status",
        ] if c in metadata_df.columns
    ]
    columns = sorted(columns, key=lambda c: len(metadata_df[c].unique()))
    metadata_df = metadata_df[columns]

    if "Disease progression" in columns:
        columns[columns.index("Disease progression")] = "Disease prog"
    if "RECIST clinical benefit status" in columns:
        columns[columns.index(
            "RECIST clinical benefit status")] = "Clin benefit"
    metadata_df.columns = columns

    # build the output str in ClusterGrammer compatible format
    categories = []
    for idx, row in metadata_df.iterrows():
        temp = [f"CIMAC Id: {idx}"]

        for cat, val in row.items():
            temp.append(f"{cat}: {val}")

        categories.append(tuple(temp))

    return categories
                   columns=['Close'])
scaler = MinMaxScaler().fit(train2)
train2 = DataFrame(scaler.transform(train2),
                   index=train2.index,
                   columns=train2.columns)

train2 = pd.concat([train2.shift(i)
                    for i in range(window_width, 0, -1)] + [train2],
                   axis=1)[window_width:]
train2.columns = [
    column + '-' + str(t)
    for column, t in zip(train2.columns, range(window_width, 0, -1))
] + ['Close']

x_train = train2.drop('Close', axis=1).to_numpy()
y_train = train2.pop('Close').to_numpy()
# x_train = scaler.fit_transform(x_train)
# y_train = scaler.fit_transform(y_train.reshape(-1, 1))

x_train = np.reshape(x_train, (x_train.shape[0], x_train.shape[1], 1))

load_model = True
if load_model:
    with open(json_file, "r") as f:
        json_model = f.read()
    model = model_from_json(json_model)
    model.load_weights(weights_file)
else:
    model = Sequential()
    model.add(LSTM(40, input_shape=(x_train.shape[1], 1)))
    # model.add(Dropout(0.3))
Example #18
0
 def split_predictor(self, data: DataFrame) -> Tuple[DataFrame, Series]:
     predictions = self.tensor_predictor(data.pop("Targets").to_list())
     return data, predictions
Example #19
0
    def _get_next_state(
        state: pd.DataFrame,
        seperation: float,
        cohesion: float,
        alignment: float,
        visibility: float,
        dimensions: tp.List[str],
        step: float,
    ) -> pd.DataFrame:

        # Self-cross-product Boids for all (center, neighbor) pairs.
        state["i"] = range(len(state))
        state["j"] = 0
        pairs = pd.merge(
            left=state,
            right=state.add_prefix(prefix="n"),
            left_on="j",
            right_on="nj",
            how="outer",
        )

        # Unpack columns.
        cols = [
            (
                f"p{i}",  # Positions
                f"v{i}",  # Velocitys
                f"np{i}",  # Neighbor positions.
                f"nv{i}",  # Neighbor velocitys.
                f"nd{i}",  # Neighbor distances.
            ) for i in dimensions
        ]
        p, v, np, nv, nd = map(list, zip(*cols))

        # For each dimension:
        for pi, npi, ndi in zip(p, np, nd):
            # Compute neighbor-to-center translations.
            pairs[ndi] = pairs[pi] - pairs[npi]

        # Compute neighbor-to-center distances.
        ndmag = pairs[nd].pow(2).sum(axis=1).pow(0.5)

        # Subset pairs to visible neighbors.
        pairs = pairs.loc[ndmag.le(visibility)]

        # For each dimension:
        for ndi in nd:
            # Transform neighbor-to-center translations to repulsions.
            pairs[ndi] /= ndmag.pow(2)

        # Compute neighbor velocity magnitudes.
        nvmag = pairs[nv].pow(2).sum(axis=1).pow(0.5)

        # For each dimension:
        for nvi in nv:
            # Transform neighbor velocities to (unit) neighbor directions.
            pairs[nvi] /= nvmag
            pairs[nvi].where(cond=nvmag.gt(0), other=0, inplace=True)

        # Nullify neighbors that are centers.
        pairs.loc[pairs["i"] == pairs["ni"], [*np, *nv, *nd]] = None

        # Augment repulsor behaviour.
        centers = pairs["t"].eq("repulsor")
        pairs.loc[centers, np] = None
        pairs.loc[centers, nv] = None
        pairs.loc[centers, nd] = None
        neighbors = pairs["nt"].eq("repulsor")
        pairs.loc[neighbors, np] = None
        pairs.loc[neighbors, nv] = None
        pairs.loc[neighbors, nd] *= 30

        # Aggregate neighbor information per center Boid.
        agg_last = {col: "last" for col in ("t", *p, *v)}
        agg_mean = {col: "mean" for col in (*np, *nv, *nd)}
        agg = {**agg_last, **agg_mean}
        groups = pairs.groupby(by="i", as_index=False, sort=False)
        state = groups.agg(func=agg).drop(columns="i")

        # For each dimension:
        for pi, npi in zip(p, np):
            # Transform mean-neighbor positions to center-to-mean-neighbor translations.
            state[npi] -= state[pi]

        # For each dimension:
        for pi, vi, npi, nvi, ndi in zip(p, v, np, nv, nd):
            # Compute accelerations.
            ai = 0
            ai += seperation * state.pop(ndi).where(cond=pd.notnull, other=0)
            ai += cohesion * state.pop(npi).where(cond=pd.notnull, other=0)
            ai += alignment * state.pop(nvi).where(cond=pd.notnull, other=0)
            # Update velocities and positions.
            state[vi] += ai * step**2
            state[pi] += state[vi] * step

        return state
Example #20
0
def create_frame_from_record(
    record: pd.DataFrame,
    col_start_dt: str,
    col_end_dt: str,
    frequency: str,
    col_date_nm: str,
    **kwargs,
) -> pd.DataFrame:
    """
    Create a frame with a date colum ranging from the start_dt to the end_dt from a record.

    This function is to be used when you have a record of data from a DataFrame and you want to
    expand the data out over a date range. Note the DataFrame must contain a column for a start date
    and an end date. All columns from the original record will be duplicated the length of the date
    range. See example for usage.

    If wanting to do this for many records use the function ``footings.actuarial_tools.expand_frame_per_record``.
    When doing one record ``create_frame_from_record`` is faster than expand_frame_per_record.

    Parameters
    ----------
    record : pd.DataFrame
        A single record from a DataFrame.
    col_start_dt : str
        The name of the start date column.
    col_end_dt : str
        The name of the end date column.
    frequency : str
        The frequency at which records are created.
    col_date_nm : str
        The column name to assign the date column.
    kwargs :
        See kwargs under footings.actuarial_tools.create_frame

    Returns
    -------
    pandas.DataFrame
        A DataFrame with a date column, any passed kwargs, and columns from the record.

    Raises
    ------
    ValueError
        If the length of record is not 1.

    See Also
    --------
    footings.actuarial_tools.create_frame
    footings.actuarial_tools.expand_frame_per_record

    Examples
    --------
    >>> import pandas as pd
    >>> from footings.actuarial_tools import create_frame_from_record
    >>> record = pd.DataFrame(
    >>>     {
    >>>         "POLICY": ["P1"],
    >>>         "GENDER": ["M"],
    >>>         "START_DATE": [pd.Timestamp("2020-01-10")],
    >>>         "END_DATE": [pd.Timestamp("2020-05-30")]
    >>>     }
    >>> )
    >>> frame = create_frame_from_record(
    >>>     record=record,
    >>>     col_start_dt="START_DATE",
    >>>     col_end_dt="END_DATE",
    >>>     frequency="M",
    >>>     col_date_nm="DATE",
    >>>     duration_month="DURATION_MONTH"
    >>> )
    >>> frame
    >>> #       DATE            DURATION_MONTH	    POLICY      GENDER
    >>> # 0     2020-01-10	1	            P1	        M
    >>> # 1     2020-02-10	2	            P1	        M
    >>> # 2     2020-03-10	3	            P1	        M
    >>> # 3     2020-04-10	4	            P1	        M
    >>> # 4     2020-05-10	5	            P1	        M
    >>> # 5     2020-06-10	6	            P1	        M
    """
    record = record.to_dict(orient="records")
    if len(record) != 1:
        msg = f"The record must be a pd.DataFrame with one row. The record pass has {len(record)} rows."
        raise ValueError(msg)
    record = record[0]
    start_dt = record[col_start_dt]
    record.pop(col_start_dt)
    end_dt = record[col_end_dt]
    record.pop(col_end_dt)

    return create_frame(
        start_dt=start_dt,
        end_dt=end_dt,
        frequency=frequency,
        col_date_nm=col_date_nm,
        **kwargs,
    ).assign(**record)
Example #21
0
def remove_fea(df: pd.DataFrame, fea_list: list):
    for fea in fea_list:
        df.pop(fea)
Example #22
0
class TestJoin(tm.TestCase):

    _multiprocess_can_split_ = True

    def setUp(self):
        # aggregate multiple columns
        self.df = DataFrame({'key1': get_test_data(),
                             'key2': get_test_data(),
                             'data1': np.random.randn(N),
                             'data2': np.random.randn(N)})

        # exclude a couple keys for fun
        self.df = self.df[self.df['key2'] > 1]

        self.df2 = DataFrame({'key1': get_test_data(n=N // 5),
                              'key2': get_test_data(ngroups=NGROUPS // 2,
                                                    n=N // 5),
                              'value': np.random.randn(N // 5)})

        index, data = tm.getMixedTypeDict()
        self.target = DataFrame(data, index=index)

        # Join on string value
        self.source = DataFrame({'MergedA': data['A'], 'MergedD': data['D']},
                                index=data['C'])

    def test_cython_left_outer_join(self):
        left = a_([0, 1, 2, 1, 2, 0, 0, 1, 2, 3, 3], dtype=np.int64)
        right = a_([1, 1, 0, 4, 2, 2, 1], dtype=np.int64)
        max_group = 5

        ls, rs = _join.left_outer_join(left, right, max_group)

        exp_ls = left.argsort(kind='mergesort')
        exp_rs = right.argsort(kind='mergesort')

        exp_li = a_([0, 1, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5,
                     6, 6, 7, 7, 8, 8, 9, 10])
        exp_ri = a_([0, 0, 0, 1, 2, 3, 1, 2, 3, 1, 2, 3,
                     4, 5, 4, 5, 4, 5, -1, -1])

        exp_ls = exp_ls.take(exp_li)
        exp_ls[exp_li == -1] = -1

        exp_rs = exp_rs.take(exp_ri)
        exp_rs[exp_ri == -1] = -1

        self.assert_numpy_array_equal(ls, exp_ls, check_dtype=False)
        self.assert_numpy_array_equal(rs, exp_rs, check_dtype=False)

    def test_cython_right_outer_join(self):
        left = a_([0, 1, 2, 1, 2, 0, 0, 1, 2, 3, 3], dtype=np.int64)
        right = a_([1, 1, 0, 4, 2, 2, 1], dtype=np.int64)
        max_group = 5

        rs, ls = _join.left_outer_join(right, left, max_group)

        exp_ls = left.argsort(kind='mergesort')
        exp_rs = right.argsort(kind='mergesort')

        #            0        1        1        1
        exp_li = a_([0, 1, 2, 3, 4, 5, 3, 4, 5, 3, 4, 5,
                     #            2        2        4
                     6, 7, 8, 6, 7, 8, -1])
        exp_ri = a_([0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3,
                     4, 4, 4, 5, 5, 5, 6])

        exp_ls = exp_ls.take(exp_li)
        exp_ls[exp_li == -1] = -1

        exp_rs = exp_rs.take(exp_ri)
        exp_rs[exp_ri == -1] = -1

        self.assert_numpy_array_equal(ls, exp_ls, check_dtype=False)
        self.assert_numpy_array_equal(rs, exp_rs, check_dtype=False)

    def test_cython_inner_join(self):
        left = a_([0, 1, 2, 1, 2, 0, 0, 1, 2, 3, 3], dtype=np.int64)
        right = a_([1, 1, 0, 4, 2, 2, 1, 4], dtype=np.int64)
        max_group = 5

        ls, rs = _join.inner_join(left, right, max_group)

        exp_ls = left.argsort(kind='mergesort')
        exp_rs = right.argsort(kind='mergesort')

        exp_li = a_([0, 1, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5,
                     6, 6, 7, 7, 8, 8])
        exp_ri = a_([0, 0, 0, 1, 2, 3, 1, 2, 3, 1, 2, 3,
                     4, 5, 4, 5, 4, 5])

        exp_ls = exp_ls.take(exp_li)
        exp_ls[exp_li == -1] = -1

        exp_rs = exp_rs.take(exp_ri)
        exp_rs[exp_ri == -1] = -1

        self.assert_numpy_array_equal(ls, exp_ls, check_dtype=False)
        self.assert_numpy_array_equal(rs, exp_rs, check_dtype=False)

    def test_left_outer_join(self):
        joined_key2 = merge(self.df, self.df2, on='key2')
        _check_join(self.df, self.df2, joined_key2, ['key2'], how='left')

        joined_both = merge(self.df, self.df2)
        _check_join(self.df, self.df2, joined_both, ['key1', 'key2'],
                    how='left')

    def test_right_outer_join(self):
        joined_key2 = merge(self.df, self.df2, on='key2', how='right')
        _check_join(self.df, self.df2, joined_key2, ['key2'], how='right')

        joined_both = merge(self.df, self.df2, how='right')
        _check_join(self.df, self.df2, joined_both, ['key1', 'key2'],
                    how='right')

    def test_full_outer_join(self):
        joined_key2 = merge(self.df, self.df2, on='key2', how='outer')
        _check_join(self.df, self.df2, joined_key2, ['key2'], how='outer')

        joined_both = merge(self.df, self.df2, how='outer')
        _check_join(self.df, self.df2, joined_both, ['key1', 'key2'],
                    how='outer')

    def test_inner_join(self):
        joined_key2 = merge(self.df, self.df2, on='key2', how='inner')
        _check_join(self.df, self.df2, joined_key2, ['key2'], how='inner')

        joined_both = merge(self.df, self.df2, how='inner')
        _check_join(self.df, self.df2, joined_both, ['key1', 'key2'],
                    how='inner')

    def test_handle_overlap(self):
        joined = merge(self.df, self.df2, on='key2',
                       suffixes=['.foo', '.bar'])

        self.assertIn('key1.foo', joined)
        self.assertIn('key1.bar', joined)

    def test_handle_overlap_arbitrary_key(self):
        joined = merge(self.df, self.df2,
                       left_on='key2', right_on='key1',
                       suffixes=['.foo', '.bar'])
        self.assertIn('key1.foo', joined)
        self.assertIn('key2.bar', joined)

    def test_join_on(self):
        target = self.target
        source = self.source

        merged = target.join(source, on='C')
        self.assert_series_equal(merged['MergedA'], target['A'],
                                 check_names=False)
        self.assert_series_equal(merged['MergedD'], target['D'],
                                 check_names=False)

        # join with duplicates (fix regression from DataFrame/Matrix merge)
        df = DataFrame({'key': ['a', 'a', 'b', 'b', 'c']})
        df2 = DataFrame({'value': [0, 1, 2]}, index=['a', 'b', 'c'])
        joined = df.join(df2, on='key')
        expected = DataFrame({'key': ['a', 'a', 'b', 'b', 'c'],
                              'value': [0, 0, 1, 1, 2]})
        assert_frame_equal(joined, expected)

        # Test when some are missing
        df_a = DataFrame([[1], [2], [3]], index=['a', 'b', 'c'],
                         columns=['one'])
        df_b = DataFrame([['foo'], ['bar']], index=[1, 2],
                         columns=['two'])
        df_c = DataFrame([[1], [2]], index=[1, 2],
                         columns=['three'])
        joined = df_a.join(df_b, on='one')
        joined = joined.join(df_c, on='one')
        self.assertTrue(np.isnan(joined['two']['c']))
        self.assertTrue(np.isnan(joined['three']['c']))

        # merge column not p resent
        self.assertRaises(KeyError, target.join, source, on='E')

        # overlap
        source_copy = source.copy()
        source_copy['A'] = 0
        self.assertRaises(ValueError, target.join, source_copy, on='A')

    def test_join_on_fails_with_different_right_index(self):
        with tm.assertRaises(ValueError):
            df = DataFrame({'a': np.random.choice(['m', 'f'], size=3),
                            'b': np.random.randn(3)})
            df2 = DataFrame({'a': np.random.choice(['m', 'f'], size=10),
                             'b': np.random.randn(10)},
                            index=tm.makeCustomIndex(10, 2))
            merge(df, df2, left_on='a', right_index=True)

    def test_join_on_fails_with_different_left_index(self):
        with tm.assertRaises(ValueError):
            df = DataFrame({'a': np.random.choice(['m', 'f'], size=3),
                            'b': np.random.randn(3)},
                           index=tm.makeCustomIndex(10, 2))
            df2 = DataFrame({'a': np.random.choice(['m', 'f'], size=10),
                             'b': np.random.randn(10)})
            merge(df, df2, right_on='b', left_index=True)

    def test_join_on_fails_with_different_column_counts(self):
        with tm.assertRaises(ValueError):
            df = DataFrame({'a': np.random.choice(['m', 'f'], size=3),
                            'b': np.random.randn(3)})
            df2 = DataFrame({'a': np.random.choice(['m', 'f'], size=10),
                             'b': np.random.randn(10)},
                            index=tm.makeCustomIndex(10, 2))
            merge(df, df2, right_on='a', left_on=['a', 'b'])

    def test_join_on_fails_with_wrong_object_type(self):
        # GH12081
        wrongly_typed = [Series([0, 1]), 2, 'str', None, np.array([0, 1])]
        df = DataFrame({'a': [1, 1]})

        for obj in wrongly_typed:
            with tm.assertRaisesRegexp(ValueError, str(type(obj))):
                merge(obj, df, left_on='a', right_on='a')
            with tm.assertRaisesRegexp(ValueError, str(type(obj))):
                merge(df, obj, left_on='a', right_on='a')

    def test_join_on_pass_vector(self):
        expected = self.target.join(self.source, on='C')
        del expected['C']

        join_col = self.target.pop('C')
        result = self.target.join(self.source, on=join_col)
        assert_frame_equal(result, expected)

    def test_join_with_len0(self):
        # nothing to merge
        merged = self.target.join(self.source.reindex([]), on='C')
        for col in self.source:
            self.assertIn(col, merged)
            self.assertTrue(merged[col].isnull().all())

        merged2 = self.target.join(self.source.reindex([]), on='C',
                                   how='inner')
        self.assert_index_equal(merged2.columns, merged.columns)
        self.assertEqual(len(merged2), 0)

    def test_join_on_inner(self):
        df = DataFrame({'key': ['a', 'a', 'd', 'b', 'b', 'c']})
        df2 = DataFrame({'value': [0, 1]}, index=['a', 'b'])

        joined = df.join(df2, on='key', how='inner')

        expected = df.join(df2, on='key')
        expected = expected[expected['value'].notnull()]
        self.assert_series_equal(joined['key'], expected['key'],
                                 check_dtype=False)
        self.assert_series_equal(joined['value'], expected['value'],
                                 check_dtype=False)
        self.assert_index_equal(joined.index, expected.index)

    def test_join_on_singlekey_list(self):
        df = DataFrame({'key': ['a', 'a', 'b', 'b', 'c']})
        df2 = DataFrame({'value': [0, 1, 2]}, index=['a', 'b', 'c'])

        # corner cases
        joined = df.join(df2, on=['key'])
        expected = df.join(df2, on='key')

        assert_frame_equal(joined, expected)

    def test_join_on_series(self):
        result = self.target.join(self.source['MergedA'], on='C')
        expected = self.target.join(self.source[['MergedA']], on='C')
        assert_frame_equal(result, expected)

    def test_join_on_series_buglet(self):
        # GH #638
        df = DataFrame({'a': [1, 1]})
        ds = Series([2], index=[1], name='b')
        result = df.join(ds, on='a')
        expected = DataFrame({'a': [1, 1],
                              'b': [2, 2]}, index=df.index)
        tm.assert_frame_equal(result, expected)

    def test_join_index_mixed(self):
        df1 = DataFrame({'A': 1., 'B': 2, 'C': 'foo', 'D': True},
                        index=np.arange(10),
                        columns=['A', 'B', 'C', 'D'])
        self.assertEqual(df1['B'].dtype, np.int64)
        self.assertEqual(df1['D'].dtype, np.bool_)

        df2 = DataFrame({'A': 1., 'B': 2, 'C': 'foo', 'D': True},
                        index=np.arange(0, 10, 2),
                        columns=['A', 'B', 'C', 'D'])

        # overlap
        joined = df1.join(df2, lsuffix='_one', rsuffix='_two')
        expected_columns = ['A_one', 'B_one', 'C_one', 'D_one',
                            'A_two', 'B_two', 'C_two', 'D_two']
        df1.columns = expected_columns[:4]
        df2.columns = expected_columns[4:]
        expected = _join_by_hand(df1, df2)
        assert_frame_equal(joined, expected)

        # no overlapping blocks
        df1 = DataFrame(index=np.arange(10))
        df1['bool'] = True
        df1['string'] = 'foo'

        df2 = DataFrame(index=np.arange(5, 15))
        df2['int'] = 1
        df2['float'] = 1.

        for kind in ['inner', 'outer', 'left', 'right']:

            joined = df1.join(df2, how=kind)
            expected = _join_by_hand(df1, df2, how=kind)
            assert_frame_equal(joined, expected)

            joined = df2.join(df1, how=kind)
            expected = _join_by_hand(df2, df1, how=kind)
            assert_frame_equal(joined, expected)

    def test_join_empty_bug(self):
        # generated an exception in 0.4.3
        x = DataFrame()
        x.join(DataFrame([3], index=[0], columns=['A']), how='outer')

    def test_join_unconsolidated(self):
        # GH #331
        a = DataFrame(randn(30, 2), columns=['a', 'b'])
        c = Series(randn(30))
        a['c'] = c
        d = DataFrame(randn(30, 1), columns=['q'])

        # it works!
        a.join(d)
        d.join(a)

    def test_join_multiindex(self):
        index1 = MultiIndex.from_arrays([['a', 'a', 'a', 'b', 'b', 'b'],
                                         [1, 2, 3, 1, 2, 3]],
                                        names=['first', 'second'])

        index2 = MultiIndex.from_arrays([['b', 'b', 'b', 'c', 'c', 'c'],
                                         [1, 2, 3, 1, 2, 3]],
                                        names=['first', 'second'])

        df1 = DataFrame(data=np.random.randn(6), index=index1,
                        columns=['var X'])
        df2 = DataFrame(data=np.random.randn(6), index=index2,
                        columns=['var Y'])

        df1 = df1.sortlevel(0)
        df2 = df2.sortlevel(0)

        joined = df1.join(df2, how='outer')
        ex_index = index1._tuple_index.union(index2._tuple_index)
        expected = df1.reindex(ex_index).join(df2.reindex(ex_index))
        expected.index.names = index1.names
        assert_frame_equal(joined, expected)
        self.assertEqual(joined.index.names, index1.names)

        df1 = df1.sortlevel(1)
        df2 = df2.sortlevel(1)

        joined = df1.join(df2, how='outer').sortlevel(0)
        ex_index = index1._tuple_index.union(index2._tuple_index)
        expected = df1.reindex(ex_index).join(df2.reindex(ex_index))
        expected.index.names = index1.names

        assert_frame_equal(joined, expected)
        self.assertEqual(joined.index.names, index1.names)

    def test_join_inner_multiindex(self):
        key1 = ['bar', 'bar', 'bar', 'foo', 'foo', 'baz', 'baz', 'qux',
                'qux', 'snap']
        key2 = ['two', 'one', 'three', 'one', 'two', 'one', 'two', 'two',
                'three', 'one']

        data = np.random.randn(len(key1))
        data = DataFrame({'key1': key1, 'key2': key2,
                          'data': data})

        index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'],
                                   ['one', 'two', 'three']],
                           labels=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3],
                                   [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]],
                           names=['first', 'second'])
        to_join = DataFrame(np.random.randn(10, 3), index=index,
                            columns=['j_one', 'j_two', 'j_three'])

        joined = data.join(to_join, on=['key1', 'key2'], how='inner')
        expected = merge(data, to_join.reset_index(),
                         left_on=['key1', 'key2'],
                         right_on=['first', 'second'], how='inner',
                         sort=False)

        expected2 = merge(to_join, data,
                          right_on=['key1', 'key2'], left_index=True,
                          how='inner', sort=False)
        assert_frame_equal(joined, expected2.reindex_like(joined))

        expected2 = merge(to_join, data, right_on=['key1', 'key2'],
                          left_index=True, how='inner', sort=False)

        expected = expected.drop(['first', 'second'], axis=1)
        expected.index = joined.index

        self.assertTrue(joined.index.is_monotonic)
        assert_frame_equal(joined, expected)

        # _assert_same_contents(expected, expected2.ix[:, expected.columns])

    def test_join_hierarchical_mixed(self):
        # GH 2024
        df = DataFrame([(1, 2, 3), (4, 5, 6)], columns=['a', 'b', 'c'])
        new_df = df.groupby(['a']).agg({'b': [np.mean, np.sum]})
        other_df = DataFrame(
            [(1, 2, 3), (7, 10, 6)], columns=['a', 'b', 'd'])
        other_df.set_index('a', inplace=True)
        # GH 9455, 12219
        with tm.assert_produces_warning(UserWarning):
            result = merge(new_df, other_df, left_index=True, right_index=True)
        self.assertTrue(('b', 'mean') in result)
        self.assertTrue('b' in result)

    def test_join_float64_float32(self):

        a = DataFrame(randn(10, 2), columns=['a', 'b'], dtype=np.float64)
        b = DataFrame(randn(10, 1), columns=['c'], dtype=np.float32)
        joined = a.join(b)
        self.assertEqual(joined.dtypes['a'], 'float64')
        self.assertEqual(joined.dtypes['b'], 'float64')
        self.assertEqual(joined.dtypes['c'], 'float32')

        a = np.random.randint(0, 5, 100).astype('int64')
        b = np.random.random(100).astype('float64')
        c = np.random.random(100).astype('float32')
        df = DataFrame({'a': a, 'b': b, 'c': c})
        xpdf = DataFrame({'a': a, 'b': b, 'c': c})
        s = DataFrame(np.random.random(5).astype('float32'), columns=['md'])
        rs = df.merge(s, left_on='a', right_index=True)
        self.assertEqual(rs.dtypes['a'], 'int64')
        self.assertEqual(rs.dtypes['b'], 'float64')
        self.assertEqual(rs.dtypes['c'], 'float32')
        self.assertEqual(rs.dtypes['md'], 'float32')

        xp = xpdf.merge(s, left_on='a', right_index=True)
        assert_frame_equal(rs, xp)

    def test_join_many_non_unique_index(self):
        df1 = DataFrame({"a": [1, 1], "b": [1, 1], "c": [10, 20]})
        df2 = DataFrame({"a": [1, 1], "b": [1, 2], "d": [100, 200]})
        df3 = DataFrame({"a": [1, 1], "b": [1, 2], "e": [1000, 2000]})
        idf1 = df1.set_index(["a", "b"])
        idf2 = df2.set_index(["a", "b"])
        idf3 = df3.set_index(["a", "b"])

        result = idf1.join([idf2, idf3], how='outer')

        df_partially_merged = merge(df1, df2, on=['a', 'b'], how='outer')
        expected = merge(df_partially_merged, df3, on=['a', 'b'], how='outer')

        result = result.reset_index()
        expected = expected[result.columns]
        expected['a'] = expected.a.astype('int64')
        expected['b'] = expected.b.astype('int64')
        assert_frame_equal(result, expected)

        df1 = DataFrame({"a": [1, 1, 1], "b": [1, 1, 1], "c": [10, 20, 30]})
        df2 = DataFrame({"a": [1, 1, 1], "b": [1, 1, 2], "d": [100, 200, 300]})
        df3 = DataFrame(
            {"a": [1, 1, 1], "b": [1, 1, 2], "e": [1000, 2000, 3000]})
        idf1 = df1.set_index(["a", "b"])
        idf2 = df2.set_index(["a", "b"])
        idf3 = df3.set_index(["a", "b"])
        result = idf1.join([idf2, idf3], how='inner')

        df_partially_merged = merge(df1, df2, on=['a', 'b'], how='inner')
        expected = merge(df_partially_merged, df3, on=['a', 'b'], how='inner')

        result = result.reset_index()

        assert_frame_equal(result, expected.ix[:, result.columns])

        # GH 11519
        df = DataFrame({'A': ['foo', 'bar', 'foo', 'bar',
                              'foo', 'bar', 'foo', 'foo'],
                        'B': ['one', 'one', 'two', 'three',
                              'two', 'two', 'one', 'three'],
                        'C': np.random.randn(8),
                        'D': np.random.randn(8)})
        s = Series(np.repeat(np.arange(8), 2),
                   index=np.repeat(np.arange(8), 2), name='TEST')
        inner = df.join(s, how='inner')
        outer = df.join(s, how='outer')
        left = df.join(s, how='left')
        right = df.join(s, how='right')
        assert_frame_equal(inner, outer)
        assert_frame_equal(inner, left)
        assert_frame_equal(inner, right)

    def test_join_sort(self):
        left = DataFrame({'key': ['foo', 'bar', 'baz', 'foo'],
                          'value': [1, 2, 3, 4]})
        right = DataFrame({'value2': ['a', 'b', 'c']},
                          index=['bar', 'baz', 'foo'])

        joined = left.join(right, on='key', sort=True)
        expected = DataFrame({'key': ['bar', 'baz', 'foo', 'foo'],
                              'value': [2, 3, 1, 4],
                              'value2': ['a', 'b', 'c', 'c']},
                             index=[1, 2, 0, 3])
        assert_frame_equal(joined, expected)

        # smoke test
        joined = left.join(right, on='key', sort=False)
        self.assert_index_equal(joined.index, pd.Index(lrange(4)))

    def test_join_mixed_non_unique_index(self):
        # GH 12814, unorderable types in py3 with a non-unique index
        df1 = DataFrame({'a': [1, 2, 3, 4]}, index=[1, 2, 3, 'a'])
        df2 = DataFrame({'b': [5, 6, 7, 8]}, index=[1, 3, 3, 4])
        result = df1.join(df2)
        expected = DataFrame({'a': [1, 2, 3, 3, 4],
                              'b': [5, np.nan, 6, 7, np.nan]},
                             index=[1, 2, 3, 3, 'a'])
        tm.assert_frame_equal(result, expected)

        df3 = DataFrame({'a': [1, 2, 3, 4]}, index=[1, 2, 2, 'a'])
        df4 = DataFrame({'b': [5, 6, 7, 8]}, index=[1, 2, 3, 4])
        result = df3.join(df4)
        expected = DataFrame({'a': [1, 2, 3, 4], 'b': [5, 6, 6, np.nan]},
                             index=[1, 2, 2, 'a'])
        tm.assert_frame_equal(result, expected)

    def test_mixed_type_join_with_suffix(self):
        # GH #916
        df = DataFrame(np.random.randn(20, 6),
                       columns=['a', 'b', 'c', 'd', 'e', 'f'])
        df.insert(0, 'id', 0)
        df.insert(5, 'dt', 'foo')

        grouped = df.groupby('id')
        mn = grouped.mean()
        cn = grouped.count()

        # it works!
        mn.join(cn, rsuffix='_right')

    def test_join_many(self):
        df = DataFrame(np.random.randn(10, 6), columns=list('abcdef'))
        df_list = [df[['a', 'b']], df[['c', 'd']], df[['e', 'f']]]

        joined = df_list[0].join(df_list[1:])
        tm.assert_frame_equal(joined, df)

        df_list = [df[['a', 'b']][:-2],
                   df[['c', 'd']][2:], df[['e', 'f']][1:9]]

        def _check_diff_index(df_list, result, exp_index):
            reindexed = [x.reindex(exp_index) for x in df_list]
            expected = reindexed[0].join(reindexed[1:])
            tm.assert_frame_equal(result, expected)

        # different join types
        joined = df_list[0].join(df_list[1:], how='outer')
        _check_diff_index(df_list, joined, df.index)

        joined = df_list[0].join(df_list[1:])
        _check_diff_index(df_list, joined, df_list[0].index)

        joined = df_list[0].join(df_list[1:], how='inner')
        _check_diff_index(df_list, joined, df.index[2:8])

        self.assertRaises(ValueError, df_list[0].join, df_list[1:], on='a')

    def test_join_many_mixed(self):
        df = DataFrame(np.random.randn(8, 4), columns=['A', 'B', 'C', 'D'])
        df['key'] = ['foo', 'bar'] * 4
        df1 = df.ix[:, ['A', 'B']]
        df2 = df.ix[:, ['C', 'D']]
        df3 = df.ix[:, ['key']]

        result = df1.join([df2, df3])
        assert_frame_equal(result, df)

    def test_join_dups(self):

        # joining dups
        df = concat([DataFrame(np.random.randn(10, 4),
                               columns=['A', 'A', 'B', 'B']),
                     DataFrame(np.random.randint(0, 10, size=20)
                               .reshape(10, 2),
                               columns=['A', 'C'])],
                    axis=1)

        expected = concat([df, df], axis=1)
        result = df.join(df, rsuffix='_2')
        result.columns = expected.columns
        assert_frame_equal(result, expected)

        # GH 4975, invalid join on dups
        w = DataFrame(np.random.randn(4, 2), columns=["x", "y"])
        x = DataFrame(np.random.randn(4, 2), columns=["x", "y"])
        y = DataFrame(np.random.randn(4, 2), columns=["x", "y"])
        z = DataFrame(np.random.randn(4, 2), columns=["x", "y"])

        dta = x.merge(y, left_index=True, right_index=True).merge(
            z, left_index=True, right_index=True, how="outer")
        dta = dta.merge(w, left_index=True, right_index=True)
        expected = concat([x, y, z, w], axis=1)
        expected.columns = ['x_x', 'y_x', 'x_y',
                            'y_y', 'x_x', 'y_x', 'x_y', 'y_y']
        assert_frame_equal(dta, expected)

    def test_panel_join(self):
        panel = tm.makePanel()
        tm.add_nans(panel)

        p1 = panel.ix[:2, :10, :3]
        p2 = panel.ix[2:, 5:, 2:]

        # left join
        result = p1.join(p2)
        expected = p1.copy()
        expected['ItemC'] = p2['ItemC']
        tm.assert_panel_equal(result, expected)

        # right join
        result = p1.join(p2, how='right')
        expected = p2.copy()
        expected['ItemA'] = p1['ItemA']
        expected['ItemB'] = p1['ItemB']
        expected = expected.reindex(items=['ItemA', 'ItemB', 'ItemC'])
        tm.assert_panel_equal(result, expected)

        # inner join
        result = p1.join(p2, how='inner')
        expected = panel.ix[:, 5:10, 2:3]
        tm.assert_panel_equal(result, expected)

        # outer join
        result = p1.join(p2, how='outer')
        expected = p1.reindex(major=panel.major_axis,
                              minor=panel.minor_axis)
        expected = expected.join(p2.reindex(major=panel.major_axis,
                                            minor=panel.minor_axis))
        tm.assert_panel_equal(result, expected)

    def test_panel_join_overlap(self):
        panel = tm.makePanel()
        tm.add_nans(panel)

        p1 = panel.ix[['ItemA', 'ItemB', 'ItemC']]
        p2 = panel.ix[['ItemB', 'ItemC']]

        # Expected index is
        #
        # ItemA, ItemB_p1, ItemC_p1, ItemB_p2, ItemC_p2
        joined = p1.join(p2, lsuffix='_p1', rsuffix='_p2')
        p1_suf = p1.ix[['ItemB', 'ItemC']].add_suffix('_p1')
        p2_suf = p2.ix[['ItemB', 'ItemC']].add_suffix('_p2')
        no_overlap = panel.ix[['ItemA']]
        expected = no_overlap.join(p1_suf.join(p2_suf))
        tm.assert_panel_equal(joined, expected)

    def test_panel_join_many(self):
        tm.K = 10
        panel = tm.makePanel()
        tm.K = 4

        panels = [panel.ix[:2], panel.ix[2:6], panel.ix[6:]]

        joined = panels[0].join(panels[1:])
        tm.assert_panel_equal(joined, panel)

        panels = [panel.ix[:2, :-5], panel.ix[2:6, 2:], panel.ix[6:, 5:-7]]

        data_dict = {}
        for p in panels:
            data_dict.update(p.iteritems())

        joined = panels[0].join(panels[1:], how='inner')
        expected = pd.Panel.from_dict(data_dict, intersect=True)
        tm.assert_panel_equal(joined, expected)

        joined = panels[0].join(panels[1:], how='outer')
        expected = pd.Panel.from_dict(data_dict, intersect=False)
        tm.assert_panel_equal(joined, expected)

        # edge cases
        self.assertRaises(ValueError, panels[0].join, panels[1:],
                          how='outer', lsuffix='foo', rsuffix='bar')
        self.assertRaises(ValueError, panels[0].join, panels[1:],
                          how='right')
Example #23
0
def functional_connectivity(data: pd.DataFrame, metric: str = 'cov', **kwargs) -> np.ndarray:
    """Calculate functional connectivity of node timeseries in data.

    Parameters
    ----------
    data
        Pandas dataframe containing the simulation results.
    metric
        Type of connectivtiy measurement that should be used.
            - `cov` for covariance (uses `np.cov`)
            - `corr` for pearsson correlation (uses `np.corrcoef`)
            - `csd` for cross-spectral density (uses `mne.time_frequency.csd_array_morlet`)
            - `coh` for coherence (uses `mne.connectivtiy.spectral_connectivity`)
            - `cohy` for coherency (uses `mne.connectivtiy.spectral_connectivity`)
            - `imcoh` for imaginary coherence (uses `mne.connectivtiy.spectral_connectivity`)
            - `plv` for phase locking value (uses `mne.connectivtiy.spectral_connectivity`)
            - `ppc` for pairwise phase consistency (uses `mne.connectivtiy.spectral_connectivity`)
            - `pli` for phase lag index (uses `mne.connectivtiy.spectral_connectivity`)
            - `pli2_unbiased` for unbiased estimate of squared phase lag index
               (uses `mne.connectivtiy.spectral_connectivity`)
            - `wpli`for weighted phase lag index (uses `mne.connectivtiy.spectral_connectivity`)
            - `wpli2_debiased` for debiased weighted phase lag index (uses `mne.connectivtiy.spectral_connectivity`)
    kwargs
        Additional keyword arguments passed to respective function used for fc calculation.

    Returns
    -------
    np.ndarray
        Pairwise functional connectivity

    """

    if 'time' in data.columns.values:
        idx = data.pop('time')
        data.index = idx

    # calculate functional connectivity
    ###################################

    if metric == 'cov':

        # covariance
        fc = np.cov(data.values.T, **kwargs)

    elif metric == 'corr':

        # pearsson correlation coefficient
        fc = np.corrcoef(data.values.T, **kwargs)

    elif metric == 'csd':

        from mne.time_frequency import csd_array_morlet
        fc = np.abs(csd_array_morlet(X=np.reshape(data.values, (1, data.shape[1], data.shape[0])),
                                     sfreq=1./(data.index[1] - data.index[0]),
                                     ch_names=data.columns.values,
                                     **kwargs).mean().get_data())

    elif metric in 'cohcohyimcohplvppcplipli2_unbiasedwpliwpli2_debiased':

        # phase-based connectivtiy/synchronization measurement
        from mne.connectivity import spectral_connectivity
        fc, _, _, _, _ = spectral_connectivity(np.reshape(data.values.T, (1, data.shape[1], data.shape[0])),
                                               method=metric,
                                               sfreq=1./(data.index[1] - data.index[0]),
                                               **kwargs)
        fc = fc.squeeze()

    else:

        raise ValueError(f'FC metric is not supported by this function: {metric}. Check the documentation of the '
                         f'argument `metric` for valid options.')

    return fc
Example #24
0
 def labeling(self, data: pd.DataFrame) -> pd.Series:
     labels = data['price_range'].apply(lambda x: 0 if x in [0, 1] else 1)
     self.classes = [0, 1]
     data.pop('price_range')
     return labels
Example #25
0
def explode_id_list(df: pd.DataFrame, column_name: str) -> pd.DataFrame:
    df = df.join(pd.DataFrame(df.pop(column_name).to_list()))
    return df
# 高频值填充f2
age_maxf = df['f2'].value_counts().index[0]
df['f2'].fillna(age_maxf, inplace=True)

# 获取 weight 数据列中单位为 lbs 的数据
rows_with_lbs = df['Weight'].str.contains('lbs').fillna(False)
# 将 lbs转换为 kgs, 2.2lbs=1kgs
for i, lbs_row in df[rows_with_lbs].iterrows():
    # 截取从头开始到倒数第三个字符之前,即去掉lbs。
    weight = int(float(lbs_row['Weight'][:-3]) / 2.2)
    df.at[i, 'Weight'] = '{}kgs'.format(weight)

# 高频值填充weight
weight_maxf = df['Weight'].value_counts().index[0]
df['Weight'].fillna(weight_maxf, inplace=True)

# 拆分name列, split的第二个参数表示将分列结果转换为DataFrame
df['FirstName'], df['LastName'] = df['Name'].str.split(' ', True).str

# 删除原Name列
df.drop(['Name'], axis=1, inplace=True)

# 移动first_name和last_name这俩列
first_name = df.pop('FirstName')
df.insert(1, 'FirstName', first_name)
last_name = df.pop('LastName')
df.insert(2, 'LastName', last_name)

print(df)
Example #27
0
    def __get_empty_price_symbols(all_prices_df: pd.DataFrame):
        all_prices_df.index = all_prices_df.pop('symbol')
        all_prices_df = all_prices_df.mean(axis=1)

        return list(all_prices_df[all_prices_df == 0].index)
Example #28
0
 def __init__(self, data: pd.DataFrame, primary_id_column: str,
              output_column_name: str) -> None:
     self._data = data
     self._id_column = primary_id_column
     self._id_series = data.pop(self._id_column)
     self._output_column_name = output_column_name
Example #29
0
    def _tidy_up_df(self, df: pd.DataFrame, dataset) -> pd.DataFrame:
        """
        Implementation of _tidy_up_df for DWD Observations

        :param df: untidy DataFrame
        :param dataset: dataset enumeration
        :return: tidied DataFrame
        """

        droppable_columns = [
            # Hourly
            # Cloud type
            DwdObservationParameter.HOURLY.CLOUD_TYPE.
            CLOUD_TYPE_LAYER1_ABBREVIATION.value,
            DwdObservationParameter.HOURLY.CLOUD_TYPE.
            CLOUD_TYPE_LAYER2_ABBREVIATION.value,
            DwdObservationParameter.HOURLY.CLOUD_TYPE.
            CLOUD_TYPE_LAYER3_ABBREVIATION.value,
            DwdObservationParameter.HOURLY.CLOUD_TYPE.
            CLOUD_TYPE_LAYER4_ABBREVIATION.value,
            # Cloudiness
            DwdObservationParameter.HOURLY.CLOUDINESS.
            CLOUD_COVER_TOTAL_INDICATOR.value,
            # Solar
            DwdObservationParameter.HOURLY.SOLAR.END_OF_INTERVAL.value,
            DwdObservationParameter.HOURLY.SOLAR.TRUE_LOCAL_TIME.value,
            # Visibility
            DwdObservationParameter.HOURLY.VISIBILITY.
            VISIBILITY_RANGE_INDICATOR.value,
            # Weather
            DwdObservationParameter.HOURLY.WEATHER_PHENOMENA.WEATHER_TEXT.
            value,
        ]

        # Drop string columns, can't be coerced to float
        df = df.drop(
            columns=droppable_columns,
            errors="ignore",
        )

        resolution = self.sr.stations.resolution

        if dataset == DwdObservationDataset.CLIMATE_SUMMARY:
            if resolution == Resolution.DAILY:
                quality_wind = df.pop(DwdObservationParameter.DAILY.
                                      CLIMATE_SUMMARY.QUALITY_WIND.value)
                quality_general = df.pop(DwdObservationParameter.DAILY.
                                         CLIMATE_SUMMARY.QUALITY_GENERAL.value)

                quality = pd.concat([
                    pd.Series(repeat(quality_wind.tolist(), 2)).explode(),
                    pd.Series(repeat(quality_general.tolist(), 12)).explode(),
                ])

            elif resolution in (Resolution.MONTHLY, Resolution.ANNUAL):
                quality_general = df.pop(DwdObservationParameter.MONTHLY.
                                         CLIMATE_SUMMARY.QUALITY_GENERAL.value)
                quality_precipitation = df.pop(
                    DwdObservationParameter.MONTHLY.CLIMATE_SUMMARY.
                    QUALITY_PRECIPITATION.value)
                quality = pd.concat([
                    pd.Series(repeat(quality_general, 9)).explode(),
                    pd.Series(repeat(quality_precipitation, 2)).explode(),
                ])
        elif resolution == Resolution.SUBDAILY and DwdObservationDataset.WIND_EXTREME:
            quality_fx_3 = df.pop("qn_8_3")
            quality_fx_6 = df.pop("qn_8_6")
            quality = pd.concat([quality_fx_3, quality_fx_6])
        else:
            quality = df.pop(df.columns[2])
            quality = pd.Series(repeat(quality, df.shape[1])).explode()

        possible_id_vars = (
            Columns.STATION_ID.value,
            Columns.DATE.value,
            Columns.FROM_DATE.value,
            Columns.TO_DATE.value,
        )

        id_vars = df.columns.intersection(possible_id_vars)

        df_tidy = df.melt(
            id_vars=id_vars,
            var_name=Columns.PARAMETER.value,
            value_name=Columns.VALUE.value,
        )
        df_tidy[Columns.QUALITY.value] = quality.reset_index(drop=True)

        return df_tidy
Example #30
0
def format_data(df: pd.DataFrame, stats: Dict, metadata: pd.DataFrame, ids: Ids) -> Tuple[List, List, List]:
    df = df.reset_index()
    df.insert(7, 'Gene ID', df.pop('TARGET'))

    col_types = np.fromiter(map(get_col_type, df.columns), 'U20', df.shape[1])

    num_cols = is_numeric_column(col_types)
    num_df = df.loc[:, num_cols]
    p_values = col_types == 'Pvalue'

    edge_counts = stats['edge_counts']
    edge_counts.index = edge_counts.index.tolist()

    total_edge_counts = stats['total']
    total_edge_counts.index = total_edge_counts.index.tolist()

    fc_cols = col_types == 'Log2FC'
    induce_repress = stats['induce_repress_count']
    induce_repress.index = induce_repress.index.tolist()

    empty_cols = df.isnull().all(axis=0)

    # for JSON response, can't have NaN or Inf
    df.loc[:, num_cols] = num_df.mask(np.isinf(num_df), np.nan)
    df = df.replace({np.nan: None})

    data_col_len = 8

    columns = list(map(list, zip_longest(*((col,) for col in df.columns[:data_col_len]),
                                         *df.columns[data_col_len:])))

    none_cols = [None] * len(columns[0])

    columns[-1: -1] = [none_cols.copy(),
                       none_cols.copy(),
                       none_cols.copy()]  # avoid references

    prev = (None,) * 5
    name = None
    tech = None
    method = None
    edge = None
    ind_rep = None
    for i, col in enumerate(islice(zip(*columns), data_col_len, None), data_col_len):
        try:
            if col[0:2] != prev[0:2]:
                name, _, uuid_ = col[0]
                analysis = metadata.loc[:, col[1]]

                name = get_name(name, analysis, ids[col[0:2]])

                tech = get_tech(analysis)
                method = get_analysis_method(analysis)

                edge_count = edge_counts.at[col[:2]]
                total_edge_count = total_edge_counts.at[col[:2]]

                edge = get_edge(analysis).format(edge_count)
                if edge_count != total_edge_count:
                    edge += " ({})".format(total_edge_count)

                try:
                    ind_rep = "Induced-{0[induced]:} Repressed-{0[repressed]:}".format(
                        induce_repress[(*col[:2], 'Log2FC')])
                except KeyError:
                    ind_rep = None

            columns[0][i] = name
            columns[1][i] = tech
            columns[2][i] = method
            columns[3][i] = edge
            columns[4][i] = ind_rep
        except KeyError:
            pass

        if col[-1] == 'DAP':
            columns[2][i] = columns[1][i] = None
        prev = col

    merged_cells = get_merge_cells(columns)

    # Column formatting for Handsontable
    column_formats = []
    for i, (num, p, fc, empty, col) in enumerate(zip(num_cols, p_values, fc_cols, empty_cols, zip(*columns))):
        opt = {}
        if num:
            if p:
                opt['type'] = 'p_value'
            else:
                opt['type'] = 'numeric'

            if fc:
                opt['renderer'] = 'renderFc'
            elif i >= data_col_len and not p:
                opt['renderer'] = 'renderExp'

            if i >= data_col_len:
                opt['validator'] = 'exponential'
        else:
            opt['type'] = 'text'

            if col[-1] == 'EDGE':
                opt['className'] = 'htCenter'
                opt['renderer'] = 'renderEdge'

        if empty:
            opt['width'] = 1

        column_formats.append(opt)
    return column_formats, merged_cells, columns + df.values.tolist()
Example #31
0
    def test_set_index_cast_datetimeindex(self):
        df = DataFrame({
            'A': [datetime(2000, 1, 1) + timedelta(i) for i in range(1000)],
            'B':
            np.random.randn(1000)
        })

        idf = df.set_index('A')
        assert isinstance(idf.index, pd.DatetimeIndex)

        # don't cast a DatetimeIndex WITH a tz, leave as object
        # GH 6032
        i = (pd.DatetimeIndex(
            to_datetime(['2013-1-1 13:00', '2013-1-2 14:00'],
                        errors="raise")).tz_localize('US/Pacific'))
        df = DataFrame(np.random.randn(2, 1), columns=['A'])

        expected = Series(
            np.array([
                pd.Timestamp('2013-01-01 13:00:00-0800', tz='US/Pacific'),
                pd.Timestamp('2013-01-02 14:00:00-0800', tz='US/Pacific')
            ],
                     dtype="object"))

        # convert index to series
        result = Series(i)
        assert_series_equal(result, expected)

        # assignt to frame
        df['B'] = i
        result = df['B']
        assert_series_equal(result, expected, check_names=False)
        assert result.name == 'B'

        # keep the timezone
        result = i.to_series(keep_tz=True)
        assert_series_equal(result.reset_index(drop=True), expected)

        # convert to utc
        df['C'] = i.to_series().reset_index(drop=True)
        result = df['C']
        comp = pd.DatetimeIndex(expected.values)
        comp = comp.tz_localize(None)
        tm.assert_numpy_array_equal(result.values, comp.values)

        # list of datetimes with a tz
        df['D'] = i.to_pydatetime()
        result = df['D']
        assert_series_equal(result, expected, check_names=False)
        assert result.name == 'D'

        # GH 6785
        # set the index manually
        import pytz
        df = DataFrame([{
            'ts': datetime(2014, 4, 1, tzinfo=pytz.utc),
            'foo': 1
        }])
        expected = df.set_index('ts')
        df.index = df['ts']
        df.pop('ts')
        assert_frame_equal(df, expected)
Example #32
0
class TestJoin(object):

    def setup_method(self, method):
        # aggregate multiple columns
        self.df = DataFrame({'key1': get_test_data(),
                             'key2': get_test_data(),
                             'data1': np.random.randn(N),
                             'data2': np.random.randn(N)})

        # exclude a couple keys for fun
        self.df = self.df[self.df['key2'] > 1]

        self.df2 = DataFrame({'key1': get_test_data(n=N // 5),
                              'key2': get_test_data(ngroups=NGROUPS // 2,
                                                    n=N // 5),
                              'value': np.random.randn(N // 5)})

        index, data = tm.getMixedTypeDict()
        self.target = DataFrame(data, index=index)

        # Join on string value
        self.source = DataFrame({'MergedA': data['A'], 'MergedD': data['D']},
                                index=data['C'])

    def test_cython_left_outer_join(self):
        left = a_([0, 1, 2, 1, 2, 0, 0, 1, 2, 3, 3], dtype=np.int64)
        right = a_([1, 1, 0, 4, 2, 2, 1], dtype=np.int64)
        max_group = 5

        ls, rs = libjoin.left_outer_join(left, right, max_group)

        exp_ls = left.argsort(kind='mergesort')
        exp_rs = right.argsort(kind='mergesort')

        exp_li = a_([0, 1, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5,
                     6, 6, 7, 7, 8, 8, 9, 10])
        exp_ri = a_([0, 0, 0, 1, 2, 3, 1, 2, 3, 1, 2, 3,
                     4, 5, 4, 5, 4, 5, -1, -1])

        exp_ls = exp_ls.take(exp_li)
        exp_ls[exp_li == -1] = -1

        exp_rs = exp_rs.take(exp_ri)
        exp_rs[exp_ri == -1] = -1

        tm.assert_numpy_array_equal(ls, exp_ls, check_dtype=False)
        tm.assert_numpy_array_equal(rs, exp_rs, check_dtype=False)

    def test_cython_right_outer_join(self):
        left = a_([0, 1, 2, 1, 2, 0, 0, 1, 2, 3, 3], dtype=np.int64)
        right = a_([1, 1, 0, 4, 2, 2, 1], dtype=np.int64)
        max_group = 5

        rs, ls = libjoin.left_outer_join(right, left, max_group)

        exp_ls = left.argsort(kind='mergesort')
        exp_rs = right.argsort(kind='mergesort')

        #            0        1        1        1
        exp_li = a_([0, 1, 2, 3, 4, 5, 3, 4, 5, 3, 4, 5,
                     #            2        2        4
                     6, 7, 8, 6, 7, 8, -1])
        exp_ri = a_([0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3,
                     4, 4, 4, 5, 5, 5, 6])

        exp_ls = exp_ls.take(exp_li)
        exp_ls[exp_li == -1] = -1

        exp_rs = exp_rs.take(exp_ri)
        exp_rs[exp_ri == -1] = -1

        tm.assert_numpy_array_equal(ls, exp_ls, check_dtype=False)
        tm.assert_numpy_array_equal(rs, exp_rs, check_dtype=False)

    def test_cython_inner_join(self):
        left = a_([0, 1, 2, 1, 2, 0, 0, 1, 2, 3, 3], dtype=np.int64)
        right = a_([1, 1, 0, 4, 2, 2, 1, 4], dtype=np.int64)
        max_group = 5

        ls, rs = libjoin.inner_join(left, right, max_group)

        exp_ls = left.argsort(kind='mergesort')
        exp_rs = right.argsort(kind='mergesort')

        exp_li = a_([0, 1, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5,
                     6, 6, 7, 7, 8, 8])
        exp_ri = a_([0, 0, 0, 1, 2, 3, 1, 2, 3, 1, 2, 3,
                     4, 5, 4, 5, 4, 5])

        exp_ls = exp_ls.take(exp_li)
        exp_ls[exp_li == -1] = -1

        exp_rs = exp_rs.take(exp_ri)
        exp_rs[exp_ri == -1] = -1

        tm.assert_numpy_array_equal(ls, exp_ls, check_dtype=False)
        tm.assert_numpy_array_equal(rs, exp_rs, check_dtype=False)

    def test_left_outer_join(self):
        joined_key2 = merge(self.df, self.df2, on='key2')
        _check_join(self.df, self.df2, joined_key2, ['key2'], how='left')

        joined_both = merge(self.df, self.df2)
        _check_join(self.df, self.df2, joined_both, ['key1', 'key2'],
                    how='left')

    def test_right_outer_join(self):
        joined_key2 = merge(self.df, self.df2, on='key2', how='right')
        _check_join(self.df, self.df2, joined_key2, ['key2'], how='right')

        joined_both = merge(self.df, self.df2, how='right')
        _check_join(self.df, self.df2, joined_both, ['key1', 'key2'],
                    how='right')

    def test_full_outer_join(self):
        joined_key2 = merge(self.df, self.df2, on='key2', how='outer')
        _check_join(self.df, self.df2, joined_key2, ['key2'], how='outer')

        joined_both = merge(self.df, self.df2, how='outer')
        _check_join(self.df, self.df2, joined_both, ['key1', 'key2'],
                    how='outer')

    def test_inner_join(self):
        joined_key2 = merge(self.df, self.df2, on='key2', how='inner')
        _check_join(self.df, self.df2, joined_key2, ['key2'], how='inner')

        joined_both = merge(self.df, self.df2, how='inner')
        _check_join(self.df, self.df2, joined_both, ['key1', 'key2'],
                    how='inner')

    def test_handle_overlap(self):
        joined = merge(self.df, self.df2, on='key2',
                       suffixes=['.foo', '.bar'])

        assert 'key1.foo' in joined
        assert 'key1.bar' in joined

    def test_handle_overlap_arbitrary_key(self):
        joined = merge(self.df, self.df2,
                       left_on='key2', right_on='key1',
                       suffixes=['.foo', '.bar'])
        assert 'key1.foo' in joined
        assert 'key2.bar' in joined

    def test_join_on(self):
        target = self.target
        source = self.source

        merged = target.join(source, on='C')
        tm.assert_series_equal(merged['MergedA'], target['A'],
                               check_names=False)
        tm.assert_series_equal(merged['MergedD'], target['D'],
                               check_names=False)

        # join with duplicates (fix regression from DataFrame/Matrix merge)
        df = DataFrame({'key': ['a', 'a', 'b', 'b', 'c']})
        df2 = DataFrame({'value': [0, 1, 2]}, index=['a', 'b', 'c'])
        joined = df.join(df2, on='key')
        expected = DataFrame({'key': ['a', 'a', 'b', 'b', 'c'],
                              'value': [0, 0, 1, 1, 2]})
        assert_frame_equal(joined, expected)

        # Test when some are missing
        df_a = DataFrame([[1], [2], [3]], index=['a', 'b', 'c'],
                         columns=['one'])
        df_b = DataFrame([['foo'], ['bar']], index=[1, 2],
                         columns=['two'])
        df_c = DataFrame([[1], [2]], index=[1, 2],
                         columns=['three'])
        joined = df_a.join(df_b, on='one')
        joined = joined.join(df_c, on='one')
        assert np.isnan(joined['two']['c'])
        assert np.isnan(joined['three']['c'])

        # merge column not p resent
        with pytest.raises(KeyError, match="^'E'$"):
            target.join(source, on='E')

        # overlap
        source_copy = source.copy()
        source_copy['A'] = 0
        msg = ("You are trying to merge on float64 and object columns. If"
               " you wish to proceed you should use pd.concat")
        with pytest.raises(ValueError, match=msg):
            target.join(source_copy, on='A')

    def test_join_on_fails_with_different_right_index(self):
        df = DataFrame({'a': np.random.choice(['m', 'f'], size=3),
                        'b': np.random.randn(3)})
        df2 = DataFrame({'a': np.random.choice(['m', 'f'], size=10),
                         'b': np.random.randn(10)},
                        index=tm.makeCustomIndex(10, 2))
        msg = (r'len\(left_on\) must equal the number of levels in the index'
               ' of "right"')
        with pytest.raises(ValueError, match=msg):
            merge(df, df2, left_on='a', right_index=True)

    def test_join_on_fails_with_different_left_index(self):
        df = DataFrame({'a': np.random.choice(['m', 'f'], size=3),
                        'b': np.random.randn(3)},
                       index=tm.makeCustomIndex(3, 2))
        df2 = DataFrame({'a': np.random.choice(['m', 'f'], size=10),
                         'b': np.random.randn(10)})
        msg = (r'len\(right_on\) must equal the number of levels in the index'
               ' of "left"')
        with pytest.raises(ValueError, match=msg):
            merge(df, df2, right_on='b', left_index=True)

    def test_join_on_fails_with_different_column_counts(self):
        df = DataFrame({'a': np.random.choice(['m', 'f'], size=3),
                        'b': np.random.randn(3)})
        df2 = DataFrame({'a': np.random.choice(['m', 'f'], size=10),
                         'b': np.random.randn(10)},
                        index=tm.makeCustomIndex(10, 2))
        msg = r"len\(right_on\) must equal len\(left_on\)"
        with pytest.raises(ValueError, match=msg):
            merge(df, df2, right_on='a', left_on=['a', 'b'])

    @pytest.mark.parametrize("wrong_type", [2, 'str', None, np.array([0, 1])])
    def test_join_on_fails_with_wrong_object_type(self, wrong_type):
        # GH12081 - original issue

        # GH21220 - merging of Series and DataFrame is now allowed
        # Edited test to remove the Series object from test parameters

        df = DataFrame({'a': [1, 1]})
        msg = ("Can only merge Series or DataFrame objects, a {} was passed"
               .format(str(type(wrong_type))))
        with pytest.raises(TypeError, match=msg):
            merge(wrong_type, df, left_on='a', right_on='a')
        with pytest.raises(TypeError, match=msg):
            merge(df, wrong_type, left_on='a', right_on='a')

    def test_join_on_pass_vector(self):
        expected = self.target.join(self.source, on='C')
        del expected['C']

        join_col = self.target.pop('C')
        result = self.target.join(self.source, on=join_col)
        assert_frame_equal(result, expected)

    def test_join_with_len0(self):
        # nothing to merge
        merged = self.target.join(self.source.reindex([]), on='C')
        for col in self.source:
            assert col in merged
            assert merged[col].isna().all()

        merged2 = self.target.join(self.source.reindex([]), on='C',
                                   how='inner')
        tm.assert_index_equal(merged2.columns, merged.columns)
        assert len(merged2) == 0

    def test_join_on_inner(self):
        df = DataFrame({'key': ['a', 'a', 'd', 'b', 'b', 'c']})
        df2 = DataFrame({'value': [0, 1]}, index=['a', 'b'])

        joined = df.join(df2, on='key', how='inner')

        expected = df.join(df2, on='key')
        expected = expected[expected['value'].notna()]
        tm.assert_series_equal(joined['key'], expected['key'],
                               check_dtype=False)
        tm.assert_series_equal(joined['value'], expected['value'],
                               check_dtype=False)
        tm.assert_index_equal(joined.index, expected.index)

    def test_join_on_singlekey_list(self):
        df = DataFrame({'key': ['a', 'a', 'b', 'b', 'c']})
        df2 = DataFrame({'value': [0, 1, 2]}, index=['a', 'b', 'c'])

        # corner cases
        joined = df.join(df2, on=['key'])
        expected = df.join(df2, on='key')

        assert_frame_equal(joined, expected)

    def test_join_on_series(self):
        result = self.target.join(self.source['MergedA'], on='C')
        expected = self.target.join(self.source[['MergedA']], on='C')
        assert_frame_equal(result, expected)

    def test_join_on_series_buglet(self):
        # GH #638
        df = DataFrame({'a': [1, 1]})
        ds = Series([2], index=[1], name='b')
        result = df.join(ds, on='a')
        expected = DataFrame({'a': [1, 1],
                              'b': [2, 2]}, index=df.index)
        tm.assert_frame_equal(result, expected)

    def test_join_index_mixed(self, join_type):
        # no overlapping blocks
        df1 = DataFrame(index=np.arange(10))
        df1['bool'] = True
        df1['string'] = 'foo'

        df2 = DataFrame(index=np.arange(5, 15))
        df2['int'] = 1
        df2['float'] = 1.

        joined = df1.join(df2, how=join_type)
        expected = _join_by_hand(df1, df2, how=join_type)
        assert_frame_equal(joined, expected)

        joined = df2.join(df1, how=join_type)
        expected = _join_by_hand(df2, df1, how=join_type)
        assert_frame_equal(joined, expected)

    def test_join_index_mixed_overlap(self):
        df1 = DataFrame({'A': 1., 'B': 2, 'C': 'foo', 'D': True},
                        index=np.arange(10),
                        columns=['A', 'B', 'C', 'D'])
        assert df1['B'].dtype == np.int64
        assert df1['D'].dtype == np.bool_

        df2 = DataFrame({'A': 1., 'B': 2, 'C': 'foo', 'D': True},
                        index=np.arange(0, 10, 2),
                        columns=['A', 'B', 'C', 'D'])

        # overlap
        joined = df1.join(df2, lsuffix='_one', rsuffix='_two')
        expected_columns = ['A_one', 'B_one', 'C_one', 'D_one',
                            'A_two', 'B_two', 'C_two', 'D_two']
        df1.columns = expected_columns[:4]
        df2.columns = expected_columns[4:]
        expected = _join_by_hand(df1, df2)
        assert_frame_equal(joined, expected)

    def test_join_empty_bug(self):
        # generated an exception in 0.4.3
        x = DataFrame()
        x.join(DataFrame([3], index=[0], columns=['A']), how='outer')

    def test_join_unconsolidated(self):
        # GH #331
        a = DataFrame(randn(30, 2), columns=['a', 'b'])
        c = Series(randn(30))
        a['c'] = c
        d = DataFrame(randn(30, 1), columns=['q'])

        # it works!
        a.join(d)
        d.join(a)

    def test_join_multiindex(self):
        index1 = MultiIndex.from_arrays([['a', 'a', 'a', 'b', 'b', 'b'],
                                         [1, 2, 3, 1, 2, 3]],
                                        names=['first', 'second'])

        index2 = MultiIndex.from_arrays([['b', 'b', 'b', 'c', 'c', 'c'],
                                         [1, 2, 3, 1, 2, 3]],
                                        names=['first', 'second'])

        df1 = DataFrame(data=np.random.randn(6), index=index1,
                        columns=['var X'])
        df2 = DataFrame(data=np.random.randn(6), index=index2,
                        columns=['var Y'])

        df1 = df1.sort_index(level=0)
        df2 = df2.sort_index(level=0)

        joined = df1.join(df2, how='outer')
        ex_index = Index(index1.values).union(Index(index2.values))
        expected = df1.reindex(ex_index).join(df2.reindex(ex_index))
        expected.index.names = index1.names
        assert_frame_equal(joined, expected)
        assert joined.index.names == index1.names

        df1 = df1.sort_index(level=1)
        df2 = df2.sort_index(level=1)

        joined = df1.join(df2, how='outer').sort_index(level=0)
        ex_index = Index(index1.values).union(Index(index2.values))
        expected = df1.reindex(ex_index).join(df2.reindex(ex_index))
        expected.index.names = index1.names

        assert_frame_equal(joined, expected)
        assert joined.index.names == index1.names

    def test_join_inner_multiindex(self):
        key1 = ['bar', 'bar', 'bar', 'foo', 'foo', 'baz', 'baz', 'qux',
                'qux', 'snap']
        key2 = ['two', 'one', 'three', 'one', 'two', 'one', 'two', 'two',
                'three', 'one']

        data = np.random.randn(len(key1))
        data = DataFrame({'key1': key1, 'key2': key2,
                          'data': data})

        index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'],
                                   ['one', 'two', 'three']],
                           codes=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3],
                                  [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]],
                           names=['first', 'second'])
        to_join = DataFrame(np.random.randn(10, 3), index=index,
                            columns=['j_one', 'j_two', 'j_three'])

        joined = data.join(to_join, on=['key1', 'key2'], how='inner')
        expected = merge(data, to_join.reset_index(),
                         left_on=['key1', 'key2'],
                         right_on=['first', 'second'], how='inner',
                         sort=False)

        expected2 = merge(to_join, data,
                          right_on=['key1', 'key2'], left_index=True,
                          how='inner', sort=False)
        assert_frame_equal(joined, expected2.reindex_like(joined))

        expected2 = merge(to_join, data, right_on=['key1', 'key2'],
                          left_index=True, how='inner', sort=False)

        expected = expected.drop(['first', 'second'], axis=1)
        expected.index = joined.index

        assert joined.index.is_monotonic
        assert_frame_equal(joined, expected)

        # _assert_same_contents(expected, expected2.loc[:, expected.columns])

    def test_join_hierarchical_mixed(self):
        # GH 2024
        df = DataFrame([(1, 2, 3), (4, 5, 6)], columns=['a', 'b', 'c'])
        new_df = df.groupby(['a']).agg({'b': [np.mean, np.sum]})
        other_df = DataFrame(
            [(1, 2, 3), (7, 10, 6)], columns=['a', 'b', 'd'])
        other_df.set_index('a', inplace=True)
        # GH 9455, 12219
        with tm.assert_produces_warning(UserWarning):
            result = merge(new_df, other_df, left_index=True, right_index=True)
        assert ('b', 'mean') in result
        assert 'b' in result

    def test_join_float64_float32(self):

        a = DataFrame(randn(10, 2), columns=['a', 'b'], dtype=np.float64)
        b = DataFrame(randn(10, 1), columns=['c'], dtype=np.float32)
        joined = a.join(b)
        assert joined.dtypes['a'] == 'float64'
        assert joined.dtypes['b'] == 'float64'
        assert joined.dtypes['c'] == 'float32'

        a = np.random.randint(0, 5, 100).astype('int64')
        b = np.random.random(100).astype('float64')
        c = np.random.random(100).astype('float32')
        df = DataFrame({'a': a, 'b': b, 'c': c})
        xpdf = DataFrame({'a': a, 'b': b, 'c': c})
        s = DataFrame(np.random.random(5).astype('float32'), columns=['md'])
        rs = df.merge(s, left_on='a', right_index=True)
        assert rs.dtypes['a'] == 'int64'
        assert rs.dtypes['b'] == 'float64'
        assert rs.dtypes['c'] == 'float32'
        assert rs.dtypes['md'] == 'float32'

        xp = xpdf.merge(s, left_on='a', right_index=True)
        assert_frame_equal(rs, xp)

    def test_join_many_non_unique_index(self):
        df1 = DataFrame({"a": [1, 1], "b": [1, 1], "c": [10, 20]})
        df2 = DataFrame({"a": [1, 1], "b": [1, 2], "d": [100, 200]})
        df3 = DataFrame({"a": [1, 1], "b": [1, 2], "e": [1000, 2000]})
        idf1 = df1.set_index(["a", "b"])
        idf2 = df2.set_index(["a", "b"])
        idf3 = df3.set_index(["a", "b"])

        result = idf1.join([idf2, idf3], how='outer')

        df_partially_merged = merge(df1, df2, on=['a', 'b'], how='outer')
        expected = merge(df_partially_merged, df3, on=['a', 'b'], how='outer')

        result = result.reset_index()
        expected = expected[result.columns]
        expected['a'] = expected.a.astype('int64')
        expected['b'] = expected.b.astype('int64')
        assert_frame_equal(result, expected)

        df1 = DataFrame({"a": [1, 1, 1], "b": [1, 1, 1], "c": [10, 20, 30]})
        df2 = DataFrame({"a": [1, 1, 1], "b": [1, 1, 2], "d": [100, 200, 300]})
        df3 = DataFrame(
            {"a": [1, 1, 1], "b": [1, 1, 2], "e": [1000, 2000, 3000]})
        idf1 = df1.set_index(["a", "b"])
        idf2 = df2.set_index(["a", "b"])
        idf3 = df3.set_index(["a", "b"])
        result = idf1.join([idf2, idf3], how='inner')

        df_partially_merged = merge(df1, df2, on=['a', 'b'], how='inner')
        expected = merge(df_partially_merged, df3, on=['a', 'b'], how='inner')

        result = result.reset_index()

        assert_frame_equal(result, expected.loc[:, result.columns])

        # GH 11519
        df = DataFrame({'A': ['foo', 'bar', 'foo', 'bar',
                              'foo', 'bar', 'foo', 'foo'],
                        'B': ['one', 'one', 'two', 'three',
                              'two', 'two', 'one', 'three'],
                        'C': np.random.randn(8),
                        'D': np.random.randn(8)})
        s = Series(np.repeat(np.arange(8), 2),
                   index=np.repeat(np.arange(8), 2), name='TEST')
        inner = df.join(s, how='inner')
        outer = df.join(s, how='outer')
        left = df.join(s, how='left')
        right = df.join(s, how='right')
        assert_frame_equal(inner, outer)
        assert_frame_equal(inner, left)
        assert_frame_equal(inner, right)

    def test_join_sort(self):
        left = DataFrame({'key': ['foo', 'bar', 'baz', 'foo'],
                          'value': [1, 2, 3, 4]})
        right = DataFrame({'value2': ['a', 'b', 'c']},
                          index=['bar', 'baz', 'foo'])

        joined = left.join(right, on='key', sort=True)
        expected = DataFrame({'key': ['bar', 'baz', 'foo', 'foo'],
                              'value': [2, 3, 1, 4],
                              'value2': ['a', 'b', 'c', 'c']},
                             index=[1, 2, 0, 3])
        assert_frame_equal(joined, expected)

        # smoke test
        joined = left.join(right, on='key', sort=False)
        tm.assert_index_equal(joined.index, pd.Index(lrange(4)))

    def test_join_mixed_non_unique_index(self):
        # GH 12814, unorderable types in py3 with a non-unique index
        df1 = DataFrame({'a': [1, 2, 3, 4]}, index=[1, 2, 3, 'a'])
        df2 = DataFrame({'b': [5, 6, 7, 8]}, index=[1, 3, 3, 4])
        result = df1.join(df2)
        expected = DataFrame({'a': [1, 2, 3, 3, 4],
                              'b': [5, np.nan, 6, 7, np.nan]},
                             index=[1, 2, 3, 3, 'a'])
        tm.assert_frame_equal(result, expected)

        df3 = DataFrame({'a': [1, 2, 3, 4]}, index=[1, 2, 2, 'a'])
        df4 = DataFrame({'b': [5, 6, 7, 8]}, index=[1, 2, 3, 4])
        result = df3.join(df4)
        expected = DataFrame({'a': [1, 2, 3, 4], 'b': [5, 6, 6, np.nan]},
                             index=[1, 2, 2, 'a'])
        tm.assert_frame_equal(result, expected)

    def test_join_non_unique_period_index(self):
        # GH #16871
        index = pd.period_range('2016-01-01', periods=16, freq='M')
        df = DataFrame([i for i in range(len(index))],
                       index=index, columns=['pnum'])
        df2 = concat([df, df])
        result = df.join(df2, how='inner', rsuffix='_df2')
        expected = DataFrame(
            np.tile(np.arange(16, dtype=np.int64).repeat(2).reshape(-1, 1), 2),
            columns=['pnum', 'pnum_df2'], index=df2.sort_index().index)
        tm.assert_frame_equal(result, expected)

    def test_mixed_type_join_with_suffix(self):
        # GH #916
        df = DataFrame(np.random.randn(20, 6),
                       columns=['a', 'b', 'c', 'd', 'e', 'f'])
        df.insert(0, 'id', 0)
        df.insert(5, 'dt', 'foo')

        grouped = df.groupby('id')
        mn = grouped.mean()
        cn = grouped.count()

        # it works!
        mn.join(cn, rsuffix='_right')

    def test_join_many(self):
        df = DataFrame(np.random.randn(10, 6), columns=list('abcdef'))
        df_list = [df[['a', 'b']], df[['c', 'd']], df[['e', 'f']]]

        joined = df_list[0].join(df_list[1:])
        tm.assert_frame_equal(joined, df)

        df_list = [df[['a', 'b']][:-2],
                   df[['c', 'd']][2:], df[['e', 'f']][1:9]]

        def _check_diff_index(df_list, result, exp_index):
            reindexed = [x.reindex(exp_index) for x in df_list]
            expected = reindexed[0].join(reindexed[1:])
            tm.assert_frame_equal(result, expected)

        # different join types
        joined = df_list[0].join(df_list[1:], how='outer')
        _check_diff_index(df_list, joined, df.index)

        joined = df_list[0].join(df_list[1:])
        _check_diff_index(df_list, joined, df_list[0].index)

        joined = df_list[0].join(df_list[1:], how='inner')
        _check_diff_index(df_list, joined, df.index[2:8])

        msg = "Joining multiple DataFrames only supported for joining on index"
        with pytest.raises(ValueError, match=msg):
            df_list[0].join(df_list[1:], on='a')

    def test_join_many_mixed(self):
        df = DataFrame(np.random.randn(8, 4), columns=['A', 'B', 'C', 'D'])
        df['key'] = ['foo', 'bar'] * 4
        df1 = df.loc[:, ['A', 'B']]
        df2 = df.loc[:, ['C', 'D']]
        df3 = df.loc[:, ['key']]

        result = df1.join([df2, df3])
        assert_frame_equal(result, df)

    def test_join_dups(self):

        # joining dups
        df = concat([DataFrame(np.random.randn(10, 4),
                               columns=['A', 'A', 'B', 'B']),
                     DataFrame(np.random.randint(0, 10, size=20)
                               .reshape(10, 2),
                               columns=['A', 'C'])],
                    axis=1)

        expected = concat([df, df], axis=1)
        result = df.join(df, rsuffix='_2')
        result.columns = expected.columns
        assert_frame_equal(result, expected)

        # GH 4975, invalid join on dups
        w = DataFrame(np.random.randn(4, 2), columns=["x", "y"])
        x = DataFrame(np.random.randn(4, 2), columns=["x", "y"])
        y = DataFrame(np.random.randn(4, 2), columns=["x", "y"])
        z = DataFrame(np.random.randn(4, 2), columns=["x", "y"])

        dta = x.merge(y, left_index=True, right_index=True).merge(
            z, left_index=True, right_index=True, how="outer")
        dta = dta.merge(w, left_index=True, right_index=True)
        expected = concat([x, y, z, w], axis=1)
        expected.columns = ['x_x', 'y_x', 'x_y',
                            'y_y', 'x_x', 'y_x', 'x_y', 'y_y']
        assert_frame_equal(dta, expected)

    def test_panel_join(self):
        with catch_warnings(record=True):
            panel = tm.makePanel()
            tm.add_nans(panel)

            p1 = panel.iloc[:2, :10, :3]
            p2 = panel.iloc[2:, 5:, 2:]

            # left join
            result = p1.join(p2)
            expected = p1.copy()
            expected['ItemC'] = p2['ItemC']
            tm.assert_panel_equal(result, expected)

            # right join
            result = p1.join(p2, how='right')
            expected = p2.copy()
            expected['ItemA'] = p1['ItemA']
            expected['ItemB'] = p1['ItemB']
            expected = expected.reindex(items=['ItemA', 'ItemB', 'ItemC'])
            tm.assert_panel_equal(result, expected)

            # inner join
            result = p1.join(p2, how='inner')
            expected = panel.iloc[:, 5:10, 2:3]
            tm.assert_panel_equal(result, expected)

            # outer join
            result = p1.join(p2, how='outer')
            expected = p1.reindex(major=panel.major_axis,
                                  minor=panel.minor_axis)
            expected = expected.join(p2.reindex(major=panel.major_axis,
                                                minor=panel.minor_axis))
            tm.assert_panel_equal(result, expected)

    def test_panel_join_overlap(self):
        with catch_warnings(record=True):
            panel = tm.makePanel()
            tm.add_nans(panel)

            p1 = panel.loc[['ItemA', 'ItemB', 'ItemC']]
            p2 = panel.loc[['ItemB', 'ItemC']]

            # Expected index is
            #
            # ItemA, ItemB_p1, ItemC_p1, ItemB_p2, ItemC_p2
            joined = p1.join(p2, lsuffix='_p1', rsuffix='_p2')
            p1_suf = p1.loc[['ItemB', 'ItemC']].add_suffix('_p1')
            p2_suf = p2.loc[['ItemB', 'ItemC']].add_suffix('_p2')
            no_overlap = panel.loc[['ItemA']]
            expected = no_overlap.join(p1_suf.join(p2_suf))
            tm.assert_panel_equal(joined, expected)

    def test_panel_join_many(self):
        with catch_warnings(record=True):
            tm.K = 10
            panel = tm.makePanel()
            tm.K = 4

            panels = [panel.iloc[:2], panel.iloc[2:6], panel.iloc[6:]]

            joined = panels[0].join(panels[1:])
            tm.assert_panel_equal(joined, panel)

            panels = [panel.iloc[:2, :-5],
                      panel.iloc[2:6, 2:],
                      panel.iloc[6:, 5:-7]]

            data_dict = {}
            for p in panels:
                data_dict.update(p.iteritems())

            joined = panels[0].join(panels[1:], how='inner')
            expected = pd.Panel.from_dict(data_dict, intersect=True)
            tm.assert_panel_equal(joined, expected)

            joined = panels[0].join(panels[1:], how='outer')
            expected = pd.Panel.from_dict(data_dict, intersect=False)
            tm.assert_panel_equal(joined, expected)

            # edge cases
            msg = "Suffixes not supported when passing multiple panels"
            with pytest.raises(ValueError, match=msg):
                panels[0].join(panels[1:], how='outer', lsuffix='foo',
                               rsuffix='bar')
            msg = "Right join not supported with multiple panels"
            with pytest.raises(ValueError, match=msg):
                panels[0].join(panels[1:], how='right')

    def test_join_multi_to_multi(self, join_type):
        # GH 20475
        leftindex = MultiIndex.from_product([list('abc'), list('xy'), [1, 2]],
                                            names=['abc', 'xy', 'num'])
        left = DataFrame({'v1': range(12)}, index=leftindex)

        rightindex = MultiIndex.from_product([list('abc'), list('xy')],
                                             names=['abc', 'xy'])
        right = DataFrame({'v2': [100 * i for i in range(1, 7)]},
                          index=rightindex)

        result = left.join(right, on=['abc', 'xy'], how=join_type)
        expected = (left.reset_index()
                        .merge(right.reset_index(),
                               on=['abc', 'xy'], how=join_type)
                        .set_index(['abc', 'xy', 'num'])
                    )
        assert_frame_equal(expected, result)

        msg = (r'len\(left_on\) must equal the number of levels in the index'
               ' of "right"')
        with pytest.raises(ValueError, match=msg):
            left.join(right, on='xy', how=join_type)

        with pytest.raises(ValueError, match=msg):
            right.join(left, on=['abc', 'xy'], how=join_type)
Example #33
0
    def test_set_index_datetime(self):
        # GH#3950
        df = DataFrame({
            "label": ["a", "a", "a", "b", "b", "b"],
            "datetime": [
                "2011-07-19 07:00:00",
                "2011-07-19 08:00:00",
                "2011-07-19 09:00:00",
                "2011-07-19 07:00:00",
                "2011-07-19 08:00:00",
                "2011-07-19 09:00:00",
            ],
            "value":
            range(6),
        })
        df.index = to_datetime(df.pop("datetime"), utc=True)
        df.index = df.index.tz_convert("US/Pacific")

        expected = DatetimeIndex(
            [
                "2011-07-19 07:00:00", "2011-07-19 08:00:00",
                "2011-07-19 09:00:00"
            ],
            name="datetime",
        )
        expected = expected.tz_localize("UTC").tz_convert("US/Pacific")

        df = df.set_index("label", append=True)
        tm.assert_index_equal(df.index.levels[0], expected)
        tm.assert_index_equal(df.index.levels[1],
                              Index(["a", "b"], name="label"))
        assert df.index.names == ["datetime", "label"]

        df = df.swaplevel(0, 1)
        tm.assert_index_equal(df.index.levels[0],
                              Index(["a", "b"], name="label"))
        tm.assert_index_equal(df.index.levels[1], expected)
        assert df.index.names == ["label", "datetime"]

        df = DataFrame(np.random.random(6))
        idx1 = DatetimeIndex(
            [
                "2011-07-19 07:00:00",
                "2011-07-19 08:00:00",
                "2011-07-19 09:00:00",
                "2011-07-19 07:00:00",
                "2011-07-19 08:00:00",
                "2011-07-19 09:00:00",
            ],
            tz="US/Eastern",
        )
        idx2 = DatetimeIndex(
            [
                "2012-04-01 09:00",
                "2012-04-01 09:00",
                "2012-04-01 09:00",
                "2012-04-02 09:00",
                "2012-04-02 09:00",
                "2012-04-02 09:00",
            ],
            tz="US/Eastern",
        )
        idx3 = date_range("2011-01-01 09:00", periods=6, tz="Asia/Tokyo")
        idx3 = idx3._with_freq(None)

        df = df.set_index(idx1)
        df = df.set_index(idx2, append=True)
        df = df.set_index(idx3, append=True)

        expected1 = DatetimeIndex(
            [
                "2011-07-19 07:00:00", "2011-07-19 08:00:00",
                "2011-07-19 09:00:00"
            ],
            tz="US/Eastern",
        )
        expected2 = DatetimeIndex(["2012-04-01 09:00", "2012-04-02 09:00"],
                                  tz="US/Eastern")

        tm.assert_index_equal(df.index.levels[0], expected1)
        tm.assert_index_equal(df.index.levels[1], expected2)
        tm.assert_index_equal(df.index.levels[2], idx3)

        # GH#7092
        tm.assert_index_equal(df.index.get_level_values(0), idx1)
        tm.assert_index_equal(df.index.get_level_values(1), idx2)
        tm.assert_index_equal(df.index.get_level_values(2), idx3)
Example #34
0
            'Diagnosis'], axis=1).drop(labels=0)
riskModelData['General feeling'] = riskModelData['General feeling'].astype(
    float)
riskModelData['Age'] = riskModelData['Age'].astype(float)
riskModelData['Gender'] = riskModelData['Gender'].astype(float)
riskModelData['Sexuality'] = riskModelData['Sexuality'].astype(float)
riskModelData['Anger'] = riskModelData['Anger'].astype(float)
riskModelData['Disgust'] = riskModelData['Disgust'].astype(float)
riskModelData['Fear'] = riskModelData['Fear'].astype(float)
riskModelData['Joy'] = riskModelData['Joy'].astype(float)
riskModelData['Sadness'] = riskModelData['Sadness'].astype(float)

riskModelData['Risk Factor'] = riskModelData['Anger'] + riskModelData[
    'Disgust'] + riskModelData['Fear'] - riskModelData['Joy'] + riskModelData[
        'Sadness'] + riskModelData['General feeling']
riskModelData = riskModelData.drop(
    labels=['General feeling', 'Anger', 'Disgust', 'Fear', 'Joy', 'Sadness'],
    axis=1)

riskModelOutput = riskModelData.pop('Risk Factor')

pp.pprint(riskModelData)
pp.pprint(riskModelOutput)

predictionModel = linear_model.RidgeCV()
predictionModel.fit(riskModelData, riskModelOutput)
print(predictionModel.score(riskModelData, riskModelOutput))
print(predictionModel.coef_)

predictedRisk = predictionModel.predict(riskModelData)
pp.pprint(predictedRisk)
Example #35
0
def platform_bill(bill, timestamp):
    data = DataFrame(bill).sum().to_dict()
    if 'timestamp' in data:
        data.pop('timestamp')
    print timestamp, data
    LyDaily.update_one({'timestamp': timestamp}, data, upsert=True)
Example #36
0
# coding:UTF-8
Example #37
0
    def test_convert_dti_to_series(self):
        # don't cast a DatetimeIndex WITH a tz, leave as object
        # GH 6032
        idx = DatetimeIndex(to_datetime(["2013-1-1 13:00", "2013-1-2 14:00"]),
                            name="B").tz_localize("US/Pacific")
        df = DataFrame(np.random.randn(2, 1), columns=["A"])

        expected = Series(
            np.array(
                [
                    Timestamp("2013-01-01 13:00:00-0800", tz="US/Pacific"),
                    Timestamp("2013-01-02 14:00:00-0800", tz="US/Pacific"),
                ],
                dtype="object",
            ),
            name="B",
        )

        # convert index to series
        result = Series(idx)
        tm.assert_series_equal(result, expected)

        # assign to frame
        df["B"] = idx
        result = df["B"]
        tm.assert_series_equal(result, expected)

        # convert to series while keeping the timezone
        msg = "stop passing 'keep_tz'"
        with tm.assert_produces_warning(FutureWarning) as m:
            result = idx.to_series(keep_tz=True, index=[0, 1])
        tm.assert_series_equal(result, expected)
        assert msg in str(m[0].message)

        # convert to utc
        with tm.assert_produces_warning(FutureWarning) as m:
            df["B"] = idx.to_series(keep_tz=False, index=[0, 1])
        result = df["B"]
        comp = Series(DatetimeIndex(expected.values).tz_localize(None),
                      name="B")
        tm.assert_series_equal(result, comp)
        msg = "do 'idx.tz_convert(None)' before calling"
        assert msg in str(m[0].message)

        result = idx.to_series(index=[0, 1])
        tm.assert_series_equal(result, expected)

        with tm.assert_produces_warning(FutureWarning) as m:
            result = idx.to_series(keep_tz=False, index=[0, 1])
        tm.assert_series_equal(result, expected.dt.tz_convert(None))
        msg = "do 'idx.tz_convert(None)' before calling"
        assert msg in str(m[0].message)

        # list of datetimes with a tz
        df["B"] = idx.to_pydatetime()
        result = df["B"]
        tm.assert_series_equal(result, expected)

        # GH 6785
        # set the index manually
        import pytz

        df = DataFrame([{
            "ts": datetime(2014, 4, 1, tzinfo=pytz.utc),
            "foo": 1
        }])
        expected = df.set_index("ts")
        df.index = df["ts"]
        df.pop("ts")
        tm.assert_frame_equal(df, expected)
Example #38
0
def pop_target(df: pd.DataFrame, params: FeatureParams) -> pd.Series:
    target = df.pop(params.target_col)
    return target
Example #39
0
def preprocess(df: pd.DataFrame) -> tuple:
    with timeit("Preprocessamento", log):
        df = df.copy()

        df["neighborhood"] = df["neighborhood"].map(
            df.groupby("neighborhood")["total_fee"].median()
        )

        y = df.pop("total_fee")

        X = df[
            [
                "neighborhood",
                "url",
                "usable_area",
                "floors",
                "type_unit",
                "bedrooms",
                "bathrooms",
                "suites",
                "parking_spaces",
                "amenities",
                "address_lat",
                "address_lon",
                "estacao",
                "distance",
                "created_date",
                "updated_date",
            ]
        ].set_index("url")

        #
        # Colunas Numéricas
        #
        numeric_columns = [
            "usable_area",
            "floors",
            "bedrooms",
            "bathrooms",
            "suites",
            "parking_spaces",
            "address_lat",
            "address_lon",
            "distance",
        ]
        X[numeric_columns] = X[numeric_columns].astype(float).fillna(-999)

        #
        # Transforma datas de criação e atualização em features
        #
        X["qtd_days_created"] = (
            (
                (datetime.now() - pd.to_datetime(X["created_date"]))
                / np.timedelta64(1, "D")
            )
            .round()
            .fillna(-1)
            .astype(int)
        )
        X["qtd_days_updated"] = (
            (
                (datetime.now() - pd.to_datetime(X["updated_date"]))
                / np.timedelta64(1, "D")
            )
            .round()
            .fillna(-1)
            .astype(int)
        )

        #
        # Transforma colunas não numéricas
        #

        # Type Unit
        valid_type_unit = ["APARTMENT", "HOME", "CONDOMINIUM", "PENTHOUSE", "FLAT"]
        X.loc[~X.type_unit.isin(valid_type_unit), "type_unit"] = "OTHERS"
        dummies_type_units = onehot(
            X.type_unit, valid_type_unit + ["OTHERS"], "type_unit"
        )
        X[dummies_type_units.columns] = dummies_type_units.values

        # Estação de trem/metrô
        estacoes_validas = [
            "Estação_Jardim_Oceânico",
            "Estação_Uruguai",
            "Estação_Botafogo/Coca-Cola",
            "Estação_Afonso_Pena",
            "Estação_Saens_Peña",
            "Estação_Flamengo",
            "Estação_São_Francisco_Xavier_(Metrô_Rio)",
            "Estação_Madureira",
        ]
        # estacoes = X.loc[X.estacao != "", "estacao"].value_counts(normalize=True)
        # estacoes_validas = list(estacoes[estacoes > 0.05].index)
        X.loc[~X.estacao.isin(estacoes_validas), "estacao"] = "OTHERS"
        dummies_estacoes = onehot(X.type_unit, estacoes_validas + ["OTHERS"], "estação")
        X[dummies_estacoes.columns] = dummies_estacoes.values

        # Amenities
        valid_amenities = [
            "ELEVATOR",
            "POOL",
            "BARBECUE_GRILL",
            "PARTY_HALL",
            "PLAYGROUND",
            "GATED_COMMUNITY",
            "BALCONY",
            "INTERCOM",
            "KITCHEN_CABINETS",
            "GYM",
            "SAUNA",
            "FURNISHED",
            "SPORTS_COURT",
        ]
        amenities = X.amenities.explode()
        amenities[~amenities.isin([valid_amenities])] = "OTHERS"
        dummies_amenities = (
            onehot(amenities, valid_amenities, "amenity").groupby(level=0).max()
        )
        X[dummies_amenities.columns] = dummies_amenities.values

        # Remove colunas
        X = X.drop(
            columns=[
                "type_unit",
                "estacao",
                "created_date",
                "updated_date",
                "amenities",
            ]
        )

        log.debug(f"shape: {X.shape}, columns: {X.columns.tolist()}")

        return X, y
Example #40
0
def analytic_signal(data: pd.DataFrame, fmin: float, fmax: float, nodes: List[str] = None, **kwargs) -> pd.DataFrame:
    """Calculates analytic signal from simulation results, using the hilbert transform.

    Parameters
    ----------
    data
        Simulation results.
    fmin
        Lower bound frequency for bandpass filter that will be applied to the data.
    fmax
        Upper bound frequency for bandpass filter that will be applied to the data.
    nodes
        List of node names for which to calculate the analytic signal.
    kwargs
        Additional keyword arguments that will be passed to the `mne.Raw.filter` method.

    Returns
    -------
    pd.DataFrame
        Dataframe containing the fields `time`, `node`, `amplitude` and `phase`.

    """

    if 'time' in data.columns.values:
        idx = data.pop('time')
        data.index = idx

    if nodes:
        if type(nodes[0]) is str:
            data = data.loc[:, nodes]
        else:
            data = data.iloc[:, nodes]

    # create mne raw data object
    from pyrates.utility.mne_wrapper import mne_from_dataframe
    raw = mne_from_dataframe(data)

    # bandpass filter the raw data
    raw.filter(l_freq=fmin, h_freq=fmax, **kwargs)

    # apply hilbert transform
    raw.apply_hilbert()

    # get phase of analytic signal
    def get_angle(x):
        return np.angle(x) + np.pi
    raw_phase = raw.copy()
    raw_phase.apply_function(get_angle)
    raw_phase.apply_function(np.real, dtype=np.float32)
    raw_phase.apply_function(np.unwrap)

    # get amplitude of analytic signal
    raw_amplitude = raw.copy()
    raw_amplitude.apply_function(np.abs)
    raw_amplitude.apply_function(np.real, dtype=np.float32)

    # combine phase and amplitude into dataframe
    time = data.index
    data_phase = raw_phase.to_data_frame(scalings={'eeg': 1.})
    data_phase['time'] = time
    data_amp = raw_amplitude.to_data_frame(scalings={'eeg': 1.})
    data_amp['time'] = time
    data = pd.melt(data_phase, id_vars=['time'], var_name='node', value_name='phase')
    data_tmp = pd.melt(data_amp, id_vars=['time'], var_name='node', value_name='amplitude')
    data['amplitude'] = data_tmp['amplitude']

    return data
Example #41
0
    def test_set_index_cast_datetimeindex(self):
        df = DataFrame({'A': [datetime(2000, 1, 1) + timedelta(i)
                              for i in range(1000)],
                        'B': np.random.randn(1000)})

        idf = df.set_index('A')
        assert isinstance(idf.index, pd.DatetimeIndex)

        # don't cast a DatetimeIndex WITH a tz, leave as object
        # GH 6032
        i = (pd.DatetimeIndex(
            to_datetime(['2013-1-1 13:00',
                         '2013-1-2 14:00'], errors="raise"))
             .tz_localize('US/Pacific'))
        df = DataFrame(np.random.randn(2, 1), columns=['A'])

        expected = Series(np.array([pd.Timestamp('2013-01-01 13:00:00-0800',
                                                 tz='US/Pacific'),
                                    pd.Timestamp('2013-01-02 14:00:00-0800',
                                                 tz='US/Pacific')],
                                   dtype="object"))

        # convert index to series
        result = Series(i)
        assert_series_equal(result, expected)

        # assignt to frame
        df['B'] = i
        result = df['B']
        assert_series_equal(result, expected, check_names=False)
        assert result.name == 'B'

        # keep the timezone
        result = i.to_series(keep_tz=True)
        assert_series_equal(result.reset_index(drop=True), expected)

        # convert to utc
        df['C'] = i.to_series().reset_index(drop=True)
        result = df['C']
        comp = pd.DatetimeIndex(expected.values).copy()
        comp.tz = None
        tm.assert_numpy_array_equal(result.values, comp.values)

        # list of datetimes with a tz
        df['D'] = i.to_pydatetime()
        result = df['D']
        assert_series_equal(result, expected, check_names=False)
        assert result.name == 'D'

        # GH 6785
        # set the index manually
        import pytz
        df = DataFrame(
            [{'ts': datetime(2014, 4, 1, tzinfo=pytz.utc), 'foo': 1}])
        expected = df.set_index('ts')
        df.index = df['ts']
        df.pop('ts')
        assert_frame_equal(df, expected)

        # GH 3950
        # reset_index with single level
        for tz in ['UTC', 'Asia/Tokyo', 'US/Eastern']:
            idx = pd.date_range('1/1/2011', periods=5,
                                freq='D', tz=tz, name='idx')
            df = pd.DataFrame(
                {'a': range(5), 'b': ['A', 'B', 'C', 'D', 'E']}, index=idx)

            expected = pd.DataFrame({'idx': [datetime(2011, 1, 1),
                                             datetime(2011, 1, 2),
                                             datetime(2011, 1, 3),
                                             datetime(2011, 1, 4),
                                             datetime(2011, 1, 5)],
                                     'a': range(5),
                                     'b': ['A', 'B', 'C', 'D', 'E']},
                                    columns=['idx', 'a', 'b'])
            expected['idx'] = expected['idx'].apply(
                lambda d: pd.Timestamp(d, tz=tz))
            assert_frame_equal(df.reset_index(), expected)
Example #42
0
        target.append(10)
    elif c == 'watertower':
        target.append(11)
    elif c == 'library':
        target.append(12)
    elif c == 'sportcenter':
        target.append(13)

# Classes: ['outdoorligths', 'trafficlights', 'kindergarten', 'playgroundwithbuilding', 'n
# ursinghome', 'school', 'healthcenter', 'officebuilding', 'indoorswimmingpool', 'hospital
# ', 'firestation', 'watertower', 'library', 'sportcenter']
# Total number of classes: 14

# We remove the column with class names and add the column with class numbers
df['TARGET'] = target
df.pop('CLASS')

#########################################
##  Building the model                 ##
#########################################

model = keras.Sequential([
    keras.layers.Dense(25, activation=tf.nn.relu),
    keras.layers.Dense(64, activation=tf.nn.relu),

    # We could generalize the number of nodes of the output layer
    # according to the number of classes our data contains.
    keras.layers.Dense(14, activation=tf.nn.softmax)
])

model.compile(
Example #43
0
def get_labels(dataset: pd.DataFrame) -> pd.Series:
    return dataset.pop('MPG')
Example #44
0
print tableViews(df[0:40])
#SaveInFile(df[0:40], "b-techs.ma-10-2014.txt")

#nous allons suprimer quelque donner que nous n'allons pas utiliser.

del df['%V'];
del df['%i'];
del df['%l'];
del df['%{User-Agent}i']

#nous allons renomer les colonne que nous afichons.

df = df.rename(columns={'%>s': 'Status', '%b':'b', '%h':'IP', '%r':'Request', '%t': 'Time'})
#print df.head()
#cette ligne permet de concertir le temps (datetime)
df.index = pd.to_datetime(df.pop('Time'))

#convertir le type status en int
df['Status'] = df['Status'].astype('int')
df['b'][93]

#df['b'] = df['b'].replace('-', 'NaN', regex=True).astype('float')
#df['b'] = df['b']/1048576.
#print df['b']
#df['b'][93]

def dash2nan(x):
       if x == '-':
              x = np.nan
       else:
              x = float(x) * 10**(-3) #Convertir les byts en K-byts multipier 0.0001