def test_default_na_values(all_parsers):
    _NA_VALUES = {"-1.#IND", "1.#QNAN", "1.#IND", "-1.#QNAN", "#N/A",
                  "N/A", "n/a", "NA", "#NA", "NULL", "null", "NaN", "nan",
                  "-NaN", "-nan", "#N/A N/A", ""}
    assert _NA_VALUES == com._NA_VALUES

    parser = all_parsers
    nv = len(_NA_VALUES)

    def f(i, v):
        if i == 0:
            buf = ""
        elif i > 0:
            buf = "".join([","] * i)

        buf = "{0}{1}".format(buf, v)

        if i < nv - 1:
            buf = "{0}{1}".format(buf, "".join([","] * (nv - i - 1)))

        return buf

    data = StringIO("\n".join(f(i, v) for i, v in enumerate(_NA_VALUES)))
    expected = DataFrame(np.nan, columns=range(nv), index=range(nv))

    result = parser.read_csv(data, header=None)
    tm.assert_frame_equal(result, expected)
Beispiel #2
0
 def setup(self):
     self.n = (10 ** 3)
     self.to_ts = dict(((i, pd.Timestamp(i)) for i in range(self.n)))
     self.to_td = dict(((i, pd.Timedelta(i)) for i in range(self.n)))
     self.s = Series(np.random.randint(self.n, size=(10 ** 3)))
     self.df = DataFrame({'A': np.random.randint(self.n, size=(10 ** 3)),
                          'B': np.random.randint(self.n, size=(10 ** 3))})
def test_get_duplicates():
    # GH5873
    for a in [101, 102]:
        mi = MultiIndex.from_arrays([[101, a], [3.5, np.nan]])
        assert not mi.has_duplicates

        with tm.assert_produces_warning(FutureWarning):
            # Deprecated - see GH20239
            assert mi.get_duplicates().equals(MultiIndex.from_arrays([[], []]))

        tm.assert_numpy_array_equal(mi.duplicated(),
                                    np.zeros(2, dtype='bool'))

    for n in range(1, 6):  # 1st level shape
        for m in range(1, 5):  # 2nd level shape
            # all possible unique combinations, including nan
            codes = product(range(-1, n), range(-1, m))
            mi = MultiIndex(levels=[list('abcde')[:n], list('WXYZ')[:m]],
                            codes=np.random.permutation(list(codes)).T)
            assert len(mi) == (n + 1) * (m + 1)
            assert not mi.has_duplicates

            with tm.assert_produces_warning(FutureWarning):
                # Deprecated - see GH20239
                assert mi.get_duplicates().equals(MultiIndex.from_arrays(
                    [[], []]))

            tm.assert_numpy_array_equal(mi.duplicated(),
                                        np.zeros(len(mi), dtype='bool'))
Beispiel #4
0
def test_is_():
    mi = MultiIndex.from_tuples(lzip(range(10), range(10)))
    assert mi.is_(mi)
    assert mi.is_(mi.view())
    assert mi.is_(mi.view().view().view().view())
    mi2 = mi.view()
    # names are metadata, they don't change id
    mi2.names = ["A", "B"]
    assert mi2.is_(mi)
    assert mi.is_(mi2)

    assert mi.is_(mi.set_names(["C", "D"]))
    mi2 = mi.view()
    mi2.set_names(["E", "F"], inplace=True)
    assert mi.is_(mi2)
    # levels are inherent properties, they change identity
    mi3 = mi2.set_levels([lrange(10), lrange(10)])
    assert not mi3.is_(mi2)
    # shouldn't change
    assert mi2.is_(mi)
    mi4 = mi3.view()

    # GH 17464 - Remove duplicate MultiIndex levels
    mi4.set_levels([lrange(10), lrange(10)], inplace=True)
    assert not mi4.is_(mi3)
    mi5 = mi.view()
    mi5.set_levels(mi5.levels, inplace=True)
    assert not mi5.is_(mi)
Beispiel #5
0
    def test_constructor_coverage(self):
        rng = timedelta_range("1 days", periods=10.5)
        exp = timedelta_range("1 days", periods=10)
        self.assertTrue(rng.equals(exp))

        self.assertRaises(ValueError, TimedeltaIndex, start="1 days", periods="foo", freq="D")

        self.assertRaises(ValueError, TimedeltaIndex, start="1 days", end="10 days")

        self.assertRaises(ValueError, TimedeltaIndex, "1 days")

        # generator expression
        gen = (timedelta(i) for i in range(10))
        result = TimedeltaIndex(gen)
        expected = TimedeltaIndex([timedelta(i) for i in range(10)])
        self.assertTrue(result.equals(expected))

        # NumPy string array
        strings = np.array(["1 days", "2 days", "3 days"])
        result = TimedeltaIndex(strings)
        expected = to_timedelta([1, 2, 3], unit="d")
        self.assertTrue(result.equals(expected))

        from_ints = TimedeltaIndex(expected.asi8)
        self.assertTrue(from_ints.equals(expected))

        # non-conforming freq
        self.assertRaises(ValueError, TimedeltaIndex, ["1 days", "2 days", "4 days"], freq="D")

        self.assertRaises(ValueError, TimedeltaIndex, periods=10, freq="D")
Beispiel #6
0
    def insert(self, value):
        # find first node on each level where node.next[levels].value > value
        chain = [None] * self.maxlevels
        steps_at_level = [0] * self.maxlevels
        node = self.head
        for level in reversed(range(self.maxlevels)):
            while node.next[level].value <= value:
                steps_at_level[level] += node.width[level]
                node = node.next[level]
            chain[level] = node

        # insert a link to the newnode at each level
        d = min(self.maxlevels, 1 - int(log(random(), 2.0)))
        newnode = Node(value, [None] * d, [None] * d)
        steps = 0
        for level in range(d):
            prevnode = chain[level]
            newnode.next[level] = prevnode.next[level]
            prevnode.next[level] = newnode
            newnode.width[level] = prevnode.width[level] - steps
            prevnode.width[level] = steps + 1
            steps += steps_at_level[level]
        for level in range(d, self.maxlevels):
            chain[level].width[level] += 1
        self.size += 1
Beispiel #7
0
    def test_basic(self):

        labels, uniques = algos.factorize(['a', 'b', 'b', 'a',
                                           'a', 'c', 'c', 'c'])
        # self.assert_numpy_array_equal(labels, np.array([ 0, 1, 1, 0, 0, 2, 2, 2],dtype=np.int64))
        self.assert_numpy_array_equal(uniques, np.array(['a','b','c'], dtype=object))

        labels, uniques = algos.factorize(['a', 'b', 'b', 'a',
                                           'a', 'c', 'c', 'c'], sort=True)
        self.assert_numpy_array_equal(labels, np.array([ 0, 1, 1, 0, 0, 2, 2, 2],dtype=np.int64))
        self.assert_numpy_array_equal(uniques, np.array(['a','b','c'], dtype=object))

        labels, uniques = algos.factorize(list(reversed(range(5))))
        self.assert_numpy_array_equal(labels, np.array([0, 1, 2, 3, 4], dtype=np.int64))
        self.assert_numpy_array_equal(uniques, np.array([ 4, 3, 2, 1, 0],dtype=np.int64))

        labels, uniques = algos.factorize(list(reversed(range(5))), sort=True)
        self.assert_numpy_array_equal(labels, np.array([ 4, 3, 2, 1, 0],dtype=np.int64))
        self.assert_numpy_array_equal(uniques, np.array([0, 1, 2, 3, 4], dtype=np.int64))

        labels, uniques = algos.factorize(list(reversed(np.arange(5.))))
        self.assert_numpy_array_equal(labels, np.array([0., 1., 2., 3., 4.], dtype=np.float64))
        self.assert_numpy_array_equal(uniques, np.array([ 4, 3, 2, 1, 0],dtype=np.int64))

        labels, uniques = algos.factorize(list(reversed(np.arange(5.))), sort=True)
        self.assert_numpy_array_equal(labels, np.array([ 4, 3, 2, 1, 0],dtype=np.int64))
        self.assert_numpy_array_equal(uniques, np.array([0., 1., 2., 3., 4.], dtype=np.float64))
    def check(nlevels, with_nulls):
        labels = np.tile(np.arange(500), 2)
        level = np.arange(500)

        if with_nulls:  # inject some null values
            labels[500] = -1  # common nan value
            labels = [labels.copy() for i in range(nlevels)]
            for i in range(nlevels):
                labels[i][500 + i - nlevels // 2] = -1

            labels += [np.array([-1, 1]).repeat(500)]
        else:
            labels = [labels] * nlevels + [np.arange(2).repeat(500)]

        levels = [level] * nlevels + [[0, 1]]

        # no dups
        index = MultiIndex(levels=levels, labels=labels)
        assert not index.has_duplicates

        # with a dup
        if with_nulls:
            def f(a):
                return np.insert(a, 1000, a[0])
            labels = list(map(f, labels))
            index = MultiIndex(levels=levels, labels=labels)
        else:
            values = index.values.tolist()
            index = MultiIndex.from_tuples(values + [values[0]])

        assert index.has_duplicates
Beispiel #9
0
def test_split_ranges():
    def _bin(x, width):
        "return int(x) as a base2 string of given width"
        return ''.join(str((x >> i) & 1) for i in range(width - 1, -1, -1))

    def test_locs(mask):
        nfalse = sum(np.array(mask) == 0)

        remaining = 0
        for s, e in com.split_ranges(mask):
            remaining += e - s

            assert 0 not in mask[s:e]

        # make sure the total items covered by the ranges are a complete cover
        assert remaining + nfalse == len(mask)

    # exhaustively test all possible mask sequences of length 8
    ncols = 8
    for i in range(2 ** ncols):
        cols = lmap(int, list(_bin(i, ncols)))  # count up in base2
        mask = [cols[i] == 1 for i in range(len(cols))]
        test_locs(mask)

    # base cases
    test_locs([])
    test_locs([0])
    test_locs([1])
Beispiel #10
0
    def test_rolling_max_how_resample(self):

        indices = [datetime(1975, 1, i) for i in range(1, 6)]
        # So that we can have 3 datapoints on last day (4, 10, and 20)
        indices.append(datetime(1975, 1, 5, 1))
        indices.append(datetime(1975, 1, 5, 2))
        series = Series(list(range(0, 5)) + [10, 20], index=indices)
        # Use floats instead of ints as values
        series = series.map(lambda x: float(x))
        # Sort chronologically
        series = series.sort_index()

        # Default how should be max
        expected = Series([0.0, 1.0, 2.0, 3.0, 20.0],
                          index=[datetime(1975, 1, i, 0)
                                 for i in range(1, 6)])
        x = mom.rolling_max(series, window=1, freq='D')
        assert_series_equal(expected, x)

        # Now specify median (10.0)
        expected = Series([0.0, 1.0, 2.0, 3.0, 10.0],
                          index=[datetime(1975, 1, i, 0)
                                 for i in range(1, 6)])
        x = mom.rolling_max(series, window=1, freq='D', how='median')
        assert_series_equal(expected, x)

        # Now specify mean (4+10+20)/3
        v = (4.0+10.0+20.0)/3.0
        expected = Series([0.0, 1.0, 2.0, 3.0, v],
                          index=[datetime(1975, 1, i, 0)
                                 for i in range(1, 6)])
        x = mom.rolling_max(series, window=1, freq='D', how='mean')
        assert_series_equal(expected, x)
Beispiel #11
0
def _get_join_keys(llab, rlab, shape, sort):
    from pandas.core.groupby import _int64_overflow_possible

    # how many levels can be done without overflow
    pred = lambda i: not _int64_overflow_possible(shape[:i])
    nlev = next(filter(pred, range(len(shape), 0, -1)))

    # get keys for the first `nlev` levels
    stride = np.prod(shape[1:nlev], dtype='i8')
    lkey = stride * llab[0].astype('i8', subok=False, copy=False)
    rkey = stride * rlab[0].astype('i8', subok=False, copy=False)

    for i in range(1, nlev):
        stride //= shape[i]
        lkey += llab[i] * stride
        rkey += rlab[i] * stride

    if nlev == len(shape):  # all done!
        return lkey, rkey

    # densify current keys to avoid overflow
    lkey, rkey, count = _factorize_keys(lkey, rkey, sort=sort)

    llab = [lkey] + llab[nlev:]
    rlab = [rkey] + rlab[nlev:]
    shape = [count] + shape[nlev:]

    return _get_join_keys(llab, rlab, shape, sort)
Beispiel #12
0
    def test_constructor_coverage(self):
        rng = timedelta_range('1 days', periods=10.5)
        exp = timedelta_range('1 days', periods=10)
        self.assertTrue(rng.equals(exp))

        self.assertRaises(ValueError, TimedeltaIndex, start='1 days',
                          periods='foo', freq='D')

        self.assertRaises(ValueError, TimedeltaIndex, start='1 days',
                          end='10 days')

        self.assertRaises(ValueError, TimedeltaIndex, '1 days')

        # generator expression
        gen = (timedelta(i) for i in range(10))
        result = TimedeltaIndex(gen)
        expected = TimedeltaIndex([timedelta(i) for i in range(10)])
        self.assertTrue(result.equals(expected))

        # NumPy string array
        strings = np.array(['1 days', '2 days', '3 days'])
        result = TimedeltaIndex(strings)
        expected = to_timedelta([1,2,3],unit='d')
        self.assertTrue(result.equals(expected))

        from_ints = TimedeltaIndex(expected.asi8)
        self.assertTrue(from_ints.equals(expected))

        # non-conforming freq
        self.assertRaises(ValueError, TimedeltaIndex,
                          ['1 days', '2 days', '4 days'],
                          freq='D')

        self.assertRaises(ValueError, TimedeltaIndex, periods=10, freq='D')
Beispiel #13
0
    def test_basic(self):

        labels, uniques = algos.factorize(["a", "b", "b", "a", "a", "c", "c", "c"])
        # self.assert_numpy_array_equal(labels, np.array([ 0, 1, 1, 0, 0, 2, 2, 2],dtype=np.int64))
        self.assert_numpy_array_equal(uniques, np.array(["a", "b", "c"], dtype=object))

        labels, uniques = algos.factorize(["a", "b", "b", "a", "a", "c", "c", "c"], sort=True)
        self.assert_numpy_array_equal(labels, np.array([0, 1, 1, 0, 0, 2, 2, 2], dtype=np.int64))
        self.assert_numpy_array_equal(uniques, np.array(["a", "b", "c"], dtype=object))

        labels, uniques = algos.factorize(list(reversed(range(5))))
        self.assert_numpy_array_equal(labels, np.array([0, 1, 2, 3, 4], dtype=np.int64))
        self.assert_numpy_array_equal(uniques, np.array([4, 3, 2, 1, 0], dtype=np.int64))

        labels, uniques = algos.factorize(list(reversed(range(5))), sort=True)
        self.assert_numpy_array_equal(labels, np.array([4, 3, 2, 1, 0], dtype=np.int64))
        self.assert_numpy_array_equal(uniques, np.array([0, 1, 2, 3, 4], dtype=np.int64))

        labels, uniques = algos.factorize(list(reversed(np.arange(5.0))))
        self.assert_numpy_array_equal(labels, np.array([0.0, 1.0, 2.0, 3.0, 4.0], dtype=np.float64))
        self.assert_numpy_array_equal(uniques, np.array([4, 3, 2, 1, 0], dtype=np.int64))

        labels, uniques = algos.factorize(list(reversed(np.arange(5.0))), sort=True)
        self.assert_numpy_array_equal(labels, np.array([4, 3, 2, 1, 0], dtype=np.int64))
        self.assert_numpy_array_equal(uniques, np.array([0.0, 1.0, 2.0, 3.0, 4.0], dtype=np.float64))
Beispiel #14
0
    def _check_stat_op(self, name, alternative, obj=None, has_skipna=True):
        if obj is None:
            obj = self.panel4d

            # # set some NAs
            # obj.ix[5:10] = np.nan
            # obj.ix[15:20, -2:] = np.nan

        f = getattr(obj, name)

        if has_skipna:
            def skipna_wrapper(x):
                nona = remove_na(x)
                if len(nona) == 0:
                    return np.nan
                return alternative(nona)

            def wrapper(x):
                return alternative(np.asarray(x))

            for i in range(obj.ndim):
                result = f(axis=i, skipna=False)
                assert_panel_equal(result, obj.apply(wrapper, axis=i))
        else:
            skipna_wrapper = alternative
            wrapper = alternative

        for i in range(obj.ndim):
            result = f(axis=i)
            assert_panel_equal(result, obj.apply(skipna_wrapper, axis=i))

        self.assertRaises(Exception, f, axis=obj.ndim)
Beispiel #15
0
def test_index_unique(dups):
    uniques = dups.index.unique()
    expected = DatetimeIndex([datetime(2000, 1, 2), datetime(2000, 1, 3),
                              datetime(2000, 1, 4), datetime(2000, 1, 5)])
    assert uniques.dtype == 'M8[ns]'  # sanity
    tm.assert_index_equal(uniques, expected)
    assert dups.index.nunique() == 4

    # #2563
    assert isinstance(uniques, DatetimeIndex)

    dups_local = dups.index.tz_localize('US/Eastern')
    dups_local.name = 'foo'
    result = dups_local.unique()
    expected = DatetimeIndex(expected, name='foo')
    expected = expected.tz_localize('US/Eastern')
    assert result.tz is not None
    assert result.name == 'foo'
    tm.assert_index_equal(result, expected)

    # NaT, note this is excluded
    arr = [1370745748 + t for t in range(20)] + [tslib.iNaT]
    idx = DatetimeIndex(arr * 3)
    tm.assert_index_equal(idx.unique(), DatetimeIndex(arr))
    assert idx.nunique() == 20
    assert idx.nunique(dropna=False) == 21

    arr = [Timestamp('2013-06-09 02:42:28') + timedelta(seconds=t)
           for t in range(20)] + [NaT]
    idx = DatetimeIndex(arr * 3)
    tm.assert_index_equal(idx.unique(), DatetimeIndex(arr))
    assert idx.nunique() == 20
    assert idx.nunique(dropna=False) == 21
Beispiel #16
0
    def test_constructor_empty(self, input_class):
        empty = Series()
        empty2 = Series(input_class())

        # these are Index() and RangeIndex() which don't compare type equal
        # but are just .equals
        assert_series_equal(empty, empty2, check_index_type=False)

        # With explicit dtype:
        empty = Series(dtype='float64')
        empty2 = Series(input_class(), dtype='float64')
        assert_series_equal(empty, empty2, check_index_type=False)

        # GH 18515 : with dtype=category:
        empty = Series(dtype='category')
        empty2 = Series(input_class(), dtype='category')
        assert_series_equal(empty, empty2, check_index_type=False)

        if input_class is not list:
            # With index:
            empty = Series(index=lrange(10))
            empty2 = Series(input_class(), index=lrange(10))
            assert_series_equal(empty, empty2)

            # With index and dtype float64:
            empty = Series(np.nan, index=lrange(10))
            empty2 = Series(input_class(), index=lrange(10), dtype='float64')
            assert_series_equal(empty, empty2)

            # GH 19853 : with empty string, index and dtype str
            empty = Series('', dtype=str, index=range(3))
            empty2 = Series('', index=range(3))
            assert_series_equal(empty, empty2)
Beispiel #17
0
def _maybe_promote_shape(values, naxes):
    # test to see if we have an array else leave since must be a number
    if not isinstance(values, np.ndarray):
        return values

    ndims = values.ndim
    if ndims > naxes:
        raise AssertionError('cannot have more dims than axes, '
                             '{0} > {1}'.format(ndims, naxes))
    if ndims == naxes:
        return values

    ndim, nax = range(ndims), range(naxes)

    axes_slice = [slice(None)] * naxes

    # set difference of numaxes and ndims
    slices = list(set(nax) - set(ndim))

    if ndims == naxes:
        if slices:
            raise AssertionError('slices should be empty if ndims == naxes '
                                 '{0}'.format(slices))
    else:
        if not slices:
            raise AssertionError('slices should NOT be empty if ndim != naxes '
                                 '{0}'.format(slices))

    for sl in slices:
        axes_slice[sl] = np.newaxis

    return values[tuple(axes_slice)]
Beispiel #18
0
    def _write_regular_rows(self, fmt_values, indent):
        truncate_h = self.fmt.truncate_h
        truncate_v = self.fmt.truncate_v

        ncols = len(self.fmt.tr_frame.columns)
        nrows = len(self.fmt.tr_frame)
        fmt = self.fmt._get_formatter('__index__')
        if fmt is not None:
            index_values = self.fmt.tr_frame.index.map(fmt)
        else:
            index_values = self.fmt.tr_frame.index.format()

        row = []
        for i in range(nrows):

            if truncate_v and i == (self.fmt.tr_row_num):
                str_sep_row = ['...'] * len(row)
                self.write_tr(str_sep_row, indent, self.indent_delta,
                              tags=None, nindex_levels=1)

            row = []
            row.append(index_values[i])
            row.extend(fmt_values[j][i] for j in range(ncols))

            if truncate_h:
                dot_col_ix = self.fmt.tr_col_num + 1
                row.insert(dot_col_ix, '...')
            self.write_tr(row, indent, self.indent_delta, tags=None,
                          nindex_levels=1)
Beispiel #19
0
    def _apply_1d(self, func, axis):

        axis_name = self._get_axis_name(axis)
        ax = self._get_axis(axis)
        ndim = self.ndim
        values = self.values

        # iter thru the axes
        slice_axis = self._get_axis(axis)
        slice_indexer = [0]*(ndim-1)
        indexer = np.zeros(ndim, 'O')
        indlist = list(range(ndim))
        indlist.remove(axis)
        indexer[axis] = slice(None, None)
        indexer.put(indlist, slice_indexer)
        planes = [ self._get_axis(axi) for axi in indlist ]
        shape = np.array(self.shape).take(indlist)

        # all the iteration points
        points = cartesian_product(planes)

        results = []
        for i in range(np.prod(shape)):

            # construct the object
            pts = tuple([ p[i] for p in points ])
            indexer.put(indlist, slice_indexer)

            obj = Series(values[tuple(indexer)],index=slice_axis,name=pts)
            result = func(obj)

            results.append(result)

            # increment the indexer
            slice_indexer[-1] += 1
            n = -1
            while (slice_indexer[n] >= shape[n]) and (n > (1-ndim)):
                slice_indexer[n-1] += 1
                slice_indexer[n] = 0
                n -= 1

        # empty object
        if not len(results):
            return self._constructor(**self._construct_axes_dict())

        # same ndim as current
        if isinstance(results[0],Series):
            arr = np.vstack([ r.values for r in results ])
            arr = arr.T.reshape(tuple([len(slice_axis)] + list(shape)))
            tranp = np.array([axis]+indlist).argsort()
            arr = arr.transpose(tuple(list(tranp)))
            return self._constructor(arr,**self._construct_axes_dict())

        # ndim-1 shape
        results = np.array(results).reshape(shape)
        if results.ndim == 2 and axis_name != self._info_axis_name:
            results = results.T
            planes = planes[::-1]
        return self._construct_return_type(results,planes)
Beispiel #20
0
    def test_astype_categorical_to_other(self):

        df = DataFrame({'value': np.random.randint(0, 10000, 100)})
        labels = ["{0} - {1}".format(i, i + 499) for i in range(0, 10000, 500)]
        cat_labels = Categorical(labels, labels)

        df = df.sort_values(by=['value'], ascending=True)
        df['value_group'] = pd.cut(df.value, range(0, 10500, 500),
                                   right=False, labels=cat_labels)

        s = df['value_group']
        expected = s
        tm.assert_series_equal(s.astype('category'), expected)
        tm.assert_series_equal(s.astype(CategoricalDtype()), expected)
        msg = (r"could not convert string to float: '(0 - 499|9500 - 9999)'|"
               r"invalid literal for float\(\): (0 - 499|9500 - 9999)")
        with pytest.raises(ValueError, match=msg):
            s.astype('float64')

        cat = Series(Categorical(['a', 'b', 'b', 'a', 'a', 'c', 'c', 'c']))
        exp = Series(['a', 'b', 'b', 'a', 'a', 'c', 'c', 'c'])
        tm.assert_series_equal(cat.astype('str'), exp)
        s2 = Series(Categorical(['1', '2', '3', '4']))
        exp2 = Series([1, 2, 3, 4]).astype(int)
        tm.assert_series_equal(s2.astype('int'), exp2)

        # object don't sort correctly, so just compare that we have the same
        # values
        def cmp(a, b):
            tm.assert_almost_equal(
                np.sort(np.unique(a)), np.sort(np.unique(b)))

        expected = Series(np.array(s.values), name='value_group')
        cmp(s.astype('object'), expected)
        cmp(s.astype(np.object_), expected)

        # array conversion
        tm.assert_almost_equal(np.array(s), np.array(s.values))

        # valid conversion
        for valid in [lambda x: x.astype('category'),
                      lambda x: x.astype(CategoricalDtype()),
                      lambda x: x.astype('object').astype('category'),
                      lambda x: x.astype('object').astype(
                          CategoricalDtype())
                      ]:

            result = valid(s)
            # compare series values
            # internal .categories can't be compared because it is sorted
            tm.assert_series_equal(result, s, check_categorical=False)

        # invalid conversion (these are NOT a dtype)
        msg = (r"invalid type <class 'pandas\.core\.arrays\.categorical\."
               "Categorical'> for astype")
        for invalid in [lambda x: x.astype(Categorical),
                        lambda x: x.astype('object').astype(Categorical)]:
            with pytest.raises(TypeError, match=msg):
                invalid(s)
Beispiel #21
0
    def get_result(self):

        # series only
        if self._is_series:

            # stack blocks
            if self.axis == 0:
                new_data = com._concat_compat([x._values for x in self.objs])
                name = com._consensus_name_attr(self.objs)
                return (Series(new_data, index=self.new_axes[0],
                               name=name,
                               dtype=new_data.dtype)
                        .__finalize__(self, method='concat'))

            # combine as columns in a frame
            else:
                data = dict(zip(range(len(self.objs)), self.objs))
                index, columns = self.new_axes
                tmpdf = DataFrame(data, index=index)
                # checks if the column variable already stores valid column
                # names (because set via the 'key' argument in the 'concat'
                # function call. If that's not the case, use the series names
                # as column names
                if (columns.equals(Index(np.arange(len(self.objs)))) and
                        not self.ignore_index):
                    columns = np.array([data[i].name
                                        for i in range(len(data))],
                                       dtype='object')
                    indexer = isnull(columns)
                    if indexer.any():
                        columns[indexer] = np.arange(len(indexer[indexer]))
                tmpdf.columns = columns
                return tmpdf.__finalize__(self, method='concat')

        # combine block managers
        else:
            mgrs_indexers = []
            for obj in self.objs:
                mgr = obj._data
                indexers = {}
                for ax, new_labels in enumerate(self.new_axes):
                    if ax == self.axis:
                        # Suppress reindexing on concat axis
                        continue

                    obj_labels = mgr.axes[ax]
                    if not new_labels.equals(obj_labels):
                        indexers[ax] = obj_labels.reindex(new_labels)[1]

                mgrs_indexers.append((obj._data, indexers))

            new_data = concatenate_block_managers(
                mgrs_indexers, self.new_axes,
                concat_axis=self.axis, copy=self.copy)
            if not self.copy:
                new_data._consolidate_inplace()

            return (self.objs[0]._from_axes(new_data, self.new_axes)
                    .__finalize__(self, method='concat'))
Beispiel #22
0
def test_rename():
    # GH 17407
    s = Series(range(1, 6), index=pd.Index(range(2, 7), name='IntIndex'))
    result = s.rename(str)
    expected = s.rename(lambda i: str(i))
    assert_series_equal(result, expected)

    assert result.name == expected.name
Beispiel #23
0
    def write_cells(self, cells, sheet_name=None, startrow=0, startcol=0):
        # Write the frame cells using openpyxl.
        from openpyxl.cell import get_column_letter

        sheet_name = self._get_sheet_name(sheet_name)

        if sheet_name in self.sheets:
            wks = self.sheets[sheet_name]
        else:
            wks = self.book.create_sheet()
            wks.title = sheet_name
            self.sheets[sheet_name] = wks

        for cell in cells:
            colletter = get_column_letter(startcol + cell.col + 1)
            xcell = wks.cell("%s%s" % (colletter, startrow + cell.row + 1))
            xcell.value = _conv_value(cell.val)
            style_kwargs = {}

            # Apply format codes before cell.style to allow override
            if isinstance(cell.val, datetime.datetime):
                style_kwargs.update(self._convert_to_style_kwargs({
                        'number_format':{'format_code': self.datetime_format}}))
            elif isinstance(cell.val, datetime.date):
                style_kwargs.update(self._convert_to_style_kwargs({
                        'number_format':{'format_code': self.date_format}}))

            if cell.style:
                style_kwargs.update(self._convert_to_style_kwargs(cell.style))

            if style_kwargs:
                xcell.style = xcell.style.copy(**style_kwargs)

            if cell.mergestart is not None and cell.mergeend is not None:
                cletterstart = get_column_letter(startcol + cell.col + 1)
                cletterend = get_column_letter(startcol + cell.mergeend + 1)

                wks.merge_cells('%s%s:%s%s' % (cletterstart,
                                               startrow + cell.row + 1,
                                               cletterend,
                                               startrow + cell.mergestart + 1))

                # Excel requires that the format of the first cell in a merged
                # range is repeated in the rest of the merged range.
                if style_kwargs:
                    first_row = startrow + cell.row + 1
                    last_row = startrow + cell.mergestart + 1
                    first_col = startcol + cell.col + 1
                    last_col = startcol + cell.mergeend + 1

                    for row in range(first_row, last_row + 1):
                        for col in range(first_col, last_col + 1):
                            if row == first_row and col == first_col:
                                # Ignore first cell. It is already handled.
                                continue
                            colletter = get_column_letter(col)
                            xcell = wks.cell("%s%s" % (colletter, row))
                            xcell.style = xcell.style.copy(**style_kwargs)
Beispiel #24
0
    def write_cells(self, cells, sheet_name=None, startrow=0, startcol=0):
        # Write the frame cells using openpyxl.
        from openpyxl.cell import get_column_letter

        sheet_name = self._get_sheet_name(sheet_name)

        if sheet_name in self.sheets:
            wks = self.sheets[sheet_name]
        else:
            wks = self.book.create_sheet()
            wks.title = sheet_name
            self.sheets[sheet_name] = wks

        for cell in cells:
            colletter = get_column_letter(startcol + cell.col + 1)
            xcell = wks.cell("%s%s" % (colletter, startrow + cell.row + 1))
            xcell.value = _conv_value(cell.val)
            style = None
            if cell.style:
                style = self._convert_to_style(cell.style)
                for field in style.__fields__:
                    xcell.style.__setattr__(field,
                                            style.__getattribute__(field))

            if isinstance(cell.val, datetime.datetime):
                xcell.style.number_format.format_code = self.datetime_format
            elif isinstance(cell.val, datetime.date):
                xcell.style.number_format.format_code = self.date_format
            elif isinstance(cell.val, datetime.time):
                xcell.style.number_format.format_code = self.time_format

            if cell.mergestart is not None and cell.mergeend is not None:
                cletterstart = get_column_letter(startcol + cell.col + 1)
                cletterend = get_column_letter(startcol + cell.mergeend + 1)

                wks.merge_cells('%s%s:%s%s' % (cletterstart,
                                               startrow + cell.row + 1,
                                               cletterend,
                                               startrow + cell.mergestart + 1))

                # Excel requires that the format of the first cell in a merged
                # range is repeated in the rest of the merged range.
                if style:
                    first_row = startrow + cell.row + 1
                    last_row = startrow + cell.mergestart + 1
                    first_col = startcol + cell.col + 1
                    last_col = startcol + cell.mergeend + 1

                    for row in range(first_row, last_row + 1):
                        for col in range(first_col, last_col + 1):
                            if row == first_row and col == first_col:
                                # Ignore first cell. It is already handled.
                                continue
                            colletter = get_column_letter(col)
                            xcell = wks.cell("%s%s" % (colletter, row))
                            for field in style.__fields__:
                                xcell.style.__setattr__(
                                    field, style.__getattribute__(field))
Beispiel #25
0
    def write_cells(self, cells, sheet_name=None, startrow=0, startcol=0):
        # Write the frame cells using openpyxl.
        from openpyxl import styles

        sheet_name = self._get_sheet_name(sheet_name)

        _style_cache = {}

        if sheet_name in self.sheets:
            wks = self.sheets[sheet_name]
        else:
            wks = self.book.create_sheet()
            wks.title = sheet_name
            self.sheets[sheet_name] = wks

        for cell in cells:
            xcell = wks.cell(row=startrow + cell.row + 1, column=startcol + cell.col + 1)
            xcell.value = _conv_value(cell.val)

            style_kwargs = {}
            if cell.style:
                key = str(cell.style)
                style_kwargs = _style_cache.get(key)
                if style_kwargs is None:
                    style_kwargs = self._convert_to_style_kwargs(cell.style)
                    _style_cache[key] = style_kwargs

            if style_kwargs:
                for k, v in style_kwargs.items():
                    setattr(xcell, k, v)

            if cell.mergestart is not None and cell.mergeend is not None:

                wks.merge_cells(
                    start_row=startrow + cell.row + 1,
                    start_column=startcol + cell.col + 1,
                    end_column=startcol + cell.mergeend + 1,
                    end_row=startrow + cell.mergeend + 1,
                )

                # When cells are merged only the top-left cell is preserved
                # The behaviour of the other cells in a merged range is undefined
                if style_kwargs:
                    first_row = startrow + cell.row + 1
                    last_row = startrow + cell.mergestart + 1
                    first_col = startcol + cell.col + 1
                    last_col = startcol + cell.mergeend + 1

                    for row in range(first_row, last_row + 1):
                        for col in range(first_col, last_col + 1):
                            if row == first_row and col == first_col:
                                # Ignore first cell. It is already handled.
                                continue
                            xcell = wks.cell(column=col, row=row)
                            for k, v in style_kwargs.items():
                                setattr(xcell, k, v)
Beispiel #26
0
def test_where_unsafe():
    # see gh-9731
    s = Series(np.arange(10), dtype="int64")
    values = [2.5, 3.5, 4.5, 5.5]

    mask = s > 5
    expected = Series(lrange(6) + values, dtype="float64")

    s[mask] = values
    assert_series_equal(s, expected)

    # see gh-3235
    s = Series(np.arange(10), dtype='int64')
    mask = s < 5
    s[mask] = lrange(2, 7)
    expected = Series(lrange(2, 7) + lrange(5, 10), dtype='int64')
    assert_series_equal(s, expected)
    assert s.dtype == expected.dtype

    s = Series(np.arange(10), dtype='int64')
    mask = s > 5
    s[mask] = [0] * 4
    expected = Series([0, 1, 2, 3, 4, 5] + [0] * 4, dtype='int64')
    assert_series_equal(s, expected)

    s = Series(np.arange(10))
    mask = s > 5

    def f():
        s[mask] = [5, 4, 3, 2, 1]

    pytest.raises(ValueError, f)

    def f():
        s[mask] = [0] * 5

    pytest.raises(ValueError, f)

    # dtype changes
    s = Series([1, 2, 3, 4])
    result = s.where(s > 2, np.nan)
    expected = Series([np.nan, np.nan, 3, 4])
    assert_series_equal(result, expected)

    # GH 4667
    # setting with None changes dtype
    s = Series(range(10)).astype(float)
    s[8] = None
    result = s[8]
    assert isna(result)

    s = Series(range(10)).astype(float)
    s[s > 8] = None
    result = s[isna(s)]
    expected = Series(np.nan, index=[9])
    assert_series_equal(result, expected)
Beispiel #27
0
    def test_index_type_coercion(self):

        with catch_warnings(record=True):
            simplefilter("ignore")

            # GH 11836
            # if we have an index type and set it with something that looks
            # to numpy like the same, but is actually, not
            # (e.g. setting with a float or string '0')
            # then we need to coerce to object

            # integer indexes
            for s in [Series(range(5)),
                      Series(range(5), index=range(1, 6))]:

                assert s.index.is_integer()

                for indexer in [lambda x: x.ix,
                                lambda x: x.loc,
                                lambda x: x]:
                    s2 = s.copy()
                    indexer(s2)[0.1] = 0
                    assert s2.index.is_floating()
                    assert indexer(s2)[0.1] == 0

                    s2 = s.copy()
                    indexer(s2)[0.0] = 0
                    exp = s.index
                    if 0 not in s:
                        exp = Index(s.index.tolist() + [0])
                    tm.assert_index_equal(s2.index, exp)

                    s2 = s.copy()
                    indexer(s2)['0'] = 0
                    assert s2.index.is_object()

            for s in [Series(range(5), index=np.arange(5.))]:

                assert s.index.is_floating()

                for idxr in [lambda x: x.ix,
                             lambda x: x.loc,
                             lambda x: x]:

                    s2 = s.copy()
                    idxr(s2)[0.1] = 0
                    assert s2.index.is_floating()
                    assert idxr(s2)[0.1] == 0

                    s2 = s.copy()
                    idxr(s2)[0.0] = 0
                    tm.assert_index_equal(s2.index, s.index)

                    s2 = s.copy()
                    idxr(s2)['0'] = 0
                    assert s2.index.is_object()
Beispiel #28
0
    def test_range_in_series_indexing(self):
        # range can cause an indexing error
        # GH 11652
        for x in [5, 999999, 1000000]:
            s = Series(index=range(x))
            s.loc[range(1)] = 42
            tm.assert_series_equal(s.loc[range(1)], Series(42.0, index=[0]))

            s.loc[range(2)] = 43
            tm.assert_series_equal(s.loc[range(2)], Series(43.0, index=[0, 1]))
Beispiel #29
0
    def test_metadata_propagation_indiv(self):
        # check that the metadata matches up on the resulting ops

        o = Series(range(3),range(3))
        o.name = 'foo'
        o2 = Series(range(3),range(3))
        o2.name = 'bar'

        result = o.T
        self.check_metadata(o,result)
Beispiel #30
0
    def trellis(self, layers):
        """
        Create a trellis structure for a list of layers.  Each layer will be
        cloned with different data in to a two dimensional grid.

        Parameters:
        -----------
        layers: a list of Layer objects

        Returns:
        --------
        trellised_layers: Clones of each layer in the list arranged in a
        trellised latice
        """
        trellised_layers = []
        for layer in layers:
            data = layer.data
            if self.by[0] == '.':
                grouped = data.groupby(self.by[1])
            elif self.by[1] == '.':
                grouped = data.groupby(self.by[0])
            else:
                grouped = data.groupby(self.by)
            groups = list(grouped.groups.keys())
            if self.by[0] == '.' or self.by[1] == '.':
                shingle1 = set([g for g in groups])
            else:
                shingle1 = set([g[0] for g in groups])
                shingle2 = set([g[1] for g in groups])
            if self.by[0] == '.':
                self.rows = 1
                self.cols = len(shingle1)
            elif self.by[1] == '.':
                self.rows = len(shingle1)
                self.cols = 1
            else:
                self.rows = len(shingle1)
                self.cols = len(shingle2)
            trellised = [[None for _ in range(self.cols)]
                         for _ in range(self.rows)]
            self.group_grid = [[None for _ in range(
                self.cols)] for _ in range(self.rows)]
            row = 0
            col = 0
            for group, data in grouped:
                new_layer = deepcopy(layer)
                new_layer.data = data
                trellised[row][col] = new_layer
                self.group_grid[row][col] = group
                col += 1
                if col >= self.cols:
                    col = 0
                    row += 1
            trellised_layers.append(trellised)
        return trellised_layers
Beispiel #31
0
    def test_operators_bitwise(self):
        # GH 9016: support bitwise op for integer types
        index = list('bca')

        s_tft = Series([True, False, True], index=index)
        s_fff = Series([False, False, False], index=index)
        s_tff = Series([True, False, False], index=index)
        s_empty = Series([])

        # TODO: unused
        # s_0101 = Series([0, 1, 0, 1])

        s_0123 = Series(range(4), dtype='int64')
        s_3333 = Series([3] * 4)
        s_4444 = Series([4] * 4)

        res = s_tft & s_empty
        expected = s_fff
        assert_series_equal(res, expected)

        res = s_tft | s_empty
        expected = s_tft
        assert_series_equal(res, expected)

        res = s_0123 & s_3333
        expected = Series(range(4), dtype='int64')
        assert_series_equal(res, expected)

        res = s_0123 | s_4444
        expected = Series(range(4, 8), dtype='int64')
        assert_series_equal(res, expected)

        s_a0b1c0 = Series([1], list('b'))

        res = s_tft & s_a0b1c0
        expected = s_tff.reindex(list('abc'))
        assert_series_equal(res, expected)

        res = s_tft | s_a0b1c0
        expected = s_tft.reindex(list('abc'))
        assert_series_equal(res, expected)

        n0 = 0
        res = s_tft & n0
        expected = s_fff
        assert_series_equal(res, expected)

        res = s_0123 & n0
        expected = Series([0] * 4)
        assert_series_equal(res, expected)

        n1 = 1
        res = s_tft & n1
        expected = s_tft
        assert_series_equal(res, expected)

        res = s_0123 & n1
        expected = Series([0, 1, 0, 1])
        assert_series_equal(res, expected)

        s_1111 = Series([1] * 4, dtype='int8')
        res = s_0123 & s_1111
        expected = Series([0, 1, 0, 1], dtype='int64')
        assert_series_equal(res, expected)

        res = s_0123.astype(np.int16) | s_1111.astype(np.int32)
        expected = Series([1, 1, 3, 3], dtype='int32')
        assert_series_equal(res, expected)

        with pytest.raises(TypeError):
            s_1111 & 'a'
        with pytest.raises(TypeError):
            s_1111 & ['a', 'b', 'c', 'd']
        with pytest.raises(TypeError):
            s_0123 & np.NaN
        with pytest.raises(TypeError):
            s_0123 & 3.14
        with pytest.raises(TypeError):
            s_0123 & [0.1, 4, 3.14, 2]

        # s_0123 will be all false now because of reindexing like s_tft
        if compat.PY3:
            # unable to sort incompatible object via .union.
            exp = Series([False] * 7, index=['b', 'c', 'a', 0, 1, 2, 3])
            with tm.assert_produces_warning(RuntimeWarning):
                assert_series_equal(s_tft & s_0123, exp)
        else:
            exp = Series([False] * 7, index=[0, 1, 2, 3, 'a', 'b', 'c'])
            assert_series_equal(s_tft & s_0123, exp)

        # s_tft will be all false now because of reindexing like s_0123
        if compat.PY3:
            # unable to sort incompatible object via .union.
            exp = Series([False] * 7, index=[0, 1, 2, 3, 'b', 'c', 'a'])
            with tm.assert_produces_warning(RuntimeWarning):
                assert_series_equal(s_0123 & s_tft, exp)
        else:
            exp = Series([False] * 7, index=[0, 1, 2, 3, 'a', 'b', 'c'])
            assert_series_equal(s_0123 & s_tft, exp)

        assert_series_equal(s_0123 & False, Series([False] * 4))
        assert_series_equal(s_0123 ^ False, Series([False, True, True, True]))
        assert_series_equal(s_0123 & [False], Series([False] * 4))
        assert_series_equal(s_0123 & (False), Series([False] * 4))
        assert_series_equal(s_0123 & Series([False, np.NaN, False, False]),
                            Series([False] * 4))

        s_ftft = Series([False, True, False, True])
        assert_series_equal(s_0123 & Series([0.1, 4, -3.14, 2]), s_ftft)

        s_abNd = Series(['a', 'b', np.NaN, 'd'])
        res = s_0123 & s_abNd
        expected = s_ftft
        assert_series_equal(res, expected)
def _take_multi(data, indexer, out):
    if not data.flags.c_contiguous:
        data = data.copy()
    for i in range(data.shape[0]):
        data[i].take(indexer, out=out[i])
Beispiel #33
0
    def test_pairs(self):
        data = {'birthdt': ['08jan2009', '20dec2008', '30dec2008',
                            '21dec2008', '11jan2009'],
                'birthwt': [1766, 3301, 1454, 3139, 4133],
                'id': [101, 102, 103, 104, 105],
                'sex': ['Male', 'Female', 'Female', 'Female', 'Female'],
                'visitdt1': ['11jan2009', '22dec2008', '04jan2009',
                             '29dec2008', '20jan2009'],
                'visitdt2': ['21jan2009', nan, '22jan2009', '31dec2008', '03feb2009'],
                'visitdt3': ['05feb2009', nan, nan, '02jan2009', '15feb2009'],
                'wt1': [1823, 3338, 1549, 3298, 4306],
                'wt2': [2011.0, nan, 1892.0, 3338.0, 4575.0],
                'wt3': [2293.0, nan, nan, 3377.0, 4805.0]}

        df = DataFrame(data)

        spec = {'visitdt': ['visitdt%d' % i for i in range(1, 4)],
                'wt': ['wt%d' % i for i in range(1, 4)]}
        result = lreshape(df, spec)

        exp_data = {'birthdt': ['08jan2009', '20dec2008', '30dec2008',
                                '21dec2008', '11jan2009', '08jan2009',
                                '30dec2008', '21dec2008', '11jan2009',
                                '08jan2009', '21dec2008', '11jan2009'],
                    'birthwt': [1766, 3301, 1454, 3139, 4133, 1766,
                                1454, 3139, 4133, 1766, 3139, 4133],
                    'id': [101, 102, 103, 104, 105, 101,
                           103, 104, 105, 101, 104, 105],
                    'sex': ['Male', 'Female', 'Female', 'Female', 'Female',
                            'Male', 'Female', 'Female', 'Female', 'Male',
                            'Female', 'Female'],
                    'visitdt': ['11jan2009', '22dec2008', '04jan2009', '29dec2008',
                                '20jan2009', '21jan2009', '22jan2009', '31dec2008',
                                '03feb2009', '05feb2009', '02jan2009', '15feb2009'],
                    'wt': [1823.0, 3338.0, 1549.0, 3298.0, 4306.0, 2011.0,
                           1892.0, 3338.0, 4575.0, 2293.0, 3377.0, 4805.0]}
        exp = DataFrame(exp_data, columns=result.columns)
        tm.assert_frame_equal(result, exp)

        result = lreshape(df, spec, dropna=False)
        exp_data = {'birthdt': ['08jan2009', '20dec2008', '30dec2008',
                                '21dec2008', '11jan2009',
                                '08jan2009', '20dec2008', '30dec2008',
                                '21dec2008', '11jan2009',
                                '08jan2009', '20dec2008', '30dec2008',
                                '21dec2008', '11jan2009'],
                    'birthwt': [1766, 3301, 1454, 3139, 4133,
                                1766, 3301, 1454, 3139, 4133,
                                1766, 3301, 1454, 3139, 4133],
                    'id': [101, 102, 103, 104, 105,
                           101, 102, 103, 104, 105,
                           101, 102, 103, 104, 105],
                    'sex': ['Male', 'Female', 'Female', 'Female', 'Female',
                            'Male', 'Female', 'Female', 'Female', 'Female',
                            'Male', 'Female', 'Female', 'Female', 'Female'],
                    'visitdt': ['11jan2009', '22dec2008', '04jan2009',
                                '29dec2008', '20jan2009',
                                '21jan2009', nan, '22jan2009',
                                '31dec2008', '03feb2009',
                                '05feb2009', nan, nan, '02jan2009', '15feb2009'],
                    'wt': [1823.0, 3338.0, 1549.0, 3298.0, 4306.0, 2011.0,
                           nan, 1892.0, 3338.0, 4575.0, 2293.0, nan, nan,
                           3377.0, 4805.0]}
        exp = DataFrame(exp_data, columns=result.columns)
        tm.assert_frame_equal(result, exp)

        spec = {'visitdt': ['visitdt%d' % i for i in range(1, 3)],
                'wt': ['wt%d' % i for i in range(1, 4)]}
        self.assertRaises(ValueError, lreshape, df, spec)
Beispiel #34
0
 def test_date(self):
     import datetime
     dates = [datetime.date(2012, 1, x) for x in range(1, 20)]
     index = Index(dates)
     self.assertEqual(index.inferred_type, 'date')
Beispiel #35
0
def pivot_table(data,
                values=None,
                index=None,
                columns=None,
                aggfunc='mean',
                fill_value=None,
                margins=False,
                dropna=True,
                margins_name='All'):
    index = _convert_by(index)
    columns = _convert_by(columns)

    if isinstance(aggfunc, list):
        pieces = []
        keys = []
        for func in aggfunc:
            table = pivot_table(data,
                                values=values,
                                index=index,
                                columns=columns,
                                fill_value=fill_value,
                                aggfunc=func,
                                margins=margins,
                                margins_name=margins_name)
            pieces.append(table)
            keys.append(func.__name__)
        return concat(pieces, keys=keys, axis=1)

    keys = index + columns

    values_passed = values is not None
    if values_passed:
        if is_list_like(values):
            values_multi = True
            values = list(values)
        else:
            values_multi = False
            values = [values]

        # GH14938 Make sure value labels are in data
        for i in values:
            if i not in data:
                raise KeyError(i)

        to_filter = []
        for x in keys + values:
            if isinstance(x, Grouper):
                x = x.key
            try:
                if x in data:
                    to_filter.append(x)
            except TypeError:
                pass
        if len(to_filter) < len(data.columns):
            data = data[to_filter]

    else:
        values = data.columns
        for key in keys:
            try:
                values = values.drop(key)
            except (TypeError, ValueError):
                pass
        values = list(values)

    grouped = data.groupby(keys)
    agged = grouped.agg(aggfunc)

    table = agged
    if table.index.nlevels > 1:
        # Related GH #17123
        # If index_names are integers, determine whether the integers refer
        # to the level position or name.
        index_names = agged.index.names[:len(index)]
        to_unstack = []
        for i in range(len(index), len(keys)):
            name = agged.index.names[i]
            if name is None or name in index_names:
                to_unstack.append(i)
            else:
                to_unstack.append(name)
        table = agged.unstack(to_unstack)

    if not dropna:
        from pandas import MultiIndex
        try:
            m = MultiIndex.from_arrays(cartesian_product(table.index.levels),
                                       names=table.index.names)
            table = table.reindex_axis(m, axis=0)
        except AttributeError:
            pass  # it's a single level

        try:
            m = MultiIndex.from_arrays(cartesian_product(table.columns.levels),
                                       names=table.columns.names)
            table = table.reindex_axis(m, axis=1)
        except AttributeError:
            pass  # it's a single level or a series

    if isinstance(table, ABCDataFrame):
        table = table.sort_index(axis=1)

    if fill_value is not None:
        table = table.fillna(value=fill_value, downcast='infer')

    if margins:
        if dropna:
            data = data[data.notna().all(axis=1)]
        table = _add_margins(table,
                             data,
                             values,
                             rows=index,
                             cols=columns,
                             aggfunc=aggfunc,
                             margins_name=margins_name,
                             fill_value=fill_value)

    # discard the top level
    if values_passed and not values_multi and not table.empty and \
       (table.columns.nlevels > 1):
        table = table[values[0]]

    if len(index) == 0 and len(columns) > 0:
        table = table.T

    # GH 15193 Make sure empty columns are removed if dropna=True
    if isinstance(table, ABCDataFrame) and dropna:
        table = table.dropna(how='all', axis=1)

    return table
Beispiel #36
0
    def format(self, formatter, subset=None):
        """
        Format the text display value of cells.

        .. versionadded:: 0.18.0

        Parameters
        ----------
        formatter: str, callable, or dict
        subset: IndexSlice
            An argument to ``DataFrame.loc`` that restricts which elements
            ``formatter`` is applied to.

        Returns
        -------
        self : Styler

        Notes
        -----

        ``formatter`` is either an ``a`` or a dict ``{column name: a}`` where
        ``a`` is one of

        - str: this will be wrapped in: ``a.format(x)``
        - callable: called with the value of an individual cell

        The default display value for numeric values is the "general" (``g``)
        format with ``pd.options.display.precision`` precision.

        Examples
        --------

        >>> df = pd.DataFrame(np.random.randn(4, 2), columns=['a', 'b'])
        >>> df.style.format("{:.2%}")
        >>> df['c'] = ['a', 'b', 'c', 'd']
        >>> df.style.format({'C': str.upper})
        """
        if subset is None:
            row_locs = range(len(self.data))
            col_locs = range(len(self.data.columns))
        else:
            subset = _non_reducing_slice(subset)
            if len(subset) == 1:
                subset = subset, self.data.columns

            sub_df = self.data.loc[subset]
            row_locs = self.data.index.get_indexer_for(sub_df.index)
            col_locs = self.data.columns.get_indexer_for(sub_df.columns)

        if isinstance(formatter, MutableMapping):
            for col, col_formatter in formatter.items():
                # formatter must be callable, so '{}' are converted to lambdas
                col_formatter = _maybe_wrap_formatter(col_formatter)
                col_num = self.data.columns.get_indexer_for([col])[0]

                for row_num in row_locs:
                    self._display_funcs[(row_num, col_num)] = col_formatter
        else:
            # single scalar to format all cells with
            locs = product(*(row_locs, col_locs))
            for i, j in locs:
                formatter = _maybe_wrap_formatter(formatter)
                self._display_funcs[(i, j)] = formatter
        return self
Beispiel #37
0
def _subplots(naxes=None,
              sharex=False,
              sharey=False,
              squeeze=True,
              subplot_kw=None,
              ax=None,
              layout=None,
              layout_type='box',
              **fig_kw):
    """Create a figure with a set of subplots already made.

    This utility wrapper makes it convenient to create common layouts of
    subplots, including the enclosing figure object, in a single call.

    Keyword arguments:

    naxes : int
      Number of required axes. Exceeded axes are set invisible. Default is
      nrows * ncols.

    sharex : bool
      If True, the X axis will be shared amongst all subplots.

    sharey : bool
      If True, the Y axis will be shared amongst all subplots.

    squeeze : bool

      If True, extra dimensions are squeezed out from the returned axis object:
        - if only one subplot is constructed (nrows=ncols=1), the resulting
        single Axis object is returned as a scalar.
        - for Nx1 or 1xN subplots, the returned object is a 1-d numpy object
        array of Axis objects are returned as numpy 1-d arrays.
        - for NxM subplots with N>1 and M>1 are returned as a 2d array.

      If False, no squeezing at all is done: the returned axis object is always
      a 2-d array containing Axis instances, even if it ends up being 1x1.

    subplot_kw : dict
      Dict with keywords passed to the add_subplot() call used to create each
      subplots.

    ax : Matplotlib axis object, optional

    layout : tuple
      Number of rows and columns of the subplot grid.
      If not specified, calculated from naxes and layout_type

    layout_type : {'box', 'horziontal', 'vertical'}, default 'box'
      Specify how to layout the subplot grid.

    fig_kw : Other keyword arguments to be passed to the figure() call.
        Note that all keywords not recognized above will be
        automatically included here.

    Returns:

    fig, ax : tuple
      - fig is the Matplotlib Figure object
      - ax can be either a single axis object or an array of axis objects if
      more than one subplot was created.  The dimensions of the resulting array
      can be controlled with the squeeze keyword, see above.

    **Examples:**

    x = np.linspace(0, 2*np.pi, 400)
    y = np.sin(x**2)

    # Just a figure and one subplot
    f, ax = plt.subplots()
    ax.plot(x, y)
    ax.set_title('Simple plot')

    # Two subplots, unpack the output array immediately
    f, (ax1, ax2) = plt.subplots(1, 2, sharey=True)
    ax1.plot(x, y)
    ax1.set_title('Sharing Y axis')
    ax2.scatter(x, y)

    # Four polar axes
    plt.subplots(2, 2, subplot_kw=dict(polar=True))
    """
    import matplotlib.pyplot as plt

    if subplot_kw is None:
        subplot_kw = {}

    if ax is None:
        fig = plt.figure(**fig_kw)
    else:
        if is_list_like(ax):
            ax = _flatten(ax)
            if layout is not None:
                warnings.warn(
                    "When passing multiple axes, layout keyword is "
                    "ignored", UserWarning)
            if sharex or sharey:
                warnings.warn(
                    "When passing multiple axes, sharex and sharey "
                    "are ignored. These settings must be specified "
                    "when creating axes",
                    UserWarning,
                    stacklevel=4)
            if len(ax) == naxes:
                fig = ax[0].get_figure()
                return fig, ax
            else:
                raise ValueError("The number of passed axes must be {0}, the "
                                 "same as the output plot".format(naxes))

        fig = ax.get_figure()
        # if ax is passed and a number of subplots is 1, return ax as it is
        if naxes == 1:
            if squeeze:
                return fig, ax
            else:
                return fig, _flatten(ax)
        else:
            warnings.warn(
                "To output multiple subplots, the figure containing "
                "the passed axes is being cleared",
                UserWarning,
                stacklevel=4)
            fig.clear()

    nrows, ncols = _get_layout(naxes, layout=layout, layout_type=layout_type)
    nplots = nrows * ncols

    # Create empty object array to hold all axes.  It's easiest to make it 1-d
    # so we can just append subplots upon creation, and then
    axarr = np.empty(nplots, dtype=object)

    # Create first subplot separately, so we can share it if requested
    ax0 = fig.add_subplot(nrows, ncols, 1, **subplot_kw)

    if sharex:
        subplot_kw['sharex'] = ax0
    if sharey:
        subplot_kw['sharey'] = ax0
    axarr[0] = ax0

    # Note off-by-one counting because add_subplot uses the MATLAB 1-based
    # convention.
    for i in range(1, nplots):
        kwds = subplot_kw.copy()
        # Set sharex and sharey to None for blank/dummy axes, these can
        # interfere with proper axis limits on the visible axes if
        # they share axes e.g. issue #7528
        if i >= naxes:
            kwds['sharex'] = None
            kwds['sharey'] = None
        ax = fig.add_subplot(nrows, ncols, i + 1, **kwds)
        axarr[i] = ax

    if naxes != nplots:
        for ax in axarr[naxes:]:
            ax.set_visible(False)

    _handle_shared_axes(axarr, nplots, naxes, nrows, ncols, sharex, sharey)

    if squeeze:
        # Reshape the array to have the final desired dimension (nrow,ncol),
        # though discarding unneeded dimensions that equal 1.  If we only have
        # one subplot, just return it instead of a 1-element array.
        if nplots == 1:
            axes = axarr[0]
        else:
            axes = axarr.reshape(nrows, ncols).squeeze()
    else:
        # returned axis array will be always 2-d, even if nrows=ncols=1
        axes = axarr.reshape(nrows, ncols)

    return fig, axes
Beispiel #38
0
    def test_frame_from_json_to_json(self):
        def _check_orient(df,
                          orient,
                          dtype=None,
                          numpy=False,
                          convert_axes=True,
                          check_dtype=True,
                          raise_ok=None,
                          sort=None,
                          check_index_type=True,
                          check_column_type=True,
                          check_numpy_dtype=False):
            if sort is not None:
                df = df.sort_values(sort)
            else:
                df = df.sort_index()

            # if we are not unique, then check that we are raising ValueError
            # for the appropriate orients
            if not df.index.is_unique and orient in ['index', 'columns']:
                self.assertRaises(ValueError,
                                  lambda: df.to_json(orient=orient))
                return
            if (not df.columns.is_unique
                    and orient in ['index', 'columns', 'records']):
                self.assertRaises(ValueError,
                                  lambda: df.to_json(orient=orient))
                return

            dfjson = df.to_json(orient=orient)

            try:
                unser = read_json(dfjson,
                                  orient=orient,
                                  dtype=dtype,
                                  numpy=numpy,
                                  convert_axes=convert_axes)
            except Exception as detail:
                if raise_ok is not None:
                    if isinstance(detail, raise_ok):
                        return
                raise

            if sort is not None and sort in unser.columns:
                unser = unser.sort_values(sort)
            else:
                unser = unser.sort_index()

            if dtype is False:
                check_dtype = False

            if not convert_axes and df.index.dtype.type == np.datetime64:
                unser.index = DatetimeIndex(
                    unser.index.values.astype('i8') * 1e6)
            if orient == "records":
                # index is not captured in this orientation
                assert_almost_equal(df.values,
                                    unser.values,
                                    check_dtype=check_numpy_dtype)
                self.assert_index_equal(df.columns,
                                        unser.columns,
                                        exact=check_column_type)
            elif orient == "values":
                # index and cols are not captured in this orientation
                if numpy is True and df.shape == (0, 0):
                    assert unser.shape[0] == 0
                else:
                    assert_almost_equal(df.values,
                                        unser.values,
                                        check_dtype=check_numpy_dtype)
            elif orient == "split":
                # index and col labels might not be strings
                unser.index = [str(i) for i in unser.index]
                unser.columns = [str(i) for i in unser.columns]

                if sort is None:
                    unser = unser.sort_index()
                assert_almost_equal(df.values,
                                    unser.values,
                                    check_dtype=check_numpy_dtype)
            else:
                if convert_axes:
                    assert_frame_equal(df,
                                       unser,
                                       check_dtype=check_dtype,
                                       check_index_type=check_index_type,
                                       check_column_type=check_column_type)
                else:
                    assert_frame_equal(df,
                                       unser,
                                       check_less_precise=False,
                                       check_dtype=check_dtype)

        def _check_all_orients(df,
                               dtype=None,
                               convert_axes=True,
                               raise_ok=None,
                               sort=None,
                               check_index_type=True,
                               check_column_type=True):

            # numpy=False
            if convert_axes:
                _check_orient(df,
                              "columns",
                              dtype=dtype,
                              sort=sort,
                              check_index_type=False,
                              check_column_type=False)
                _check_orient(df,
                              "records",
                              dtype=dtype,
                              sort=sort,
                              check_index_type=False,
                              check_column_type=False)
                _check_orient(df,
                              "split",
                              dtype=dtype,
                              sort=sort,
                              check_index_type=False,
                              check_column_type=False)
                _check_orient(df,
                              "index",
                              dtype=dtype,
                              sort=sort,
                              check_index_type=False,
                              check_column_type=False)
                _check_orient(df,
                              "values",
                              dtype=dtype,
                              sort=sort,
                              check_index_type=False,
                              check_column_type=False)

            _check_orient(df,
                          "columns",
                          dtype=dtype,
                          convert_axes=False,
                          sort=sort)
            _check_orient(df,
                          "records",
                          dtype=dtype,
                          convert_axes=False,
                          sort=sort)
            _check_orient(df,
                          "split",
                          dtype=dtype,
                          convert_axes=False,
                          sort=sort)
            _check_orient(df,
                          "index",
                          dtype=dtype,
                          convert_axes=False,
                          sort=sort)
            _check_orient(df,
                          "values",
                          dtype=dtype,
                          convert_axes=False,
                          sort=sort)

            # numpy=True and raise_ok might be not None, so ignore the error
            if convert_axes:
                _check_orient(df,
                              "columns",
                              dtype=dtype,
                              numpy=True,
                              raise_ok=raise_ok,
                              sort=sort,
                              check_index_type=False,
                              check_column_type=False)
                _check_orient(df,
                              "records",
                              dtype=dtype,
                              numpy=True,
                              raise_ok=raise_ok,
                              sort=sort,
                              check_index_type=False,
                              check_column_type=False)
                _check_orient(df,
                              "split",
                              dtype=dtype,
                              numpy=True,
                              raise_ok=raise_ok,
                              sort=sort,
                              check_index_type=False,
                              check_column_type=False)
                _check_orient(df,
                              "index",
                              dtype=dtype,
                              numpy=True,
                              raise_ok=raise_ok,
                              sort=sort,
                              check_index_type=False,
                              check_column_type=False)
                _check_orient(df,
                              "values",
                              dtype=dtype,
                              numpy=True,
                              raise_ok=raise_ok,
                              sort=sort,
                              check_index_type=False,
                              check_column_type=False)

            _check_orient(df,
                          "columns",
                          dtype=dtype,
                          numpy=True,
                          convert_axes=False,
                          raise_ok=raise_ok,
                          sort=sort)
            _check_orient(df,
                          "records",
                          dtype=dtype,
                          numpy=True,
                          convert_axes=False,
                          raise_ok=raise_ok,
                          sort=sort)
            _check_orient(df,
                          "split",
                          dtype=dtype,
                          numpy=True,
                          convert_axes=False,
                          raise_ok=raise_ok,
                          sort=sort)
            _check_orient(df,
                          "index",
                          dtype=dtype,
                          numpy=True,
                          convert_axes=False,
                          raise_ok=raise_ok,
                          sort=sort)
            _check_orient(df,
                          "values",
                          dtype=dtype,
                          numpy=True,
                          convert_axes=False,
                          raise_ok=raise_ok,
                          sort=sort)

        # basic
        _check_all_orients(self.frame)
        self.assertEqual(self.frame.to_json(),
                         self.frame.to_json(orient="columns"))

        _check_all_orients(self.intframe, dtype=self.intframe.values.dtype)
        _check_all_orients(self.intframe, dtype=False)

        # big one
        # index and columns are strings as all unserialised JSON object keys
        # are assumed to be strings
        biggie = DataFrame(np.zeros((200, 4)),
                           columns=[str(i) for i in range(4)],
                           index=[str(i) for i in range(200)])
        _check_all_orients(biggie, dtype=False, convert_axes=False)

        # dtypes
        _check_all_orients(DataFrame(biggie, dtype=np.float64),
                           dtype=np.float64,
                           convert_axes=False)
        _check_all_orients(DataFrame(biggie, dtype=np.int),
                           dtype=np.int,
                           convert_axes=False)
        _check_all_orients(DataFrame(biggie, dtype='U3'),
                           dtype='U3',
                           convert_axes=False,
                           raise_ok=ValueError)

        # categorical
        _check_all_orients(self.categorical, sort='sort', raise_ok=ValueError)

        # empty
        _check_all_orients(self.empty_frame,
                           check_index_type=False,
                           check_column_type=False)

        # time series data
        _check_all_orients(self.tsframe)

        # mixed data
        index = pd.Index(['a', 'b', 'c', 'd', 'e'])
        data = {
            'A': [0., 1., 2., 3., 4.],
            'B': [0., 1., 0., 1., 0.],
            'C': ['foo1', 'foo2', 'foo3', 'foo4', 'foo5'],
            'D': [True, False, True, False, True]
        }
        df = DataFrame(data=data, index=index)
        _check_orient(df, "split", check_dtype=False)
        _check_orient(df, "records", check_dtype=False)
        _check_orient(df, "values", check_dtype=False)
        _check_orient(df, "columns", check_dtype=False)
        # index oriented is problematic as it is read back in in a transposed
        # state, so the columns are interpreted as having mixed data and
        # given object dtypes.
        # force everything to have object dtype beforehand
        _check_orient(df.transpose().transpose(), "index", dtype=False)
Beispiel #39
0
 def test_float_index_at_iat(self):
     s = Series([1, 2, 3], index=[0.1, 0.2, 0.3])
     for el, item in s.iteritems():
         assert s.at[el] == item
     for i in range(len(s)):
         assert s.iat[i] == i + 1
Beispiel #40
0
 def test_xticklabels(self):
     # GH11529
     s = Series(np.arange(10), index=['P%02d' % i for i in range(10)])
     ax = s.plot(xticks=[0, 3, 5, 9])
     exp = ['P%02d' % i for i in [0, 3, 5, 9]]
     self._check_text_labels(ax.get_xticklabels(), exp)
Beispiel #41
0
def melt(frame, id_vars=None, value_vars=None,
         var_name=None, value_name='value', col_level=None):
    """
    "Unpivots" a DataFrame from wide format to long format, optionally leaving
    identifier variables set.

    This function is useful to massage a DataFrame into a format where one
    or more columns are identifier variables (`id_vars`), while all other
    columns, considered measured variables (`value_vars`), are "unpivoted" to
    the row axis, leaving just two non-identifier columns, 'variable' and
    'value'.

    Parameters
    ----------
    frame : DataFrame
    id_vars : tuple, list, or ndarray, optional
        Column(s) to use as identifier variables.
    value_vars : tuple, list, or ndarray, optional
        Column(s) to unpivot. If not specified, uses all columns that
        are not set as `id_vars`.
    var_name : scalar
        Name to use for the 'variable' column. If None it uses
        ``frame.columns.name`` or 'variable'.
    value_name : scalar, default 'value'
        Name to use for the 'value' column.
    col_level : int or string, optional
        If columns are a MultiIndex then use this level to melt.

    See also
    --------
    pivot_table
    DataFrame.pivot

    Examples
    --------
    >>> import pandas as pd
    >>> df = pd.DataFrame({'A': {0: 'a', 1: 'b', 2: 'c'},
    ...                    'B': {0: 1, 1: 3, 2: 5},
    ...                    'C': {0: 2, 1: 4, 2: 6}})
    >>> df
       A  B  C
    0  a  1  2
    1  b  3  4
    2  c  5  6

    >>> pd.melt(df, id_vars=['A'], value_vars=['B'])
       A variable  value
    0  a        B      1
    1  b        B      3
    2  c        B      5

    >>> pd.melt(df, id_vars=['A'], value_vars=['B', 'C'])
       A variable  value
    0  a        B      1
    1  b        B      3
    2  c        B      5
    3  a        C      2
    4  b        C      4
    5  c        C      6

    The names of 'variable' and 'value' columns can be customized:

    >>> pd.melt(df, id_vars=['A'], value_vars=['B'],
    ...         var_name='myVarname', value_name='myValname')
       A myVarname  myValname
    0  a         B          1
    1  b         B          3
    2  c         B          5

    If you have multi-index columns:

    >>> df.columns = [list('ABC'), list('DEF')]
    >>> df
       A  B  C
       D  E  F
    0  a  1  2
    1  b  3  4
    2  c  5  6

    >>> pd.melt(df, col_level=0, id_vars=['A'], value_vars=['B'])
       A variable  value
    0  a        B      1
    1  b        B      3
    2  c        B      5

    >>> pd.melt(df, id_vars=[('A', 'D')], value_vars=[('B', 'E')])
      (A, D) variable_0 variable_1  value
    0      a          B          E      1
    1      b          B          E      3
    2      c          B          E      5

    """
    # TODO: what about the existing index?
    if id_vars is not None:
        if not isinstance(id_vars, (tuple, list, np.ndarray)):
            id_vars = [id_vars]
        else:
            id_vars = list(id_vars)
    else:
        id_vars = []

    if value_vars is not None:
        if not isinstance(value_vars, (tuple, list, np.ndarray)):
            value_vars = [value_vars]
        frame = frame.ix[:, id_vars + value_vars]
    else:
        frame = frame.copy()

    if col_level is not None:  # allow list or other?
        # frame is a copy
        frame.columns = frame.columns.get_level_values(col_level)

    if var_name is None:
        if isinstance(frame.columns, MultiIndex):
            if len(frame.columns.names) == len(set(frame.columns.names)):
                var_name = frame.columns.names
            else:
                var_name = ['variable_%s' % i for i in
                            range(len(frame.columns.names))]
        else:
            var_name = [frame.columns.name if frame.columns.name is not None
                        else 'variable']
    if isinstance(var_name, compat.string_types):
        var_name = [var_name]

    N, K = frame.shape
    K -= len(id_vars)

    mdata = {}
    for col in id_vars:
        mdata[col] = np.tile(frame.pop(col).values, K)

    mcolumns = id_vars + var_name + [value_name]

    mdata[value_name] = frame.values.ravel('F')
    for i, col in enumerate(var_name):
        # asanyarray will keep the columns as an Index
        mdata[col] = np.asanyarray(frame.columns.get_level_values(i)).repeat(N)

    return DataFrame(mdata, columns=mcolumns)
Beispiel #42
0
def _stack_multi_columns(frame, level_num=-1, dropna=True):
    def _convert_level_number(level_num, columns):
        """
        Logic for converting the level number to something
        we can safely pass to swaplevel:

        We generally want to convert the level number into
        a level name, except when columns do not have names,
        in which case we must leave as a level number
        """
        if level_num in columns.names:
            return columns.names[level_num]
        else:
            if columns.names[level_num] is None:
                return level_num
            else:
                return columns.names[level_num]

    this = frame.copy()

    # this makes life much simpler
    if level_num != frame.columns.nlevels - 1:
        # roll levels to put selected level at end
        roll_columns = this.columns
        for i in range(level_num, frame.columns.nlevels - 1):
            # Need to check if the ints conflict with level names
            lev1 = _convert_level_number(i, roll_columns)
            lev2 = _convert_level_number(i + 1, roll_columns)
            roll_columns = roll_columns.swaplevel(lev1, lev2)
        this.columns = roll_columns

    if not this.columns.is_lexsorted():
        # Workaround the edge case where 0 is one of the column names,
        # which interferes with trying to sort based on the first
        # level
        level_to_sort = _convert_level_number(0, this.columns)
        this = this.sortlevel(level_to_sort, axis=1)

    # tuple list excluding level for grouping columns
    if len(frame.columns.levels) > 2:
        tuples = list(zip(*[
            lev.take(lab) for lev, lab in
            zip(this.columns.levels[:-1], this.columns.labels[:-1])
        ]))
        unique_groups = [key for key, _ in itertools.groupby(tuples)]
        new_names = this.columns.names[:-1]
        new_columns = MultiIndex.from_tuples(unique_groups, names=new_names)
    else:
        new_columns = unique_groups = this.columns.levels[0]

    # time to ravel the values
    new_data = {}
    level_vals = this.columns.levels[-1]
    level_labels = sorted(set(this.columns.labels[-1]))
    level_vals_used = level_vals[level_labels]
    levsize = len(level_labels)
    drop_cols = []
    for key in unique_groups:
        loc = this.columns.get_loc(key)
        slice_len = loc.stop - loc.start
        # can make more efficient?

        if slice_len == 0:
            drop_cols.append(key)
            continue
        elif slice_len != levsize:
            chunk = this.ix[:, this.columns[loc]]
            chunk.columns = level_vals.take(chunk.columns.labels[-1])
            value_slice = chunk.reindex(columns=level_vals_used).values
        else:
            if frame._is_mixed_type:
                value_slice = this.ix[:, this.columns[loc]].values
            else:
                value_slice = this.values[:, loc]

        new_data[key] = value_slice.ravel()

    if len(drop_cols) > 0:
        new_columns = new_columns.difference(drop_cols)

    N = len(this)

    if isinstance(this.index, MultiIndex):
        new_levels = list(this.index.levels)
        new_names = list(this.index.names)
        new_labels = [lab.repeat(levsize) for lab in this.index.labels]
    else:
        new_levels = [this.index]
        new_labels = [np.arange(N).repeat(levsize)]
        new_names = [this.index.name]  # something better?

    new_levels.append(frame.columns.levels[level_num])
    new_labels.append(np.tile(level_labels, N))
    new_names.append(frame.columns.names[level_num])

    new_index = MultiIndex(levels=new_levels, labels=new_labels,
                           names=new_names, verify_integrity=False)

    result = DataFrame(new_data, index=new_index, columns=new_columns)

    # more efficient way to go about this? can do the whole masking biz but
    # will only save a small amount of time...
    if dropna:
        result = result.dropna(axis=0, how='all')

    return result
    def test_week_of_month(self):
        days = ['MON', 'TUE', 'WED', 'THU', 'FRI', 'SAT', 'SUN']

        for day in days:
            for i in range(1, 5):
                self._check_generated_range('1/1/2000', 'WOM-%d%s' % (i, day))
Beispiel #44
0
 def create_cols(name):
     return ["%s%03d" % (name, i) for i in range(5)]
Beispiel #45
0
def _in_chunks(seq, size):
    """
    Return sequence in 'chunks' of size defined by size
    """
    return (seq[pos:pos + size] for pos in range(0, len(seq), size))
Beispiel #46
0
    def _write_hierarchical_rows(self, fmt_values, indent):
        template = 'rowspan="{span}" valign="top"'

        truncate_h = self.fmt.truncate_h
        truncate_v = self.fmt.truncate_v
        frame = self.fmt.tr_frame
        ncols = len(frame.columns)
        nrows = len(frame)
        row_levels = self.frame.index.nlevels

        idx_values = frame.index.format(sparsify=False,
                                        adjoin=False,
                                        names=False)
        idx_values = lzip(*idx_values)

        if self.fmt.sparsify:
            # GH3547
            sentinel = com.sentinel_factory()
            levels = frame.index.format(sparsify=sentinel,
                                        adjoin=False,
                                        names=False)

            level_lengths = get_level_lengths(levels, sentinel)
            inner_lvl = len(level_lengths) - 1
            if truncate_v:
                # Insert ... row and adjust idx_values and
                # level_lengths to take this into account.
                ins_row = self.fmt.tr_row_num
                inserted = False
                for lnum, records in enumerate(level_lengths):
                    rec_new = {}
                    for tag, span in list(records.items()):
                        if tag >= ins_row:
                            rec_new[tag + 1] = span
                        elif tag + span > ins_row:
                            rec_new[tag] = span + 1

                            # GH 14882 - Make sure insertion done once
                            if not inserted:
                                dot_row = list(idx_values[ins_row - 1])
                                dot_row[-1] = u('...')
                                idx_values.insert(ins_row, tuple(dot_row))
                                inserted = True
                            else:
                                dot_row = list(idx_values[ins_row])
                                dot_row[inner_lvl - lnum] = u('...')
                                idx_values[ins_row] = tuple(dot_row)
                        else:
                            rec_new[tag] = span
                        # If ins_row lies between tags, all cols idx cols
                        # receive ...
                        if tag + span == ins_row:
                            rec_new[ins_row] = 1
                            if lnum == 0:
                                idx_values.insert(
                                    ins_row,
                                    tuple([u('...')] * len(level_lengths)))

                            # GH 14882 - Place ... in correct level
                            elif inserted:
                                dot_row = list(idx_values[ins_row])
                                dot_row[inner_lvl - lnum] = u('...')
                                idx_values[ins_row] = tuple(dot_row)
                    level_lengths[lnum] = rec_new

                level_lengths[inner_lvl][ins_row] = 1
                for ix_col in range(len(fmt_values)):
                    fmt_values[ix_col].insert(ins_row, '...')
                nrows += 1

            for i in range(nrows):
                row = []
                tags = {}

                sparse_offset = 0
                j = 0
                for records, v in zip(level_lengths, idx_values[i]):
                    if i in records:
                        if records[i] > 1:
                            tags[j] = template.format(span=records[i])
                    else:
                        sparse_offset += 1
                        continue

                    j += 1
                    row.append(v)

                row.extend(fmt_values[j][i] for j in range(ncols))
                if truncate_h:
                    row.insert(
                        row_levels - sparse_offset + self.fmt.tr_col_num,
                        '...')
                self.write_tr(row,
                              indent,
                              self.indent_delta,
                              tags=tags,
                              nindex_levels=len(levels) - sparse_offset)
        else:
            for i in range(len(frame)):
                idx_values = list(
                    zip(*frame.index.format(
                        sparsify=False, adjoin=False, names=False)))
                row = []
                row.extend(idx_values[i])
                row.extend(fmt_values[j][i] for j in range(ncols))
                if truncate_h:
                    row.insert(row_levels + self.fmt.tr_col_num, '...')
                self.write_tr(row,
                              indent,
                              self.indent_delta,
                              tags=None,
                              nindex_levels=frame.index.nlevels)
Beispiel #47
0
    def test_getitem_duplicates_multiindex(self):
        # GH 5725 the 'A' happens to be a valid Timestamp so the doesn't raise
        # the appropriate error, only in PY3 of course!

        index = MultiIndex(levels=[['D', 'B', 'C'],
                                   [0, 26, 27, 37, 57, 67, 75, 82]],
                           codes=[[0, 0, 0, 1, 2, 2, 2, 2, 2, 2],
                                  [1, 3, 4, 6, 0, 2, 2, 3, 5, 7]],
                           names=['tag', 'day'])
        arr = np.random.randn(len(index), 1)
        df = DataFrame(arr, index=index, columns=['val'])
        result = df.val['D']
        expected = Series(arr.ravel()[0:3],
                          name='val',
                          index=Index([26, 37, 57], name='day'))
        tm.assert_series_equal(result, expected)

        def f():
            df.val['A']

        pytest.raises(KeyError, f)

        def f():
            df.val['X']

        pytest.raises(KeyError, f)

        # A is treated as a special Timestamp
        index = MultiIndex(levels=[['A', 'B', 'C'],
                                   [0, 26, 27, 37, 57, 67, 75, 82]],
                           codes=[[0, 0, 0, 1, 2, 2, 2, 2, 2, 2],
                                  [1, 3, 4, 6, 0, 2, 2, 3, 5, 7]],
                           names=['tag', 'day'])
        df = DataFrame(arr, index=index, columns=['val'])
        result = df.val['A']
        expected = Series(arr.ravel()[0:3],
                          name='val',
                          index=Index([26, 37, 57], name='day'))
        tm.assert_series_equal(result, expected)

        def f():
            df.val['X']

        pytest.raises(KeyError, f)

        # GH 7866
        # multi-index slicing with missing indexers
        idx = MultiIndex.from_product([['A', 'B', 'C'], ['foo', 'bar', 'baz']],
                                      names=['one', 'two'])
        s = Series(np.arange(9, dtype='int64'), index=idx).sort_index()

        exp_idx = MultiIndex.from_product([['A'], ['foo', 'bar', 'baz']],
                                          names=['one', 'two'])
        expected = Series(np.arange(3, dtype='int64'),
                          index=exp_idx).sort_index()

        result = s.loc[['A']]
        tm.assert_series_equal(result, expected)
        result = s.loc[['A', 'D']]
        tm.assert_series_equal(result, expected)

        # not any values found
        pytest.raises(KeyError, lambda: s.loc[['D']])

        # empty ok
        result = s.loc[[]]
        expected = s.iloc[[]]
        tm.assert_series_equal(result, expected)

        idx = pd.IndexSlice
        expected = Series(
            [0, 3, 6],
            index=MultiIndex.from_product([['A', 'B', 'C'], ['foo']],
                                          names=['one', 'two'])).sort_index()

        result = s.loc[idx[:, ['foo']]]
        tm.assert_series_equal(result, expected)
        result = s.loc[idx[:, ['foo', 'bah']]]
        tm.assert_series_equal(result, expected)

        # GH 8737
        # empty indexer
        multi_index = MultiIndex.from_product((['foo', 'bar',
                                                'baz'], ['alpha', 'beta']))
        df = DataFrame(np.random.randn(5, 6),
                       index=range(5),
                       columns=multi_index)
        df = df.sort_index(level=0, axis=1)

        expected = DataFrame(index=range(5),
                             columns=multi_index.reindex([])[0])
        result1 = df.loc[:, ([], slice(None))]
        result2 = df.loc[:, (['foo'], [])]
        tm.assert_frame_equal(result1, expected)
        tm.assert_frame_equal(result2, expected)

        # regression from < 0.14.0
        # GH 7914
        df = DataFrame([[np.mean, np.median], ['mean', 'median']],
                       columns=MultiIndex.from_tuples([('functs', 'mean'),
                                                       ('functs', 'median')]),
                       index=['function', 'name'])
        result = df.loc['function', ('functs', 'mean')]
        assert result == np.mean
Beispiel #48
0
 def test_object_refcount_bug(self):
     lst = ['A', 'B', 'C', 'D', 'E']
     for i in range(1000):
         len(algos.unique(lst))
Beispiel #49
0
    def _translate(self):
        """
        Convert the DataFrame in `self.data` and the attrs from `_build_styles`
        into a dictionary of {head, body, uuid, cellstyle}
        """
        table_styles = self.table_styles or []
        caption = self.caption
        ctx = self.ctx
        precision = self.precision
        uuid = self.uuid or str(uuid1()).replace("-", "_")
        ROW_HEADING_CLASS = "row_heading"
        COL_HEADING_CLASS = "col_heading"
        INDEX_NAME_CLASS = "index_name"

        DATA_CLASS = "data"
        BLANK_CLASS = "blank"
        BLANK_VALUE = ""

        def format_attr(pair):
            return "{key}={value}".format(**pair)

        # for sparsifying a MultiIndex
        idx_lengths = _get_level_lengths(self.index)
        col_lengths = _get_level_lengths(self.columns)

        cell_context = dict()

        n_rlvls = self.data.index.nlevels
        n_clvls = self.data.columns.nlevels
        rlabels = self.data.index.tolist()
        clabels = self.data.columns.tolist()

        if n_rlvls == 1:
            rlabels = [[x] for x in rlabels]
        if n_clvls == 1:
            clabels = [[x] for x in clabels]
        clabels = list(zip(*clabels))

        cellstyle = []
        head = []

        for r in range(n_clvls):
            # Blank for Index columns...
            row_es = [{"type": "th",
                       "value": BLANK_VALUE,
                       "display_value": BLANK_VALUE,
                       "is_visible": True,
                       "class": " ".join([BLANK_CLASS])}] * (n_rlvls - 1)

            # ... except maybe the last for columns.names
            name = self.data.columns.names[r]
            cs = [BLANK_CLASS if name is None else INDEX_NAME_CLASS,
                  "level%s" % r]
            name = BLANK_VALUE if name is None else name
            row_es.append({"type": "th",
                           "value": name,
                           "display_value": name,
                           "class": " ".join(cs),
                           "is_visible": True})

            for c, value in enumerate(clabels[r]):
                cs = [COL_HEADING_CLASS, "level%s" % r, "col%s" % c]
                cs.extend(cell_context.get(
                    "col_headings", {}).get(r, {}).get(c, []))
                es = {
                    "type": "th",
                    "value": value,
                    "display_value": value,
                    "class": " ".join(cs),
                    "is_visible": _is_visible(c, r, col_lengths),
                }
                colspan = col_lengths.get((r, c), 0)
                if colspan > 1:
                    es["attributes"] = [
                        format_attr({"key": "colspan", "value": colspan})
                    ]
                row_es.append(es)
            head.append(row_es)

        if self.data.index.names and not all(x is None
                                             for x in self.data.index.names):
            index_header_row = []

            for c, name in enumerate(self.data.index.names):
                cs = [INDEX_NAME_CLASS,
                      "level%s" % c]
                name = '' if name is None else name
                index_header_row.append({"type": "th", "value": name,
                                         "class": " ".join(cs)})

            index_header_row.extend(
                [{"type": "th",
                  "value": BLANK_VALUE,
                  "class": " ".join([BLANK_CLASS])
                  }] * len(clabels[0]))

            head.append(index_header_row)

        body = []
        for r, idx in enumerate(self.data.index):
            row_es = []
            for c, value in enumerate(rlabels[r]):
                es = {
                    "type": "th",
                    "is_visible": _is_visible(r, c, idx_lengths),
                    "value": value,
                    "display_value": value,
                    "class": " ".join([ROW_HEADING_CLASS, "level%s" % c,
                                       "row%s" % r]),
                }
                rowspan = idx_lengths.get((c, r), 0)
                if rowspan > 1:
                    es["attributes"] = [
                        format_attr({"key": "rowspan", "value": rowspan})
                    ]
                row_es.append(es)

            for c, col in enumerate(self.data.columns):
                cs = [DATA_CLASS, "row%s" % r, "col%s" % c]
                cs.extend(cell_context.get("data", {}).get(r, {}).get(c, []))
                formatter = self._display_funcs[(r, c)]
                value = self.data.iloc[r, c]
                row_es.append({
                    "type": "td",
                    "value": value,
                    "class": " ".join(cs),
                    "id": "_".join(cs[1:]),
                    "display_value": formatter(value)
                })
                props = []
                for x in ctx[r, c]:
                    # have to handle empty styles like ['']
                    if x.count(":"):
                        props.append(x.split(":"))
                    else:
                        props.append(['', ''])
                cellstyle.append({'props': props,
                                  'selector': "row%s_col%s" % (r, c)})
            body.append(row_es)

        return dict(head=head, cellstyle=cellstyle, body=body, uuid=uuid,
                    precision=precision, table_styles=table_styles,
                    caption=caption, table_attributes=self.table_attributes)
Beispiel #50
0
class TestDataFrameOperators(TestData):

    def test_operators(self):
        garbage = random.random(4)
        colSeries = Series(garbage, index=np.array(self.frame.columns))

        idSum = self.frame + self.frame
        seriesSum = self.frame + colSeries

        for col, series in compat.iteritems(idSum):
            for idx, val in compat.iteritems(series):
                origVal = self.frame[col][idx] * 2
                if not np.isnan(val):
                    assert val == origVal
                else:
                    assert np.isnan(origVal)

        for col, series in compat.iteritems(seriesSum):
            for idx, val in compat.iteritems(series):
                origVal = self.frame[col][idx] + colSeries[col]
                if not np.isnan(val):
                    assert val == origVal
                else:
                    assert np.isnan(origVal)

        added = self.frame2 + self.frame2
        expected = self.frame2 * 2
        assert_frame_equal(added, expected)

        df = DataFrame({'a': ['a', None, 'b']})
        assert_frame_equal(df + df, DataFrame({'a': ['aa', np.nan, 'bb']}))

        # Test for issue #10181
        for dtype in ('float', 'int64'):
            frames = [
                DataFrame(dtype=dtype),
                DataFrame(columns=['A'], dtype=dtype),
                DataFrame(index=[0], dtype=dtype),
            ]
            for df in frames:
                assert (df + df).equals(df)
                assert_frame_equal(df + df, df)

    def test_ops_np_scalar(self):
        vals, xs = np.random.rand(5, 3), [nan, 7, -23, 2.718, -3.14, np.inf]
        f = lambda x: DataFrame(x, index=list('ABCDE'),
                                columns=['jim', 'joe', 'jolie'])

        df = f(vals)

        for x in xs:
            assert_frame_equal(df / np.array(x), f(vals / x))
            assert_frame_equal(np.array(x) * df, f(vals * x))
            assert_frame_equal(df + np.array(x), f(vals + x))
            assert_frame_equal(np.array(x) - df, f(x - vals))

    def test_operators_boolean(self):

        # GH 5808
        # empty frames, non-mixed dtype

        result = DataFrame(index=[1]) & DataFrame(index=[1])
        assert_frame_equal(result, DataFrame(index=[1]))

        result = DataFrame(index=[1]) | DataFrame(index=[1])
        assert_frame_equal(result, DataFrame(index=[1]))

        result = DataFrame(index=[1]) & DataFrame(index=[1, 2])
        assert_frame_equal(result, DataFrame(index=[1, 2]))

        result = DataFrame(index=[1], columns=['A']) & DataFrame(
            index=[1], columns=['A'])
        assert_frame_equal(result, DataFrame(index=[1], columns=['A']))

        result = DataFrame(True, index=[1], columns=['A']) & DataFrame(
            True, index=[1], columns=['A'])
        assert_frame_equal(result, DataFrame(True, index=[1], columns=['A']))

        result = DataFrame(True, index=[1], columns=['A']) | DataFrame(
            True, index=[1], columns=['A'])
        assert_frame_equal(result, DataFrame(True, index=[1], columns=['A']))

        # boolean ops
        result = DataFrame(1, index=[1], columns=['A']) | DataFrame(
            True, index=[1], columns=['A'])
        assert_frame_equal(result, DataFrame(1, index=[1], columns=['A']))

        def f():
            DataFrame(1.0, index=[1], columns=['A']) | DataFrame(
                True, index=[1], columns=['A'])
        pytest.raises(TypeError, f)

        def f():
            DataFrame('foo', index=[1], columns=['A']) | DataFrame(
                True, index=[1], columns=['A'])
        pytest.raises(TypeError, f)

    def test_operators_none_as_na(self):
        df = DataFrame({"col1": [2, 5.0, 123, None],
                        "col2": [1, 2, 3, 4]}, dtype=object)

        ops = [operator.add, operator.sub, operator.mul, operator.truediv]

        # since filling converts dtypes from object, changed expected to be
        # object
        for op in ops:
            filled = df.fillna(np.nan)
            result = op(df, 3)
            expected = op(filled, 3).astype(object)
            expected[com.isna(expected)] = None
            assert_frame_equal(result, expected)

            result = op(df, df)
            expected = op(filled, filled).astype(object)
            expected[com.isna(expected)] = None
            assert_frame_equal(result, expected)

            result = op(df, df.fillna(7))
            assert_frame_equal(result, expected)

            result = op(df.fillna(7), df)
            assert_frame_equal(result, expected, check_dtype=False)

    def test_comparison_invalid(self):

        def check(df, df2):

            for (x, y) in [(df, df2), (df2, df)]:
                pytest.raises(TypeError, lambda: x == y)
                pytest.raises(TypeError, lambda: x != y)
                pytest.raises(TypeError, lambda: x >= y)
                pytest.raises(TypeError, lambda: x > y)
                pytest.raises(TypeError, lambda: x < y)
                pytest.raises(TypeError, lambda: x <= y)

        # GH4968
        # invalid date/int comparisons
        df = DataFrame(np.random.randint(10, size=(10, 1)), columns=['a'])
        df['dates'] = date_range('20010101', periods=len(df))

        df2 = df.copy()
        df2['dates'] = df['a']
        check(df, df2)

        df = DataFrame(np.random.randint(10, size=(10, 2)), columns=['a', 'b'])
        df2 = DataFrame({'a': date_range('20010101', periods=len(
            df)), 'b': date_range('20100101', periods=len(df))})
        check(df, df2)

    def test_timestamp_compare(self):
        # make sure we can compare Timestamps on the right AND left hand side
        # GH4982
        df = DataFrame({'dates1': date_range('20010101', periods=10),
                        'dates2': date_range('20010102', periods=10),
                        'intcol': np.random.randint(1000000000, size=10),
                        'floatcol': np.random.randn(10),
                        'stringcol': list(tm.rands(10))})
        df.loc[np.random.rand(len(df)) > 0.5, 'dates2'] = pd.NaT
        ops = {'gt': 'lt', 'lt': 'gt', 'ge': 'le', 'le': 'ge', 'eq': 'eq',
               'ne': 'ne'}

        for left, right in ops.items():
            left_f = getattr(operator, left)
            right_f = getattr(operator, right)

            # no nats
            expected = left_f(df, Timestamp('20010109'))
            result = right_f(Timestamp('20010109'), df)
            assert_frame_equal(result, expected)

            # nats
            expected = left_f(df, Timestamp('nat'))
            result = right_f(Timestamp('nat'), df)
            assert_frame_equal(result, expected)

    def test_modulo(self):
        # GH3590, modulo as ints
        p = DataFrame({'first': [3, 4, 5, 8], 'second': [0, 0, 0, 3]})

        # this is technically wrong as the integer portion is coerced to float
        # ###
        expected = DataFrame({'first': Series([0, 0, 0, 0], dtype='float64'),
                              'second': Series([np.nan, np.nan, np.nan, 0])})
        result = p % p
        assert_frame_equal(result, expected)

        # numpy has a slightly different (wrong) treatement
        with np.errstate(all='ignore'):
            arr = p.values % p.values
        result2 = DataFrame(arr, index=p.index,
                            columns=p.columns, dtype='float64')
        result2.iloc[0:3, 1] = np.nan
        assert_frame_equal(result2, expected)

        result = p % 0
        expected = DataFrame(np.nan, index=p.index, columns=p.columns)
        assert_frame_equal(result, expected)

        # numpy has a slightly different (wrong) treatement
        with np.errstate(all='ignore'):
            arr = p.values.astype('float64') % 0
        result2 = DataFrame(arr, index=p.index, columns=p.columns)
        assert_frame_equal(result2, expected)

        # not commutative with series
        p = DataFrame(np.random.randn(10, 5))
        s = p[0]
        res = s % p
        res2 = p % s
        assert not res.fillna(0).equals(res2.fillna(0))

    def test_div(self):

        # integer div, but deal with the 0's (GH 9144)
        p = DataFrame({'first': [3, 4, 5, 8], 'second': [0, 0, 0, 3]})
        result = p / p

        expected = DataFrame({'first': Series([1.0, 1.0, 1.0, 1.0]),
                              'second': Series([nan, nan, nan, 1])})
        assert_frame_equal(result, expected)

        with np.errstate(all='ignore'):
            arr = p.values.astype('float') / p.values
        result2 = DataFrame(arr, index=p.index,
                            columns=p.columns)
        assert_frame_equal(result2, expected)

        result = p / 0
        expected = DataFrame(np.inf, index=p.index, columns=p.columns)
        expected.iloc[0:3, 1] = nan
        assert_frame_equal(result, expected)

        # numpy has a slightly different (wrong) treatement
        with np.errstate(all='ignore'):
            arr = p.values.astype('float64') / 0
        result2 = DataFrame(arr, index=p.index,
                            columns=p.columns)
        assert_frame_equal(result2, expected)

        p = DataFrame(np.random.randn(10, 5))
        s = p[0]
        res = s / p
        res2 = p / s
        assert not res.fillna(0).equals(res2.fillna(0))

    def test_logical_operators(self):

        def _check_bin_op(op):
            result = op(df1, df2)
            expected = DataFrame(op(df1.values, df2.values), index=df1.index,
                                 columns=df1.columns)
            assert result.values.dtype == np.bool_
            assert_frame_equal(result, expected)

        def _check_unary_op(op):
            result = op(df1)
            expected = DataFrame(op(df1.values), index=df1.index,
                                 columns=df1.columns)
            assert result.values.dtype == np.bool_
            assert_frame_equal(result, expected)

        df1 = {'a': {'a': True, 'b': False, 'c': False, 'd': True, 'e': True},
               'b': {'a': False, 'b': True, 'c': False,
                     'd': False, 'e': False},
               'c': {'a': False, 'b': False, 'c': True,
                     'd': False, 'e': False},
               'd': {'a': True, 'b': False, 'c': False, 'd': True, 'e': True},
               'e': {'a': True, 'b': False, 'c': False, 'd': True, 'e': True}}

        df2 = {'a': {'a': True, 'b': False, 'c': True, 'd': False, 'e': False},
               'b': {'a': False, 'b': True, 'c': False,
                     'd': False, 'e': False},
               'c': {'a': True, 'b': False, 'c': True, 'd': False, 'e': False},
               'd': {'a': False, 'b': False, 'c': False,
                     'd': True, 'e': False},
               'e': {'a': False, 'b': False, 'c': False,
                     'd': False, 'e': True}}

        df1 = DataFrame(df1)
        df2 = DataFrame(df2)

        _check_bin_op(operator.and_)
        _check_bin_op(operator.or_)
        _check_bin_op(operator.xor)

        # operator.neg is deprecated in numpy >= 1.9
        _check_unary_op(operator.inv)

    @pytest.mark.parametrize('op,res', [('__eq__', False),
                                        ('__ne__', True)])
    def test_logical_typeerror_with_non_valid(self, op, res):
        # we are comparing floats vs a string
        result = getattr(self.frame, op)('foo')
        assert bool(result.all().all()) is res

    def test_logical_with_nas(self):
        d = DataFrame({'a': [np.nan, False], 'b': [True, True]})

        # GH4947
        # bool comparisons should return bool
        result = d['a'] | d['b']
        expected = Series([False, True])
        assert_series_equal(result, expected)

        # GH4604, automatic casting here
        result = d['a'].fillna(False) | d['b']
        expected = Series([True, True])
        assert_series_equal(result, expected)

        result = d['a'].fillna(False, downcast=False) | d['b']
        expected = Series([True, True])
        assert_series_equal(result, expected)

    def test_neg(self):
        # what to do?
        assert_frame_equal(-self.frame, -1 * self.frame)

    def test_invert(self):
        assert_frame_equal(-(self.frame < 0), ~(self.frame < 0))

    def test_arith_flex_frame(self):
        ops = ['add', 'sub', 'mul', 'div', 'truediv', 'pow', 'floordiv', 'mod']
        if not compat.PY3:
            aliases = {}
        else:
            aliases = {'div': 'truediv'}

        for op in ops:
            try:
                alias = aliases.get(op, op)
                f = getattr(operator, alias)
                result = getattr(self.frame, op)(2 * self.frame)
                exp = f(self.frame, 2 * self.frame)
                assert_frame_equal(result, exp)

                # vs mix float
                result = getattr(self.mixed_float, op)(2 * self.mixed_float)
                exp = f(self.mixed_float, 2 * self.mixed_float)
                assert_frame_equal(result, exp)
                _check_mixed_float(result, dtype=dict(C=None))

                # vs mix int
                if op in ['add', 'sub', 'mul']:
                    result = getattr(self.mixed_int, op)(2 + self.mixed_int)
                    exp = f(self.mixed_int, 2 + self.mixed_int)

                    # no overflow in the uint
                    dtype = None
                    if op in ['sub']:
                        dtype = dict(B='uint64', C=None)
                    elif op in ['add', 'mul']:
                        dtype = dict(C=None)
                    assert_frame_equal(result, exp)
                    _check_mixed_int(result, dtype=dtype)

                    # rops
                    r_f = lambda x, y: f(y, x)
                    result = getattr(self.frame, 'r' + op)(2 * self.frame)
                    exp = r_f(self.frame, 2 * self.frame)
                    assert_frame_equal(result, exp)

                    # vs mix float
                    result = getattr(self.mixed_float, op)(
                        2 * self.mixed_float)
                    exp = f(self.mixed_float, 2 * self.mixed_float)
                    assert_frame_equal(result, exp)
                    _check_mixed_float(result, dtype=dict(C=None))

                    result = getattr(self.intframe, op)(2 * self.intframe)
                    exp = f(self.intframe, 2 * self.intframe)
                    assert_frame_equal(result, exp)

                    # vs mix int
                    if op in ['add', 'sub', 'mul']:
                        result = getattr(self.mixed_int, op)(
                            2 + self.mixed_int)
                        exp = f(self.mixed_int, 2 + self.mixed_int)

                        # no overflow in the uint
                        dtype = None
                        if op in ['sub']:
                            dtype = dict(B='uint64', C=None)
                        elif op in ['add', 'mul']:
                            dtype = dict(C=None)
                        assert_frame_equal(result, exp)
                        _check_mixed_int(result, dtype=dtype)
            except:
                printing.pprint_thing("Failing operation %r" % op)
                raise

            # ndim >= 3
            ndim_5 = np.ones(self.frame.shape + (3, 4, 5))
            msg = "Unable to coerce to Series/DataFrame"
            with tm.assert_raises_regex(ValueError, msg):
                f(self.frame, ndim_5)

            with tm.assert_raises_regex(ValueError, msg):
                getattr(self.frame, op)(ndim_5)

        # res_add = self.frame.add(self.frame)
        # res_sub = self.frame.sub(self.frame)
        # res_mul = self.frame.mul(self.frame)
        # res_div = self.frame.div(2 * self.frame)

        # assert_frame_equal(res_add, self.frame + self.frame)
        # assert_frame_equal(res_sub, self.frame - self.frame)
        # assert_frame_equal(res_mul, self.frame * self.frame)
        # assert_frame_equal(res_div, self.frame / (2 * self.frame))

        const_add = self.frame.add(1)
        assert_frame_equal(const_add, self.frame + 1)

        # corner cases
        result = self.frame.add(self.frame[:0])
        assert_frame_equal(result, self.frame * np.nan)

        result = self.frame[:0].add(self.frame)
        assert_frame_equal(result, self.frame * np.nan)
        with tm.assert_raises_regex(NotImplementedError, 'fill_value'):
            self.frame.add(self.frame.iloc[0], fill_value=3)
        with tm.assert_raises_regex(NotImplementedError, 'fill_value'):
            self.frame.add(self.frame.iloc[0], axis='index', fill_value=3)

    def test_binary_ops_align(self):

        # test aligning binary ops

        # GH 6681
        index = MultiIndex.from_product([list('abc'),
                                         ['one', 'two', 'three'],
                                         [1, 2, 3]],
                                        names=['first', 'second', 'third'])

        df = DataFrame(np.arange(27 * 3).reshape(27, 3),
                       index=index,
                       columns=['value1', 'value2', 'value3']).sort_index()

        idx = pd.IndexSlice
        for op in ['add', 'sub', 'mul', 'div', 'truediv']:
            opa = getattr(operator, op, None)
            if opa is None:
                continue

            x = Series([1.0, 10.0, 100.0], [1, 2, 3])
            result = getattr(df, op)(x, level='third', axis=0)

            expected = pd.concat([opa(df.loc[idx[:, :, i], :], v)
                                  for i, v in x.iteritems()]).sort_index()
            assert_frame_equal(result, expected)

            x = Series([1.0, 10.0], ['two', 'three'])
            result = getattr(df, op)(x, level='second', axis=0)

            expected = (pd.concat([opa(df.loc[idx[:, i], :], v)
                                   for i, v in x.iteritems()])
                        .reindex_like(df).sort_index())
            assert_frame_equal(result, expected)

        # GH9463 (alignment level of dataframe with series)

        midx = MultiIndex.from_product([['A', 'B'], ['a', 'b']])
        df = DataFrame(np.ones((2, 4), dtype='int64'), columns=midx)
        s = pd.Series({'a': 1, 'b': 2})

        df2 = df.copy()
        df2.columns.names = ['lvl0', 'lvl1']
        s2 = s.copy()
        s2.index.name = 'lvl1'

        # different cases of integer/string level names:
        res1 = df.mul(s, axis=1, level=1)
        res2 = df.mul(s2, axis=1, level=1)
        res3 = df2.mul(s, axis=1, level=1)
        res4 = df2.mul(s2, axis=1, level=1)
        res5 = df2.mul(s, axis=1, level='lvl1')
        res6 = df2.mul(s2, axis=1, level='lvl1')

        exp = DataFrame(np.array([[1, 2, 1, 2], [1, 2, 1, 2]], dtype='int64'),
                        columns=midx)

        for res in [res1, res2]:
            assert_frame_equal(res, exp)

        exp.columns.names = ['lvl0', 'lvl1']
        for res in [res3, res4, res5, res6]:
            assert_frame_equal(res, exp)

    def test_arith_mixed(self):

        left = DataFrame({'A': ['a', 'b', 'c'],
                          'B': [1, 2, 3]})

        result = left + left
        expected = DataFrame({'A': ['aa', 'bb', 'cc'],
                              'B': [2, 4, 6]})
        assert_frame_equal(result, expected)

    def test_arith_getitem_commute(self):
        df = DataFrame({'A': [1.1, 3.3], 'B': [2.5, -3.9]})

        self._test_op(df, operator.add)
        self._test_op(df, operator.sub)
        self._test_op(df, operator.mul)
        self._test_op(df, operator.truediv)
        self._test_op(df, operator.floordiv)
        self._test_op(df, operator.pow)

        self._test_op(df, lambda x, y: y + x)
        self._test_op(df, lambda x, y: y - x)
        self._test_op(df, lambda x, y: y * x)
        self._test_op(df, lambda x, y: y / x)
        self._test_op(df, lambda x, y: y ** x)

        self._test_op(df, lambda x, y: x + y)
        self._test_op(df, lambda x, y: x - y)
        self._test_op(df, lambda x, y: x * y)
        self._test_op(df, lambda x, y: x / y)
        self._test_op(df, lambda x, y: x ** y)

    @staticmethod
    def _test_op(df, op):
        result = op(df, 1)

        if not df.columns.is_unique:
            raise ValueError("Only unique columns supported by this test")

        for col in result.columns:
            assert_series_equal(result[col], op(df[col], 1))

    def test_bool_flex_frame(self):
        data = np.random.randn(5, 3)
        other_data = np.random.randn(5, 3)
        df = DataFrame(data)
        other = DataFrame(other_data)
        ndim_5 = np.ones(df.shape + (1, 3))

        # Unaligned
        def _check_unaligned_frame(meth, op, df, other):
            part_o = other.loc[3:, 1:].copy()
            rs = meth(part_o)
            xp = op(df, part_o.reindex(index=df.index, columns=df.columns))
            assert_frame_equal(rs, xp)

        # DataFrame
        assert df.eq(df).values.all()
        assert not df.ne(df).values.any()
        for op in ['eq', 'ne', 'gt', 'lt', 'ge', 'le']:
            f = getattr(df, op)
            o = getattr(operator, op)
            # No NAs
            assert_frame_equal(f(other), o(df, other))
            _check_unaligned_frame(f, o, df, other)
            # ndarray
            assert_frame_equal(f(other.values), o(df, other.values))
            # scalar
            assert_frame_equal(f(0), o(df, 0))
            # NAs
            msg = "Unable to coerce to Series/DataFrame"
            assert_frame_equal(f(np.nan), o(df, np.nan))
            with tm.assert_raises_regex(ValueError, msg):
                f(ndim_5)

        # Series
        def _test_seq(df, idx_ser, col_ser):
            idx_eq = df.eq(idx_ser, axis=0)
            col_eq = df.eq(col_ser)
            idx_ne = df.ne(idx_ser, axis=0)
            col_ne = df.ne(col_ser)
            assert_frame_equal(col_eq, df == Series(col_ser))
            assert_frame_equal(col_eq, -col_ne)
            assert_frame_equal(idx_eq, -idx_ne)
            assert_frame_equal(idx_eq, df.T.eq(idx_ser).T)
            assert_frame_equal(col_eq, df.eq(list(col_ser)))
            assert_frame_equal(idx_eq, df.eq(Series(idx_ser), axis=0))
            assert_frame_equal(idx_eq, df.eq(list(idx_ser), axis=0))

            idx_gt = df.gt(idx_ser, axis=0)
            col_gt = df.gt(col_ser)
            idx_le = df.le(idx_ser, axis=0)
            col_le = df.le(col_ser)

            assert_frame_equal(col_gt, df > Series(col_ser))
            assert_frame_equal(col_gt, -col_le)
            assert_frame_equal(idx_gt, -idx_le)
            assert_frame_equal(idx_gt, df.T.gt(idx_ser).T)

            idx_ge = df.ge(idx_ser, axis=0)
            col_ge = df.ge(col_ser)
            idx_lt = df.lt(idx_ser, axis=0)
            col_lt = df.lt(col_ser)
            assert_frame_equal(col_ge, df >= Series(col_ser))
            assert_frame_equal(col_ge, -col_lt)
            assert_frame_equal(idx_ge, -idx_lt)
            assert_frame_equal(idx_ge, df.T.ge(idx_ser).T)

        idx_ser = Series(np.random.randn(5))
        col_ser = Series(np.random.randn(3))
        _test_seq(df, idx_ser, col_ser)

        # list/tuple
        _test_seq(df, idx_ser.values, col_ser.values)

        # NA
        df.loc[0, 0] = np.nan
        rs = df.eq(df)
        assert not rs.loc[0, 0]
        rs = df.ne(df)
        assert rs.loc[0, 0]
        rs = df.gt(df)
        assert not rs.loc[0, 0]
        rs = df.lt(df)
        assert not rs.loc[0, 0]
        rs = df.ge(df)
        assert not rs.loc[0, 0]
        rs = df.le(df)
        assert not rs.loc[0, 0]

        # complex
        arr = np.array([np.nan, 1, 6, np.nan])
        arr2 = np.array([2j, np.nan, 7, None])
        df = DataFrame({'a': arr})
        df2 = DataFrame({'a': arr2})
        rs = df.gt(df2)
        assert not rs.values.any()
        rs = df.ne(df2)
        assert rs.values.all()

        arr3 = np.array([2j, np.nan, None])
        df3 = DataFrame({'a': arr3})
        rs = df3.gt(2j)
        assert not rs.values.any()

        # corner, dtype=object
        df1 = DataFrame({'col': ['foo', np.nan, 'bar']})
        df2 = DataFrame({'col': ['foo', datetime.now(), 'bar']})
        result = df1.ne(df2)
        exp = DataFrame({'col': [False, True, False]})
        assert_frame_equal(result, exp)

    def test_return_dtypes_bool_op_costant(self):
        # GH15077
        df = DataFrame({'x': [1, 2, 3], 'y': [1., 2., 3.]})
        const = 2

        # not empty DataFrame
        for op in ['eq', 'ne', 'gt', 'lt', 'ge', 'le']:
            result = getattr(df, op)(const).get_dtype_counts()
            tm.assert_series_equal(result, Series([2], ['bool']))

        # empty DataFrame
        empty = df.iloc[:0]
        for op in ['eq', 'ne', 'gt', 'lt', 'ge', 'le']:
            result = getattr(empty, op)(const).get_dtype_counts()
            tm.assert_series_equal(result, Series([2], ['bool']))

    def test_dti_tz_convert_to_utc(self):
        base = pd.DatetimeIndex(['2011-01-01', '2011-01-02',
                                 '2011-01-03'], tz='UTC')
        idx1 = base.tz_convert('Asia/Tokyo')[:2]
        idx2 = base.tz_convert('US/Eastern')[1:]

        df1 = DataFrame({'A': [1, 2]}, index=idx1)
        df2 = DataFrame({'A': [1, 1]}, index=idx2)
        exp = DataFrame({'A': [np.nan, 3, np.nan]}, index=base)
        assert_frame_equal(df1 + df2, exp)

    def test_arith_flex_series(self):
        df = self.simple

        row = df.xs('a')
        col = df['two']
        # after arithmetic refactor, add truediv here
        ops = ['add', 'sub', 'mul', 'mod']
        for op in ops:
            f = getattr(df, op)
            op = getattr(operator, op)
            assert_frame_equal(f(row), op(df, row))
            assert_frame_equal(f(col, axis=0), op(df.T, col).T)

        # special case for some reason
        assert_frame_equal(df.add(row, axis=None), df + row)

        # cases which will be refactored after big arithmetic refactor
        assert_frame_equal(df.div(row), df / row)
        assert_frame_equal(df.div(col, axis=0), (df.T / col).T)

        # broadcasting issue in GH7325
        df = DataFrame(np.arange(3 * 2).reshape((3, 2)), dtype='int64')
        expected = DataFrame([[nan, np.inf], [1.0, 1.5], [1.0, 1.25]])
        result = df.div(df[0], axis='index')
        assert_frame_equal(result, expected)

        df = DataFrame(np.arange(3 * 2).reshape((3, 2)), dtype='float64')
        expected = DataFrame([[np.nan, np.inf], [1.0, 1.5], [1.0, 1.25]])
        result = df.div(df[0], axis='index')
        assert_frame_equal(result, expected)

    def test_arith_non_pandas_object(self):
        df = self.simple

        val1 = df.xs('a').values
        added = DataFrame(df.values + val1, index=df.index, columns=df.columns)
        assert_frame_equal(df + val1, added)

        added = DataFrame((df.values.T + val1).T,
                          index=df.index, columns=df.columns)
        assert_frame_equal(df.add(val1, axis=0), added)

        val2 = list(df['two'])

        added = DataFrame(df.values + val2, index=df.index, columns=df.columns)
        assert_frame_equal(df + val2, added)

        added = DataFrame((df.values.T + val2).T, index=df.index,
                          columns=df.columns)
        assert_frame_equal(df.add(val2, axis='index'), added)

        val3 = np.random.rand(*df.shape)
        added = DataFrame(df.values + val3, index=df.index, columns=df.columns)
        assert_frame_equal(df.add(val3), added)

    @pytest.mark.parametrize('values', [[1, 2], (1, 2), np.array([1, 2]),
                                        range(1, 3), deque([1, 2])])
    def test_arith_alignment_non_pandas_object(self, values):
        # GH 17901
        df = DataFrame({'A': [1, 1], 'B': [1, 1]})
        expected = DataFrame({'A': [2, 2], 'B': [3, 3]})
        result = df + values
        assert_frame_equal(result, expected)

    def test_combineFrame(self):
        frame_copy = self.frame.reindex(self.frame.index[::2])

        del frame_copy['D']
        frame_copy['C'][:5] = nan

        added = self.frame + frame_copy

        indexer = added['A'].dropna().index
        exp = (self.frame['A'] * 2).copy()

        tm.assert_series_equal(added['A'].dropna(), exp.loc[indexer])

        exp.loc[~exp.index.isin(indexer)] = np.nan
        tm.assert_series_equal(added['A'], exp.loc[added['A'].index])

        assert np.isnan(added['C'].reindex(frame_copy.index)[:5]).all()

        # assert(False)

        assert np.isnan(added['D']).all()

        self_added = self.frame + self.frame
        tm.assert_index_equal(self_added.index, self.frame.index)

        added_rev = frame_copy + self.frame
        assert np.isnan(added['D']).all()
        assert np.isnan(added_rev['D']).all()

        # corner cases

        # empty
        plus_empty = self.frame + self.empty
        assert np.isnan(plus_empty.values).all()

        empty_plus = self.empty + self.frame
        assert np.isnan(empty_plus.values).all()

        empty_empty = self.empty + self.empty
        assert empty_empty.empty

        # out of order
        reverse = self.frame.reindex(columns=self.frame.columns[::-1])

        assert_frame_equal(reverse + self.frame, self.frame * 2)

        # mix vs float64, upcast
        added = self.frame + self.mixed_float
        _check_mixed_float(added, dtype='float64')
        added = self.mixed_float + self.frame
        _check_mixed_float(added, dtype='float64')

        # mix vs mix
        added = self.mixed_float + self.mixed_float2
        _check_mixed_float(added, dtype=dict(C=None))
        added = self.mixed_float2 + self.mixed_float
        _check_mixed_float(added, dtype=dict(C=None))

        # with int
        added = self.frame + self.mixed_int
        _check_mixed_float(added, dtype='float64')

    def test_combineSeries(self):

        # Series
        series = self.frame.xs(self.frame.index[0])

        added = self.frame + series

        for key, s in compat.iteritems(added):
            assert_series_equal(s, self.frame[key] + series[key])

        larger_series = series.to_dict()
        larger_series['E'] = 1
        larger_series = Series(larger_series)
        larger_added = self.frame + larger_series

        for key, s in compat.iteritems(self.frame):
            assert_series_equal(larger_added[key], s + series[key])
        assert 'E' in larger_added
        assert np.isnan(larger_added['E']).all()

        # no upcast needed
        added = self.mixed_float + series
        _check_mixed_float(added)

        # vs mix (upcast) as needed
        added = self.mixed_float + series.astype('float32')
        _check_mixed_float(added, dtype=dict(C=None))
        added = self.mixed_float + series.astype('float16')
        _check_mixed_float(added, dtype=dict(C=None))

        # these raise with numexpr.....as we are adding an int64 to an
        # uint64....weird vs int

        # added = self.mixed_int + (100*series).astype('int64')
        # _check_mixed_int(added, dtype = dict(A = 'int64', B = 'float64', C =
        # 'int64', D = 'int64'))
        # added = self.mixed_int + (100*series).astype('int32')
        # _check_mixed_int(added, dtype = dict(A = 'int32', B = 'float64', C =
        # 'int32', D = 'int64'))

        # TimeSeries
        ts = self.tsframe['A']

        # 10890
        # we no longer allow auto timeseries broadcasting
        # and require explict broadcasting
        added = self.tsframe.add(ts, axis='index')

        for key, col in compat.iteritems(self.tsframe):
            result = col + ts
            assert_series_equal(added[key], result, check_names=False)
            assert added[key].name == key
            if col.name == ts.name:
                assert result.name == 'A'
            else:
                assert result.name is None

        smaller_frame = self.tsframe[:-5]
        smaller_added = smaller_frame.add(ts, axis='index')

        tm.assert_index_equal(smaller_added.index, self.tsframe.index)

        smaller_ts = ts[:-5]
        smaller_added2 = self.tsframe.add(smaller_ts, axis='index')
        assert_frame_equal(smaller_added, smaller_added2)

        # length 0, result is all-nan
        result = self.tsframe.add(ts[:0], axis='index')
        expected = DataFrame(np.nan, index=self.tsframe.index,
                             columns=self.tsframe.columns)
        assert_frame_equal(result, expected)

        # Frame is all-nan
        result = self.tsframe[:0].add(ts, axis='index')
        expected = DataFrame(np.nan, index=self.tsframe.index,
                             columns=self.tsframe.columns)
        assert_frame_equal(result, expected)

        # empty but with non-empty index
        frame = self.tsframe[:1].reindex(columns=[])
        result = frame.mul(ts, axis='index')
        assert len(result) == len(ts)

    def test_combineFunc(self):
        result = self.frame * 2
        tm.assert_numpy_array_equal(result.values, self.frame.values * 2)

        # vs mix
        result = self.mixed_float * 2
        for c, s in compat.iteritems(result):
            tm.assert_numpy_array_equal(
                s.values, self.mixed_float[c].values * 2)
        _check_mixed_float(result, dtype=dict(C=None))

        result = self.empty * 2
        assert result.index is self.empty.index
        assert len(result.columns) == 0

    def test_comparisons(self):
        df1 = tm.makeTimeDataFrame()
        df2 = tm.makeTimeDataFrame()

        row = self.simple.xs('a')
        ndim_5 = np.ones(df1.shape + (1, 1, 1))

        def test_comp(func):
            result = func(df1, df2)
            tm.assert_numpy_array_equal(result.values,
                                        func(df1.values, df2.values))
            with tm.assert_raises_regex(ValueError,
                                        'Wrong number of dimensions'):
                func(df1, ndim_5)

            result2 = func(self.simple, row)
            tm.assert_numpy_array_equal(result2.values,
                                        func(self.simple.values, row.values))

            result3 = func(self.frame, 0)
            tm.assert_numpy_array_equal(result3.values,
                                        func(self.frame.values, 0))

            with tm.assert_raises_regex(ValueError,
                                        'Can only compare identically'
                                        '-labeled DataFrame'):
                func(self.simple, self.simple[:2])

        test_comp(operator.eq)
        test_comp(operator.ne)
        test_comp(operator.lt)
        test_comp(operator.gt)
        test_comp(operator.ge)
        test_comp(operator.le)

    def test_comparison_protected_from_errstate(self):
        missing_df = tm.makeDataFrame()
        missing_df.iloc[0]['A'] = np.nan
        with np.errstate(invalid='ignore'):
            expected = missing_df.values < 0
        with np.errstate(invalid='raise'):
            result = (missing_df < 0).values
        tm.assert_numpy_array_equal(result, expected)

    def test_string_comparison(self):
        df = DataFrame([{"a": 1, "b": "foo"}, {"a": 2, "b": "bar"}])
        mask_a = df.a > 1
        assert_frame_equal(df[mask_a], df.loc[1:1, :])
        assert_frame_equal(df[-mask_a], df.loc[0:0, :])

        mask_b = df.b == "foo"
        assert_frame_equal(df[mask_b], df.loc[0:0, :])
        assert_frame_equal(df[-mask_b], df.loc[1:1, :])

    def test_float_none_comparison(self):
        df = DataFrame(np.random.randn(8, 3), index=lrange(8),
                       columns=['A', 'B', 'C'])

        pytest.raises(TypeError, df.__eq__, None)

    def test_boolean_comparison(self):

        # GH 4576
        # boolean comparisons with a tuple/list give unexpected results
        df = DataFrame(np.arange(6).reshape((3, 2)))
        b = np.array([2, 2])
        b_r = np.atleast_2d([2, 2])
        b_c = b_r.T
        l = (2, 2, 2)
        tup = tuple(l)

        # gt
        expected = DataFrame([[False, False], [False, True], [True, True]])
        result = df > b
        assert_frame_equal(result, expected)

        result = df.values > b
        assert_numpy_array_equal(result, expected.values)

        result = df > l
        assert_frame_equal(result, expected)

        result = df > tup
        assert_frame_equal(result, expected)

        result = df > b_r
        assert_frame_equal(result, expected)

        result = df.values > b_r
        assert_numpy_array_equal(result, expected.values)

        pytest.raises(ValueError, df.__gt__, b_c)
        pytest.raises(ValueError, df.values.__gt__, b_c)

        # ==
        expected = DataFrame([[False, False], [True, False], [False, False]])
        result = df == b
        assert_frame_equal(result, expected)

        result = df == l
        assert_frame_equal(result, expected)

        result = df == tup
        assert_frame_equal(result, expected)

        result = df == b_r
        assert_frame_equal(result, expected)

        result = df.values == b_r
        assert_numpy_array_equal(result, expected.values)

        pytest.raises(ValueError, lambda: df == b_c)
        assert df.values.shape != b_c.shape

        # with alignment
        df = DataFrame(np.arange(6).reshape((3, 2)),
                       columns=list('AB'), index=list('abc'))
        expected.index = df.index
        expected.columns = df.columns

        result = df == l
        assert_frame_equal(result, expected)

        result = df == tup
        assert_frame_equal(result, expected)

    def test_boolean_comparison_error(self):

        # GH 4576
        # boolean comparisons with a tuple/list give unexpected results
        df = DataFrame(np.arange(6).reshape((3, 2)))

        # not shape compatible
        pytest.raises(ValueError, lambda: df == (2, 2))
        pytest.raises(ValueError, lambda: df == [2, 2])

    def test_combine_generic(self):
        df1 = self.frame
        df2 = self.frame.loc[self.frame.index[:-5], ['A', 'B', 'C']]

        combined = df1.combine(df2, np.add)
        combined2 = df2.combine(df1, np.add)
        assert combined['D'].isna().all()
        assert combined2['D'].isna().all()

        chunk = combined.loc[combined.index[:-5], ['A', 'B', 'C']]
        chunk2 = combined2.loc[combined2.index[:-5], ['A', 'B', 'C']]

        exp = self.frame.loc[self.frame.index[:-5],
                             ['A', 'B', 'C']].reindex_like(chunk) * 2
        assert_frame_equal(chunk, exp)
        assert_frame_equal(chunk2, exp)

    def test_inplace_ops_alignment(self):

        # inplace ops / ops alignment
        # GH 8511

        columns = list('abcdefg')
        X_orig = DataFrame(np.arange(10 * len(columns))
                           .reshape(-1, len(columns)),
                           columns=columns, index=range(10))
        Z = 100 * X_orig.iloc[:, 1:-1].copy()
        block1 = list('bedcf')
        subs = list('bcdef')

        # add
        X = X_orig.copy()
        result1 = (X[block1] + Z).reindex(columns=subs)

        X[block1] += Z
        result2 = X.reindex(columns=subs)

        X = X_orig.copy()
        result3 = (X[block1] + Z[block1]).reindex(columns=subs)

        X[block1] += Z[block1]
        result4 = X.reindex(columns=subs)

        assert_frame_equal(result1, result2)
        assert_frame_equal(result1, result3)
        assert_frame_equal(result1, result4)

        # sub
        X = X_orig.copy()
        result1 = (X[block1] - Z).reindex(columns=subs)

        X[block1] -= Z
        result2 = X.reindex(columns=subs)

        X = X_orig.copy()
        result3 = (X[block1] - Z[block1]).reindex(columns=subs)

        X[block1] -= Z[block1]
        result4 = X.reindex(columns=subs)

        assert_frame_equal(result1, result2)
        assert_frame_equal(result1, result3)
        assert_frame_equal(result1, result4)

    def test_inplace_ops_identity(self):

        # GH 5104
        # make sure that we are actually changing the object
        s_orig = Series([1, 2, 3])
        df_orig = DataFrame(np.random.randint(0, 5, size=10).reshape(-1, 5))

        # no dtype change
        s = s_orig.copy()
        s2 = s
        s += 1
        assert_series_equal(s, s2)
        assert_series_equal(s_orig + 1, s)
        assert s is s2
        assert s._data is s2._data

        df = df_orig.copy()
        df2 = df
        df += 1
        assert_frame_equal(df, df2)
        assert_frame_equal(df_orig + 1, df)
        assert df is df2
        assert df._data is df2._data

        # dtype change
        s = s_orig.copy()
        s2 = s
        s += 1.5
        assert_series_equal(s, s2)
        assert_series_equal(s_orig + 1.5, s)

        df = df_orig.copy()
        df2 = df
        df += 1.5
        assert_frame_equal(df, df2)
        assert_frame_equal(df_orig + 1.5, df)
        assert df is df2
        assert df._data is df2._data

        # mixed dtype
        arr = np.random.randint(0, 10, size=5)
        df_orig = DataFrame({'A': arr.copy(), 'B': 'foo'})
        df = df_orig.copy()
        df2 = df
        df['A'] += 1
        expected = DataFrame({'A': arr.copy() + 1, 'B': 'foo'})
        assert_frame_equal(df, expected)
        assert_frame_equal(df2, expected)
        assert df._data is df2._data

        df = df_orig.copy()
        df2 = df
        df['A'] += 1.5
        expected = DataFrame({'A': arr.copy() + 1.5, 'B': 'foo'})
        assert_frame_equal(df, expected)
        assert_frame_equal(df2, expected)
        assert df._data is df2._data

    @pytest.mark.parametrize('op', ['add', 'and', 'div', 'floordiv', 'mod',
                                    'mul', 'or', 'pow', 'sub', 'truediv',
                                    'xor'])
    def test_inplace_ops_identity2(self, op):

        if compat.PY3 and op == 'div':
            return

        df = DataFrame({'a': [1., 2., 3.],
                        'b': [1, 2, 3]})

        operand = 2
        if op in ('and', 'or', 'xor'):
            # cannot use floats for boolean ops
            df['a'] = [True, False, True]

        df_copy = df.copy()
        iop = '__i{}__'.format(op)
        op = '__{}__'.format(op)

        # no id change and value is correct
        getattr(df, iop)(operand)
        expected = getattr(df_copy, op)(operand)
        assert_frame_equal(df, expected)
        expected = id(df)
        assert id(df) == expected

    def test_alignment_non_pandas(self):
        index = ['A', 'B', 'C']
        columns = ['X', 'Y', 'Z']
        df = pd.DataFrame(np.random.randn(3, 3), index=index, columns=columns)

        align = pd.core.ops._align_method_FRAME
        for val in [[1, 2, 3], (1, 2, 3), np.array([1, 2, 3], dtype=np.int64),
                    range(1, 4)]:

            tm.assert_series_equal(align(df, val, 'index'),
                                   Series([1, 2, 3], index=df.index))
            tm.assert_series_equal(align(df, val, 'columns'),
                                   Series([1, 2, 3], index=df.columns))

        # length mismatch
        msg = 'Unable to coerce to Series, length must be 3: given 2'
        for val in [[1, 2], (1, 2), np.array([1, 2]), range(1, 3)]:

            with tm.assert_raises_regex(ValueError, msg):
                align(df, val, 'index')

            with tm.assert_raises_regex(ValueError, msg):
                align(df, val, 'columns')

        val = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
        tm.assert_frame_equal(align(df, val, 'index'),
                              DataFrame(val, index=df.index,
                                        columns=df.columns))
        tm.assert_frame_equal(align(df, val, 'columns'),
                              DataFrame(val, index=df.index,
                                        columns=df.columns))

        # shape mismatch
        msg = 'Unable to coerce to DataFrame, shape must be'
        val = np.array([[1, 2, 3], [4, 5, 6]])
        with tm.assert_raises_regex(ValueError, msg):
            align(df, val, 'index')

        with tm.assert_raises_regex(ValueError, msg):
            align(df, val, 'columns')

        val = np.zeros((3, 3, 3))
        with pytest.raises(ValueError):
            align(df, val, 'index')
        with pytest.raises(ValueError):
            align(df, val, 'columns')
Beispiel #51
0
def test_slice_locs_indexerror():
    times = [
        datetime(2000, 1, 1) + timedelta(minutes=i * 10) for i in range(100000)
    ]
    s = Series(lrange(100000), times)
    s.loc[datetime(1900, 1, 1):datetime(2100, 1, 1)]
Beispiel #52
0
    def _parse_excel(self, sheetname=0, header=0, skiprows=None, skip_footer=0,
                     index_col=None, has_index_names=None, parse_cols=None,
                     parse_dates=False, date_parser=None, na_values=None,
                     thousands=None, chunksize=None, convert_float=True,
                     **kwds):
        import xlrd
        from xlrd import (xldate, XL_CELL_DATE,
                          XL_CELL_ERROR, XL_CELL_BOOLEAN,
                          XL_CELL_NUMBER)

        epoch1904 = self.book.datemode

        # xlrd >= 0.9.3 can return datetime objects directly.
        if LooseVersion(xlrd.__VERSION__) >= LooseVersion("0.9.3"):
            xlrd_0_9_3 = True
        else:
            xlrd_0_9_3 = False

        if isinstance(sheetname, compat.string_types):
            sheet = self.book.sheet_by_name(sheetname)
        else:  # assume an integer if not a string
            sheet = self.book.sheet_by_index(sheetname)

        data = []
        should_parse = {}
        for i in range(sheet.nrows):
            row = []
            for j, (value, typ) in enumerate(zip(sheet.row_values(i),
                                                 sheet.row_types(i))):
                if parse_cols is not None and j not in should_parse:
                    should_parse[j] = self._should_parse(j, parse_cols)

                if parse_cols is None or should_parse[j]:
                    if typ == XL_CELL_DATE:
                        if xlrd_0_9_3:
                            # Use the newer xlrd datetime handling.
                            value = xldate.xldate_as_datetime(value, epoch1904)

                            # Excel doesn't distinguish between dates and time,
                            # so we treat dates on the epoch as times only.
                            # Also, Excel supports 1900 and 1904 epochs.
                            year = (value.timetuple())[0:3]
                            if ((not epoch1904 and year == (1899, 12, 31))
                                    or (epoch1904 and year == (1904, 1, 1))):
                                value = datetime.time(value.hour,
                                                      value.minute,
                                                      value.second,
                                                      value.microsecond)
                        else:
                            # Use the xlrd <= 0.9.2 date handling.
                            dt = xldate.xldate_as_tuple(value, epoch1904)

                            if dt[0] < datetime.MINYEAR:
                                value = datetime.time(*dt[3:])
                            else:
                                value = datetime.datetime(*dt)

                    elif typ == XL_CELL_ERROR:
                        value = np.nan
                    elif typ == XL_CELL_BOOLEAN:
                        value = bool(value)
                    elif convert_float and typ == XL_CELL_NUMBER:
                        # GH5394 - Excel 'numbers' are always floats
                        # it's a minimal perf hit and less suprising
                        val = int(value)
                        if val == value:
                            value = val

                    row.append(value)

            data.append(row)

        if header is not None:
            data[header] = _trim_excel_header(data[header])

        parser = TextParser(data, header=header, index_col=index_col,
                            has_index_names=has_index_names,
                            na_values=na_values,
                            thousands=thousands,
                            parse_dates=parse_dates,
                            date_parser=date_parser,
                            skiprows=skiprows,
                            skip_footer=skip_footer,
                            chunksize=chunksize,
                            **kwds)

        return parser.read()
 def __iter__(self):
     for i in range(len(self)):
         yield self._get_val_at(i)
     raise StopIteration
Beispiel #54
0
 def _bin(x, width):
     "return int(x) as a base2 string of given width"
     return ''.join(str((x >> i) & 1) for i in range(width - 1, -1, -1))
Beispiel #55
0
    def _parse_excel(self,
                     sheetname,
                     header=0,
                     skiprows=None,
                     skip_footer=0,
                     index_col=None,
                     has_index_names=None,
                     parse_cols=None,
                     parse_dates=False,
                     date_parser=None,
                     na_values=None,
                     thousands=None,
                     chunksize=None,
                     **kwds):
        from xlrd import (xldate_as_tuple, XL_CELL_DATE, XL_CELL_ERROR,
                          XL_CELL_BOOLEAN)

        datemode = self.book.datemode
        if isinstance(sheetname, compat.string_types):
            sheet = self.book.sheet_by_name(sheetname)
        else:  # assume an integer if not a string
            sheet = self.book.sheet_by_index(sheetname)

        data = []
        should_parse = {}
        for i in range(sheet.nrows):
            row = []
            for j, (value, typ) in enumerate(
                    zip(sheet.row_values(i), sheet.row_types(i))):
                if parse_cols is not None and j not in should_parse:
                    should_parse[j] = self._should_parse(j, parse_cols)

                if parse_cols is None or should_parse[j]:
                    if typ == XL_CELL_DATE:
                        dt = xldate_as_tuple(value, datemode)
                        # how to produce this first case?
                        if dt[0] < datetime.MINYEAR:  # pragma: no cover
                            value = datetime.time(*dt[3:])
                        else:
                            value = datetime.datetime(*dt)
                    elif typ == XL_CELL_ERROR:
                        value = np.nan
                    elif typ == XL_CELL_BOOLEAN:
                        value = bool(value)
                    row.append(value)

            data.append(row)

        if header is not None:
            data[header] = _trim_excel_header(data[header])

        parser = TextParser(data,
                            header=header,
                            index_col=index_col,
                            has_index_names=has_index_names,
                            na_values=na_values,
                            thousands=thousands,
                            parse_dates=parse_dates,
                            date_parser=date_parser,
                            skiprows=skiprows,
                            skip_footer=skip_footer,
                            chunksize=chunksize,
                            **kwds)

        return parser.read()
Beispiel #56
0
def pivot_table(data,
                values=None,
                index=None,
                columns=None,
                aggfunc='mean',
                fill_value=None,
                margins=False,
                dropna=True):
    """
    Create a spreadsheet-style pivot table as a DataFrame. The levels in the
    pivot table will be stored in MultiIndex objects (hierarchical indexes) on
    the index and columns of the result DataFrame

    Parameters
    ----------
    data : DataFrame
    values : column to aggregate, optional
    index : a column, Grouper, array which has the same length as data, or list of them.
        Keys to group by on the pivot table index.
        If an array is passed, it is being used as the same manner as column values.
    columns : a column, Grouper, array which has the same length as data, or list of them.
        Keys to group by on the pivot table column.
        If an array is passed, it is being used as the same manner as column values.
    aggfunc : function, default numpy.mean, or list of functions
        If list of functions passed, the resulting pivot table will have
        hierarchical columns whose top level are the function names (inferred
        from the function objects themselves)
    fill_value : scalar, default None
        Value to replace missing values with
    margins : boolean, default False
        Add all row / columns (e.g. for subtotal / grand totals)
    dropna : boolean, default True
        Do not include columns whose entries are all NaN
    rows : kwarg only alias of index [deprecated]
    cols : kwarg only alias of columns [deprecated]

    Examples
    --------
    >>> df
       A   B   C      D
    0  foo one small  1
    1  foo one large  2
    2  foo one large  2
    3  foo two small  3
    4  foo two small  3
    5  bar one large  4
    6  bar one small  5
    7  bar two small  6
    8  bar two large  7

    >>> table = pivot_table(df, values='D', index=['A', 'B'],
    ...                     columns=['C'], aggfunc=np.sum)
    >>> table
              small  large
    foo  one  1      4
         two  6      NaN
    bar  one  5      4
         two  6      7

    Returns
    -------
    table : DataFrame
    """
    index = _convert_by(index)
    columns = _convert_by(columns)

    if isinstance(aggfunc, list):
        pieces = []
        keys = []
        for func in aggfunc:
            table = pivot_table(data,
                                values=values,
                                index=index,
                                columns=columns,
                                fill_value=fill_value,
                                aggfunc=func,
                                margins=margins)
            pieces.append(table)
            keys.append(func.__name__)
        return concat(pieces, keys=keys, axis=1)

    keys = index + columns

    values_passed = values is not None
    if values_passed:
        if isinstance(values, (list, tuple)):
            values_multi = True
        else:
            values_multi = False
            values = [values]
    else:
        values = list(data.columns.drop(keys))

    if values_passed:
        to_filter = []
        for x in keys + values:
            if isinstance(x, Grouper):
                x = x.key
            try:
                if x in data:
                    to_filter.append(x)
            except TypeError:
                pass
        if len(to_filter) < len(data.columns):
            data = data[to_filter]

    grouped = data.groupby(keys)
    agged = grouped.agg(aggfunc)

    table = agged
    if table.index.nlevels > 1:
        to_unstack = [
            agged.index.names[i] for i in range(len(index), len(keys))
        ]
        table = agged.unstack(to_unstack)

    if not dropna:
        try:
            m = MultiIndex.from_arrays(cartesian_product(table.index.levels))
            table = table.reindex_axis(m, axis=0)
        except AttributeError:
            pass  # it's a single level

        try:
            m = MultiIndex.from_arrays(cartesian_product(table.columns.levels))
            table = table.reindex_axis(m, axis=1)
        except AttributeError:
            pass  # it's a single level or a series

    if isinstance(table, DataFrame):
        if isinstance(table.columns, MultiIndex):
            table = table.sortlevel(axis=1)
        else:
            table = table.sort_index(axis=1)

    if fill_value is not None:
        table = table.fillna(value=fill_value, downcast='infer')

    if margins:
        table = _add_margins(table,
                             data,
                             values,
                             rows=index,
                             cols=columns,
                             aggfunc=aggfunc)

    # discard the top level
    if values_passed and not values_multi:
        table = table[values[0]]

    if len(index) == 0 and len(columns) > 0:
        table = table.T

    return table
Beispiel #57
0
def test_to_frame():
    tuples = [(1, 'one'), (1, 'two'), (2, 'one'), (2, 'two')]

    index = MultiIndex.from_tuples(tuples)
    result = index.to_frame(index=False)
    expected = DataFrame(tuples)
    tm.assert_frame_equal(result, expected)

    result = index.to_frame()
    expected.index = index
    tm.assert_frame_equal(result, expected)

    tuples = [(1, 'one'), (1, 'two'), (2, 'one'), (2, 'two')]
    index = MultiIndex.from_tuples(tuples, names=['first', 'second'])
    result = index.to_frame(index=False)
    expected = DataFrame(tuples)
    expected.columns = ['first', 'second']
    tm.assert_frame_equal(result, expected)

    result = index.to_frame()
    expected.index = index
    tm.assert_frame_equal(result, expected)

    # See GH-22580
    index = MultiIndex.from_tuples(tuples)
    result = index.to_frame(index=False, name=['first', 'second'])
    expected = DataFrame(tuples)
    expected.columns = ['first', 'second']
    tm.assert_frame_equal(result, expected)

    result = index.to_frame(name=['first', 'second'])
    expected.index = index
    expected.columns = ['first', 'second']
    tm.assert_frame_equal(result, expected)

    msg = "'name' must be a list / sequence of column names."
    with tm.assert_raises_regex(TypeError, msg):
        index.to_frame(name='first')

    msg = "'name' should have same length as number of levels on index."
    with tm.assert_raises_regex(ValueError, msg):
        index.to_frame(name=['first'])

    # Tests for datetime index
    index = MultiIndex.from_product(
        [range(5), pd.date_range('20130101', periods=3)])
    result = index.to_frame(index=False)
    expected = DataFrame({
        0: np.repeat(np.arange(5, dtype='int64'), 3),
        1: np.tile(pd.date_range('20130101', periods=3), 5)
    })
    tm.assert_frame_equal(result, expected)

    result = index.to_frame()
    expected.index = index
    tm.assert_frame_equal(result, expected)

    # See GH-22580
    result = index.to_frame(index=False, name=['first', 'second'])
    expected = DataFrame({
        'first':
        np.repeat(np.arange(5, dtype='int64'), 3),
        'second':
        np.tile(pd.date_range('20130101', periods=3), 5)
    })
    tm.assert_frame_equal(result, expected)

    result = index.to_frame(name=['first', 'second'])
    expected.index = index
    tm.assert_frame_equal(result, expected)
Beispiel #58
0
def get_data_yahoo_actions(symbol,
                           start=None,
                           end=None,
                           retry_count=3,
                           pause=0.001):
    """
  Returns DataFrame of historical corporate actions (dividends and stock
  splits) from symbols, over date range, start to end. All dates in the
  resulting DataFrame correspond with dividend and stock split ex-dates.

  Parameters
  ----------
    sym : string with a single Single stock symbol (ticker).
    start : string, (defaults to '1/1/2010')
        Starting date, timestamp. Parses many different kind of date
        representations (e.g., 'JAN-01-2010', '1/1/10', 'Jan, 1, 1980')
    end : string, (defaults to today)
        Ending date, timestamp. Same format as starting date.
    retry_count : int, default 3
        Number of times to retry query request.
    pause : int, default 0
        Time, in seconds, of the pause between retries.
  """

    start, end = _sanitize_dates(start, end)
    url = (_HISTORICAL_YAHOO_ACTIONS_URL + 's=%s' % symbol + '&a=%s' %
           (start.month - 1) + '&b=%s' % start.day + '&c=%s' % start.year +
           '&d=%s' % (end.month - 1) + '&e=%s' % end.day + '&f=%s' % end.year +
           '&g=v')

    for _ in range(retry_count):
        time.sleep(pause)

        try:
            with urlopen(url) as resp:
                lines = resp.read()
        except _network_error_classes:
            pass
        else:
            actions_index = []
            actions_entries = []

            for line in csv.reader(StringIO(bytes_to_str(lines))):
                # Ignore lines that aren't dividends or splits (Yahoo
                # add a bunch of irrelevant fields.)
                if len(line) != 3 or line[0] not in ('DIVIDEND', 'SPLIT'):
                    continue

                action, date, value = line
                if action == 'DIVIDEND':
                    actions_index.append(to_datetime(date))
                    actions_entries.append({
                        'action': action,
                        'value': float(value)
                    })
                elif action == 'SPLIT' and ':' in value:
                    # Convert the split ratio to a fraction. For example a
                    # 4:1 split expressed as a fraction is 1/4 = 0.25.
                    denominator, numerator = value.split(':', 1)
                    split_fraction = float(numerator) / float(denominator)

                    actions_index.append(to_datetime(date))
                    actions_entries.append({
                        'action': action,
                        'value': split_fraction
                    })

            return DataFrame(actions_entries, index=actions_index)

    raise IOError("after %d tries, Yahoo! did not "
                  "return a 200 for url %r" % (retry_count, url))
Beispiel #59
0
def _unstack_multiple(data, clocs):
    from pandas.core.groupby import decons_obs_group_ids

    if len(clocs) == 0:
        return data

    # NOTE: This doesn't deal with hierarchical columns yet

    index = data.index

    clocs = [index._get_level_number(i) for i in clocs]

    rlocs = [i for i in range(index.nlevels) if i not in clocs]

    clevels = [index.levels[i] for i in clocs]
    clabels = [index.labels[i] for i in clocs]
    cnames = [index.names[i] for i in clocs]
    rlevels = [index.levels[i] for i in rlocs]
    rlabels = [index.labels[i] for i in rlocs]
    rnames = [index.names[i] for i in rlocs]

    shape = [len(x) for x in clevels]
    group_index = get_group_index(clabels, shape, sort=False, xnull=False)

    comp_ids, obs_ids = _compress_group_index(group_index, sort=False)
    recons_labels = decons_obs_group_ids(comp_ids,
                       obs_ids, shape, clabels, xnull=False)

    dummy_index = MultiIndex(levels=rlevels + [obs_ids],
                             labels=rlabels + [comp_ids],
                             names=rnames + ['__placeholder__'],
                             verify_integrity=False)

    if isinstance(data, Series):
        dummy = Series(data.values, index=dummy_index)
        unstacked = dummy.unstack('__placeholder__')
        new_levels = clevels
        new_names = cnames
        new_labels = recons_labels
    else:
        if isinstance(data.columns, MultiIndex):
            result = data
            for i in range(len(clocs)):
                val = clocs[i]
                result = result.unstack(val)
                clocs = [val if i > val else val - 1 for val in clocs]

            return result

        dummy = DataFrame(data.values, index=dummy_index,
                          columns=data.columns)

        unstacked = dummy.unstack('__placeholder__')
        if isinstance(unstacked, Series):
            unstcols = unstacked.index
        else:
            unstcols = unstacked.columns
        new_levels = [unstcols.levels[0]] + clevels
        new_names = [data.columns.name] + cnames

        new_labels = [unstcols.labels[0]]
        for rec in recons_labels:
            new_labels.append(rec.take(unstcols.labels[-1]))

    new_columns = MultiIndex(levels=new_levels, labels=new_labels,
                             names=new_names, verify_integrity=False)

    if isinstance(unstacked, Series):
        unstacked.index = new_columns
    else:
        unstacked.columns = new_columns

    return unstacked
Beispiel #60
0
        def _do_test(df,
                     r_dtype=None,
                     c_dtype=None,
                     rnlvl=None,
                     cnlvl=None,
                     dupe_col=False):

            kwargs = dict(parse_dates=False)
            if cnlvl:
                if rnlvl is not None:
                    kwargs['index_col'] = lrange(rnlvl)
                kwargs['header'] = lrange(cnlvl)
                with ensure_clean('__tmp_to_csv_moar__') as path:
                    df.to_csv(path,
                              encoding='utf8',
                              chunksize=chunksize,
                              tupleize_cols=False)
                    recons = DataFrame.from_csv(path,
                                                tupleize_cols=False,
                                                **kwargs)
            else:
                kwargs['header'] = 0
                with ensure_clean('__tmp_to_csv_moar__') as path:
                    df.to_csv(path, encoding='utf8', chunksize=chunksize)
                    recons = DataFrame.from_csv(path, **kwargs)

            def _to_uni(x):
                if not isinstance(x, compat.text_type):
                    return x.decode('utf8')
                return x

            if dupe_col:
                # read_Csv disambiguates the columns by
                # labeling them dupe.1,dupe.2, etc'. monkey patch columns
                recons.columns = df.columns
            if rnlvl and not cnlvl:
                delta_lvl = [
                    recons.iloc[:, i].values for i in range(rnlvl - 1)
                ]
                ix = MultiIndex.from_arrays([list(recons.index)] + delta_lvl)
                recons.index = ix
                recons = recons.iloc[:, rnlvl - 1:]

            type_map = dict(i='i', f='f', s='O', u='O', dt='O', p='O')
            if r_dtype:
                if r_dtype == 'u':  # unicode
                    r_dtype = 'O'
                    recons.index = np.array(lmap(_to_uni, recons.index),
                                            dtype=r_dtype)
                    df.index = np.array(lmap(_to_uni, df.index), dtype=r_dtype)
                elif r_dtype == 'dt':  # unicode
                    r_dtype = 'O'
                    recons.index = np.array(lmap(Timestamp, recons.index),
                                            dtype=r_dtype)
                    df.index = np.array(lmap(Timestamp, df.index),
                                        dtype=r_dtype)
                elif r_dtype == 'p':
                    r_dtype = 'O'
                    recons.index = np.array(list(
                        map(Timestamp, to_datetime(recons.index))),
                                            dtype=r_dtype)
                    df.index = np.array(list(
                        map(Timestamp, df.index.to_timestamp())),
                                        dtype=r_dtype)
                else:
                    r_dtype = type_map.get(r_dtype)
                    recons.index = np.array(recons.index, dtype=r_dtype)
                    df.index = np.array(df.index, dtype=r_dtype)
            if c_dtype:
                if c_dtype == 'u':
                    c_dtype = 'O'
                    recons.columns = np.array(lmap(_to_uni, recons.columns),
                                              dtype=c_dtype)
                    df.columns = np.array(lmap(_to_uni, df.columns),
                                          dtype=c_dtype)
                elif c_dtype == 'dt':
                    c_dtype = 'O'
                    recons.columns = np.array(lmap(Timestamp, recons.columns),
                                              dtype=c_dtype)
                    df.columns = np.array(lmap(Timestamp, df.columns),
                                          dtype=c_dtype)
                elif c_dtype == 'p':
                    c_dtype = 'O'
                    recons.columns = np.array(lmap(Timestamp,
                                                   to_datetime(
                                                       recons.columns)),
                                              dtype=c_dtype)
                    df.columns = np.array(lmap(Timestamp,
                                               df.columns.to_timestamp()),
                                          dtype=c_dtype)
                else:
                    c_dtype = type_map.get(c_dtype)
                    recons.columns = np.array(recons.columns, dtype=c_dtype)
                    df.columns = np.array(df.columns, dtype=c_dtype)

            assert_frame_equal(df,
                               recons,
                               check_names=False,
                               check_less_precise=True)