Ejemplo n.º 1
1
 def test_replace_str_to_str_chain(self):
     a = np.arange(1, 5)
     astr = a.astype(str)
     bstr = np.arange(2, 6).astype(str)
     df = DataFrame({'a': astr})
     with tm.assertRaisesRegexp(ValueError, "Replacement not allowed .+"):
         df.replace({'a': dict(zip(astr, bstr))})
Ejemplo n.º 2
0
    def test_time(self):
        import matplotlib.pyplot as plt

        plt.close("all")

        t = datetime(1, 1, 1, 3, 30, 0)
        deltas = np.random.randint(1, 20, 3).cumsum()
        ts = np.array([(t + timedelta(minutes=int(x))).time() for x in deltas])
        df = DataFrame({"a": np.random.randn(len(ts)), "b": np.random.randn(len(ts))}, index=ts)
        ax = df.plot()

        # verify tick labels
        ticks = ax.get_xticks()
        labels = ax.get_xticklabels()
        for t, l in zip(ticks, labels):
            m, s = divmod(int(t), 60)
            h, m = divmod(m, 60)
            xp = l.get_text()
            if len(xp) > 0:
                rs = time(h, m, s).strftime("%H:%M:%S")
                self.assert_(xp, rs)

        # change xlim
        ax.set_xlim("1:30", "5:00")

        # check tick labels again
        ticks = ax.get_xticks()
        labels = ax.get_xticklabels()
        for t, l in zip(ticks, labels):
            m, s = divmod(int(t), 60)
            h, m = divmod(m, 60)
            xp = l.get_text()
            if len(xp) > 0:
                rs = time(h, m, s).strftime("%H:%M:%S")
                self.assert_(xp, rs)
Ejemplo n.º 3
0
    def _check_colors(self, collections, linecolors=None, facecolors=None,
                      mapping=None):
        """
        Check each artist has expected line colors and face colors

        Parameters
        ----------
        collections : list-like
            list or collection of target artist
        linecolors : list-like which has the same length as collections
            list of expected line colors
        facecolors : list-like which has the same length as collections
            list of expected face colors
        mapping : Series
            Series used for color grouping key
            used for andrew_curves, parallel_coordinates, radviz test
        """

        from matplotlib.lines import Line2D
        from matplotlib.collections import Collection, PolyCollection
        conv = self.colorconverter
        if linecolors is not None:

            if mapping is not None:
                linecolors = self._get_colors_mapped(mapping, linecolors)
                linecolors = linecolors[:len(collections)]

            self.assertEqual(len(collections), len(linecolors))
            for patch, color in zip(collections, linecolors):
                if isinstance(patch, Line2D):
                    result = patch.get_color()
                    # Line2D may contains string color expression
                    result = conv.to_rgba(result)
                elif isinstance(patch, PolyCollection):
                    result = tuple(patch.get_edgecolor()[0])
                else:
                    result = patch.get_edgecolor()

                expected = conv.to_rgba(color)
                self.assertEqual(result, expected)

        if facecolors is not None:

            if mapping is not None:
                facecolors = self._get_colors_mapped(mapping, facecolors)
                facecolors = facecolors[:len(collections)]

            self.assertEqual(len(collections), len(facecolors))
            for patch, color in zip(collections, facecolors):
                if isinstance(patch, Collection):
                    # returned as list of np.array
                    result = patch.get_facecolor()[0]
                else:
                    result = patch.get_facecolor()

                if isinstance(result, np.ndarray):
                    result = tuple(result)

                expected = conv.to_rgba(color)
                self.assertEqual(result, expected)
def create_data():
    """ create the pickle/msgpack data """

    data = {
        'A': [0., 1., 2., 3., np.nan],
        'B': [0, 1, 0, 1, 0],
        'C': ['foo1', 'foo2', 'foo3', 'foo4', 'foo5'],
        'D': date_range('1/1/2009', periods=5),
        'E': [0., 1, Timestamp('20100101'), 'foo', 2.]
    }

    index = dict(int=Index(np.arange(10)),
                 date=date_range('20130101', periods=10),
                 period=period_range('2013-01-01', freq='M', periods=10))

    mi = dict(reg2=MultiIndex.from_tuples(tuple(zip(*[['bar', 'bar', 'baz', 'baz', 'foo', 'foo', 'qux', 'qux'],
                                                      ['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two']])),
                                          names=['first', 'second']))
    series = dict(float=Series(data['A']),
                  int=Series(data['B']),
                  mixed=Series(data['E']),
                  ts=TimeSeries(np.arange(10).astype(np.int64), index=date_range('20130101',periods=10)),
                  mi=Series(np.arange(5).astype(np.float64),
                            index=MultiIndex.from_tuples(tuple(zip(*[[1, 1, 2, 2, 2], [3, 4, 3, 4, 5]])),
                                                         names=['one', 'two'])),
                  dup=Series(np.arange(5).astype(np.float64), index=['A', 'B', 'C', 'D', 'A']),
                  cat=Series(Categorical(['foo', 'bar', 'baz'])),
                  per=Series([Period('2000Q1')] * 5))

    mixed_dup_df = DataFrame(data)
    mixed_dup_df.columns = list("ABCDA")
    frame = dict(float=DataFrame(dict(A=series['float'], B=series['float'] + 1)),
                 int=DataFrame(dict(A=series['int'], B=series['int'] + 1)),
                 mixed=DataFrame(dict([(k, data[k]) for k in ['A', 'B', 'C', 'D']])),
                 mi=DataFrame(dict(A=np.arange(5).astype(np.float64), B=np.arange(5).astype(np.int64)),
                              index=MultiIndex.from_tuples(tuple(zip(*[['bar', 'bar', 'baz', 'baz', 'baz'],
                                                                       ['one', 'two', 'one', 'two', 'three']])),
                                                           names=['first', 'second'])),
                 dup=DataFrame(np.arange(15).reshape(5, 3).astype(np.float64),
                               columns=['A', 'B', 'A']),
                 cat_onecol=DataFrame(dict(A=Categorical(['foo', 'bar']))),
                 cat_and_float=DataFrame(dict(A=Categorical(['foo', 'bar', 'baz']),
                                              B=np.arange(3).astype(np.int64))),
                 mixed_dup=mixed_dup_df)

    mixed_dup_panel = Panel(dict(ItemA=frame['float'], ItemB=frame['int']))
    mixed_dup_panel.items = ['ItemA', 'ItemA']
    panel = dict(float=Panel(dict(ItemA=frame['float'], ItemB=frame['float'] + 1)),
                 dup=Panel(np.arange(30).reshape(3, 5, 2).astype(np.float64),
                           items=['A', 'B', 'A']),
                 mixed_dup=mixed_dup_panel)

    return dict(series=series,
                frame=frame,
                panel=panel,
                index=index,
                mi=mi,
                sp_series=dict(float=_create_sp_series(),
                               ts=_create_sp_tsseries()),
                sp_frame=dict(float=_create_sp_frame()))
Ejemplo n.º 5
0
    def test_time(self):
        t = datetime(1, 1, 1, 3, 30, 0)
        deltas = np.random.randint(1, 20, 3).cumsum()
        ts = np.array([(t + timedelta(minutes=int(x))).time() for x in deltas])
        df = DataFrame({'a': np.random.randn(len(ts)),
                        'b': np.random.randn(len(ts))},
                       index=ts)
        ax = df.plot()

        # verify tick labels
        ticks = ax.get_xticks()
        labels = ax.get_xticklabels()
        for t, l in zip(ticks, labels):
            m, s = divmod(int(t), 60)
            h, m = divmod(m, 60)
            xp = l.get_text()
            if len(xp) > 0:
                rs = time(h, m, s).strftime('%H:%M:%S')
                self.assertEqual(xp, rs)

        # change xlim
        ax.set_xlim('1:30', '5:00')

        # check tick labels again
        ticks = ax.get_xticks()
        labels = ax.get_xticklabels()
        for t, l in zip(ticks, labels):
            m, s = divmod(int(t), 60)
            h, m = divmod(m, 60)
            xp = l.get_text()
            if len(xp) > 0:
                rs = time(h, m, s).strftime('%H:%M:%S')
                self.assertEqual(xp, rs)
Ejemplo n.º 6
0
def _range_from_fields(year=None, month=None, quarter=None, day=None,
                       hour=None, minute=None, second=None, freq=None):
    if hour is None:
        hour = 0
    if minute is None:
        minute = 0
    if second is None:
        second = 0
    if day is None:
        day = 1

    ordinals = []

    if quarter is not None:
        if freq is None:
            freq = 'Q'
            base = frequencies.FreqGroup.FR_QTR
        else:
            base, mult = frequencies.get_freq_code(freq)
            if base != frequencies.FreqGroup.FR_QTR:
                raise AssertionError("base must equal FR_QTR")

        year, quarter = _make_field_arrays(year, quarter)
        for y, q in compat.zip(year, quarter):
            y, m = libperiod.quarter_to_myear(y, q, freq)
            val = libperiod.period_ordinal(y, m, 1, 1, 1, 1, 0, 0, base)
            ordinals.append(val)
    else:
        base, mult = frequencies.get_freq_code(freq)
        arrays = _make_field_arrays(year, month, day, hour, minute, second)
        for y, mth, d, h, mn, s in compat.zip(*arrays):
            ordinals.append(libperiod.period_ordinal(
                y, mth, d, h, mn, s, 0, 0, base))

    return np.array(ordinals, dtype=np.int64), freq
def create_data():
    """ create the pickle/msgpack data """

    data = {
        'A': [0., 1., 2., 3., np.nan],
        'B': [0, 1, 0, 1, 0],
        'C': ['foo1', 'foo2', 'foo3', 'foo4', 'foo5'],
        'D': date_range('1/1/2009', periods=5),
        'E': [0., 1, Timestamp('20100101'), 'foo', 2.]
    }

    scalars = dict(timestamp=Timestamp('20130101'))
    if LooseVersion(pandas.__version__) >= '0.17.0':
        scalars['period'] = Period('2012','M')

    index = dict(int=Index(np.arange(10)),
                 date=date_range('20130101', periods=10),
                 period=period_range('2013-01-01', freq='M', periods=10))

    mi = dict(reg2=MultiIndex.from_tuples(tuple(zip(*[['bar', 'bar', 'baz', 'baz', 'foo', 'foo', 'qux', 'qux'],
                                                      ['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two']])),
                                          names=['first', 'second']))
    series = dict(float=Series(data['A']),
                  int=Series(data['B']),
                  mixed=Series(data['E']),
                  ts=Series(np.arange(10).astype(np.int64), index=date_range('20130101',periods=10)),
                  mi=Series(np.arange(5).astype(np.float64),
                            index=MultiIndex.from_tuples(tuple(zip(*[[1, 1, 2, 2, 2], [3, 4, 3, 4, 5]])),
                                                         names=['one', 'two'])),
                  dup=Series(np.arange(5).astype(np.float64), index=['A', 'B', 'C', 'D', 'A']),
<<<<<<< HEAD
<<<<<<< HEAD
                  cat=Series(Categorical(['foo', 'bar', 'baz'])))
    if LooseVersion(pandas.__version__) >= '0.17.0':
        series['period'] = Series([Period('2000Q1')] * 5)
Ejemplo n.º 8
0
def df2dbiter(dataframe, table_class, harmonize_cols_first=True, harmonize_rows_first=True,
              parse_dates=None):
    """
        Returns a generator of of ORM model instances (reflecting the rows database table mapped by
        table_class) from the given dataframe. The returned generator can be used in loops and each
        element can be e.g. added to the database by means of sqlAlchemy `session.add` method.
        NOTE: Only the dataframe columns whose name match the table_class columns will be used.
        Therefore, it is safe to append to dataframe any column whose name is not in table_class
        columns. Some iterations might return None according to the parameters (see below)
        :param table_class: the CLASS of the table whose rows are instantiated and returned
        :param dataframe: the input dataframe
        :param harmonize_cols_first: if True (default when missing), the dataframe column types
        are harmonized to reflect the table_class column types, and columns without a match
        in table_class are filtered out. See `harmonize_columns(dataframe)`. If
        `harmonize_cols_first` and `harmonize_rows_first` are both True, the harmonization is
        executed in that order (first columns, then rows).
        :param harmonize_rows_first: if True (default when missing), NA values are checked for
        those table columns which have the property nullable set to False. In this case, the
        generator **might return None** (the user should in case handle it, e.g. skipping
        these model instances which are not writable to the db). If
        `harmonize_cols_first` and `harmonize_rows_first` are both True, the harmonization is
        executed in that order (first columns, then rows).
        :param parse_dates: a list of strings denoting additional column names whose values should
        be parsed as dates. Ignored if harmonize_cols_first is False
    """
    if harmonize_cols_first:
        colnames, dataframe = _harmonize_columns(table_class, dataframe, parse_dates)
    else:
        colnames = list(shared_colnames(table_class, dataframe))
#         table_col_names = get_col_names(table_class)
#         colnames = [c for c in dataframe.columns if c in table_col_names]  # FIXME: optimize this?

    new_df = dataframe[colnames]

    if dataframe.empty:
        for _ in len(dataframe):
            yield None
        return

    valid_rows = cycle([True])
    if harmonize_rows_first:
        non_nullable_cols = list(shared_colnames(table_class, new_df, nullable=False))
        if non_nullable_cols:
            valid_rows = new_df[non_nullable_cols].notnull().all(axis=1).values
#             df = ~pd_isnull(new_df[non_nullable_cols])
#             valid_rows = df.apply(lambda row: row.all(), axis=1).values

    cols, datalist = _insert_data(new_df)
    # Note below: datalist is an array of N column, each of M rows (it would be nicer to return an
    # array of N rows, each of them representing a table row. But we do not want to touch pandas
    # code.
    # See _insert_table below). Thus we zip it:
    for is_ok, row_values in zip(valid_rows, zip(*datalist)):
        if is_ok:
            # we could make a single line statement, but two lines are more readable:
            row_args_dict = dict(zip(cols, row_values))
            yield table_class(**row_args_dict)
        else:
            yield None
def create_data():
    """ create the pickle data """

    import numpy as np
    import pandas
    from pandas import (Series,TimeSeries,DataFrame,Panel,
                        SparseSeries,SparseTimeSeries,SparseDataFrame,SparsePanel,
                        Index,MultiIndex,PeriodIndex,
                        date_range,period_range,bdate_range,Timestamp)
    nan = np.nan

    data = {
        'A': [0., 1., 2., 3., np.nan],
        'B': [0, 1, 0, 1, 0],
        'C': ['foo1', 'foo2', 'foo3', 'foo4', 'foo5'],
        'D': date_range('1/1/2009', periods=5),
        'E' : [0., 1, Timestamp('20100101'),'foo',2.],
        }

    index = dict(int = Index(np.arange(10)),
                 date = date_range('20130101',periods=10),
                 period = period_range('2013-01-01', freq='M', periods=10))

    mi = dict(reg2 = MultiIndex.from_tuples(tuple(zip(*[['bar', 'bar', 'baz', 'baz', 'foo', 'foo', 'qux', 'qux'],
                                                      ['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two']])),
                                                 names=['first', 'second']))
    series = dict(float = Series(data['A']),
                  int = Series(data['B']),
                  mixed = Series(data['E']),
                  ts = TimeSeries(np.arange(10).astype(np.int64),index=date_range('20130101',periods=10)),
                  mi = Series(np.arange(5).astype(np.float64),index=MultiIndex.from_tuples(tuple(zip(*[[1,1,2,2,2],
                                                                                                    [3,4,3,4,5]])),
                                                                                           names=['one','two'])),
                  dup=Series(np.arange(5).astype(np.float64), index=['A', 'B', 'C', 'D', 'A']))

    frame = dict(float = DataFrame(dict(A = series['float'], B = series['float'] + 1)),
                 int = DataFrame(dict(A = series['int']  , B = series['int']   + 1)),
                 mixed = DataFrame(dict([ (k,data[k]) for k in ['A','B','C','D']])),
                 mi = DataFrame(dict(A = np.arange(5).astype(np.float64), B = np.arange(5).astype(np.int64)),
                                index=MultiIndex.from_tuples(tuple(zip(*[['bar','bar','baz','baz','baz'],
                                                                       ['one','two','one','two','three']])),
                                                             names=['first','second'])),
                 dup = DataFrame(np.arange(15).reshape(5, 3).astype(np.float64),
                                 columns=['A', 'B', 'A']))
    panel = dict(float = Panel(dict(ItemA = frame['float'], ItemB = frame['float']+1)),
                 dup = Panel(np.arange(30).reshape(3, 5, 2).astype(np.float64),
                             items=['A', 'B', 'A']))



    return dict( series = series,
                 frame = frame,
                 panel = panel,
                 index = index,
                 mi = mi,
                 sp_series = dict(float = _create_sp_series(),
                                  ts = _create_sp_tsseries()),
                 sp_frame = dict(float = _create_sp_frame())
                 )
Ejemplo n.º 10
0
def _align_core(terms):
    term_index = [i for i, term in enumerate(terms)
                  if hasattr(term.value, 'axes')]
    term_dims = [terms[i].value.ndim for i in term_index]
    ndims = pd.Series(dict(zip(term_index, term_dims)))

    # initial axes are the axes of the largest-axis'd term
    biggest = terms[ndims.idxmax()].value
    typ = biggest._constructor
    axes = biggest.axes
    naxes = len(axes)
    gt_than_one_axis = naxes > 1

    for value in (terms[i].value for i in term_index):
        is_series = isinstance(value, pd.Series)
        is_series_and_gt_one_axis = is_series and gt_than_one_axis

        for axis, items in enumerate(value.axes):
            if is_series_and_gt_one_axis:
                ax, itm = naxes - 1, value.index
            else:
                ax, itm = axis, items

            if not axes[ax].is_(itm):
                axes[ax] = axes[ax].join(itm, how='outer')

    for i, ndim in compat.iteritems(ndims):
        for axis, items in zip(range(ndim), axes):
            ti = terms[i].value

            if hasattr(ti, 'reindex_axis'):
                transpose = isinstance(ti, pd.Series) and naxes > 1
                reindexer = axes[naxes - 1] if transpose else items

                term_axis_size = len(ti.axes[axis])
                reindexer_size = len(reindexer)

                ordm = np.log10(abs(reindexer_size - term_axis_size))
                if ordm >= 1 and reindexer_size >= 10000:
                    warnings.warn('Alignment difference on axis {0} is larger '
                                  'than an order of magnitude on term {1!r}, '
                                  'by more than {2:.4g}; performance may '
                                  'suffer'.format(axis, terms[i].name, ordm),
                                  category=pd.io.common.PerformanceWarning,
                                  stacklevel=6)

                if transpose:
                    f = partial(ti.reindex, index=reindexer, copy=False)
                else:
                    f = partial(ti.reindex_axis, reindexer, axis=axis,
                                copy=False)

                terms[i].update(f())

        terms[i].update(terms[i].value.values)

    return typ, _zip_axes_from_type(typ, axes)
Ejemplo n.º 11
0
def group_mean(lat, lon, data):
    indexer = np.lexsort((lon, lat))
    lat = lat.take(indexer)
    lon = lon.take(indexer)
    sorted_data = data.take(indexer)

    keys = 1000 * lat + lon
    unique_keys = np.unique(keys)

    result = ndi.mean(sorted_data, labels=keys, index=unique_keys)
    decoder = keys.searchsorted(unique_keys)

    return dict(zip(zip(lat.take(decoder), lon.take(decoder)), result))
Ejemplo n.º 12
0
def sequence_grids(layer_grids):
    """Go through the list of layer girds and perform the same thing as sequence_layers.

    Parameters:
    -----------
    layer_grids: a list of two dimensional layer grids
    """
    for grid1, grid2 in zip(layer_grids[:-1], layer_grids[1:]):
        for row1, row2 in zip(grid1, grid2):
            for layer1, layer2 in zip(row1, row2):
                if layer2.data is None:
                    layer2.data = layer1.data
                merge_aes(layer1, layer2)
    return layer_grids
Ejemplo n.º 13
0
    def test_constructors(self, closed, name):
        left, right = Index([0, 1, 2, 3]), Index([1, 2, 3, 4])
        ivs = [Interval(l, r, closed=closed) for l, r in zip(left, right)]
        expected = IntervalIndex._simple_new(
            left=left, right=right, closed=closed, name=name)

        result = IntervalIndex(ivs, name=name)
        tm.assert_index_equal(result, expected)

        result = IntervalIndex.from_intervals(ivs, name=name)
        tm.assert_index_equal(result, expected)

        result = IntervalIndex.from_breaks(
            np.arange(5), closed=closed, name=name)
        tm.assert_index_equal(result, expected)

        result = IntervalIndex.from_arrays(
            left.values, right.values, closed=closed, name=name)
        tm.assert_index_equal(result, expected)

        result = IntervalIndex.from_tuples(
            zip(left, right), closed=closed, name=name)
        tm.assert_index_equal(result, expected)

        result = Index(ivs, name=name)
        assert isinstance(result, IntervalIndex)
        tm.assert_index_equal(result, expected)

        # idempotent
        tm.assert_index_equal(Index(expected), expected)
        tm.assert_index_equal(IntervalIndex(expected), expected)

        result = IntervalIndex.from_intervals(
            expected.values, name=expected.name)
        tm.assert_index_equal(result, expected)

        left, right = expected.left, expected.right
        result = IntervalIndex.from_arrays(
            left, right, closed=expected.closed, name=expected.name)
        tm.assert_index_equal(result, expected)

        result = IntervalIndex.from_tuples(
            expected.to_tuples(), closed=expected.closed, name=expected.name)
        tm.assert_index_equal(result, expected)

        breaks = expected.left.tolist() + [expected.right[-1]]
        result = IntervalIndex.from_breaks(
            breaks, closed=expected.closed, name=expected.name)
        tm.assert_index_equal(result, expected)
Ejemplo n.º 14
0
 def insert(self):
     ins = self.insert_statement()
     data_list = []
     # to avoid if check for every row
     keys = self.frame.columns
     if self.index is not None:
         for t in self.frame.itertuples():
             data = dict((k, self.maybe_asscalar(v)) for k, v in zip(keys, t[1:]))
             data[self.index] = self.maybe_asscalar(t[0])
             data_list.append(data)
     else:
         for t in self.frame.itertuples():
             data = dict((k, self.maybe_asscalar(v)) for k, v in zip(keys, t[1:]))
             data_list.append(data)
     self.pd_sql.execute(ins, data_list)
Ejemplo n.º 15
0
def groupby1(lat, lon, data):
    indexer = np.lexsort((lon, lat))
    lat = lat.take(indexer)
    lon = lon.take(indexer)
    sorted_data = data.take(indexer)

    keys = 1000.0 * lat + lon
    unique_keys = np.unique(keys)
    bounds = keys.searchsorted(unique_keys)

    result = group_agg(sorted_data, bounds, lambda x: x.mean())

    decoder = keys.searchsorted(unique_keys)

    return dict(zip(zip(lat.take(decoder), lon.take(decoder)), result))
Ejemplo n.º 16
0
    def _combineFrame(self, other, func, axis=0):
        index, columns = self._get_plane_axes(axis)
        axis = self._get_axis_number(axis)

        other = other.reindex(index=index, columns=columns)

        if axis == 0:
            new_values = func(self.values, other.values)
        elif axis == 1:
            new_values = func(self.values.swapaxes(0, 1), other.values.T)
            new_values = new_values.swapaxes(0, 1)
        elif axis == 2:
            new_values = func(self.values.swapaxes(0, 2), other.values)
            new_values = new_values.swapaxes(0, 2)

        # TODO: make faster!
        new_frames = {}
        for item, item_slice in zip(self.items, new_values):
            old_frame = self[item]
            ofv = old_frame.default_fill_value
            ok = old_frame.default_kind
            new_frames[item] = SparseDataFrame(item_slice,
                                               index=self.major_axis,
                                               columns=self.minor_axis,
                                               default_fill_value=ofv,
                                               default_kind=ok)

        return self._new_like(new_frames)
Ejemplo n.º 17
0
    def _create_table_statement(self):
        "Return a CREATE TABLE statement to suit the contents of a DataFrame."

        columns = list(map(str, self.frame.columns))
        pat = re.compile('\s+')
        if any(map(pat.search, columns)):
            warnings.warn(_SAFE_NAMES_WARNING)
        column_types = [self._sql_type_name(typ) for typ in self.frame.dtypes]

        if self.index is not None:
            for i, idx_label in enumerate(self.index[::-1]):
                columns.insert(0, idx_label)
                column_types.insert(0, self._sql_type_name(self.frame.index.get_level_values(i).dtype))

        flv = self.pd_sql.flavor

        br_l = _SQL_SYMB[flv]['br_l']  # left val quote char
        br_r = _SQL_SYMB[flv]['br_r']  # right val quote char

        col_template = br_l + '%s' + br_r + ' %s'

        columns = ',\n  '.join(col_template %
                               x for x in zip(columns, column_types))
        template = """CREATE TABLE %(name)s (
                      %(columns)s
                      )"""
        create_statement = template % {'name': self.name, 'columns': columns}
        return create_statement
Ejemplo n.º 18
0
    def _init_data(self, data, copy, dtype, **kwargs):
        """
        Generate ND initialization; axes are passed
        as required objects to __init__
        """
        if data is None:
            data = {}
        if dtype is not None:
            dtype = self._validate_dtype(dtype)

        passed_axes = [kwargs.get(a) for a in self._AXIS_ORDERS]
        axes = None
        if isinstance(data, BlockManager):
            if any(x is not None for x in passed_axes):
                axes = [x if x is not None else y
                        for x, y in zip(passed_axes, data.axes)]
            mgr = data
        elif isinstance(data, dict):
            mgr = self._init_dict(data, passed_axes, dtype=dtype)
            copy = False
            dtype = None
        elif isinstance(data, (np.ndarray, list)):
            mgr = self._init_matrix(data, passed_axes, dtype=dtype, copy=copy)
            copy = False
            dtype = None
        else:  # pragma: no cover
            raise PandasError('Panel constructor not properly called!')

        NDFrame.__init__(self, mgr, axes=axes, copy=copy, dtype=dtype)
Ejemplo n.º 19
0
    def __unicode__(self):
        """
        Return a string representation for a particular Panel

        Invoked by unicode(df) in py2 only.
        Yields a Unicode String in both py2/py3.
        """

        class_name = str(self.__class__)

        shape = self.shape
        dims = u('Dimensions: %s') % ' x '.join(
            ["%d (%s)" % (s, a) for a, s in zip(self._AXIS_ORDERS, shape)])

        def axis_pretty(a):
            v = getattr(self, a)
            if len(v) > 0:
                return u('%s axis: %s to %s') % (a.capitalize(),
                                                 com.pprint_thing(v[0]),
                                                 com.pprint_thing(v[-1]))
            else:
                return u('%s axis: None') % a.capitalize()

        output = '\n'.join(
            [class_name, dims] + [axis_pretty(a) for a in self._AXIS_ORDERS])
        return output
Ejemplo n.º 20
0
def _get_join_indexers(left_keys, right_keys, sort=False, how='inner'):
    """

    Parameters
    ----------

    Returns
    -------

    """
    from functools import partial

    assert len(left_keys) == len(right_keys), \
        'left_key and right_keys must be the same length'

    # bind `sort` arg. of _factorize_keys
    fkeys = partial(_factorize_keys, sort=sort)

    # get left & right join labels and num. of levels at each location
    llab, rlab, shape = map(list, zip(* map(fkeys, left_keys, right_keys)))

    # get flat i8 keys from label lists
    lkey, rkey = _get_join_keys(llab, rlab, shape, sort)

    # factorize keys to a dense i8 space
    # `count` is the num. of unique keys
    # set(lkey) | set(rkey) == range(count)
    lkey, rkey, count = fkeys(lkey, rkey)

    # preserve left frame order if how == 'left' and sort == False
    kwargs = {'sort': sort} if how == 'left' else {}
    join_func = _join_functions[how]
    return join_func(lkey, rkey, count, **kwargs)
Ejemplo n.º 21
0
def get_data_famafrench(name):
    # path of zip files
    zip_file_path = '{0}/{1}.zip'.format(_FAMAFRENCH_URL, name)

    with urlopen(zip_file_path) as url:
        raw = url.read()

    with tempfile.TemporaryFile() as tmpf:
        tmpf.write(raw)

        with ZipFile(tmpf, 'r') as zf:
            data = zf.open(zf.namelist()[0]).readlines()

    line_lengths = np.array(lmap(len, data))
    file_edges = np.where(line_lengths == 2)[0]

    datasets = {}
    edges = zip(file_edges + 1, file_edges[1:])
    for i, (left_edge, right_edge) in enumerate(edges):
        dataset = [d.split() for d in data[left_edge:right_edge]]
        if len(dataset) > 10:
            ncol_raw = np.array(lmap(len, dataset))
            ncol = np.median(ncol_raw)
            header_index = np.where(ncol_raw == ncol - 1)[0][-1]
            header = dataset[header_index]
            ds_header = dataset[header_index + 1:]
            # to ensure the header is unique
            header = ['{0} {1}'.format(j, hj) for j, hj in enumerate(header,
                                                                     start=1)]
            index = np.array([d[0] for d in ds_header], dtype=int)
            dataset = np.array([d[1:] for d in ds_header], dtype=float)
            datasets[i] = DataFrame(dataset, index, columns=header)

    return datasets
Ejemplo n.º 22
0
    def __init__(self, values, index, level=-1, value_columns=None):
        if values.ndim == 1:
            values = values[:, np.newaxis]
        self.values = values
        self.value_columns = value_columns

        if value_columns is None and values.shape[1] != 1:  # pragma: no cover
            raise ValueError('must pass column labels for multi-column data')

        self.index = index
        self.level = self.index._get_level_number(level)

        levels = index.levels
        labels = index.labels
        def _make_index(lev,lab):
            i = lev.__class__(_make_index_array_level(lev.values,lab))
            i.name = lev.name
            return i

        self.new_index_levels = list([ _make_index(lev,lab) for lev,lab in zip(levels,labels) ])
        self.new_index_names = list(index.names)

        self.removed_name = self.new_index_names.pop(self.level)
        self.removed_level = self.new_index_levels.pop(self.level)

        self._make_sorted_values_labels()
        self._make_selectors()
Ejemplo n.º 23
0
def test_convert_r_dataframe():

    is_na = robj.baseenv.get("is.na")

    seriesd = _test.getSeriesData()
    frame = pd.DataFrame(seriesd, columns=['D', 'C', 'B', 'A'])

    # Null data
    frame["E"] = [np.nan for item in frame["A"]]
    # Some mixed type data
    frame["F"] = ["text" if item % 2 == 0 else np.nan for item in range(30)]

    r_dataframe = convert_to_r_dataframe(frame)

    assert np.array_equal(convert_robj(r_dataframe.rownames), frame.index)
    assert np.array_equal(convert_robj(r_dataframe.colnames), frame.columns)
    assert all(is_na(item) for item in r_dataframe.rx2("E"))

    for column in frame[["A", "B", "C", "D"]]:
        coldata = r_dataframe.rx2(column)
        original_data = frame[column]
        assert np.array_equal(convert_robj(coldata), original_data)

    for column in frame[["D", "E"]]:
        for original, converted in zip(frame[column],
                                       r_dataframe.rx2(column)):

            if pd.isnull(original):
                assert is_na(converted)
            else:
                assert original == converted
Ejemplo n.º 24
0
    def test_format_timedelta_ticks_wide(self):
        if is_platform_mac():
            pytest.skip("skip on mac for precision display issue on older mpl")

        expected_labels = [
            '00:00:00',
            '1 days 03:46:40',
            '2 days 07:33:20',
            '3 days 11:20:00',
            '4 days 15:06:40',
            '5 days 18:53:20',
            '6 days 22:40:00',
            '8 days 02:26:40',
            ''
        ]

        rng = timedelta_range('0', periods=10, freq='1 d')
        df = DataFrame(np.random.randn(len(rng), 3), rng)
        ax = df.plot(fontsize=2)
        fig = ax.get_figure()
        fig.canvas.draw()
        labels = ax.get_xticklabels()
        self.assertEqual(len(labels), len(expected_labels))
        for l, l_expected in zip(labels, expected_labels):
            self.assertEqual(l.get_text(), l_expected)
Ejemplo n.º 25
0
def _get_multiindex_indexer(join_keys, index, sort):
    from functools import partial

    # bind `sort` argument
    fkeys = partial(_factorize_keys, sort=sort)

    # left & right join labels and num. of levels at each location
    rlab, llab, shape = map(list, zip(* map(fkeys, index.levels, join_keys)))
    if sort:
        rlab = list(map(np.take, rlab, index.labels))
    else:
        i8copy = lambda a: a.astype('i8', subok=False, copy=True)
        rlab = list(map(i8copy, index.labels))

    # fix right labels if there were any nulls
    for i in range(len(join_keys)):
        mask = index.labels[i] == -1
        if mask.any():
            # check if there already was any nulls at this location
            # if there was, it is factorized to `shape[i] - 1`
            a = join_keys[i][llab[i] == shape[i] - 1]
            if a.size == 0 or not a[0] != a[0]:
                shape[i] += 1

            rlab[i][mask] = shape[i] - 1

    # get flat i8 join keys
    lkey, rkey = _get_join_keys(llab, rlab, shape, sort)

    # factorize keys to a dense i8 space
    lkey, rkey, count = fkeys(lkey, rkey)

    return _algos.left_outer_join(lkey, rkey, count, sort=sort)
Ejemplo n.º 26
0
    def _get_new_axes(self):
        ndim = self._get_result_dim()
        new_axes = [None] * ndim

        if self.join_axes is None:
            for i in range(ndim):
                if i == self.axis:
                    continue
                new_axes[i] = self._get_comb_axis(i)
        else:
            if len(self.join_axes) != ndim - 1:
                raise AssertionError("length of join_axes must not be " "equal to {0}".format(ndim - 1))

            # ufff...
            indices = lrange(ndim)
            indices.remove(self.axis)

            for i, ax in zip(indices, self.join_axes):
                new_axes[i] = ax

        if self.ignore_index:
            concat_axis = None
        else:
            concat_axis = self._get_concat_axis()

        new_axes[self.axis] = concat_axis

        return new_axes
Ejemplo n.º 27
0
    def _get_expiry_dates_and_links(self):
        """
        Gets available expiry dates.

        Returns
        -------
        Tuple of:
        List of datetime.date objects
        Dict of datetime.date objects as keys and corresponding links
        """

        url = self._OPTIONS_BASE_URL.format(sym=self.symbol)
        root = self._parse_url(url)

        try:
            links = root.xpath('//*[@id="options_menu"]/form/select/option')
        except IndexError:
            raise RemoteDataError('Expiry dates not available')

        expiry_dates = [dt.datetime.strptime(element.text, "%B %d, %Y").date() for element in links]
        links = [element.attrib['data-selectbox-link'] for element in links]
        expiry_links = dict(zip(expiry_dates, links))
        self._expiry_links = expiry_links
        self._expiry_dates = expiry_dates

        return expiry_dates, expiry_links
Ejemplo n.º 28
0
    def test_time_musec(self):
        t = datetime(1, 1, 1, 3, 30, 0)
        deltas = np.random.randint(1, 20, 3).cumsum()
        ts = np.array([(t + timedelta(microseconds=int(x))).time()
                       for x in deltas])
        df = DataFrame({'a': np.random.randn(len(ts)),
                        'b': np.random.randn(len(ts))},
                       index=ts)
        _, ax = self.plt.subplots()
        ax = df.plot(ax=ax)

        # verify tick labels
        ticks = ax.get_xticks()
        labels = ax.get_xticklabels()
        for t, l in zip(ticks, labels):
            m, s = divmod(int(t), 60)

            # TODO: unused?
            # us = int((t - int(t)) * 1e6)

            h, m = divmod(m, 60)
            xp = l.get_text()
            if len(xp) > 0:
                rs = time(h, m, s).strftime('%H:%M:%S.%f')
                assert xp == rs
Ejemplo n.º 29
0
def get_data_fred(name, start=dt.datetime(2010, 1, 1), end=dt.datetime.today()):
    """
    Get data for the given name from the St. Louis FED (FRED).
    Date format is datetime

    Returns a DataFrame.

    If multiple names are passed for "series" then the index of the
    DataFrame is the outer join of the indicies of each series.
    """
    start, end = _sanitize_dates(start, end)

    if not is_list_like(name):
        names = [name]
    else:
        names = name

    urls = [_FRED_URL + "%s" % n + "/downloaddata/%s" % n + ".csv" for n in names]

    def fetch_data(url, name):
        with urlopen(url) as resp:
            data = read_csv(
                resp, index_col=0, parse_dates=True, header=None, skiprows=1, names=["DATE", name], na_values="."
            )
        try:
            return data.truncate(start, end)
        except KeyError:
            if data.ix[3].name[7:12] == "Error":
                raise IOError("Failed to get the data. Check that {0!r} is " "a valid FRED series.".format(name))
            raise

    df = concat([fetch_data(url, n) for url, n in zip(urls, names)], axis=1, join="outer")
    return df
Ejemplo n.º 30
0
    def _save_chunk(self, start_i, end_i):

        data_index = self.data_index

        # create the data for a chunk
        slicer = slice(start_i, end_i)
        for i in range(len(self.blocks)):
            b = self.blocks[i]
            d = b.to_native_types(slicer=slicer, na_rep=self.na_rep,
                                  float_format=self.float_format,
                                  decimal=self.decimal,
                                  date_format=self.date_format,
                                  quoting=self.quoting)

            for col_loc, col in zip(b.mgr_locs, d):
                # self.data is a preallocated list
                self.data[col_loc] = col

        ix = data_index.to_native_types(slicer=slicer, na_rep=self.na_rep,
                                        float_format=self.float_format,
                                        decimal=self.decimal,
                                        date_format=self.date_format,
                                        quoting=self.quoting)

        libwriters.write_csv_rows(self.data, ix, self.nlevels,
                                  self.cols, self.writer)
Ejemplo n.º 31
0
    def _get_merge_keys(self):
        """
        Note: has side effects (copy/delete key columns)

        Parameters
        ----------
        left
        right
        on

        Returns
        -------
        left_keys, right_keys
        """
        self._validate_specification()

        left_keys = []
        right_keys = []
        join_names = []
        right_drop = []
        left_drop = []
        left, right = self.left, self.right

        is_lkey = lambda x: isinstance(
            x, (np.ndarray, ABCSeries)) and len(x) == len(left)
        is_rkey = lambda x: isinstance(
            x, (np.ndarray, ABCSeries)) and len(x) == len(right)

        # ugh, spaghetti re #733
        if _any(self.left_on) and _any(self.right_on):
            for lk, rk in zip(self.left_on, self.right_on):
                if is_lkey(lk):
                    left_keys.append(lk)
                    if is_rkey(rk):
                        right_keys.append(rk)
                        join_names.append(None)  # what to do?
                    else:
                        right_keys.append(right[rk]._values)
                        join_names.append(rk)
                else:
                    if not is_rkey(rk):
                        right_keys.append(right[rk]._values)
                        if lk == rk:
                            # avoid key upcast in corner case (length-0)
                            if len(left) > 0:
                                right_drop.append(rk)
                            else:
                                left_drop.append(lk)
                    else:
                        right_keys.append(rk)
                    left_keys.append(left[lk]._values)
                    join_names.append(lk)
        elif _any(self.left_on):
            for k in self.left_on:
                if is_lkey(k):
                    left_keys.append(k)
                    join_names.append(None)
                else:
                    left_keys.append(left[k]._values)
                    join_names.append(k)
            if isinstance(self.right.index, MultiIndex):
                right_keys = [lev._values.take(lab)
                              for lev, lab in zip(self.right.index.levels,
                                                  self.right.index.labels)]
            else:
                right_keys = [self.right.index.values]
        elif _any(self.right_on):
            for k in self.right_on:
                if is_rkey(k):
                    right_keys.append(k)
                    join_names.append(None)
                else:
                    right_keys.append(right[k]._values)
                    join_names.append(k)
            if isinstance(self.left.index, MultiIndex):
                left_keys = [lev._values.take(lab)
                             for lev, lab in zip(self.left.index.levels,
                                                 self.left.index.labels)]
            else:
                left_keys = [self.left.index.values]

        if left_drop:
            self.left = self.left.drop(left_drop, axis=1)

        if right_drop:
            self.right = self.right.drop(right_drop, axis=1)

        return left_keys, right_keys, join_names
Ejemplo n.º 32
0
    def __init__(self, objs, axis=0, join='outer', join_axes=None,
                 keys=None, levels=None, names=None,
                 ignore_index=False, verify_integrity=False, copy=True):
        if isinstance(objs, (NDFrame, compat.string_types)):
            raise TypeError('first argument must be an iterable of pandas '
                            'objects, you passed an object of type '
                            '"{0}"'.format(type(objs).__name__))

        if join == 'outer':
            self.intersect = False
        elif join == 'inner':
            self.intersect = True
        else:  # pragma: no cover
            raise ValueError('Only can inner (intersect) or outer (union) '
                             'join the other axis')

        if isinstance(objs, dict):
            if keys is None:
                keys = sorted(objs)
            objs = [objs[k] for k in keys]
        else:
            objs = list(objs)

        if len(objs) == 0:
            raise ValueError('No objects to concatenate')

        if keys is None:
            objs = [obj for obj in objs if obj is not None]
        else:
            # #1649
            clean_keys = []
            clean_objs = []
            for k, v in zip(keys, objs):
                if v is None:
                    continue
                clean_keys.append(k)
                clean_objs.append(v)
            objs = clean_objs
            keys = clean_keys

        if len(objs) == 0:
            raise ValueError('All objects passed were None')

        # consolidate data & figure out what our result ndim is going to be
        ndims = set()
        for obj in objs:
            if not isinstance(obj, NDFrame):
                raise TypeError("cannot concatenate a non-NDFrame object")

            # consolidate
            obj.consolidate(inplace=True)
            ndims.add(obj.ndim)

        # get the sample
        # want the higest ndim that we have, and must be non-empty
        # unless all objs are empty
        sample = None
        if len(ndims) > 1:
            max_ndim = max(ndims)
            for obj in objs:
                if obj.ndim == max_ndim and np.sum(obj.shape):
                    sample = obj
                    break

        else:
            # filter out the empties if we have not multi-index possibiltes
            # note to keep empty Series as it affect to result columns / name
            non_empties = [obj for obj in objs
                           if sum(obj.shape) > 0 or isinstance(obj, Series)]

            if (len(non_empties) and (keys is None and names is None and
                                      levels is None and join_axes is None)):
                objs = non_empties
                sample = objs[0]

        if sample is None:
            sample = objs[0]
        self.objs = objs

        # Need to flip BlockManager axis in the DataFrame special case
        self._is_frame = isinstance(sample, DataFrame)
        if self._is_frame:
            axis = 1 if axis == 0 else 0

        self._is_series = isinstance(sample, ABCSeries)
        if not 0 <= axis <= sample.ndim:
            raise AssertionError("axis must be between 0 and {0}, "
                                 "input was {1}".format(sample.ndim, axis))

        # if we have mixed ndims, then convert to highest ndim
        # creating column numbers as needed
        if len(ndims) > 1:
            current_column = 0
            max_ndim = sample.ndim
            self.objs, objs = [], self.objs
            for obj in objs:

                ndim = obj.ndim
                if ndim == max_ndim:
                    pass

                elif ndim != max_ndim - 1:
                    raise ValueError("cannot concatenate unaligned mixed "
                                     "dimensional NDFrame objects")

                else:
                    name = getattr(obj, 'name', None)
                    if ignore_index or name is None:
                        name = current_column
                        current_column += 1

                    # doing a row-wise concatenation so need everything
                    # to line up
                    if self._is_frame and axis == 1:
                        name = 0
                    obj = sample._constructor({name: obj})

                self.objs.append(obj)

        # note: this is the BlockManager axis (since DataFrame is transposed)
        self.axis = axis
        self.join_axes = join_axes
        self.keys = keys
        self.names = names
        self.levels = levels

        self.ignore_index = ignore_index
        self.verify_integrity = verify_integrity
        self.copy = copy

        self.new_axes = self._get_new_axes()
Ejemplo n.º 33
0
 def groupings(self):
     from pandas.core.groupby.grouper import Grouping
     return [
         Grouping(lvl, lvl, in_axis=False, level=None, name=name)
         for lvl, name in zip(self.levels, self.names)
     ]
Ejemplo n.º 34
0
 def _get_colors_mapped(self, series, colors):
     unique = series.unique()
     # unique and colors length can be differed
     # depending on slice value
     mapped = dict(zip(unique, colors))
     return [mapped[v] for v in series.values]
Ejemplo n.º 35
0
    def _maybe_add_join_keys(self, result, left_indexer, right_indexer):

        left_has_missing = None
        right_has_missing = None

        keys = zip(self.join_names, self.left_on, self.right_on)
        for i, (name, lname, rname) in enumerate(keys):
            if not _should_fill(lname, rname):
                continue

            take_left, take_right = None, None

            if name in result:

                if left_indexer is not None and right_indexer is not None:
                    if name in self.left:

                        if left_has_missing is None:
                            left_has_missing = any(left_indexer == -1)

                        if left_has_missing:
                            take_right = self.right_join_keys[i]

                            if not com.is_dtype_equal(result[name].dtype,
                                                      self.left[name].dtype):
                                take_left = self.left[name]._values

                    elif name in self.right:

                        if right_has_missing is None:
                            right_has_missing = any(right_indexer == -1)

                        if right_has_missing:
                            take_left = self.left_join_keys[i]

                            if not com.is_dtype_equal(result[name].dtype,
                                                      self.right[name].dtype):
                                take_right = self.right[name]._values

            elif left_indexer is not None \
                    and isinstance(self.left_join_keys[i], np.ndarray):

                take_left = self.left_join_keys[i]
                take_right = self.right_join_keys[i]

            if take_left is not None or take_right is not None:

                if take_left is None:
                    lvals = result[name]._values
                else:
                    lfill = na_value_for_dtype(take_left.dtype)
                    lvals = algos.take_1d(take_left, left_indexer,
                                          fill_value=lfill)

                if take_right is None:
                    rvals = result[name]._values
                else:
                    rfill = na_value_for_dtype(take_right.dtype)
                    rvals = algos.take_1d(take_right, right_indexer,
                                          fill_value=rfill)

                # if we have an all missing left_indexer
                # make sure to just use the right values
                mask = left_indexer == -1
                if mask.all():
                    key_col = rvals
                else:
                    key_col = Index(lvals).where(~mask, rvals)

                if name in result:
                    result[name] = key_col
                else:
                    result.insert(i, name or 'key_%d' % i, key_col)
Ejemplo n.º 36
0
    def _check_colors(self,
                      collections,
                      linecolors=None,
                      facecolors=None,
                      mapping=None):
        """
        Check each artist has expected line colors and face colors

        Parameters
        ----------
        collections : list-like
            list or collection of target artist
        linecolors : list-like which has the same length as collections
            list of expected line colors
        facecolors : list-like which has the same length as collections
            list of expected face colors
        mapping : Series
            Series used for color grouping key
            used for andrew_curves, parallel_coordinates, radviz test
        """

        from matplotlib.lines import Line2D
        from matplotlib.collections import (Collection, PolyCollection,
                                            LineCollection)
        conv = self.colorconverter
        if linecolors is not None:

            if mapping is not None:
                linecolors = self._get_colors_mapped(mapping, linecolors)
                linecolors = linecolors[:len(collections)]

            self.assertEqual(len(collections), len(linecolors))
            for patch, color in zip(collections, linecolors):
                if isinstance(patch, Line2D):
                    result = patch.get_color()
                    # Line2D may contains string color expression
                    result = conv.to_rgba(result)
                elif isinstance(patch, (PolyCollection, LineCollection)):
                    result = tuple(patch.get_edgecolor()[0])
                else:
                    result = patch.get_edgecolor()

                expected = conv.to_rgba(color)
                self.assertEqual(result, expected)

        if facecolors is not None:

            if mapping is not None:
                facecolors = self._get_colors_mapped(mapping, facecolors)
                facecolors = facecolors[:len(collections)]

            self.assertEqual(len(collections), len(facecolors))
            for patch, color in zip(collections, facecolors):
                if isinstance(patch, Collection):
                    # returned as list of np.array
                    result = patch.get_facecolor()[0]
                else:
                    result = patch.get_facecolor()

                if isinstance(result, np.ndarray):
                    result = tuple(result)

                expected = conv.to_rgba(color)
                self.assertEqual(result, expected)
Ejemplo n.º 37
0
def _make_concat_multiindex(indexes, keys, levels=None, names=None):

    if ((levels is None and isinstance(keys[0], tuple)) or
            (levels is not None and len(levels) > 1)):
        zipped = lzip(*keys)
        if names is None:
            names = [None] * len(zipped)

        if levels is None:
            levels = [Categorical.from_array(
                zp, ordered=True).categories for zp in zipped]
        else:
            levels = [_ensure_index(x) for x in levels]
    else:
        zipped = [keys]
        if names is None:
            names = [None]

        if levels is None:
            levels = [_ensure_index(keys)]
        else:
            levels = [_ensure_index(x) for x in levels]

    if not _all_indexes_same(indexes):
        label_list = []

        # things are potentially different sizes, so compute the exact labels
        # for each level and pass those to MultiIndex.from_arrays

        for hlevel, level in zip(zipped, levels):
            to_concat = []
            for key, index in zip(hlevel, indexes):
                try:
                    i = level.get_loc(key)
                except KeyError:
                    raise ValueError('Key %s not in level %s'
                                     % (str(key), str(level)))

                to_concat.append(np.repeat(i, len(index)))
            label_list.append(np.concatenate(to_concat))

        concat_index = _concat_indexes(indexes)

        # these go at the end
        if isinstance(concat_index, MultiIndex):
            levels.extend(concat_index.levels)
            label_list.extend(concat_index.labels)
        else:
            factor = Categorical.from_array(concat_index, ordered=True)
            levels.append(factor.categories)
            label_list.append(factor.codes)

        if len(names) == len(levels):
            names = list(names)
        else:
            # make sure that all of the passed indices have the same nlevels
            if not len(set([idx.nlevels for idx in indexes])) == 1:
                raise AssertionError("Cannot concat indices that do"
                                     " not have the same number of levels")

            # also copies
            names = names + _get_consensus_names(indexes)

        return MultiIndex(levels=levels, labels=label_list, names=names,
                          verify_integrity=False)

    new_index = indexes[0]
    n = len(new_index)
    kpieces = len(indexes)

    # also copies
    new_names = list(names)
    new_levels = list(levels)

    # construct labels
    new_labels = []

    # do something a bit more speedy

    for hlevel, level in zip(zipped, levels):
        hlevel = _ensure_index(hlevel)
        mapped = level.get_indexer(hlevel)

        mask = mapped == -1
        if mask.any():
            raise ValueError('Values not found in passed level: %s'
                             % str(hlevel[mask]))

        new_labels.append(np.repeat(mapped, n))

    if isinstance(new_index, MultiIndex):
        new_levels.extend(new_index.levels)
        new_labels.extend([np.tile(lab, kpieces) for lab in new_index.labels])
    else:
        new_levels.append(new_index)
        new_labels.append(np.tile(np.arange(n), kpieces))

    if len(new_names) < len(new_levels):
        new_names.extend(new_index.names)

    return MultiIndex(levels=new_levels, labels=new_labels, names=new_names,
                      verify_integrity=False)
Ejemplo n.º 38
0
def _stack_multi_columns(frame, level_num=-1, dropna=True):
    def _convert_level_number(level_num, columns):
        """
        Logic for converting the level number to something we can safely pass
        to swaplevel:

        We generally want to convert the level number into a level name, except
        when columns do not have names, in which case we must leave as a level
        number
        """
        if level_num in columns.names:
            return columns.names[level_num]
        else:
            if columns.names[level_num] is None:
                return level_num
            else:
                return columns.names[level_num]

    this = frame.copy()

    # this makes life much simpler
    if level_num != frame.columns.nlevels - 1:
        # roll levels to put selected level at end
        roll_columns = this.columns
        for i in range(level_num, frame.columns.nlevels - 1):
            # Need to check if the ints conflict with level names
            lev1 = _convert_level_number(i, roll_columns)
            lev2 = _convert_level_number(i + 1, roll_columns)
            roll_columns = roll_columns.swaplevel(lev1, lev2)
        this.columns = roll_columns

    if not this.columns.is_lexsorted():
        # Workaround the edge case where 0 is one of the column names,
        # which interferes with trying to sort based on the first
        # level
        level_to_sort = _convert_level_number(0, this.columns)
        this = this.sort_index(level=level_to_sort, axis=1)

    # tuple list excluding level for grouping columns
    if len(frame.columns.levels) > 2:
        tuples = list(
            zip(*[
                lev.take(lab) for lev, lab in zip(this.columns.levels[:-1],
                                                  this.columns.labels[:-1])
            ]))
        unique_groups = [key for key, _ in itertools.groupby(tuples)]
        new_names = this.columns.names[:-1]
        new_columns = MultiIndex.from_tuples(unique_groups, names=new_names)
    else:
        new_columns = unique_groups = this.columns.levels[0]

    # time to ravel the values
    new_data = {}
    level_vals = this.columns.levels[-1]
    level_labels = sorted(set(this.columns.labels[-1]))
    level_vals_used = level_vals[level_labels]
    levsize = len(level_labels)
    drop_cols = []
    for key in unique_groups:
        loc = this.columns.get_loc(key)

        # can make more efficient?
        # we almost always return a slice
        # but if unsorted can get a boolean
        # indexer
        if not isinstance(loc, slice):
            slice_len = len(loc)
        else:
            slice_len = loc.stop - loc.start

        if slice_len == 0:
            drop_cols.append(key)
            continue
        elif slice_len != levsize:
            chunk = this.loc[:, this.columns[loc]]
            chunk.columns = level_vals.take(chunk.columns.labels[-1])
            value_slice = chunk.reindex(columns=level_vals_used).values
        else:
            if frame._is_mixed_type:
                value_slice = this.loc[:, this.columns[loc]].values
            else:
                value_slice = this.values[:, loc]

        new_data[key] = value_slice.ravel()

    if len(drop_cols) > 0:
        new_columns = new_columns.difference(drop_cols)

    N = len(this)

    if isinstance(this.index, MultiIndex):
        new_levels = list(this.index.levels)
        new_names = list(this.index.names)
        new_labels = [lab.repeat(levsize) for lab in this.index.labels]
    else:
        new_levels = [this.index]
        new_labels = [np.arange(N).repeat(levsize)]
        new_names = [this.index.name]  # something better?

    new_levels.append(level_vals)
    new_labels.append(np.tile(level_labels, N))
    new_names.append(frame.columns.names[level_num])

    new_index = MultiIndex(levels=new_levels,
                           labels=new_labels,
                           names=new_names,
                           verify_integrity=False)

    result = DataFrame(new_data, index=new_index, columns=new_columns)

    # more efficient way to go about this? can do the whole masking biz but
    # will only save a small amount of time...
    if dropna:
        result = result.dropna(axis=0, how='all')

    return result
Ejemplo n.º 39
0
 def create_data(constructor):
     return dict(zip((constructor(x) for x in dates_as_str), values))
Ejemplo n.º 40
0
def _get_dummies_1d(data,
                    prefix,
                    prefix_sep='_',
                    dummy_na=False,
                    sparse=False,
                    drop_first=False,
                    dtype=None):
    # Series avoids inconsistent NaN handling
    codes, levels = _factorize_from_iterable(Series(data))

    if dtype is None:
        dtype = np.uint8
    dtype = np.dtype(dtype)

    if is_object_dtype(dtype):
        raise ValueError("dtype=object is not a valid dtype for get_dummies")

    def get_empty_Frame(data, sparse):
        if isinstance(data, Series):
            index = data.index
        else:
            index = np.arange(len(data))
        if not sparse:
            return DataFrame(index=index)
        else:
            return SparseDataFrame(index=index, default_fill_value=0)

    # if all NaN
    if not dummy_na and len(levels) == 0:
        return get_empty_Frame(data, sparse)

    codes = codes.copy()
    if dummy_na:
        codes[codes == -1] = len(levels)
        levels = np.append(levels, np.nan)

    # if dummy_na, we just fake a nan level. drop_first will drop it again
    if drop_first and len(levels) == 1:
        return get_empty_Frame(data, sparse)

    number_of_cols = len(levels)

    if prefix is not None:
        dummy_strs = [
            u'{prefix}{sep}{level}'
            if isinstance(v, text_type) else '{prefix}{sep}{level}'
            for v in levels
        ]
        dummy_cols = [
            dummy_str.format(prefix=prefix, sep=prefix_sep, level=v)
            for dummy_str, v in zip(dummy_strs, levels)
        ]
    else:
        dummy_cols = levels

    if isinstance(data, Series):
        index = data.index
    else:
        index = None

    if sparse:
        sparse_series = {}
        N = len(data)
        sp_indices = [[] for _ in range(len(dummy_cols))]
        for ndx, code in enumerate(codes):
            if code == -1:
                # Blank entries if not dummy_na and code == -1, #GH4446
                continue
            sp_indices[code].append(ndx)

        if drop_first:
            # remove first categorical level to avoid perfect collinearity
            # GH12042
            sp_indices = sp_indices[1:]
            dummy_cols = dummy_cols[1:]
        for col, ixs in zip(dummy_cols, sp_indices):
            sarr = SparseArray(np.ones(len(ixs), dtype=dtype),
                               sparse_index=IntIndex(N, ixs),
                               fill_value=0,
                               dtype=dtype)
            sparse_series[col] = SparseSeries(data=sarr, index=index)

        out = SparseDataFrame(sparse_series,
                              index=index,
                              columns=dummy_cols,
                              default_fill_value=0,
                              dtype=dtype)
        return out

    else:
        dummy_mat = np.eye(number_of_cols, dtype=dtype).take(codes, axis=0)

        if not dummy_na:
            # reset NaN GH4446
            dummy_mat[codes == -1] = 0

        if drop_first:
            # remove first GH12042
            dummy_mat = dummy_mat[:, 1:]
            dummy_cols = dummy_cols[1:]
        return DataFrame(dummy_mat, index=index, columns=dummy_cols)
Ejemplo n.º 41
0
def get_dummies(data,
                prefix=None,
                prefix_sep='_',
                dummy_na=False,
                columns=None,
                sparse=False,
                drop_first=False):
    """
    Convert categorical variable into dummy/indicator variables

    Parameters
    ----------
    data : array-like, Series, or DataFrame
    prefix : string, list of strings, or dict of strings, default None
        String to append DataFrame column names
        Pass a list with length equal to the number of columns
        when calling get_dummies on a DataFrame. Alternatively, `prefix`
        can be a dictionary mapping column names to prefixes.
    prefix_sep : string, default '_'
        If appending prefix, separator/delimiter to use. Or pass a
        list or dictionary as with `prefix.`
    dummy_na : bool, default False
        Add a column to indicate NaNs, if False NaNs are ignored.
    columns : list-like, default None
        Column names in the DataFrame to be encoded.
        If `columns` is None then all the columns with
        `object` or `category` dtype will be converted.
    sparse : bool, default False
        Whether the dummy columns should be sparse or not.  Returns
        SparseDataFrame if `data` is a Series or if all columns are included.
        Otherwise returns a DataFrame with some SparseBlocks.
    drop_first : bool, default False
        Whether to get k-1 dummies out of k categorical levels by removing the
        first level.

        .. versionadded:: 0.18.0

    Returns
    -------
    dummies : DataFrame or SparseDataFrame

    Examples
    --------
    >>> import pandas as pd
    >>> s = pd.Series(list('abca'))

    >>> pd.get_dummies(s)
       a  b  c
    0  1  0  0
    1  0  1  0
    2  0  0  1
    3  1  0  0

    >>> s1 = ['a', 'b', np.nan]

    >>> pd.get_dummies(s1)
       a  b
    0  1  0
    1  0  1
    2  0  0

    >>> pd.get_dummies(s1, dummy_na=True)
       a  b  NaN
    0  1  0    0
    1  0  1    0
    2  0  0    1

    >>> df = pd.DataFrame({'A': ['a', 'b', 'a'], 'B': ['b', 'a', 'c'],
    ...                    'C': [1, 2, 3]})

    >>> pd.get_dummies(df, prefix=['col1', 'col2'])
       C  col1_a  col1_b  col2_a  col2_b  col2_c
    0  1       1       0       0       1       0
    1  2       0       1       1       0       0
    2  3       1       0       0       0       1

    >>> pd.get_dummies(pd.Series(list('abcaa')))
       a  b  c
    0  1  0  0
    1  0  1  0
    2  0  0  1
    3  1  0  0
    4  1  0  0

    >>> pd.get_dummies(pd.Series(list('abcaa')), drop_first=True)
       b  c
    0  0  0
    1  1  0
    2  0  1
    3  0  0
    4  0  0

    See Also
    --------
    Series.str.get_dummies
    """
    from pandas.core.reshape.concat import concat
    from itertools import cycle

    if isinstance(data, DataFrame):
        # determine columns being encoded

        if columns is None:
            columns_to_encode = data.select_dtypes(
                include=['object', 'category']).columns
        else:
            columns_to_encode = columns

        # validate prefixes and separator to avoid silently dropping cols
        def check_len(item, name):
            len_msg = ("Length of '{name}' ({len_item}) did not match the "
                       "length of the columns being encoded ({len_enc}).")

            if is_list_like(item):
                if not len(item) == len(columns_to_encode):
                    len_msg = len_msg.format(name=name,
                                             len_item=len(item),
                                             len_enc=len(columns_to_encode))
                    raise ValueError(len_msg)

        check_len(prefix, 'prefix')
        check_len(prefix_sep, 'prefix_sep')
        if isinstance(prefix, compat.string_types):
            prefix = cycle([prefix])
        if isinstance(prefix, dict):
            prefix = [prefix[col] for col in columns_to_encode]

        if prefix is None:
            prefix = columns_to_encode

        # validate separators
        if isinstance(prefix_sep, compat.string_types):
            prefix_sep = cycle([prefix_sep])
        elif isinstance(prefix_sep, dict):
            prefix_sep = [prefix_sep[col] for col in columns_to_encode]

        if set(columns_to_encode) == set(data.columns):
            with_dummies = []
        else:
            with_dummies = [data.drop(columns_to_encode, axis=1)]

        for (col, pre, sep) in zip(columns_to_encode, prefix, prefix_sep):

            dummy = _get_dummies_1d(data[col],
                                    prefix=pre,
                                    prefix_sep=sep,
                                    dummy_na=dummy_na,
                                    sparse=sparse,
                                    drop_first=drop_first)
            with_dummies.append(dummy)
        result = concat(with_dummies, axis=1)
    else:
        result = _get_dummies_1d(data,
                                 prefix,
                                 prefix_sep,
                                 dummy_na,
                                 sparse=sparse,
                                 drop_first=drop_first)
    return result