def test_replace_str_to_str_chain(self): a = np.arange(1, 5) astr = a.astype(str) bstr = np.arange(2, 6).astype(str) df = DataFrame({'a': astr}) with tm.assertRaisesRegexp(ValueError, "Replacement not allowed .+"): df.replace({'a': dict(zip(astr, bstr))})
def test_time(self): import matplotlib.pyplot as plt plt.close("all") t = datetime(1, 1, 1, 3, 30, 0) deltas = np.random.randint(1, 20, 3).cumsum() ts = np.array([(t + timedelta(minutes=int(x))).time() for x in deltas]) df = DataFrame({"a": np.random.randn(len(ts)), "b": np.random.randn(len(ts))}, index=ts) ax = df.plot() # verify tick labels ticks = ax.get_xticks() labels = ax.get_xticklabels() for t, l in zip(ticks, labels): m, s = divmod(int(t), 60) h, m = divmod(m, 60) xp = l.get_text() if len(xp) > 0: rs = time(h, m, s).strftime("%H:%M:%S") self.assert_(xp, rs) # change xlim ax.set_xlim("1:30", "5:00") # check tick labels again ticks = ax.get_xticks() labels = ax.get_xticklabels() for t, l in zip(ticks, labels): m, s = divmod(int(t), 60) h, m = divmod(m, 60) xp = l.get_text() if len(xp) > 0: rs = time(h, m, s).strftime("%H:%M:%S") self.assert_(xp, rs)
def _check_colors(self, collections, linecolors=None, facecolors=None, mapping=None): """ Check each artist has expected line colors and face colors Parameters ---------- collections : list-like list or collection of target artist linecolors : list-like which has the same length as collections list of expected line colors facecolors : list-like which has the same length as collections list of expected face colors mapping : Series Series used for color grouping key used for andrew_curves, parallel_coordinates, radviz test """ from matplotlib.lines import Line2D from matplotlib.collections import Collection, PolyCollection conv = self.colorconverter if linecolors is not None: if mapping is not None: linecolors = self._get_colors_mapped(mapping, linecolors) linecolors = linecolors[:len(collections)] self.assertEqual(len(collections), len(linecolors)) for patch, color in zip(collections, linecolors): if isinstance(patch, Line2D): result = patch.get_color() # Line2D may contains string color expression result = conv.to_rgba(result) elif isinstance(patch, PolyCollection): result = tuple(patch.get_edgecolor()[0]) else: result = patch.get_edgecolor() expected = conv.to_rgba(color) self.assertEqual(result, expected) if facecolors is not None: if mapping is not None: facecolors = self._get_colors_mapped(mapping, facecolors) facecolors = facecolors[:len(collections)] self.assertEqual(len(collections), len(facecolors)) for patch, color in zip(collections, facecolors): if isinstance(patch, Collection): # returned as list of np.array result = patch.get_facecolor()[0] else: result = patch.get_facecolor() if isinstance(result, np.ndarray): result = tuple(result) expected = conv.to_rgba(color) self.assertEqual(result, expected)
def create_data(): """ create the pickle/msgpack data """ data = { 'A': [0., 1., 2., 3., np.nan], 'B': [0, 1, 0, 1, 0], 'C': ['foo1', 'foo2', 'foo3', 'foo4', 'foo5'], 'D': date_range('1/1/2009', periods=5), 'E': [0., 1, Timestamp('20100101'), 'foo', 2.] } index = dict(int=Index(np.arange(10)), date=date_range('20130101', periods=10), period=period_range('2013-01-01', freq='M', periods=10)) mi = dict(reg2=MultiIndex.from_tuples(tuple(zip(*[['bar', 'bar', 'baz', 'baz', 'foo', 'foo', 'qux', 'qux'], ['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two']])), names=['first', 'second'])) series = dict(float=Series(data['A']), int=Series(data['B']), mixed=Series(data['E']), ts=TimeSeries(np.arange(10).astype(np.int64), index=date_range('20130101',periods=10)), mi=Series(np.arange(5).astype(np.float64), index=MultiIndex.from_tuples(tuple(zip(*[[1, 1, 2, 2, 2], [3, 4, 3, 4, 5]])), names=['one', 'two'])), dup=Series(np.arange(5).astype(np.float64), index=['A', 'B', 'C', 'D', 'A']), cat=Series(Categorical(['foo', 'bar', 'baz'])), per=Series([Period('2000Q1')] * 5)) mixed_dup_df = DataFrame(data) mixed_dup_df.columns = list("ABCDA") frame = dict(float=DataFrame(dict(A=series['float'], B=series['float'] + 1)), int=DataFrame(dict(A=series['int'], B=series['int'] + 1)), mixed=DataFrame(dict([(k, data[k]) for k in ['A', 'B', 'C', 'D']])), mi=DataFrame(dict(A=np.arange(5).astype(np.float64), B=np.arange(5).astype(np.int64)), index=MultiIndex.from_tuples(tuple(zip(*[['bar', 'bar', 'baz', 'baz', 'baz'], ['one', 'two', 'one', 'two', 'three']])), names=['first', 'second'])), dup=DataFrame(np.arange(15).reshape(5, 3).astype(np.float64), columns=['A', 'B', 'A']), cat_onecol=DataFrame(dict(A=Categorical(['foo', 'bar']))), cat_and_float=DataFrame(dict(A=Categorical(['foo', 'bar', 'baz']), B=np.arange(3).astype(np.int64))), mixed_dup=mixed_dup_df) mixed_dup_panel = Panel(dict(ItemA=frame['float'], ItemB=frame['int'])) mixed_dup_panel.items = ['ItemA', 'ItemA'] panel = dict(float=Panel(dict(ItemA=frame['float'], ItemB=frame['float'] + 1)), dup=Panel(np.arange(30).reshape(3, 5, 2).astype(np.float64), items=['A', 'B', 'A']), mixed_dup=mixed_dup_panel) return dict(series=series, frame=frame, panel=panel, index=index, mi=mi, sp_series=dict(float=_create_sp_series(), ts=_create_sp_tsseries()), sp_frame=dict(float=_create_sp_frame()))
def test_time(self): t = datetime(1, 1, 1, 3, 30, 0) deltas = np.random.randint(1, 20, 3).cumsum() ts = np.array([(t + timedelta(minutes=int(x))).time() for x in deltas]) df = DataFrame({'a': np.random.randn(len(ts)), 'b': np.random.randn(len(ts))}, index=ts) ax = df.plot() # verify tick labels ticks = ax.get_xticks() labels = ax.get_xticklabels() for t, l in zip(ticks, labels): m, s = divmod(int(t), 60) h, m = divmod(m, 60) xp = l.get_text() if len(xp) > 0: rs = time(h, m, s).strftime('%H:%M:%S') self.assertEqual(xp, rs) # change xlim ax.set_xlim('1:30', '5:00') # check tick labels again ticks = ax.get_xticks() labels = ax.get_xticklabels() for t, l in zip(ticks, labels): m, s = divmod(int(t), 60) h, m = divmod(m, 60) xp = l.get_text() if len(xp) > 0: rs = time(h, m, s).strftime('%H:%M:%S') self.assertEqual(xp, rs)
def _range_from_fields(year=None, month=None, quarter=None, day=None, hour=None, minute=None, second=None, freq=None): if hour is None: hour = 0 if minute is None: minute = 0 if second is None: second = 0 if day is None: day = 1 ordinals = [] if quarter is not None: if freq is None: freq = 'Q' base = frequencies.FreqGroup.FR_QTR else: base, mult = frequencies.get_freq_code(freq) if base != frequencies.FreqGroup.FR_QTR: raise AssertionError("base must equal FR_QTR") year, quarter = _make_field_arrays(year, quarter) for y, q in compat.zip(year, quarter): y, m = libperiod.quarter_to_myear(y, q, freq) val = libperiod.period_ordinal(y, m, 1, 1, 1, 1, 0, 0, base) ordinals.append(val) else: base, mult = frequencies.get_freq_code(freq) arrays = _make_field_arrays(year, month, day, hour, minute, second) for y, mth, d, h, mn, s in compat.zip(*arrays): ordinals.append(libperiod.period_ordinal( y, mth, d, h, mn, s, 0, 0, base)) return np.array(ordinals, dtype=np.int64), freq
def create_data(): """ create the pickle/msgpack data """ data = { 'A': [0., 1., 2., 3., np.nan], 'B': [0, 1, 0, 1, 0], 'C': ['foo1', 'foo2', 'foo3', 'foo4', 'foo5'], 'D': date_range('1/1/2009', periods=5), 'E': [0., 1, Timestamp('20100101'), 'foo', 2.] } scalars = dict(timestamp=Timestamp('20130101')) if LooseVersion(pandas.__version__) >= '0.17.0': scalars['period'] = Period('2012','M') index = dict(int=Index(np.arange(10)), date=date_range('20130101', periods=10), period=period_range('2013-01-01', freq='M', periods=10)) mi = dict(reg2=MultiIndex.from_tuples(tuple(zip(*[['bar', 'bar', 'baz', 'baz', 'foo', 'foo', 'qux', 'qux'], ['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two']])), names=['first', 'second'])) series = dict(float=Series(data['A']), int=Series(data['B']), mixed=Series(data['E']), ts=Series(np.arange(10).astype(np.int64), index=date_range('20130101',periods=10)), mi=Series(np.arange(5).astype(np.float64), index=MultiIndex.from_tuples(tuple(zip(*[[1, 1, 2, 2, 2], [3, 4, 3, 4, 5]])), names=['one', 'two'])), dup=Series(np.arange(5).astype(np.float64), index=['A', 'B', 'C', 'D', 'A']), <<<<<<< HEAD <<<<<<< HEAD cat=Series(Categorical(['foo', 'bar', 'baz']))) if LooseVersion(pandas.__version__) >= '0.17.0': series['period'] = Series([Period('2000Q1')] * 5)
def df2dbiter(dataframe, table_class, harmonize_cols_first=True, harmonize_rows_first=True, parse_dates=None): """ Returns a generator of of ORM model instances (reflecting the rows database table mapped by table_class) from the given dataframe. The returned generator can be used in loops and each element can be e.g. added to the database by means of sqlAlchemy `session.add` method. NOTE: Only the dataframe columns whose name match the table_class columns will be used. Therefore, it is safe to append to dataframe any column whose name is not in table_class columns. Some iterations might return None according to the parameters (see below) :param table_class: the CLASS of the table whose rows are instantiated and returned :param dataframe: the input dataframe :param harmonize_cols_first: if True (default when missing), the dataframe column types are harmonized to reflect the table_class column types, and columns without a match in table_class are filtered out. See `harmonize_columns(dataframe)`. If `harmonize_cols_first` and `harmonize_rows_first` are both True, the harmonization is executed in that order (first columns, then rows). :param harmonize_rows_first: if True (default when missing), NA values are checked for those table columns which have the property nullable set to False. In this case, the generator **might return None** (the user should in case handle it, e.g. skipping these model instances which are not writable to the db). If `harmonize_cols_first` and `harmonize_rows_first` are both True, the harmonization is executed in that order (first columns, then rows). :param parse_dates: a list of strings denoting additional column names whose values should be parsed as dates. Ignored if harmonize_cols_first is False """ if harmonize_cols_first: colnames, dataframe = _harmonize_columns(table_class, dataframe, parse_dates) else: colnames = list(shared_colnames(table_class, dataframe)) # table_col_names = get_col_names(table_class) # colnames = [c for c in dataframe.columns if c in table_col_names] # FIXME: optimize this? new_df = dataframe[colnames] if dataframe.empty: for _ in len(dataframe): yield None return valid_rows = cycle([True]) if harmonize_rows_first: non_nullable_cols = list(shared_colnames(table_class, new_df, nullable=False)) if non_nullable_cols: valid_rows = new_df[non_nullable_cols].notnull().all(axis=1).values # df = ~pd_isnull(new_df[non_nullable_cols]) # valid_rows = df.apply(lambda row: row.all(), axis=1).values cols, datalist = _insert_data(new_df) # Note below: datalist is an array of N column, each of M rows (it would be nicer to return an # array of N rows, each of them representing a table row. But we do not want to touch pandas # code. # See _insert_table below). Thus we zip it: for is_ok, row_values in zip(valid_rows, zip(*datalist)): if is_ok: # we could make a single line statement, but two lines are more readable: row_args_dict = dict(zip(cols, row_values)) yield table_class(**row_args_dict) else: yield None
def create_data(): """ create the pickle data """ import numpy as np import pandas from pandas import (Series,TimeSeries,DataFrame,Panel, SparseSeries,SparseTimeSeries,SparseDataFrame,SparsePanel, Index,MultiIndex,PeriodIndex, date_range,period_range,bdate_range,Timestamp) nan = np.nan data = { 'A': [0., 1., 2., 3., np.nan], 'B': [0, 1, 0, 1, 0], 'C': ['foo1', 'foo2', 'foo3', 'foo4', 'foo5'], 'D': date_range('1/1/2009', periods=5), 'E' : [0., 1, Timestamp('20100101'),'foo',2.], } index = dict(int = Index(np.arange(10)), date = date_range('20130101',periods=10), period = period_range('2013-01-01', freq='M', periods=10)) mi = dict(reg2 = MultiIndex.from_tuples(tuple(zip(*[['bar', 'bar', 'baz', 'baz', 'foo', 'foo', 'qux', 'qux'], ['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two']])), names=['first', 'second'])) series = dict(float = Series(data['A']), int = Series(data['B']), mixed = Series(data['E']), ts = TimeSeries(np.arange(10).astype(np.int64),index=date_range('20130101',periods=10)), mi = Series(np.arange(5).astype(np.float64),index=MultiIndex.from_tuples(tuple(zip(*[[1,1,2,2,2], [3,4,3,4,5]])), names=['one','two'])), dup=Series(np.arange(5).astype(np.float64), index=['A', 'B', 'C', 'D', 'A'])) frame = dict(float = DataFrame(dict(A = series['float'], B = series['float'] + 1)), int = DataFrame(dict(A = series['int'] , B = series['int'] + 1)), mixed = DataFrame(dict([ (k,data[k]) for k in ['A','B','C','D']])), mi = DataFrame(dict(A = np.arange(5).astype(np.float64), B = np.arange(5).astype(np.int64)), index=MultiIndex.from_tuples(tuple(zip(*[['bar','bar','baz','baz','baz'], ['one','two','one','two','three']])), names=['first','second'])), dup = DataFrame(np.arange(15).reshape(5, 3).astype(np.float64), columns=['A', 'B', 'A'])) panel = dict(float = Panel(dict(ItemA = frame['float'], ItemB = frame['float']+1)), dup = Panel(np.arange(30).reshape(3, 5, 2).astype(np.float64), items=['A', 'B', 'A'])) return dict( series = series, frame = frame, panel = panel, index = index, mi = mi, sp_series = dict(float = _create_sp_series(), ts = _create_sp_tsseries()), sp_frame = dict(float = _create_sp_frame()) )
def _align_core(terms): term_index = [i for i, term in enumerate(terms) if hasattr(term.value, 'axes')] term_dims = [terms[i].value.ndim for i in term_index] ndims = pd.Series(dict(zip(term_index, term_dims))) # initial axes are the axes of the largest-axis'd term biggest = terms[ndims.idxmax()].value typ = biggest._constructor axes = biggest.axes naxes = len(axes) gt_than_one_axis = naxes > 1 for value in (terms[i].value for i in term_index): is_series = isinstance(value, pd.Series) is_series_and_gt_one_axis = is_series and gt_than_one_axis for axis, items in enumerate(value.axes): if is_series_and_gt_one_axis: ax, itm = naxes - 1, value.index else: ax, itm = axis, items if not axes[ax].is_(itm): axes[ax] = axes[ax].join(itm, how='outer') for i, ndim in compat.iteritems(ndims): for axis, items in zip(range(ndim), axes): ti = terms[i].value if hasattr(ti, 'reindex_axis'): transpose = isinstance(ti, pd.Series) and naxes > 1 reindexer = axes[naxes - 1] if transpose else items term_axis_size = len(ti.axes[axis]) reindexer_size = len(reindexer) ordm = np.log10(abs(reindexer_size - term_axis_size)) if ordm >= 1 and reindexer_size >= 10000: warnings.warn('Alignment difference on axis {0} is larger ' 'than an order of magnitude on term {1!r}, ' 'by more than {2:.4g}; performance may ' 'suffer'.format(axis, terms[i].name, ordm), category=pd.io.common.PerformanceWarning, stacklevel=6) if transpose: f = partial(ti.reindex, index=reindexer, copy=False) else: f = partial(ti.reindex_axis, reindexer, axis=axis, copy=False) terms[i].update(f()) terms[i].update(terms[i].value.values) return typ, _zip_axes_from_type(typ, axes)
def group_mean(lat, lon, data): indexer = np.lexsort((lon, lat)) lat = lat.take(indexer) lon = lon.take(indexer) sorted_data = data.take(indexer) keys = 1000 * lat + lon unique_keys = np.unique(keys) result = ndi.mean(sorted_data, labels=keys, index=unique_keys) decoder = keys.searchsorted(unique_keys) return dict(zip(zip(lat.take(decoder), lon.take(decoder)), result))
def sequence_grids(layer_grids): """Go through the list of layer girds and perform the same thing as sequence_layers. Parameters: ----------- layer_grids: a list of two dimensional layer grids """ for grid1, grid2 in zip(layer_grids[:-1], layer_grids[1:]): for row1, row2 in zip(grid1, grid2): for layer1, layer2 in zip(row1, row2): if layer2.data is None: layer2.data = layer1.data merge_aes(layer1, layer2) return layer_grids
def test_constructors(self, closed, name): left, right = Index([0, 1, 2, 3]), Index([1, 2, 3, 4]) ivs = [Interval(l, r, closed=closed) for l, r in zip(left, right)] expected = IntervalIndex._simple_new( left=left, right=right, closed=closed, name=name) result = IntervalIndex(ivs, name=name) tm.assert_index_equal(result, expected) result = IntervalIndex.from_intervals(ivs, name=name) tm.assert_index_equal(result, expected) result = IntervalIndex.from_breaks( np.arange(5), closed=closed, name=name) tm.assert_index_equal(result, expected) result = IntervalIndex.from_arrays( left.values, right.values, closed=closed, name=name) tm.assert_index_equal(result, expected) result = IntervalIndex.from_tuples( zip(left, right), closed=closed, name=name) tm.assert_index_equal(result, expected) result = Index(ivs, name=name) assert isinstance(result, IntervalIndex) tm.assert_index_equal(result, expected) # idempotent tm.assert_index_equal(Index(expected), expected) tm.assert_index_equal(IntervalIndex(expected), expected) result = IntervalIndex.from_intervals( expected.values, name=expected.name) tm.assert_index_equal(result, expected) left, right = expected.left, expected.right result = IntervalIndex.from_arrays( left, right, closed=expected.closed, name=expected.name) tm.assert_index_equal(result, expected) result = IntervalIndex.from_tuples( expected.to_tuples(), closed=expected.closed, name=expected.name) tm.assert_index_equal(result, expected) breaks = expected.left.tolist() + [expected.right[-1]] result = IntervalIndex.from_breaks( breaks, closed=expected.closed, name=expected.name) tm.assert_index_equal(result, expected)
def insert(self): ins = self.insert_statement() data_list = [] # to avoid if check for every row keys = self.frame.columns if self.index is not None: for t in self.frame.itertuples(): data = dict((k, self.maybe_asscalar(v)) for k, v in zip(keys, t[1:])) data[self.index] = self.maybe_asscalar(t[0]) data_list.append(data) else: for t in self.frame.itertuples(): data = dict((k, self.maybe_asscalar(v)) for k, v in zip(keys, t[1:])) data_list.append(data) self.pd_sql.execute(ins, data_list)
def groupby1(lat, lon, data): indexer = np.lexsort((lon, lat)) lat = lat.take(indexer) lon = lon.take(indexer) sorted_data = data.take(indexer) keys = 1000.0 * lat + lon unique_keys = np.unique(keys) bounds = keys.searchsorted(unique_keys) result = group_agg(sorted_data, bounds, lambda x: x.mean()) decoder = keys.searchsorted(unique_keys) return dict(zip(zip(lat.take(decoder), lon.take(decoder)), result))
def _combineFrame(self, other, func, axis=0): index, columns = self._get_plane_axes(axis) axis = self._get_axis_number(axis) other = other.reindex(index=index, columns=columns) if axis == 0: new_values = func(self.values, other.values) elif axis == 1: new_values = func(self.values.swapaxes(0, 1), other.values.T) new_values = new_values.swapaxes(0, 1) elif axis == 2: new_values = func(self.values.swapaxes(0, 2), other.values) new_values = new_values.swapaxes(0, 2) # TODO: make faster! new_frames = {} for item, item_slice in zip(self.items, new_values): old_frame = self[item] ofv = old_frame.default_fill_value ok = old_frame.default_kind new_frames[item] = SparseDataFrame(item_slice, index=self.major_axis, columns=self.minor_axis, default_fill_value=ofv, default_kind=ok) return self._new_like(new_frames)
def _create_table_statement(self): "Return a CREATE TABLE statement to suit the contents of a DataFrame." columns = list(map(str, self.frame.columns)) pat = re.compile('\s+') if any(map(pat.search, columns)): warnings.warn(_SAFE_NAMES_WARNING) column_types = [self._sql_type_name(typ) for typ in self.frame.dtypes] if self.index is not None: for i, idx_label in enumerate(self.index[::-1]): columns.insert(0, idx_label) column_types.insert(0, self._sql_type_name(self.frame.index.get_level_values(i).dtype)) flv = self.pd_sql.flavor br_l = _SQL_SYMB[flv]['br_l'] # left val quote char br_r = _SQL_SYMB[flv]['br_r'] # right val quote char col_template = br_l + '%s' + br_r + ' %s' columns = ',\n '.join(col_template % x for x in zip(columns, column_types)) template = """CREATE TABLE %(name)s ( %(columns)s )""" create_statement = template % {'name': self.name, 'columns': columns} return create_statement
def _init_data(self, data, copy, dtype, **kwargs): """ Generate ND initialization; axes are passed as required objects to __init__ """ if data is None: data = {} if dtype is not None: dtype = self._validate_dtype(dtype) passed_axes = [kwargs.get(a) for a in self._AXIS_ORDERS] axes = None if isinstance(data, BlockManager): if any(x is not None for x in passed_axes): axes = [x if x is not None else y for x, y in zip(passed_axes, data.axes)] mgr = data elif isinstance(data, dict): mgr = self._init_dict(data, passed_axes, dtype=dtype) copy = False dtype = None elif isinstance(data, (np.ndarray, list)): mgr = self._init_matrix(data, passed_axes, dtype=dtype, copy=copy) copy = False dtype = None else: # pragma: no cover raise PandasError('Panel constructor not properly called!') NDFrame.__init__(self, mgr, axes=axes, copy=copy, dtype=dtype)
def __unicode__(self): """ Return a string representation for a particular Panel Invoked by unicode(df) in py2 only. Yields a Unicode String in both py2/py3. """ class_name = str(self.__class__) shape = self.shape dims = u('Dimensions: %s') % ' x '.join( ["%d (%s)" % (s, a) for a, s in zip(self._AXIS_ORDERS, shape)]) def axis_pretty(a): v = getattr(self, a) if len(v) > 0: return u('%s axis: %s to %s') % (a.capitalize(), com.pprint_thing(v[0]), com.pprint_thing(v[-1])) else: return u('%s axis: None') % a.capitalize() output = '\n'.join( [class_name, dims] + [axis_pretty(a) for a in self._AXIS_ORDERS]) return output
def _get_join_indexers(left_keys, right_keys, sort=False, how='inner'): """ Parameters ---------- Returns ------- """ from functools import partial assert len(left_keys) == len(right_keys), \ 'left_key and right_keys must be the same length' # bind `sort` arg. of _factorize_keys fkeys = partial(_factorize_keys, sort=sort) # get left & right join labels and num. of levels at each location llab, rlab, shape = map(list, zip(* map(fkeys, left_keys, right_keys))) # get flat i8 keys from label lists lkey, rkey = _get_join_keys(llab, rlab, shape, sort) # factorize keys to a dense i8 space # `count` is the num. of unique keys # set(lkey) | set(rkey) == range(count) lkey, rkey, count = fkeys(lkey, rkey) # preserve left frame order if how == 'left' and sort == False kwargs = {'sort': sort} if how == 'left' else {} join_func = _join_functions[how] return join_func(lkey, rkey, count, **kwargs)
def get_data_famafrench(name): # path of zip files zip_file_path = '{0}/{1}.zip'.format(_FAMAFRENCH_URL, name) with urlopen(zip_file_path) as url: raw = url.read() with tempfile.TemporaryFile() as tmpf: tmpf.write(raw) with ZipFile(tmpf, 'r') as zf: data = zf.open(zf.namelist()[0]).readlines() line_lengths = np.array(lmap(len, data)) file_edges = np.where(line_lengths == 2)[0] datasets = {} edges = zip(file_edges + 1, file_edges[1:]) for i, (left_edge, right_edge) in enumerate(edges): dataset = [d.split() for d in data[left_edge:right_edge]] if len(dataset) > 10: ncol_raw = np.array(lmap(len, dataset)) ncol = np.median(ncol_raw) header_index = np.where(ncol_raw == ncol - 1)[0][-1] header = dataset[header_index] ds_header = dataset[header_index + 1:] # to ensure the header is unique header = ['{0} {1}'.format(j, hj) for j, hj in enumerate(header, start=1)] index = np.array([d[0] for d in ds_header], dtype=int) dataset = np.array([d[1:] for d in ds_header], dtype=float) datasets[i] = DataFrame(dataset, index, columns=header) return datasets
def __init__(self, values, index, level=-1, value_columns=None): if values.ndim == 1: values = values[:, np.newaxis] self.values = values self.value_columns = value_columns if value_columns is None and values.shape[1] != 1: # pragma: no cover raise ValueError('must pass column labels for multi-column data') self.index = index self.level = self.index._get_level_number(level) levels = index.levels labels = index.labels def _make_index(lev,lab): i = lev.__class__(_make_index_array_level(lev.values,lab)) i.name = lev.name return i self.new_index_levels = list([ _make_index(lev,lab) for lev,lab in zip(levels,labels) ]) self.new_index_names = list(index.names) self.removed_name = self.new_index_names.pop(self.level) self.removed_level = self.new_index_levels.pop(self.level) self._make_sorted_values_labels() self._make_selectors()
def test_convert_r_dataframe(): is_na = robj.baseenv.get("is.na") seriesd = _test.getSeriesData() frame = pd.DataFrame(seriesd, columns=['D', 'C', 'B', 'A']) # Null data frame["E"] = [np.nan for item in frame["A"]] # Some mixed type data frame["F"] = ["text" if item % 2 == 0 else np.nan for item in range(30)] r_dataframe = convert_to_r_dataframe(frame) assert np.array_equal(convert_robj(r_dataframe.rownames), frame.index) assert np.array_equal(convert_robj(r_dataframe.colnames), frame.columns) assert all(is_na(item) for item in r_dataframe.rx2("E")) for column in frame[["A", "B", "C", "D"]]: coldata = r_dataframe.rx2(column) original_data = frame[column] assert np.array_equal(convert_robj(coldata), original_data) for column in frame[["D", "E"]]: for original, converted in zip(frame[column], r_dataframe.rx2(column)): if pd.isnull(original): assert is_na(converted) else: assert original == converted
def test_format_timedelta_ticks_wide(self): if is_platform_mac(): pytest.skip("skip on mac for precision display issue on older mpl") expected_labels = [ '00:00:00', '1 days 03:46:40', '2 days 07:33:20', '3 days 11:20:00', '4 days 15:06:40', '5 days 18:53:20', '6 days 22:40:00', '8 days 02:26:40', '' ] rng = timedelta_range('0', periods=10, freq='1 d') df = DataFrame(np.random.randn(len(rng), 3), rng) ax = df.plot(fontsize=2) fig = ax.get_figure() fig.canvas.draw() labels = ax.get_xticklabels() self.assertEqual(len(labels), len(expected_labels)) for l, l_expected in zip(labels, expected_labels): self.assertEqual(l.get_text(), l_expected)
def _get_multiindex_indexer(join_keys, index, sort): from functools import partial # bind `sort` argument fkeys = partial(_factorize_keys, sort=sort) # left & right join labels and num. of levels at each location rlab, llab, shape = map(list, zip(* map(fkeys, index.levels, join_keys))) if sort: rlab = list(map(np.take, rlab, index.labels)) else: i8copy = lambda a: a.astype('i8', subok=False, copy=True) rlab = list(map(i8copy, index.labels)) # fix right labels if there were any nulls for i in range(len(join_keys)): mask = index.labels[i] == -1 if mask.any(): # check if there already was any nulls at this location # if there was, it is factorized to `shape[i] - 1` a = join_keys[i][llab[i] == shape[i] - 1] if a.size == 0 or not a[0] != a[0]: shape[i] += 1 rlab[i][mask] = shape[i] - 1 # get flat i8 join keys lkey, rkey = _get_join_keys(llab, rlab, shape, sort) # factorize keys to a dense i8 space lkey, rkey, count = fkeys(lkey, rkey) return _algos.left_outer_join(lkey, rkey, count, sort=sort)
def _get_new_axes(self): ndim = self._get_result_dim() new_axes = [None] * ndim if self.join_axes is None: for i in range(ndim): if i == self.axis: continue new_axes[i] = self._get_comb_axis(i) else: if len(self.join_axes) != ndim - 1: raise AssertionError("length of join_axes must not be " "equal to {0}".format(ndim - 1)) # ufff... indices = lrange(ndim) indices.remove(self.axis) for i, ax in zip(indices, self.join_axes): new_axes[i] = ax if self.ignore_index: concat_axis = None else: concat_axis = self._get_concat_axis() new_axes[self.axis] = concat_axis return new_axes
def _get_expiry_dates_and_links(self): """ Gets available expiry dates. Returns ------- Tuple of: List of datetime.date objects Dict of datetime.date objects as keys and corresponding links """ url = self._OPTIONS_BASE_URL.format(sym=self.symbol) root = self._parse_url(url) try: links = root.xpath('//*[@id="options_menu"]/form/select/option') except IndexError: raise RemoteDataError('Expiry dates not available') expiry_dates = [dt.datetime.strptime(element.text, "%B %d, %Y").date() for element in links] links = [element.attrib['data-selectbox-link'] for element in links] expiry_links = dict(zip(expiry_dates, links)) self._expiry_links = expiry_links self._expiry_dates = expiry_dates return expiry_dates, expiry_links
def test_time_musec(self): t = datetime(1, 1, 1, 3, 30, 0) deltas = np.random.randint(1, 20, 3).cumsum() ts = np.array([(t + timedelta(microseconds=int(x))).time() for x in deltas]) df = DataFrame({'a': np.random.randn(len(ts)), 'b': np.random.randn(len(ts))}, index=ts) _, ax = self.plt.subplots() ax = df.plot(ax=ax) # verify tick labels ticks = ax.get_xticks() labels = ax.get_xticklabels() for t, l in zip(ticks, labels): m, s = divmod(int(t), 60) # TODO: unused? # us = int((t - int(t)) * 1e6) h, m = divmod(m, 60) xp = l.get_text() if len(xp) > 0: rs = time(h, m, s).strftime('%H:%M:%S.%f') assert xp == rs
def get_data_fred(name, start=dt.datetime(2010, 1, 1), end=dt.datetime.today()): """ Get data for the given name from the St. Louis FED (FRED). Date format is datetime Returns a DataFrame. If multiple names are passed for "series" then the index of the DataFrame is the outer join of the indicies of each series. """ start, end = _sanitize_dates(start, end) if not is_list_like(name): names = [name] else: names = name urls = [_FRED_URL + "%s" % n + "/downloaddata/%s" % n + ".csv" for n in names] def fetch_data(url, name): with urlopen(url) as resp: data = read_csv( resp, index_col=0, parse_dates=True, header=None, skiprows=1, names=["DATE", name], na_values="." ) try: return data.truncate(start, end) except KeyError: if data.ix[3].name[7:12] == "Error": raise IOError("Failed to get the data. Check that {0!r} is " "a valid FRED series.".format(name)) raise df = concat([fetch_data(url, n) for url, n in zip(urls, names)], axis=1, join="outer") return df
def _save_chunk(self, start_i, end_i): data_index = self.data_index # create the data for a chunk slicer = slice(start_i, end_i) for i in range(len(self.blocks)): b = self.blocks[i] d = b.to_native_types(slicer=slicer, na_rep=self.na_rep, float_format=self.float_format, decimal=self.decimal, date_format=self.date_format, quoting=self.quoting) for col_loc, col in zip(b.mgr_locs, d): # self.data is a preallocated list self.data[col_loc] = col ix = data_index.to_native_types(slicer=slicer, na_rep=self.na_rep, float_format=self.float_format, decimal=self.decimal, date_format=self.date_format, quoting=self.quoting) libwriters.write_csv_rows(self.data, ix, self.nlevels, self.cols, self.writer)
def _get_merge_keys(self): """ Note: has side effects (copy/delete key columns) Parameters ---------- left right on Returns ------- left_keys, right_keys """ self._validate_specification() left_keys = [] right_keys = [] join_names = [] right_drop = [] left_drop = [] left, right = self.left, self.right is_lkey = lambda x: isinstance( x, (np.ndarray, ABCSeries)) and len(x) == len(left) is_rkey = lambda x: isinstance( x, (np.ndarray, ABCSeries)) and len(x) == len(right) # ugh, spaghetti re #733 if _any(self.left_on) and _any(self.right_on): for lk, rk in zip(self.left_on, self.right_on): if is_lkey(lk): left_keys.append(lk) if is_rkey(rk): right_keys.append(rk) join_names.append(None) # what to do? else: right_keys.append(right[rk]._values) join_names.append(rk) else: if not is_rkey(rk): right_keys.append(right[rk]._values) if lk == rk: # avoid key upcast in corner case (length-0) if len(left) > 0: right_drop.append(rk) else: left_drop.append(lk) else: right_keys.append(rk) left_keys.append(left[lk]._values) join_names.append(lk) elif _any(self.left_on): for k in self.left_on: if is_lkey(k): left_keys.append(k) join_names.append(None) else: left_keys.append(left[k]._values) join_names.append(k) if isinstance(self.right.index, MultiIndex): right_keys = [lev._values.take(lab) for lev, lab in zip(self.right.index.levels, self.right.index.labels)] else: right_keys = [self.right.index.values] elif _any(self.right_on): for k in self.right_on: if is_rkey(k): right_keys.append(k) join_names.append(None) else: right_keys.append(right[k]._values) join_names.append(k) if isinstance(self.left.index, MultiIndex): left_keys = [lev._values.take(lab) for lev, lab in zip(self.left.index.levels, self.left.index.labels)] else: left_keys = [self.left.index.values] if left_drop: self.left = self.left.drop(left_drop, axis=1) if right_drop: self.right = self.right.drop(right_drop, axis=1) return left_keys, right_keys, join_names
def __init__(self, objs, axis=0, join='outer', join_axes=None, keys=None, levels=None, names=None, ignore_index=False, verify_integrity=False, copy=True): if isinstance(objs, (NDFrame, compat.string_types)): raise TypeError('first argument must be an iterable of pandas ' 'objects, you passed an object of type ' '"{0}"'.format(type(objs).__name__)) if join == 'outer': self.intersect = False elif join == 'inner': self.intersect = True else: # pragma: no cover raise ValueError('Only can inner (intersect) or outer (union) ' 'join the other axis') if isinstance(objs, dict): if keys is None: keys = sorted(objs) objs = [objs[k] for k in keys] else: objs = list(objs) if len(objs) == 0: raise ValueError('No objects to concatenate') if keys is None: objs = [obj for obj in objs if obj is not None] else: # #1649 clean_keys = [] clean_objs = [] for k, v in zip(keys, objs): if v is None: continue clean_keys.append(k) clean_objs.append(v) objs = clean_objs keys = clean_keys if len(objs) == 0: raise ValueError('All objects passed were None') # consolidate data & figure out what our result ndim is going to be ndims = set() for obj in objs: if not isinstance(obj, NDFrame): raise TypeError("cannot concatenate a non-NDFrame object") # consolidate obj.consolidate(inplace=True) ndims.add(obj.ndim) # get the sample # want the higest ndim that we have, and must be non-empty # unless all objs are empty sample = None if len(ndims) > 1: max_ndim = max(ndims) for obj in objs: if obj.ndim == max_ndim and np.sum(obj.shape): sample = obj break else: # filter out the empties if we have not multi-index possibiltes # note to keep empty Series as it affect to result columns / name non_empties = [obj for obj in objs if sum(obj.shape) > 0 or isinstance(obj, Series)] if (len(non_empties) and (keys is None and names is None and levels is None and join_axes is None)): objs = non_empties sample = objs[0] if sample is None: sample = objs[0] self.objs = objs # Need to flip BlockManager axis in the DataFrame special case self._is_frame = isinstance(sample, DataFrame) if self._is_frame: axis = 1 if axis == 0 else 0 self._is_series = isinstance(sample, ABCSeries) if not 0 <= axis <= sample.ndim: raise AssertionError("axis must be between 0 and {0}, " "input was {1}".format(sample.ndim, axis)) # if we have mixed ndims, then convert to highest ndim # creating column numbers as needed if len(ndims) > 1: current_column = 0 max_ndim = sample.ndim self.objs, objs = [], self.objs for obj in objs: ndim = obj.ndim if ndim == max_ndim: pass elif ndim != max_ndim - 1: raise ValueError("cannot concatenate unaligned mixed " "dimensional NDFrame objects") else: name = getattr(obj, 'name', None) if ignore_index or name is None: name = current_column current_column += 1 # doing a row-wise concatenation so need everything # to line up if self._is_frame and axis == 1: name = 0 obj = sample._constructor({name: obj}) self.objs.append(obj) # note: this is the BlockManager axis (since DataFrame is transposed) self.axis = axis self.join_axes = join_axes self.keys = keys self.names = names self.levels = levels self.ignore_index = ignore_index self.verify_integrity = verify_integrity self.copy = copy self.new_axes = self._get_new_axes()
def groupings(self): from pandas.core.groupby.grouper import Grouping return [ Grouping(lvl, lvl, in_axis=False, level=None, name=name) for lvl, name in zip(self.levels, self.names) ]
def _get_colors_mapped(self, series, colors): unique = series.unique() # unique and colors length can be differed # depending on slice value mapped = dict(zip(unique, colors)) return [mapped[v] for v in series.values]
def _maybe_add_join_keys(self, result, left_indexer, right_indexer): left_has_missing = None right_has_missing = None keys = zip(self.join_names, self.left_on, self.right_on) for i, (name, lname, rname) in enumerate(keys): if not _should_fill(lname, rname): continue take_left, take_right = None, None if name in result: if left_indexer is not None and right_indexer is not None: if name in self.left: if left_has_missing is None: left_has_missing = any(left_indexer == -1) if left_has_missing: take_right = self.right_join_keys[i] if not com.is_dtype_equal(result[name].dtype, self.left[name].dtype): take_left = self.left[name]._values elif name in self.right: if right_has_missing is None: right_has_missing = any(right_indexer == -1) if right_has_missing: take_left = self.left_join_keys[i] if not com.is_dtype_equal(result[name].dtype, self.right[name].dtype): take_right = self.right[name]._values elif left_indexer is not None \ and isinstance(self.left_join_keys[i], np.ndarray): take_left = self.left_join_keys[i] take_right = self.right_join_keys[i] if take_left is not None or take_right is not None: if take_left is None: lvals = result[name]._values else: lfill = na_value_for_dtype(take_left.dtype) lvals = algos.take_1d(take_left, left_indexer, fill_value=lfill) if take_right is None: rvals = result[name]._values else: rfill = na_value_for_dtype(take_right.dtype) rvals = algos.take_1d(take_right, right_indexer, fill_value=rfill) # if we have an all missing left_indexer # make sure to just use the right values mask = left_indexer == -1 if mask.all(): key_col = rvals else: key_col = Index(lvals).where(~mask, rvals) if name in result: result[name] = key_col else: result.insert(i, name or 'key_%d' % i, key_col)
def _check_colors(self, collections, linecolors=None, facecolors=None, mapping=None): """ Check each artist has expected line colors and face colors Parameters ---------- collections : list-like list or collection of target artist linecolors : list-like which has the same length as collections list of expected line colors facecolors : list-like which has the same length as collections list of expected face colors mapping : Series Series used for color grouping key used for andrew_curves, parallel_coordinates, radviz test """ from matplotlib.lines import Line2D from matplotlib.collections import (Collection, PolyCollection, LineCollection) conv = self.colorconverter if linecolors is not None: if mapping is not None: linecolors = self._get_colors_mapped(mapping, linecolors) linecolors = linecolors[:len(collections)] self.assertEqual(len(collections), len(linecolors)) for patch, color in zip(collections, linecolors): if isinstance(patch, Line2D): result = patch.get_color() # Line2D may contains string color expression result = conv.to_rgba(result) elif isinstance(patch, (PolyCollection, LineCollection)): result = tuple(patch.get_edgecolor()[0]) else: result = patch.get_edgecolor() expected = conv.to_rgba(color) self.assertEqual(result, expected) if facecolors is not None: if mapping is not None: facecolors = self._get_colors_mapped(mapping, facecolors) facecolors = facecolors[:len(collections)] self.assertEqual(len(collections), len(facecolors)) for patch, color in zip(collections, facecolors): if isinstance(patch, Collection): # returned as list of np.array result = patch.get_facecolor()[0] else: result = patch.get_facecolor() if isinstance(result, np.ndarray): result = tuple(result) expected = conv.to_rgba(color) self.assertEqual(result, expected)
def _make_concat_multiindex(indexes, keys, levels=None, names=None): if ((levels is None and isinstance(keys[0], tuple)) or (levels is not None and len(levels) > 1)): zipped = lzip(*keys) if names is None: names = [None] * len(zipped) if levels is None: levels = [Categorical.from_array( zp, ordered=True).categories for zp in zipped] else: levels = [_ensure_index(x) for x in levels] else: zipped = [keys] if names is None: names = [None] if levels is None: levels = [_ensure_index(keys)] else: levels = [_ensure_index(x) for x in levels] if not _all_indexes_same(indexes): label_list = [] # things are potentially different sizes, so compute the exact labels # for each level and pass those to MultiIndex.from_arrays for hlevel, level in zip(zipped, levels): to_concat = [] for key, index in zip(hlevel, indexes): try: i = level.get_loc(key) except KeyError: raise ValueError('Key %s not in level %s' % (str(key), str(level))) to_concat.append(np.repeat(i, len(index))) label_list.append(np.concatenate(to_concat)) concat_index = _concat_indexes(indexes) # these go at the end if isinstance(concat_index, MultiIndex): levels.extend(concat_index.levels) label_list.extend(concat_index.labels) else: factor = Categorical.from_array(concat_index, ordered=True) levels.append(factor.categories) label_list.append(factor.codes) if len(names) == len(levels): names = list(names) else: # make sure that all of the passed indices have the same nlevels if not len(set([idx.nlevels for idx in indexes])) == 1: raise AssertionError("Cannot concat indices that do" " not have the same number of levels") # also copies names = names + _get_consensus_names(indexes) return MultiIndex(levels=levels, labels=label_list, names=names, verify_integrity=False) new_index = indexes[0] n = len(new_index) kpieces = len(indexes) # also copies new_names = list(names) new_levels = list(levels) # construct labels new_labels = [] # do something a bit more speedy for hlevel, level in zip(zipped, levels): hlevel = _ensure_index(hlevel) mapped = level.get_indexer(hlevel) mask = mapped == -1 if mask.any(): raise ValueError('Values not found in passed level: %s' % str(hlevel[mask])) new_labels.append(np.repeat(mapped, n)) if isinstance(new_index, MultiIndex): new_levels.extend(new_index.levels) new_labels.extend([np.tile(lab, kpieces) for lab in new_index.labels]) else: new_levels.append(new_index) new_labels.append(np.tile(np.arange(n), kpieces)) if len(new_names) < len(new_levels): new_names.extend(new_index.names) return MultiIndex(levels=new_levels, labels=new_labels, names=new_names, verify_integrity=False)
def _stack_multi_columns(frame, level_num=-1, dropna=True): def _convert_level_number(level_num, columns): """ Logic for converting the level number to something we can safely pass to swaplevel: We generally want to convert the level number into a level name, except when columns do not have names, in which case we must leave as a level number """ if level_num in columns.names: return columns.names[level_num] else: if columns.names[level_num] is None: return level_num else: return columns.names[level_num] this = frame.copy() # this makes life much simpler if level_num != frame.columns.nlevels - 1: # roll levels to put selected level at end roll_columns = this.columns for i in range(level_num, frame.columns.nlevels - 1): # Need to check if the ints conflict with level names lev1 = _convert_level_number(i, roll_columns) lev2 = _convert_level_number(i + 1, roll_columns) roll_columns = roll_columns.swaplevel(lev1, lev2) this.columns = roll_columns if not this.columns.is_lexsorted(): # Workaround the edge case where 0 is one of the column names, # which interferes with trying to sort based on the first # level level_to_sort = _convert_level_number(0, this.columns) this = this.sort_index(level=level_to_sort, axis=1) # tuple list excluding level for grouping columns if len(frame.columns.levels) > 2: tuples = list( zip(*[ lev.take(lab) for lev, lab in zip(this.columns.levels[:-1], this.columns.labels[:-1]) ])) unique_groups = [key for key, _ in itertools.groupby(tuples)] new_names = this.columns.names[:-1] new_columns = MultiIndex.from_tuples(unique_groups, names=new_names) else: new_columns = unique_groups = this.columns.levels[0] # time to ravel the values new_data = {} level_vals = this.columns.levels[-1] level_labels = sorted(set(this.columns.labels[-1])) level_vals_used = level_vals[level_labels] levsize = len(level_labels) drop_cols = [] for key in unique_groups: loc = this.columns.get_loc(key) # can make more efficient? # we almost always return a slice # but if unsorted can get a boolean # indexer if not isinstance(loc, slice): slice_len = len(loc) else: slice_len = loc.stop - loc.start if slice_len == 0: drop_cols.append(key) continue elif slice_len != levsize: chunk = this.loc[:, this.columns[loc]] chunk.columns = level_vals.take(chunk.columns.labels[-1]) value_slice = chunk.reindex(columns=level_vals_used).values else: if frame._is_mixed_type: value_slice = this.loc[:, this.columns[loc]].values else: value_slice = this.values[:, loc] new_data[key] = value_slice.ravel() if len(drop_cols) > 0: new_columns = new_columns.difference(drop_cols) N = len(this) if isinstance(this.index, MultiIndex): new_levels = list(this.index.levels) new_names = list(this.index.names) new_labels = [lab.repeat(levsize) for lab in this.index.labels] else: new_levels = [this.index] new_labels = [np.arange(N).repeat(levsize)] new_names = [this.index.name] # something better? new_levels.append(level_vals) new_labels.append(np.tile(level_labels, N)) new_names.append(frame.columns.names[level_num]) new_index = MultiIndex(levels=new_levels, labels=new_labels, names=new_names, verify_integrity=False) result = DataFrame(new_data, index=new_index, columns=new_columns) # more efficient way to go about this? can do the whole masking biz but # will only save a small amount of time... if dropna: result = result.dropna(axis=0, how='all') return result
def create_data(constructor): return dict(zip((constructor(x) for x in dates_as_str), values))
def _get_dummies_1d(data, prefix, prefix_sep='_', dummy_na=False, sparse=False, drop_first=False, dtype=None): # Series avoids inconsistent NaN handling codes, levels = _factorize_from_iterable(Series(data)) if dtype is None: dtype = np.uint8 dtype = np.dtype(dtype) if is_object_dtype(dtype): raise ValueError("dtype=object is not a valid dtype for get_dummies") def get_empty_Frame(data, sparse): if isinstance(data, Series): index = data.index else: index = np.arange(len(data)) if not sparse: return DataFrame(index=index) else: return SparseDataFrame(index=index, default_fill_value=0) # if all NaN if not dummy_na and len(levels) == 0: return get_empty_Frame(data, sparse) codes = codes.copy() if dummy_na: codes[codes == -1] = len(levels) levels = np.append(levels, np.nan) # if dummy_na, we just fake a nan level. drop_first will drop it again if drop_first and len(levels) == 1: return get_empty_Frame(data, sparse) number_of_cols = len(levels) if prefix is not None: dummy_strs = [ u'{prefix}{sep}{level}' if isinstance(v, text_type) else '{prefix}{sep}{level}' for v in levels ] dummy_cols = [ dummy_str.format(prefix=prefix, sep=prefix_sep, level=v) for dummy_str, v in zip(dummy_strs, levels) ] else: dummy_cols = levels if isinstance(data, Series): index = data.index else: index = None if sparse: sparse_series = {} N = len(data) sp_indices = [[] for _ in range(len(dummy_cols))] for ndx, code in enumerate(codes): if code == -1: # Blank entries if not dummy_na and code == -1, #GH4446 continue sp_indices[code].append(ndx) if drop_first: # remove first categorical level to avoid perfect collinearity # GH12042 sp_indices = sp_indices[1:] dummy_cols = dummy_cols[1:] for col, ixs in zip(dummy_cols, sp_indices): sarr = SparseArray(np.ones(len(ixs), dtype=dtype), sparse_index=IntIndex(N, ixs), fill_value=0, dtype=dtype) sparse_series[col] = SparseSeries(data=sarr, index=index) out = SparseDataFrame(sparse_series, index=index, columns=dummy_cols, default_fill_value=0, dtype=dtype) return out else: dummy_mat = np.eye(number_of_cols, dtype=dtype).take(codes, axis=0) if not dummy_na: # reset NaN GH4446 dummy_mat[codes == -1] = 0 if drop_first: # remove first GH12042 dummy_mat = dummy_mat[:, 1:] dummy_cols = dummy_cols[1:] return DataFrame(dummy_mat, index=index, columns=dummy_cols)
def get_dummies(data, prefix=None, prefix_sep='_', dummy_na=False, columns=None, sparse=False, drop_first=False): """ Convert categorical variable into dummy/indicator variables Parameters ---------- data : array-like, Series, or DataFrame prefix : string, list of strings, or dict of strings, default None String to append DataFrame column names Pass a list with length equal to the number of columns when calling get_dummies on a DataFrame. Alternatively, `prefix` can be a dictionary mapping column names to prefixes. prefix_sep : string, default '_' If appending prefix, separator/delimiter to use. Or pass a list or dictionary as with `prefix.` dummy_na : bool, default False Add a column to indicate NaNs, if False NaNs are ignored. columns : list-like, default None Column names in the DataFrame to be encoded. If `columns` is None then all the columns with `object` or `category` dtype will be converted. sparse : bool, default False Whether the dummy columns should be sparse or not. Returns SparseDataFrame if `data` is a Series or if all columns are included. Otherwise returns a DataFrame with some SparseBlocks. drop_first : bool, default False Whether to get k-1 dummies out of k categorical levels by removing the first level. .. versionadded:: 0.18.0 Returns ------- dummies : DataFrame or SparseDataFrame Examples -------- >>> import pandas as pd >>> s = pd.Series(list('abca')) >>> pd.get_dummies(s) a b c 0 1 0 0 1 0 1 0 2 0 0 1 3 1 0 0 >>> s1 = ['a', 'b', np.nan] >>> pd.get_dummies(s1) a b 0 1 0 1 0 1 2 0 0 >>> pd.get_dummies(s1, dummy_na=True) a b NaN 0 1 0 0 1 0 1 0 2 0 0 1 >>> df = pd.DataFrame({'A': ['a', 'b', 'a'], 'B': ['b', 'a', 'c'], ... 'C': [1, 2, 3]}) >>> pd.get_dummies(df, prefix=['col1', 'col2']) C col1_a col1_b col2_a col2_b col2_c 0 1 1 0 0 1 0 1 2 0 1 1 0 0 2 3 1 0 0 0 1 >>> pd.get_dummies(pd.Series(list('abcaa'))) a b c 0 1 0 0 1 0 1 0 2 0 0 1 3 1 0 0 4 1 0 0 >>> pd.get_dummies(pd.Series(list('abcaa')), drop_first=True) b c 0 0 0 1 1 0 2 0 1 3 0 0 4 0 0 See Also -------- Series.str.get_dummies """ from pandas.core.reshape.concat import concat from itertools import cycle if isinstance(data, DataFrame): # determine columns being encoded if columns is None: columns_to_encode = data.select_dtypes( include=['object', 'category']).columns else: columns_to_encode = columns # validate prefixes and separator to avoid silently dropping cols def check_len(item, name): len_msg = ("Length of '{name}' ({len_item}) did not match the " "length of the columns being encoded ({len_enc}).") if is_list_like(item): if not len(item) == len(columns_to_encode): len_msg = len_msg.format(name=name, len_item=len(item), len_enc=len(columns_to_encode)) raise ValueError(len_msg) check_len(prefix, 'prefix') check_len(prefix_sep, 'prefix_sep') if isinstance(prefix, compat.string_types): prefix = cycle([prefix]) if isinstance(prefix, dict): prefix = [prefix[col] for col in columns_to_encode] if prefix is None: prefix = columns_to_encode # validate separators if isinstance(prefix_sep, compat.string_types): prefix_sep = cycle([prefix_sep]) elif isinstance(prefix_sep, dict): prefix_sep = [prefix_sep[col] for col in columns_to_encode] if set(columns_to_encode) == set(data.columns): with_dummies = [] else: with_dummies = [data.drop(columns_to_encode, axis=1)] for (col, pre, sep) in zip(columns_to_encode, prefix, prefix_sep): dummy = _get_dummies_1d(data[col], prefix=pre, prefix_sep=sep, dummy_na=dummy_na, sparse=sparse, drop_first=drop_first) with_dummies.append(dummy) result = concat(with_dummies, axis=1) else: result = _get_dummies_1d(data, prefix, prefix_sep, dummy_na, sparse=sparse, drop_first=drop_first) return result