def _map(f, arr, na_mask=False, na_value=np.nan, dtype=object): from pandas.core.series import Series if not len(arr): return np.ndarray(0, dtype=dtype) if isinstance(arr, Series): arr = arr.values if not isinstance(arr, np.ndarray): arr = np.asarray(arr, dtype=object) if na_mask: mask = isnull(arr) try: result = lib.map_infer_mask(arr, f, mask.view(np.uint8)) except (TypeError, AttributeError): def g(x): try: return f(x) except (TypeError, AttributeError): return na_value return _map(g, arr, dtype=dtype) if na_value is not np.nan: np.putmask(result, mask, na_value) if result.dtype == object: result = lib.maybe_convert_objects(result) return result else: return lib.map_infer(arr, f)
def _format_strings(self): if self.float_format is None: float_format = print_config.float_format if float_format is None: fmt_str = '%% .%dg' % print_config.precision float_format = lambda x: fmt_str % x else: float_format = self.float_format formatter = com.pprint_thing if self.formatter is None else self.formatter def _format(x): if self.na_rep is not None and lib.checknull(x): if x is None: return 'None' return self.na_rep else: # object dtype return '%s' % formatter(x) vals = self.values is_float = lib.map_infer(vals, com.is_float) & notnull(vals) leading_space = is_float.any() fmt_values = [] for i, v in enumerate(vals): if not is_float[i] and leading_space: fmt_values.append(' %s' % _format(v)) elif is_float[i]: fmt_values.append(float_format(v)) else: fmt_values.append(' %s' % _format(v)) return fmt_values
def _map(f, arr, na_mask=False, na_value=np.nan): if isinstance(arr, Series): arr = arr.values if not isinstance(arr, np.ndarray): arr = np.asarray(arr, dtype=object) if na_mask: mask = isnull(arr) try: result = lib.map_infer_mask(arr, f, mask.view(np.uint8)) except (TypeError, AttributeError): def g(x): try: return f(x) except (TypeError, AttributeError): return na_value return _map(g, arr) if na_value is not np.nan: np.putmask(result, mask, na_value) if result.dtype == object: result = lib.maybe_convert_objects(result) return result else: return lib.map_infer(arr, f)
def str_get_dummies(arr, sep='|'): """ Split each string in the Series by sep and return a frame of dummy/indicator variables. Parameters ---------- sep : string, default "|" String to split on. Returns ------- dummies : DataFrame Examples -------- >>> Series(['a|b', 'a', 'a|c']).str.get_dummies() a b c 0 1 1 0 1 1 0 0 2 1 0 1 >>> Series(['a|b', np.nan, 'a|c']).str.get_dummies() a b c 0 1 1 0 1 0 0 0 2 1 0 1 See Also -------- pandas.get_dummies """ from pandas.core.frame import DataFrame from pandas.core.index import Index # GH9980, Index.str does not support get_dummies() as it returns a frame if isinstance(arr, Index): raise TypeError( "get_dummies is not supported for string methods on Index") # TODO remove this hack? arr = arr.fillna('') try: arr = sep + arr + sep except TypeError: arr = sep + arr.astype(str) + sep tags = set() for ts in arr.str.split(sep): tags.update(ts) tags = sorted(tags - set([""])) dummies = np.empty((len(arr), len(tags)), dtype=np.int64) for i, t in enumerate(tags): pat = sep + t + sep dummies[:, i] = lib.map_infer(arr.values, lambda x: pat in x) return DataFrame(dummies, arr.index, tags)
def str_get_dummies(arr, sep='|'): """ Split each string in the Series by sep and return a frame of dummy/indicator variables. Parameters ---------- sep : string, default "|" String to split on. Returns ------- dummies : DataFrame Examples -------- >>> Series(['a|b', 'a', 'a|c']).str.get_dummies() a b c 0 1 1 0 1 1 0 0 2 1 0 1 >>> Series(['a|b', np.nan, 'a|c']).str.get_dummies() a b c 0 1 1 0 1 0 0 0 2 1 0 1 See Also -------- pandas.get_dummies """ from pandas.core.frame import DataFrame from pandas.core.index import Index # GH9980, Index.str does not support get_dummies() as it returns a frame if isinstance(arr, Index): raise TypeError("get_dummies is not supported for string methods on " "Index") # TODO remove this hack? arr = arr.fillna('') try: arr = sep + arr + sep except TypeError: arr = sep + arr.astype(str) + sep tags = set() for ts in arr.str.split(sep): tags.update(ts) tags = sorted(tags - set([""])) dummies = np.empty((len(arr), len(tags)), dtype=np.int64) for i, t in enumerate(tags): pat = sep + t + sep dummies[:, i] = lib.map_infer(arr.values, lambda x: pat in x) return DataFrame(dummies, arr.index, tags)
def _dt_box_array(arr, offset=None, tz=None): if arr is None: return arr if not isinstance(arr, np.ndarray): return arr boxfunc = lambda x: Timestamp(x, offset=offset, tz=tz) return lib.map_infer(arr, boxfunc)
def auto_map(arr, f, otherargs, n_results=1, required="all"): from pandas.core.series import Series if all(np.isscalar(a) for a in otherargs): res = lib.map_infer(arr, lambda v: f(v, *otherargs)) return Series(res, index=arr.index, copy=False) n_otherargs = len(otherargs) if required == "all": required = list(range(n_otherargs)) res = map_iter_args(arr, f, azip(*otherargs), n_otherargs, required, n_results) res = [Series(col, index=arr.index, copy=False) for col in res] if n_results == 1: return res[0] return res
def auto_map(arr, f, otherargs, n_results=1, required='all'): from pandas.core.series import Series if all(np.isscalar(a) for a in otherargs): res = lib.map_infer(arr, lambda v: f(v, *otherargs)) return Series(res, index=arr.index, copy=False) n_otherargs = len(otherargs) if required == 'all': required = list(range(n_otherargs)) res = map_iter_args(arr, f, azip(*otherargs), n_otherargs, required, n_results) res = [Series(col, index=arr.index, copy=False) for col in res] if n_results == 1: return res[0] return res
def _format_strings(self, use_unicode=False): if self.float_format is None: float_format = print_config.float_format if float_format is None: fmt_str = "%% .%dg" % print_config.precision float_format = lambda x: fmt_str % x else: float_format = self.float_format if use_unicode: def _strify(x): return _stringify(x, print_config.encoding) formatter = _strify if self.formatter is None else self.formatter else: formatter = str if self.formatter is None else self.formatter def _format(x): if self.na_rep is not None and lib.checknull(x): if x is None: return "None" return self.na_rep else: # object dtype return "%s" % formatter(x) vals = self.values is_float = lib.map_infer(vals, com.is_float) & notnull(vals) leading_space = is_float.any() fmt_values = [] for i, v in enumerate(vals): if not is_float[i] and leading_space: fmt_values.append(" %s" % _format(v)) elif is_float[i]: fmt_values.append(float_format(v)) else: fmt_values.append(" %s" % _format(v)) return fmt_values
def str_get_dummies(arr, sep='|'): """ Split each string by sep and return a frame of dummy/indicator variables. Examples -------- >>> Series(['a|b', 'a', 'a|c']).str.get_dummies() a b c 0 1 1 0 1 1 0 0 2 1 0 1 >>> pd.Series(['a|b', np.nan, 'a|c']).str.get_dummies() a b c 0 1 1 0 1 0 0 0 2 1 0 1 See also ``pd.get_dummies``. """ from pandas.core.frame import DataFrame # TODO remove this hack? arr = arr.fillna('') try: arr = sep + arr + sep except TypeError: arr = sep + arr.astype(str) + sep tags = set() for ts in arr.str.split(sep): tags.update(ts) tags = sorted(tags - set([""])) dummies = np.empty((len(arr), len(tags)), dtype=np.int64) for i, t in enumerate(tags): pat = sep + t + sep dummies[:, i] = lib.map_infer(arr.values, lambda x: pat in x) return DataFrame(dummies, arr.index, tags)
def _format_strings(self): if self.float_format is None: float_format = get_option("display.float_format") if float_format is None: fmt_str = '%% .%dg' % get_option("display.precision") float_format = lambda x: fmt_str % x else: float_format = self.float_format formatter = (lambda x: com.pprint_thing(x,escape_chars=('\t','\r','\n'))) \ if self.formatter is None else self.formatter def _format(x): if self.na_rep is not None and lib.checknull(x): if x is None: return 'None' return self.na_rep else: # object dtype return '%s' % formatter(x) vals = self.values is_float = lib.map_infer(vals, com.is_float) & notnull(vals) leading_space = is_float.any() fmt_values = [] for i, v in enumerate(vals): if not is_float[i] and leading_space: fmt_values.append(' %s' % _format(v)) elif is_float[i]: fmt_values.append(float_format(v)) else: fmt_values.append(' %s' % _format(v)) return fmt_values
def _format_strings(self): if self.float_format is None: float_format = get_option("print.float_format") if float_format is None: fmt_str = '%% .%dg' % get_option("print.precision") float_format = lambda x: fmt_str % x else: float_format = self.float_format formatter = (lambda x: com.pprint_thing(x,escape_chars=('\t','\r','\n'))) \ if self.formatter is None else self.formatter def _format(x): if self.na_rep is not None and lib.checknull(x): if x is None: return 'None' return self.na_rep else: # object dtype return '%s' % formatter(x) vals = self.values is_float = lib.map_infer(vals, com.is_float) & notnull(vals) leading_space = is_float.any() fmt_values = [] for i, v in enumerate(vals): if not is_float[i] and leading_space: fmt_values.append(' %s' % _format(v)) elif is_float[i]: fmt_values.append(float_format(v)) else: fmt_values.append(' %s' % _format(v)) return fmt_values
def _convert_to_indexer(self, obj, axis=0): """ Convert indexing key into something we can use to do actual fancy indexing on an ndarray Examples ix[:5] -> slice(0, 5) ix[[1,2,3]] -> [1,2,3] ix[['foo', 'bar', 'baz']] -> [i, j, k] (indices of foo, bar, baz) Going by Zen of Python? "In the face of ambiguity, refuse the temptation to guess." raise AmbiguousIndexError with integer labels? - No, prefer label-based indexing """ labels = self.obj._get_axis(axis) is_int_index = _is_integer_index(labels) if com.is_integer(obj) and not is_int_index: return obj try: return labels.get_loc(obj) except (KeyError, TypeError): pass if isinstance(obj, slice): ltype = labels.inferred_type if ltype == 'floating': int_slice = _is_int_slice(obj) else: # floats that are within tolerance of int used int_slice = _is_index_slice(obj) null_slice = obj.start is None and obj.stop is None # could have integers in the first level of the MultiIndex position_slice = (int_slice and not ltype == 'integer' and not isinstance(labels, MultiIndex)) start, stop = obj.start, obj.stop # last ditch effort: if we are mixed and have integers try: if 'mixed' in ltype and int_slice: if start is not None: i = labels.get_loc(start) if stop is not None: j = labels.get_loc(stop) position_slice = False except KeyError: if ltype == 'mixed-integer-float': raise if null_slice or position_slice: slicer = obj else: try: i, j = labels.slice_locs(start, stop) slicer = slice(i, j, obj.step) except Exception: if _is_index_slice(obj): if labels.inferred_type == 'integer': raise slicer = obj else: raise return slicer elif _is_list_like(obj): if com._is_bool_indexer(obj): objarr = _check_bool_indexer(labels, obj) return objarr else: if isinstance(obj, Index): objarr = obj.values else: objarr = _asarray_tuplesafe(obj) # If have integer labels, defer to label-based indexing if _is_integer_dtype(objarr) and not is_int_index: if labels.inferred_type != 'integer': objarr = np.where(objarr < 0, len(labels) + objarr, objarr) return objarr # this is not the most robust, but... if (isinstance(labels, MultiIndex) and not isinstance(objarr[0], tuple)): level = 0 _, indexer = labels.reindex(objarr, level=level) check = labels.levels[0].get_indexer(objarr) else: level = None # XXX if labels.is_unique: indexer = check = labels.get_indexer(objarr) else: mask = np.zeros(len(labels), dtype=bool) lvalues = labels.values for x in objarr: # ugh to_or = lib.map_infer(lvalues, x.__eq__) if not to_or.any(): raise KeyError('%s not in index' % str(x)) mask |= to_or indexer = check = mask.nonzero()[0] mask = check == -1 if mask.any(): raise KeyError('%s not in index' % objarr[mask]) return indexer else: return labels.get_loc(obj)
def _box_values(self, values): f = lambda x: Period(ordinal=x, freq=self.freq) return lib.map_infer(values, f)
def _get_ordinals(data, freq): f = lambda x: Period(x, freq=freq).ordinal if isinstance(data[0], Period): return period.extract_ordinals(data, freq) else: return lib.map_infer(data, f)
def get_chunk(self, rows=None): if rows is not None and self.skip_footer: raise ValueError('skip_footer not supported for iteration') try: content = self._get_lines(rows) except StopIteration: if self._first_chunk: content = [] else: raise # done with first read, next time raise StopIteration self._first_chunk = False if len(content) == 0: # pragma: no cover if self.index_col is not None: if np.isscalar(self.index_col): index = Index([], name=self.index_name) else: index = MultiIndex.from_arrays([[]] * len(self.index_col), names=self.index_name) else: index = Index([]) return DataFrame(index=index, columns=self.columns) zipped_content = list(lib.to_object_array(content).T) if not self._has_complex_date_col and self.index_col is not None: index = self._get_simple_index(zipped_content) index = self._agg_index(index) else: index = Index(np.arange(len(content))) col_len, zip_len = len(self.columns), len(zipped_content) if col_len != zip_len: row_num = -1 for (i, l) in enumerate(content): if len(l) != col_len: break footers = 0 if self.skip_footer: footers = self.skip_footer row_num = self.pos - (len(content) - i + footers) msg = ('Expecting %d columns, got %d in row %d' % (col_len, zip_len, row_num)) raise ValueError(msg) data = dict((k, v) for k, v in izip(self.columns, zipped_content)) # apply converters for col, f in self.converters.iteritems(): if isinstance(col, int) and col not in self.columns: col = self.columns[col] data[col] = lib.map_infer(data[col], f) columns = list(self.columns) if self.parse_dates is not None: data, columns = self._process_date_conversion(data) data = _convert_to_ndarrays(data, self.na_values, self.verbose) df = DataFrame(data=data, columns=columns, index=index) if self._has_complex_date_col and self.index_col is not None: if not self._name_processed: self.index_name = self._get_index_name(list(columns)) self._name_processed = True data = dict(((k, v) for k, v in df.iteritems())) index = self._get_complex_date_index(data, col_names=columns, parse_dates=False) index = self._agg_index(index, False) data = dict(((k, v.values) for k, v in data.iteritems())) df = DataFrame(data=data, columns=columns, index=index) if self.squeeze and len(df.columns) == 1: return df[df.columns[0]] return df
def _box_values(self, values): """ apply box func to passed values """ return lib.map_infer(values, self._box_func)
def get_chunk(self, rows=None): if rows is not None and self.skip_footer: raise ValueError("skip_footer not supported for iteration") try: content = self._get_lines(rows) except StopIteration: if self._first_chunk: content = [] else: raise # done with first read, next time raise StopIteration self._first_chunk = False columns = list(self.orig_columns) if len(content) == 0: # pragma: no cover if self.index_col is not None: if np.isscalar(self.index_col): index = Index([], name=self.index_name) columns.pop(self.index_col) else: index = MultiIndex.from_arrays([[]] * len(self.index_col), names=self.index_name) for n in self.index_col: columns.pop(n) else: index = Index([]) return DataFrame(index=index, columns=columns) alldata = self._rows_to_cols(content) data = self._exclude_implicit_index(alldata) # apply converters for col, f in self.converters.iteritems(): if isinstance(col, int) and col not in self.orig_columns: col = self.orig_columns[col] data[col] = lib.map_infer(data[col], f) data = _convert_to_ndarrays(data, self.na_values, self.verbose) if self.parse_dates is not None: data, columns = self._process_date_conversion(data) if self.index_col is None: numrows = len(content) index = Index(np.arange(numrows)) elif not self._has_complex_date_col: index = self._get_simple_index(alldata, columns) index = self._agg_index(index) elif self._has_complex_date_col: if not self._name_processed: self.index_name = self._explicit_index_names(list(columns)) self._name_processed = True index = self._get_complex_date_index(data, columns) index = self._agg_index(index, False) df = DataFrame(data=data, columns=columns, index=index) if self.squeeze and len(df.columns) == 1: return df[df.columns[0]] return df
def _map(f, arr): if not isinstance(arr, np.ndarray): arr = np.asarray(arr, dtype=object) return lib.map_infer(arr, f)
def _box_values(self, values): """ apply box func to passed values """ import pandas.lib as lib return lib.map_infer(values, self._box_func)
def _have_unicode(self): mask = lib.map_infer(self.values, lambda x: isinstance(x, unicode)) return mask.any()
def _box_values(self, values): return lib.map_infer(values, lib.Timestamp)
def _get_object_index(self): boxfunc = lambda x: Timestamp(x, offset=self.offset, tz=self.tz) boxed_values = lib.map_infer(self.asi8, boxfunc) return Index(boxed_values, dtype=object)