def _construct_return_type(self, result, axes=None, **kwargs): """ return the type for the ndim of the result """ ndim = getattr(result, "ndim", None) # need to assume they are the same if ndim is None: if isinstance(result, dict): ndim = getattr(list(compat.itervalues(result))[0], "ndim", None) # a saclar result if ndim is None: ndim = 0 # have a dict, so top-level is +1 dim else: ndim += 1 # scalar if ndim == 0: return Series(result) # same as self elif self.ndim == ndim: """ return the construction dictionary for these axes """ if axes is None: return self._constructor(result) return self._constructor(result, **self._construct_axes_dict()) # sliced elif self.ndim == ndim + 1: if axes is None: return self._constructor_sliced(result) return self._constructor_sliced(result, **self._extract_axes_for_slice(self, axes)) raise PandasError("invalid _construct_return_type [self->%s] " "[result->%s]" % (self, result))
def test_default_encoding(self): for frame in compat.itervalues(self.frame): result = frame.to_msgpack() expected = frame.to_msgpack(encoding='utf8') self.assertEqual(result, expected) result = self.encode_decode(frame) assert_frame_equal(result, frame)
def _f(*args, **kwargs): obj_iter = itertools.chain(args, compat.itervalues(kwargs)) if any(self.check(obj) for obj in obj_iter): raise TypeError('reduction operation {0!r} not allowed for ' 'this dtype'.format(f.__name__.replace('nan', ''))) return f(*args, **kwargs)
def _get_series_result_type(result): """ return appropriate class of Series concat input is either dict or array-like """ if isinstance(result, dict): # concat Series with axis 1 if all(is_sparse(c) for c in compat.itervalues(result)): from pandas.sparse.api import SparseDataFrame return SparseDataFrame else: from pandas.core.frame import DataFrame return DataFrame elif is_sparse(result): # concat Series with axis 1 from pandas.sparse.api import SparseSeries return SparseSeries else: from pandas.core.series import Series return Series
def params(self): if isinstance(self.symbols, compat.string_types): sym_list = self.symbols else: sym_list = '+'.join(self.symbols) # for codes see: http://www.gummy-stuff.org/Yahoo-data.htm request = ''.join(compat.itervalues(_yahoo_codes)) # code request string params = {'s': sym_list, 'f': request} return params
def __set__(self, obj, value): value = _ensure_index(value) if isinstance(value, MultiIndex): raise NotImplementedError for v in compat.itervalues(obj._frames): setattr(v, self.frame_attr, value) setattr(obj, self.cache_field, value)
def _f(*args, **kwargs): obj_iter = itertools.chain(args, compat.itervalues(kwargs)) if any(self.check(obj) for obj in obj_iter): raise TypeError( "reduction operation {0!r} not allowed for " "this dtype".format(f.__name__.replace("nan", "")) ) try: return f(*args, **kwargs) except ValueError as e: # we want to transform an object array # ValueError message to the more typical TypeError # e.g. this is normally a disallowed function on # object arrays that contain strings if is_object_dtype(args[0]): raise TypeError(e) raise
def _f(*args, **kwargs): obj_iter = itertools.chain(args, compat.itervalues(kwargs)) if any(self.check(obj) for obj in obj_iter): msg = 'reduction operation {name!r} not allowed for this dtype' raise TypeError(msg.format(name=f.__name__.replace('nan', ''))) try: with np.errstate(invalid='ignore'): return f(*args, **kwargs) except ValueError as e: # we want to transform an object array # ValueError message to the more typical TypeError # e.g. this is normally a disallowed function on # object arrays that contain strings if is_object_dtype(args[0]): raise TypeError(e) raise
def _get_data(symbols): """ Get current yahoo quote Returns a DataFrame """ if isinstance(symbols, compat.string_types): sym_list = symbols else: sym_list = '+'.join(symbols) # for codes see: http://www.gummy-stuff.org/Yahoo-data.htm request = ''.join(compat.itervalues(_yahoo_codes)) # code request string header = list(_yahoo_codes.keys()) data = defaultdict(list) params = { 's': sym_list, 'f': request } url = _encode_url(_URL, params) with urlopen(url) as response: lines = response.readlines() def line_gen(lines): for line in lines: yield line.decode('utf-8').strip() for line in csv.reader(line_gen(lines)): for i, field in enumerate(line): if field[-2:] == '%"': v = float(field.strip('"%')) elif field[0] == '"': v = field.strip('"') else: try: v = float(field) except ValueError: v = field data[header[i]].append(v) idx = data.pop('symbol') return DataFrame(data, index=idx)
def _get_series_result_type(result, objs=None): """ return appropriate class of Series concat input is either dict or array-like """ # concat Series with axis 1 if isinstance(result, dict): # concat Series with axis 1 if all(is_sparse(c) for c in compat.itervalues(result)): from pandas.core.sparse.api import SparseDataFrame return SparseDataFrame else: from pandas.core.frame import DataFrame return DataFrame # otherwise it is a SingleBlockManager (axis = 0) if result._block.is_sparse: from pandas.core.sparse.api import SparseSeries return SparseSeries else: return objs[0]._constructor
def get_quote_yahoo(symbols): """ Get current yahoo quote Returns a DataFrame """ if isinstance(symbols, compat.string_types): sym_list = symbols else: sym_list = '+'.join(symbols) # for codes see: http://www.gummy-stuff.org/Yahoo-data.htm request = ''.join(compat.itervalues(_yahoo_codes)) # code request string header = list(_yahoo_codes.keys()) data = defaultdict(list) url_str = 'http://finance.yahoo.com/d/quotes.csv?s=%s&f=%s' % (sym_list, request) with urlopen(url_str) as url: lines = url.readlines() for line in lines: fields = line.decode('utf-8').strip().split(',') for i, field in enumerate(fields): if field[-2:] == '%"': v = float(field.strip('"%')) elif field[0] == '"': v = field.strip('"') else: try: v = float(field) except ValueError: v = np.nan data[header[i]].append(v) idx = data.pop('symbol') return DataFrame(data, index=idx)
def _get_one(self, symbol, *args, **kwargs): """ Get current Yahoo Quote for a symbol Returns a DataFrame """ # ['symbol', 'last', 'change_pct', 'PE', 'time', 'short_ratio'] # for codes see: http://www.gummy-stuff.org/Yahoo-data.htm request = ''.join(compat.itervalues(self._yahoo_codes)) # code request string header = list(self._yahoo_codes.keys()) data = defaultdict(list) url = self._url('/d/quotes.csv') params = { 's': symbol, 'f': request } response = self.session.get(url, params=params, stream=True) for line in response.iter_lines(): fields = line.decode('utf-8').strip().split(',') for i, field in enumerate(fields): if field[-2:] == '%"': v = float(field.strip('"%')) elif field[0] == '"': v = field.strip('"') else: try: v = float(field) except ValueError: v = field data[header[i]].append(v) idx = data.pop('symbol') return pd.DataFrame(data, index=idx)
def is_any_frame(): # return a boolean if we have *any* nested series return any(isinstance(r, ABCDataFrame) for r in compat.itervalues(result))
def _aggregate(self, arg, *args, **kwargs): """ provide an implementation for the aggregators Parameters ---------- arg : string, dict, function *args : args to pass on to the function **kwargs : kwargs to pass on to the function Returns ------- tuple of result, how Notes ----- how can be a string describe the required post-processing, or None if not required """ is_aggregator = lambda x: isinstance(x, (list, tuple, dict)) is_nested_renamer = False _axis = kwargs.pop('_axis', None) if _axis is None: _axis = getattr(self, 'axis', 0) _level = kwargs.pop('_level', None) if isinstance(arg, compat.string_types): return self._try_aggregate_string_function(arg, *args, **kwargs), None if isinstance(arg, dict): # aggregate based on the passed dict if _axis != 0: # pragma: no cover raise ValueError('Can only pass dict with axis=0') obj = self._selected_obj def nested_renaming_depr(level=4): # deprecation of nested renaming # GH 15931 warnings.warn( ("using a dict with renaming " "is deprecated and will be removed in a future " "version"), FutureWarning, stacklevel=level) # if we have a dict of any non-scalars # eg. {'A' : ['mean']}, normalize all to # be list-likes if any(is_aggregator(x) for x in compat.itervalues(arg)): new_arg = compat.OrderedDict() for k, v in compat.iteritems(arg): if not isinstance(v, (tuple, list, dict)): new_arg[k] = [v] else: new_arg[k] = v # the keys must be in the columns # for ndim=2, or renamers for ndim=1 # ok for now, but deprecated # {'A': { 'ra': 'mean' }} # {'A': { 'ra': ['mean'] }} # {'ra': ['mean']} # not ok # {'ra' : { 'A' : 'mean' }} if isinstance(v, dict): is_nested_renamer = True if k not in obj.columns: msg = ('cannot perform renaming for {key} with a ' 'nested dictionary').format(key=k) raise SpecificationError(msg) nested_renaming_depr(4 + (_level or 0)) elif isinstance(obj, ABCSeries): nested_renaming_depr() elif isinstance(obj, ABCDataFrame) and \ k not in obj.columns: raise KeyError( "Column '{col}' does not exist!".format(col=k)) arg = new_arg else: # deprecation of renaming keys # GH 15931 keys = list(compat.iterkeys(arg)) if (isinstance(obj, ABCDataFrame) and len(obj.columns.intersection(keys)) != len(keys)): nested_renaming_depr() from pandas.core.reshape.concat import concat def _agg_1dim(name, how, subset=None): """ aggregate a 1-dim with how """ colg = self._gotitem(name, ndim=1, subset=subset) if colg.ndim != 1: raise SpecificationError("nested dictionary is ambiguous " "in aggregation") return colg.aggregate(how, _level=(_level or 0) + 1) def _agg_2dim(name, how): """ aggregate a 2-dim with how """ colg = self._gotitem(self._selection, ndim=2, subset=obj) return colg.aggregate(how, _level=None) def _agg(arg, func): """ run the aggregations over the arg with func return an OrderedDict """ result = compat.OrderedDict() for fname, agg_how in compat.iteritems(arg): result[fname] = func(fname, agg_how) return result # set the final keys keys = list(compat.iterkeys(arg)) result = compat.OrderedDict() # nested renamer if is_nested_renamer: result = list(_agg(arg, _agg_1dim).values()) if all(isinstance(r, dict) for r in result): result, results = compat.OrderedDict(), result for r in results: result.update(r) keys = list(compat.iterkeys(result)) else: if self._selection is not None: keys = None # some selection on the object elif self._selection is not None: sl = set(self._selection_list) # we are a Series like object, # but may have multiple aggregations if len(sl) == 1: result = _agg(arg, lambda fname, agg_how: _agg_1dim(self._selection, agg_how)) # we are selecting the same set as we are aggregating elif not len(sl - set(keys)): result = _agg(arg, _agg_1dim) # we are a DataFrame, with possibly multiple aggregations else: result = _agg(arg, _agg_2dim) # no selection else: try: result = _agg(arg, _agg_1dim) except SpecificationError: # we are aggregating expecting all 1d-returns # but we have 2d result = _agg(arg, _agg_2dim) # combine results def is_any_series(): # return a boolean if we have *any* nested series return any(isinstance(r, ABCSeries) for r in compat.itervalues(result)) def is_any_frame(): # return a boolean if we have *any* nested series return any(isinstance(r, ABCDataFrame) for r in compat.itervalues(result)) if isinstance(result, list): return concat(result, keys=keys, axis=1), True elif is_any_frame(): # we have a dict of DataFrames # return a MI DataFrame return concat([result[k] for k in keys], keys=keys, axis=1), True elif isinstance(self, ABCSeries) and is_any_series(): # we have a dict of Series # return a MI Series try: result = concat(result) except TypeError: # we want to give a nice error here if # we have non-same sized objects, so # we don't automatically broadcast raise ValueError("cannot perform both aggregation " "and transformation operations " "simultaneously") return result, True # fall thru from pandas import DataFrame, Series try: result = DataFrame(result) except ValueError: # we have a dict of scalars result = Series(result, name=getattr(self, 'name', None)) return result, True elif is_list_like(arg) and arg not in compat.string_types: # we require a list, but not an 'str' return self._aggregate_multiple_funcs(arg, _level=_level, _axis=_axis), None else: result = None f = self._is_cython_func(arg) if f and not args and not kwargs: return getattr(self, f)(), None # caller can react return result, True
def test_utf(self): # GH10581 for encoding in self.utf_encodings: for frame in compat.itervalues(self.frame): result = self.encode_decode(frame, encoding=encoding) assert_frame_equal(result, frame)
def json_normalize(data, record_path=None, meta=None, meta_prefix=None, record_prefix=None, errors='raise', sep='.'): """ "Normalize" semi-structured JSON data into a flat table Parameters ---------- data : dict or list of dicts Unserialized JSON objects record_path : string or list of strings, default None Path in each object to list of records. If not passed, data will be assumed to be an array of records meta : list of paths (string or list of strings), default None Fields to use as metadata for each record in resulting table record_prefix : string, default None If True, prefix records with dotted (?) path, e.g. foo.bar.field if path to records is ['foo', 'bar'] meta_prefix : string, default None errors : {'raise', 'ignore'}, default 'raise' * 'ignore' : will ignore KeyError if keys listed in meta are not always present * 'raise' : will raise KeyError if keys listed in meta are not always present .. versionadded:: 0.20.0 sep : string, default '.' Nested records will generate names separated by sep, e.g., for sep='.', { 'foo' : { 'bar' : 0 } } -> foo.bar .. versionadded:: 0.20.0 Returns ------- frame : DataFrame Examples -------- >>> from pandas.io.json import json_normalize >>> data = [{'id': 1, 'name': {'first': 'Coleen', 'last': 'Volk'}}, ... {'name': {'given': 'Mose', 'family': 'Regner'}}, ... {'id': 2, 'name': 'Faye Raker'}] >>> json_normalize(data) id name name.family name.first name.given name.last 0 1.0 NaN NaN Coleen NaN Volk 1 NaN NaN Regner NaN Mose NaN 2 2.0 Faye Raker NaN NaN NaN NaN >>> data = [{'state': 'Florida', ... 'shortname': 'FL', ... 'info': { ... 'governor': 'Rick Scott' ... }, ... 'counties': [{'name': 'Dade', 'population': 12345}, ... {'name': 'Broward', 'population': 40000}, ... {'name': 'Palm Beach', 'population': 60000}]}, ... {'state': 'Ohio', ... 'shortname': 'OH', ... 'info': { ... 'governor': 'John Kasich' ... }, ... 'counties': [{'name': 'Summit', 'population': 1234}, ... {'name': 'Cuyahoga', 'population': 1337}]}] >>> result = json_normalize(data, 'counties', ['state', 'shortname', ... ['info', 'governor']]) >>> result name population info.governor state shortname 0 Dade 12345 Rick Scott Florida FL 1 Broward 40000 Rick Scott Florida FL 2 Palm Beach 60000 Rick Scott Florida FL 3 Summit 1234 John Kasich Ohio OH 4 Cuyahoga 1337 John Kasich Ohio OH >>> data = {'A': [1, 2]} >>> json_normalize(data, 'A', record_prefix='Prefix.') Prefix.0 0 1 1 2 """ def _pull_field(js, spec): result = js if isinstance(spec, list): for field in spec: result = result[field] else: result = result[spec] return result if isinstance(data, list) and not data: return DataFrame() # A bit of a hackjob if isinstance(data, dict): data = [data] if record_path is None: if any([isinstance(x, dict) for x in compat.itervalues(y)] for y in data): # naive normalization, this is idempotent for flat records # and potentially will inflate the data considerably for # deeply nested structures: # {VeryLong: { b: 1,c:2}} -> {VeryLong.b:1 ,VeryLong.c:@} # # TODO: handle record value which are lists, at least error # reasonably data = nested_to_record(data, sep=sep) return DataFrame(data) elif not isinstance(record_path, list): record_path = [record_path] if meta is None: meta = [] elif not isinstance(meta, list): meta = [meta] meta = [m if isinstance(m, list) else [m] for m in meta] # Disastrously inefficient for now records = [] lengths = [] meta_vals = defaultdict(list) if not isinstance(sep, compat.string_types): sep = str(sep) meta_keys = [sep.join(val) for val in meta] def _recursive_extract(data, path, seen_meta, level=0): if len(path) > 1: for obj in data: for val, key in zip(meta, meta_keys): if level + 1 == len(val): seen_meta[key] = _pull_field(obj, val[-1]) _recursive_extract(obj[path[0]], path[1:], seen_meta, level=level + 1) else: for obj in data: recs = _pull_field(obj, path[0]) # For repeating the metadata later lengths.append(len(recs)) for val, key in zip(meta, meta_keys): if level + 1 > len(val): meta_val = seen_meta[key] else: try: meta_val = _pull_field(obj, val[level:]) except KeyError as e: if errors == 'ignore': meta_val = np.nan else: raise KeyError( "Try running with " "errors='ignore' as key " "{err} is not always present".format( err=e)) meta_vals[key].append(meta_val) records.extend(recs) _recursive_extract(data, record_path, {}, level=0) result = DataFrame(records) if record_prefix is not None: result = result.rename( columns=lambda x: "{p}{c}".format(p=record_prefix, c=x)) # Data types, a problem for k, v in compat.iteritems(meta_vals): if meta_prefix is not None: k = meta_prefix + k if k in result: raise ValueError('Conflicting metadata name {name}, ' 'need distinguishing prefix '.format(name=k)) result[k] = np.array(v).repeat(lengths) return result
def test_dict_iterators(self): assert next(itervalues({1: 2})) == 2 assert next(iterkeys({1: 2})) == 1 assert next(iteritems({1: 2})) == (1, 2)
def json_normalize(data, record_path=None, meta=None, meta_prefix=None, record_prefix=None, errors='raise'): """ "Normalize" semi-structured JSON data into a flat table Parameters ---------- data : dict or list of dicts Unserialized JSON objects record_path : string or list of strings, default None Path in each object to list of records. If not passed, data will be assumed to be an array of records meta : list of paths (string or list of strings), default None Fields to use as metadata for each record in resulting table record_prefix : string, default None If True, prefix records with dotted (?) path, e.g. foo.bar.field if path to records is ['foo', 'bar'] meta_prefix : string, default None errors : {'raise', 'ignore'}, default 'raise' * ignore : will ignore KeyError if keys listed in meta are not always present * raise : will raise KeyError if keys listed in meta are not always present .. versionadded:: 0.20.0 Returns ------- frame : DataFrame Examples -------- >>> data = [{'state': 'Florida', ... 'shortname': 'FL', ... 'info': { ... 'governor': 'Rick Scott' ... }, ... 'counties': [{'name': 'Dade', 'population': 12345}, ... {'name': 'Broward', 'population': 40000}, ... {'name': 'Palm Beach', 'population': 60000}]}, ... {'state': 'Ohio', ... 'shortname': 'OH', ... 'info': { ... 'governor': 'John Kasich' ... }, ... 'counties': [{'name': 'Summit', 'population': 1234}, ... {'name': 'Cuyahoga', 'population': 1337}]}] >>> from pandas.io.json import json_normalize >>> result = json_normalize(data, 'counties', ['state', 'shortname', ... ['info', 'governor']]) >>> result name population info.governor state shortname 0 Dade 12345 Rick Scott Florida FL 1 Broward 40000 Rick Scott Florida FL 2 Palm Beach 60000 Rick Scott Florida FL 3 Summit 1234 John Kasich Ohio OH 4 Cuyahoga 1337 John Kasich Ohio OH """ def _pull_field(js, spec): result = js if isinstance(spec, list): for field in spec: result = result[field] else: result = result[spec] return result # A bit of a hackjob if isinstance(data, dict): data = [data] if record_path is None: if any([isinstance(x, dict) for x in compat.itervalues(data[0])]): # naive normalization, this is idempotent for flat records # and potentially will inflate the data considerably for # deeply nested structures: # {VeryLong: { b: 1,c:2}} -> {VeryLong.b:1 ,VeryLong.c:@} # # TODO: handle record value which are lists, at least error # reasonably data = nested_to_record(data) return DataFrame(data) elif not isinstance(record_path, list): record_path = [record_path] if meta is None: meta = [] elif not isinstance(meta, list): meta = [meta] for i, x in enumerate(meta): if not isinstance(x, list): meta[i] = [x] # Disastrously inefficient for now records = [] lengths = [] meta_vals = defaultdict(list) meta_keys = ['.'.join(val) for val in meta] def _recursive_extract(data, path, seen_meta, level=0): if len(path) > 1: for obj in data: for val, key in zip(meta, meta_keys): if level + 1 == len(val): seen_meta[key] = _pull_field(obj, val[-1]) _recursive_extract(obj[path[0]], path[1:], seen_meta, level=level + 1) else: for obj in data: recs = _pull_field(obj, path[0]) # For repeating the metadata later lengths.append(len(recs)) for val, key in zip(meta, meta_keys): if level + 1 > len(val): meta_val = seen_meta[key] else: try: meta_val = _pull_field(obj, val[level:]) except KeyError as e: if errors == 'ignore': meta_val = np.nan else: raise \ KeyError("Try running with " "errors='ignore' as key " "%s is not always present", e) meta_vals[key].append(meta_val) records.extend(recs) _recursive_extract(data, record_path, {}, level=0) result = DataFrame(records) if record_prefix is not None: result.rename(columns=lambda x: record_prefix + x, inplace=True) # Data types, a problem for k, v in compat.iteritems(meta_vals): if meta_prefix is not None: k = meta_prefix + k if k in result: raise ValueError('Conflicting metadata name %s, ' 'need distinguishing prefix ' % k) result[k] = np.array(v).repeat(lengths) return result
def shuffle_uri(df, grouped): perm = np.r_[tuple([np.random.permutation( idxs) for idxs in compat.itervalues(grouped.groups)])] df['state_permuted'] = np.asarray(df.ix[perm]['value'])
def is_any_series(): # return a boolean if we have *any* nested series return any( isinstance(r, ABCSeries) for r in compat.itervalues(result))
def _aggregate(self, arg, *args, **kwargs): """ provide an implementation for the aggregators Parameters ---------- arg : string, dict, function *args : args to pass on to the function **kwargs : kwargs to pass on to the function Returns ------- tuple of result, how Notes ----- how can be a string describe the required post-processing, or None if not required """ is_aggregator = lambda x: isinstance(x, (list, tuple, dict)) is_nested_renamer = False _axis = kwargs.pop('_axis', None) if _axis is None: _axis = getattr(self, 'axis', 0) _level = kwargs.pop('_level', None) if isinstance(arg, compat.string_types): return self._try_aggregate_string_function(arg, *args, **kwargs), None if isinstance(arg, dict): # aggregate based on the passed dict if _axis != 0: # pragma: no cover raise ValueError('Can only pass dict with axis=0') obj = self._selected_obj def nested_renaming_depr(level=4): # deprecation of nested renaming # GH 15931 warnings.warn(("using a dict with renaming " "is deprecated and will be removed in a future " "version"), FutureWarning, stacklevel=level) # if we have a dict of any non-scalars # eg. {'A' : ['mean']}, normalize all to # be list-likes if any(is_aggregator(x) for x in compat.itervalues(arg)): new_arg = compat.OrderedDict() for k, v in compat.iteritems(arg): if not isinstance(v, (tuple, list, dict)): new_arg[k] = [v] else: new_arg[k] = v # the keys must be in the columns # for ndim=2, or renamers for ndim=1 # ok for now, but deprecated # {'A': { 'ra': 'mean' }} # {'A': { 'ra': ['mean'] }} # {'ra': ['mean']} # not ok # {'ra' : { 'A' : 'mean' }} if isinstance(v, dict): is_nested_renamer = True if k not in obj.columns: raise SpecificationError('cannot perform renaming ' 'for {0} with a nested ' 'dictionary'.format(k)) nested_renaming_depr(4 + (_level or 0)) elif isinstance(obj, ABCSeries): nested_renaming_depr() arg = new_arg else: # deprecation of renaming keys # GH 15931 keys = list(compat.iterkeys(arg)) if (isinstance(obj, ABCDataFrame) and len(obj.columns.intersection(keys)) != len(keys)): nested_renaming_depr() from pandas.tools.concat import concat def _agg_1dim(name, how, subset=None): """ aggregate a 1-dim with how """ colg = self._gotitem(name, ndim=1, subset=subset) if colg.ndim != 1: raise SpecificationError("nested dictionary is ambiguous " "in aggregation") return colg.aggregate(how, _level=(_level or 0) + 1) def _agg_2dim(name, how): """ aggregate a 2-dim with how """ colg = self._gotitem(self._selection, ndim=2, subset=obj) return colg.aggregate(how, _level=None) def _agg(arg, func): """ run the aggregations over the arg with func return an OrderedDict """ result = compat.OrderedDict() for fname, agg_how in compat.iteritems(arg): result[fname] = func(fname, agg_how) return result # set the final keys keys = list(compat.iterkeys(arg)) result = compat.OrderedDict() # nested renamer if is_nested_renamer: result = list(_agg(arg, _agg_1dim).values()) if all(isinstance(r, dict) for r in result): result, results = compat.OrderedDict(), result for r in results: result.update(r) keys = list(compat.iterkeys(result)) else: if self._selection is not None: keys = None # some selection on the object elif self._selection is not None: sl = set(self._selection_list) # we are a Series like object, # but may have multiple aggregations if len(sl) == 1: result = _agg( arg, lambda fname, agg_how: _agg_1dim( self._selection, agg_how)) # we are selecting the same set as we are aggregating elif not len(sl - set(keys)): result = _agg(arg, _agg_1dim) # we are a DataFrame, with possibly multiple aggregations else: result = _agg(arg, _agg_2dim) # no selection else: try: result = _agg(arg, _agg_1dim) except SpecificationError: # we are aggregating expecting all 1d-returns # but we have 2d result = _agg(arg, _agg_2dim) # combine results def is_any_series(): # return a boolean if we have *any* nested series return any([ isinstance(r, ABCSeries) for r in compat.itervalues(result) ]) def is_any_frame(): # return a boolean if we have *any* nested series return any([ isinstance(r, ABCDataFrame) for r in compat.itervalues(result) ]) if isinstance(result, list): return concat(result, keys=keys, axis=1), True elif is_any_frame(): # we have a dict of DataFrames # return a MI DataFrame return concat([result[k] for k in keys], keys=keys, axis=1), True elif isinstance(self, ABCSeries) and is_any_series(): # we have a dict of Series # return a MI Series try: result = concat(result) except TypeError: # we want to give a nice error here if # we have non-same sized objects, so # we don't automatically broadcast raise ValueError("cannot perform both aggregation " "and transformation operations " "simultaneously") return result, True # fall thru from pandas import DataFrame, Series try: result = DataFrame(result) except ValueError: # we have a dict of scalars result = Series(result, name=getattr(self, 'name', None)) return result, True elif is_list_like(arg) and arg not in compat.string_types: # we require a list, but not an 'str' return self._aggregate_multiple_funcs(arg, _level=_level, _axis=_axis), None else: result = None f = self._is_cython_func(arg) if f and not args and not kwargs: return getattr(self, f)(), None # caller can react return result, True
from pandas.io.data import _yahoo_codes from pandas.io.common import urlopen import pandas.compat as compat from collections import defaultdict sym_list = 'APT' _yahoo_codes.update({'MarketCap' : 'j1'}) request = ''.join(compat.itervalues(_yahoo_codes)) # code request string header = list(_yahoo_codes.keys()) print header data = defaultdict(list) url_str = 'http://finance.yahoo.com/d/quotes.csv?s=%s&f=%s' % (sym_list, request) with urlopen(url_str) as url: lines = url.readlines() print lines
def is_any_frame(): # return a boolean if we have *any* nested series return any([ isinstance(r, ABCDataFrame) for r in compat.itervalues(result) ])
def is_any_series(): # return a boolean if we have *any* nested series return any([isinstance(r, ABCSeries) for r in compat.itervalues(result)])
def _aggregate(self, arg, *args, **kwargs): """ provide an implementation for the aggregators Parameters ---------- arg : string, dict, function *args : args to pass on to the function **kwargs : kwargs to pass on to the function Returns ------- tuple of result, how Notes ----- how can be a string describe the required post-processing, or None if not required """ is_aggregator = lambda x: isinstance(x, (list, tuple, dict)) is_nested_renamer = False _level = kwargs.pop('_level', None) if isinstance(arg, compat.string_types): return getattr(self, arg)(*args, **kwargs), None if isinstance(arg, dict): # aggregate based on the passed dict if self.axis != 0: # pragma: no cover raise ValueError('Can only pass dict with axis=0') obj = self._selected_obj # if we have a dict of any non-scalars # eg. {'A' : ['mean']}, normalize all to # be list-likes if any(is_aggregator(x) for x in compat.itervalues(arg)): new_arg = compat.OrderedDict() for k, v in compat.iteritems(arg): if not isinstance(v, (tuple, list, dict)): new_arg[k] = [v] else: new_arg[k] = v # the keys must be in the columns # for ndim=2, or renamers for ndim=1 # ok # {'A': { 'ra': 'mean' }} # {'A': { 'ra': ['mean'] }} # {'ra': ['mean']} # not ok # {'ra' : { 'A' : 'mean' }} if isinstance(v, dict): is_nested_renamer = True if k not in obj.columns: raise SpecificationError('cannot perform renaming ' 'for {0} with a nested ' 'dictionary'.format(k)) arg = new_arg from pandas.tools.merge import concat def _agg_1dim(name, how, subset=None): """ aggregate a 1-dim with how """ colg = self._gotitem(name, ndim=1, subset=subset) if colg.ndim != 1: raise SpecificationError("nested dictionary is ambiguous " "in aggregation") return colg.aggregate(how, _level=(_level or 0) + 1) def _agg_2dim(name, how): """ aggregate a 2-dim with how """ colg = self._gotitem(self._selection, ndim=2, subset=obj) return colg.aggregate(how, _level=None) def _agg(arg, func): """ run the aggregations over the arg with func return an OrderedDict """ result = compat.OrderedDict() for fname, agg_how in compat.iteritems(arg): result[fname] = func(fname, agg_how) return result # set the final keys keys = list(compat.iterkeys(arg)) result = compat.OrderedDict() # nested renamer if is_nested_renamer: result = list(_agg(arg, _agg_1dim).values()) if all(isinstance(r, dict) for r in result): result, results = compat.OrderedDict(), result for r in results: result.update(r) keys = list(compat.iterkeys(result)) else: if self._selection is not None: keys = None # some selection on the object elif self._selection is not None: sl = set(self._selection_list) # we are a Series like object, # but may have multiple aggregations if len(sl) == 1: result = _agg(arg, lambda fname, agg_how: _agg_1dim(self._selection, agg_how)) # we are selecting the same set as we are aggregating elif not len(sl - set(compat.iterkeys(arg))): result = _agg(arg, _agg_1dim) # we are a DataFrame, with possibly multiple aggregations else: result = _agg(arg, _agg_2dim) # no selection else: try: result = _agg(arg, _agg_1dim) except SpecificationError: # we are aggregating expecting all 1d-returns # but we have 2d result = _agg(arg, _agg_2dim) # combine results if isinstance(result, list): result = concat(result, keys=keys, axis=1) elif isinstance(list(compat.itervalues(result))[0], gt.ABCDataFrame): result = concat([result[k] for k in keys], keys=keys, axis=1) else: from pandas import DataFrame result = DataFrame(result) return result, True elif hasattr(arg, '__iter__'): return self._aggregate_multiple_funcs(arg, _level=_level), None else: result = None cy_func = self._is_cython_func(arg) if cy_func and not args and not kwargs: return getattr(self, cy_func)(), None # caller can react return result, True
def test_dict_iterators(self): self.assertEqual(next(itervalues({1: 2})), 2) self.assertEqual(next(iterkeys({1: 2})), 1) self.assertEqual(next(iteritems({1: 2})), (1, 2))
def _aggregate(self, arg, *args, **kwargs): """ provide an implementation for the aggregators Parameters ---------- arg : string, dict, function *args : args to pass on to the function **kwargs : kwargs to pass on to the function Returns ------- tuple of result, how Notes ----- how can be a string describe the required post-processing, or None if not required """ is_aggregator = lambda x: isinstance(x, (list, tuple, dict)) is_nested_renamer = False _level = kwargs.pop('_level', None) if isinstance(arg, compat.string_types): return getattr(self, arg)(*args, **kwargs), None if isinstance(arg, dict): # aggregate based on the passed dict if self.axis != 0: # pragma: no cover raise ValueError('Can only pass dict with axis=0') obj = self._selected_obj # if we have a dict of any non-scalars # eg. {'A' : ['mean']}, normalize all to # be list-likes if any(is_aggregator(x) for x in compat.itervalues(arg)): new_arg = compat.OrderedDict() for k, v in compat.iteritems(arg): if not isinstance(v, (tuple, list, dict)): new_arg[k] = [v] else: new_arg[k] = v # the keys must be in the columns # for ndim=2, or renamers for ndim=1 # ok # {'A': { 'ra': 'mean' }} # {'A': { 'ra': ['mean'] }} # {'ra': ['mean']} # not ok # {'ra' : { 'A' : 'mean' }} if isinstance(v, dict): is_nested_renamer = True if k not in obj.columns: raise SpecificationError('cannot perform renaming ' 'for {0} with a nested ' 'dictionary'.format(k)) arg = new_arg from pandas.tools.merge import concat def _agg_1dim(name, how, subset=None): """ aggregate a 1-dim with how """ colg = self._gotitem(name, ndim=1, subset=subset) if colg.ndim != 1: raise SpecificationError("nested dictionary is ambiguous " "in aggregation") return colg.aggregate(how, _level=(_level or 0) + 1) def _agg_2dim(name, how): """ aggregate a 2-dim with how """ colg = self._gotitem(self._selection, ndim=2, subset=obj) return colg.aggregate(how, _level=None) def _agg(arg, func): """ run the aggregations over the arg with func return an OrderedDict """ result = compat.OrderedDict() for fname, agg_how in compat.iteritems(arg): result[fname] = func(fname, agg_how) return result # set the final keys keys = list(compat.iterkeys(arg)) result = compat.OrderedDict() # nested renamer if is_nested_renamer: result = list(_agg(arg, _agg_1dim).values()) if all(isinstance(r, dict) for r in result): result, results = compat.OrderedDict(), result for r in results: result.update(r) keys = list(compat.iterkeys(result)) else: if self._selection is not None: keys = None # some selection on the object elif self._selection is not None: sl = set(self._selection_list) # we are a Series like object, # but may have multiple aggregations if len(sl) == 1: result = _agg(arg, lambda fname, agg_how: _agg_1dim(self._selection, agg_how)) # we are selecting the same set as we are aggregating elif not len(sl - set(compat.iterkeys(arg))): result = _agg(arg, _agg_1dim) # we are a DataFrame, with possibly multiple aggregations else: result = _agg(arg, _agg_2dim) # no selection else: try: result = _agg(arg, _agg_1dim) except SpecificationError: # we are aggregating expecting all 1d-returns # but we have 2d result = _agg(arg, _agg_2dim) # combine results if isinstance(result, list): result = concat(result, keys=keys, axis=1) elif isinstance(list(compat.itervalues(result))[0], com.ABCDataFrame): result = concat([result[k] for k in keys], keys=keys, axis=1) else: from pandas import DataFrame result = DataFrame(result) return result, True elif hasattr(arg, '__iter__'): return self._aggregate_multiple_funcs(arg, _level=_level), None else: result = None cy_func = self._is_cython_func(arg) if cy_func and not args and not kwargs: return getattr(self, cy_func)(), None # caller can react return result, True
def json_normalize(data, record_path=None, meta=None, meta_prefix=None, record_prefix=None, errors='raise', sep='.'): """ Normalize semi-structured JSON data into a flat table. Parameters ---------- data : dict or list of dicts Unserialized JSON objects record_path : string or list of strings, default None Path in each object to list of records. If not passed, data will be assumed to be an array of records meta : list of paths (string or list of strings), default None Fields to use as metadata for each record in resulting table meta_prefix : string, default None record_prefix : string, default None If True, prefix records with dotted (?) path, e.g. foo.bar.field if path to records is ['foo', 'bar'] errors : {'raise', 'ignore'}, default 'raise' * 'ignore' : will ignore KeyError if keys listed in meta are not always present * 'raise' : will raise KeyError if keys listed in meta are not always present .. versionadded:: 0.20.0 sep : string, default '.' Nested records will generate names separated by sep, e.g., for sep='.', { 'foo' : { 'bar' : 0 } } -> foo.bar .. versionadded:: 0.20.0 Returns ------- frame : DataFrame Examples -------- >>> from pandas.io.json import json_normalize >>> data = [{'id': 1, 'name': {'first': 'Coleen', 'last': 'Volk'}}, ... {'name': {'given': 'Mose', 'family': 'Regner'}}, ... {'id': 2, 'name': 'Faye Raker'}] >>> json_normalize(data) id name name.family name.first name.given name.last 0 1.0 NaN NaN Coleen NaN Volk 1 NaN NaN Regner NaN Mose NaN 2 2.0 Faye Raker NaN NaN NaN NaN >>> data = [{'state': 'Florida', ... 'shortname': 'FL', ... 'info': { ... 'governor': 'Rick Scott' ... }, ... 'counties': [{'name': 'Dade', 'population': 12345}, ... {'name': 'Broward', 'population': 40000}, ... {'name': 'Palm Beach', 'population': 60000}]}, ... {'state': 'Ohio', ... 'shortname': 'OH', ... 'info': { ... 'governor': 'John Kasich' ... }, ... 'counties': [{'name': 'Summit', 'population': 1234}, ... {'name': 'Cuyahoga', 'population': 1337}]}] >>> result = json_normalize(data, 'counties', ['state', 'shortname', ... ['info', 'governor']]) >>> result name population info.governor state shortname 0 Dade 12345 Rick Scott Florida FL 1 Broward 40000 Rick Scott Florida FL 2 Palm Beach 60000 Rick Scott Florida FL 3 Summit 1234 John Kasich Ohio OH 4 Cuyahoga 1337 John Kasich Ohio OH >>> data = {'A': [1, 2]} >>> json_normalize(data, 'A', record_prefix='Prefix.') Prefix.0 0 1 1 2 """ def _pull_field(js, spec): result = js if isinstance(spec, list): for field in spec: result = result[field] else: result = result[spec] return result if isinstance(data, list) and not data: return DataFrame() # A bit of a hackjob if isinstance(data, dict): data = [data] if record_path is None: if any([isinstance(x, dict) for x in compat.itervalues(y)] for y in data): # naive normalization, this is idempotent for flat records # and potentially will inflate the data considerably for # deeply nested structures: # {VeryLong: { b: 1,c:2}} -> {VeryLong.b:1 ,VeryLong.c:@} # # TODO: handle record value which are lists, at least error # reasonably data = nested_to_record(data, sep=sep) return DataFrame(data) elif not isinstance(record_path, list): record_path = [record_path] if meta is None: meta = [] elif not isinstance(meta, list): meta = [meta] meta = [m if isinstance(m, list) else [m] for m in meta] # Disastrously inefficient for now records = [] lengths = [] meta_vals = defaultdict(list) if not isinstance(sep, compat.string_types): sep = str(sep) meta_keys = [sep.join(val) for val in meta] def _recursive_extract(data, path, seen_meta, level=0): if isinstance(data, dict): data = [data] if len(path) > 1: for obj in data: for val, key in zip(meta, meta_keys): if level + 1 == len(val): seen_meta[key] = _pull_field(obj, val[-1]) _recursive_extract(obj[path[0]], path[1:], seen_meta, level=level + 1) else: for obj in data: recs = _pull_field(obj, path[0]) # For repeating the metadata later lengths.append(len(recs)) for val, key in zip(meta, meta_keys): if level + 1 > len(val): meta_val = seen_meta[key] else: try: meta_val = _pull_field(obj, val[level:]) except KeyError as e: if errors == 'ignore': meta_val = np.nan else: raise KeyError("Try running with " "errors='ignore' as key " "{err} is not always present" .format(err=e)) meta_vals[key].append(meta_val) records.extend(recs) _recursive_extract(data, record_path, {}, level=0) result = DataFrame(records) if record_prefix is not None: result = result.rename( columns=lambda x: "{p}{c}".format(p=record_prefix, c=x)) # Data types, a problem for k, v in compat.iteritems(meta_vals): if meta_prefix is not None: k = meta_prefix + k if k in result: raise ValueError('Conflicting metadata name {name}, ' 'need distinguishing prefix '.format(name=k)) result[k] = np.array(v).repeat(lengths) return result