Beispiel #1
0
    def _construct_return_type(self, result, axes=None, **kwargs):
        """ return the type for the ndim of the result """
        ndim = getattr(result, "ndim", None)

        # need to assume they are the same
        if ndim is None:
            if isinstance(result, dict):
                ndim = getattr(list(compat.itervalues(result))[0], "ndim", None)

                # a saclar result
                if ndim is None:
                    ndim = 0

                # have a dict, so top-level is +1 dim
                else:
                    ndim += 1

        # scalar
        if ndim == 0:
            return Series(result)

        # same as self
        elif self.ndim == ndim:
            """ return the construction dictionary for these axes """
            if axes is None:
                return self._constructor(result)
            return self._constructor(result, **self._construct_axes_dict())

        # sliced
        elif self.ndim == ndim + 1:
            if axes is None:
                return self._constructor_sliced(result)
            return self._constructor_sliced(result, **self._extract_axes_for_slice(self, axes))

        raise PandasError("invalid _construct_return_type [self->%s] " "[result->%s]" % (self, result))
Beispiel #2
0
 def test_default_encoding(self):
     for frame in compat.itervalues(self.frame):
         result = frame.to_msgpack()
         expected = frame.to_msgpack(encoding='utf8')
         self.assertEqual(result, expected)
         result = self.encode_decode(frame)
         assert_frame_equal(result, frame)
Beispiel #3
0
 def _f(*args, **kwargs):
     obj_iter = itertools.chain(args, compat.itervalues(kwargs))
     if any(self.check(obj) for obj in obj_iter):
         raise TypeError('reduction operation {0!r} not allowed for '
                         'this dtype'.format(f.__name__.replace('nan',
                                                                '')))
     return f(*args, **kwargs)
Beispiel #4
0
def _get_series_result_type(result):
    """
    return appropriate class of Series concat
    input is either dict or array-like
    """
    if isinstance(result, dict):
        # concat Series with axis 1
        if all(is_sparse(c) for c in compat.itervalues(result)):
            from pandas.sparse.api import SparseDataFrame

            return SparseDataFrame
        else:
            from pandas.core.frame import DataFrame

            return DataFrame

    elif is_sparse(result):
        # concat Series with axis 1
        from pandas.sparse.api import SparseSeries

        return SparseSeries
    else:
        from pandas.core.series import Series

        return Series
 def params(self):
     if isinstance(self.symbols, compat.string_types):
         sym_list = self.symbols
     else:
         sym_list = '+'.join(self.symbols)
     # for codes see: http://www.gummy-stuff.org/Yahoo-data.htm
     request = ''.join(compat.itervalues(_yahoo_codes))  # code request string
     params = {'s': sym_list, 'f': request}
     return params
Beispiel #6
0
    def __set__(self, obj, value):
        value = _ensure_index(value)

        if isinstance(value, MultiIndex):
            raise NotImplementedError

        for v in compat.itervalues(obj._frames):
            setattr(v, self.frame_attr, value)

        setattr(obj, self.cache_field, value)
Beispiel #7
0
 def _f(*args, **kwargs):
     obj_iter = itertools.chain(args, compat.itervalues(kwargs))
     if any(self.check(obj) for obj in obj_iter):
         raise TypeError(
             "reduction operation {0!r} not allowed for " "this dtype".format(f.__name__.replace("nan", ""))
         )
     try:
         return f(*args, **kwargs)
     except ValueError as e:
         # we want to transform an object array
         # ValueError message to the more typical TypeError
         # e.g. this is normally a disallowed function on
         # object arrays that contain strings
         if is_object_dtype(args[0]):
             raise TypeError(e)
         raise
Beispiel #8
0
 def _f(*args, **kwargs):
     obj_iter = itertools.chain(args, compat.itervalues(kwargs))
     if any(self.check(obj) for obj in obj_iter):
         msg = 'reduction operation {name!r} not allowed for this dtype'
         raise TypeError(msg.format(name=f.__name__.replace('nan', '')))
     try:
         with np.errstate(invalid='ignore'):
             return f(*args, **kwargs)
     except ValueError as e:
         # we want to transform an object array
         # ValueError message to the more typical TypeError
         # e.g. this is normally a disallowed function on
         # object arrays that contain strings
         if is_object_dtype(args[0]):
             raise TypeError(e)
         raise
Beispiel #9
0
def _get_data(symbols):
    """
    Get current yahoo quote

    Returns a DataFrame
    """
    if isinstance(symbols, compat.string_types):
        sym_list = symbols
    else:
        sym_list = '+'.join(symbols)

    # for codes see: http://www.gummy-stuff.org/Yahoo-data.htm
    request = ''.join(compat.itervalues(_yahoo_codes))  # code request string
    header = list(_yahoo_codes.keys())

    data = defaultdict(list)

    params = {
        's': sym_list,
        'f': request
    }
    url = _encode_url(_URL, params)

    with urlopen(url) as response:
        lines = response.readlines()

    def line_gen(lines):
        for line in lines:
            yield line.decode('utf-8').strip()

    for line in csv.reader(line_gen(lines)):
        for i, field in enumerate(line):
            if field[-2:] == '%"':
                v = float(field.strip('"%'))
            elif field[0] == '"':
                v = field.strip('"')
            else:
                try:
                    v = float(field)
                except ValueError:
                    v = field
            data[header[i]].append(v)

    idx = data.pop('symbol')
    return DataFrame(data, index=idx)
Beispiel #10
0
def _get_series_result_type(result, objs=None):
    """
    return appropriate class of Series concat
    input is either dict or array-like
    """
    # concat Series with axis 1
    if isinstance(result, dict):
        # concat Series with axis 1
        if all(is_sparse(c) for c in compat.itervalues(result)):
            from pandas.core.sparse.api import SparseDataFrame
            return SparseDataFrame
        else:
            from pandas.core.frame import DataFrame
            return DataFrame

    # otherwise it is a SingleBlockManager (axis = 0)
    if result._block.is_sparse:
        from pandas.core.sparse.api import SparseSeries
        return SparseSeries
    else:
        return objs[0]._constructor
Beispiel #11
0
def get_quote_yahoo(symbols):
    """
    Get current yahoo quote

    Returns a DataFrame
    """
    if isinstance(symbols, compat.string_types):
        sym_list = symbols
    else:
        sym_list = '+'.join(symbols)

    # for codes see: http://www.gummy-stuff.org/Yahoo-data.htm
    request = ''.join(compat.itervalues(_yahoo_codes))  # code request string
    header = list(_yahoo_codes.keys())

    data = defaultdict(list)

    url_str = 'http://finance.yahoo.com/d/quotes.csv?s=%s&f=%s' % (sym_list,
                                                                   request)

    with urlopen(url_str) as url:
        lines = url.readlines()

    for line in lines:
        fields = line.decode('utf-8').strip().split(',')
        for i, field in enumerate(fields):
            if field[-2:] == '%"':
                v = float(field.strip('"%'))
            elif field[0] == '"':
                v = field.strip('"')
            else:
                try:
                    v = float(field)
                except ValueError:
                    v = np.nan
            data[header[i]].append(v)

    idx = data.pop('symbol')
    return DataFrame(data, index=idx)
    def _get_one(self, symbol, *args, **kwargs):
        """
        Get current Yahoo Quote for a symbol
        Returns a DataFrame
        """
        
        # ['symbol', 'last', 'change_pct', 'PE', 'time', 'short_ratio']
        # for codes see: http://www.gummy-stuff.org/Yahoo-data.htm
        request = ''.join(compat.itervalues(self._yahoo_codes))  # code request string
        header = list(self._yahoo_codes.keys())
        
        data = defaultdict(list)
        
        url = self._url('/d/quotes.csv')
        params = {
            's': symbol,
            'f': request
        }
        
        response = self.session.get(url, params=params, stream=True)
        
        for line in response.iter_lines():
            fields = line.decode('utf-8').strip().split(',')
            for i, field in enumerate(fields):
                if field[-2:] == '%"':
                    v = float(field.strip('"%'))
                elif field[0] == '"':
                    v = field.strip('"')
                else:
                    try:
                        v = float(field)
                    except ValueError:
                        v = field
                data[header[i]].append(v)

        idx = data.pop('symbol')
        return pd.DataFrame(data, index=idx)
Beispiel #13
0
 def is_any_frame():
     # return a boolean if we have *any* nested series
     return any(isinstance(r, ABCDataFrame)
                for r in compat.itervalues(result))
Beispiel #14
0
    def _aggregate(self, arg, *args, **kwargs):
        """
        provide an implementation for the aggregators

        Parameters
        ----------
        arg : string, dict, function
        *args : args to pass on to the function
        **kwargs : kwargs to pass on to the function

        Returns
        -------
        tuple of result, how

        Notes
        -----
        how can be a string describe the required post-processing, or
        None if not required
        """
        is_aggregator = lambda x: isinstance(x, (list, tuple, dict))
        is_nested_renamer = False

        _axis = kwargs.pop('_axis', None)
        if _axis is None:
            _axis = getattr(self, 'axis', 0)
        _level = kwargs.pop('_level', None)

        if isinstance(arg, compat.string_types):
            return self._try_aggregate_string_function(arg, *args,
                                                       **kwargs), None

        if isinstance(arg, dict):

            # aggregate based on the passed dict
            if _axis != 0:  # pragma: no cover
                raise ValueError('Can only pass dict with axis=0')

            obj = self._selected_obj

            def nested_renaming_depr(level=4):
                # deprecation of nested renaming
                # GH 15931
                warnings.warn(
                    ("using a dict with renaming "
                     "is deprecated and will be removed in a future "
                     "version"),
                    FutureWarning, stacklevel=level)

            # if we have a dict of any non-scalars
            # eg. {'A' : ['mean']}, normalize all to
            # be list-likes
            if any(is_aggregator(x) for x in compat.itervalues(arg)):
                new_arg = compat.OrderedDict()
                for k, v in compat.iteritems(arg):
                    if not isinstance(v, (tuple, list, dict)):
                        new_arg[k] = [v]
                    else:
                        new_arg[k] = v

                    # the keys must be in the columns
                    # for ndim=2, or renamers for ndim=1

                    # ok for now, but deprecated
                    # {'A': { 'ra': 'mean' }}
                    # {'A': { 'ra': ['mean'] }}
                    # {'ra': ['mean']}

                    # not ok
                    # {'ra' : { 'A' : 'mean' }}
                    if isinstance(v, dict):
                        is_nested_renamer = True

                        if k not in obj.columns:
                            msg = ('cannot perform renaming for {key} with a '
                                   'nested dictionary').format(key=k)
                            raise SpecificationError(msg)
                        nested_renaming_depr(4 + (_level or 0))

                    elif isinstance(obj, ABCSeries):
                        nested_renaming_depr()
                    elif isinstance(obj, ABCDataFrame) and \
                            k not in obj.columns:
                        raise KeyError(
                            "Column '{col}' does not exist!".format(col=k))

                arg = new_arg

            else:
                # deprecation of renaming keys
                # GH 15931
                keys = list(compat.iterkeys(arg))
                if (isinstance(obj, ABCDataFrame) and
                        len(obj.columns.intersection(keys)) != len(keys)):
                    nested_renaming_depr()

            from pandas.core.reshape.concat import concat

            def _agg_1dim(name, how, subset=None):
                """
                aggregate a 1-dim with how
                """
                colg = self._gotitem(name, ndim=1, subset=subset)
                if colg.ndim != 1:
                    raise SpecificationError("nested dictionary is ambiguous "
                                             "in aggregation")
                return colg.aggregate(how, _level=(_level or 0) + 1)

            def _agg_2dim(name, how):
                """
                aggregate a 2-dim with how
                """
                colg = self._gotitem(self._selection, ndim=2,
                                     subset=obj)
                return colg.aggregate(how, _level=None)

            def _agg(arg, func):
                """
                run the aggregations over the arg with func
                return an OrderedDict
                """
                result = compat.OrderedDict()
                for fname, agg_how in compat.iteritems(arg):
                    result[fname] = func(fname, agg_how)
                return result

            # set the final keys
            keys = list(compat.iterkeys(arg))
            result = compat.OrderedDict()

            # nested renamer
            if is_nested_renamer:
                result = list(_agg(arg, _agg_1dim).values())

                if all(isinstance(r, dict) for r in result):

                    result, results = compat.OrderedDict(), result
                    for r in results:
                        result.update(r)
                    keys = list(compat.iterkeys(result))

                else:

                    if self._selection is not None:
                        keys = None

            # some selection on the object
            elif self._selection is not None:

                sl = set(self._selection_list)

                # we are a Series like object,
                # but may have multiple aggregations
                if len(sl) == 1:

                    result = _agg(arg, lambda fname,
                                  agg_how: _agg_1dim(self._selection, agg_how))

                # we are selecting the same set as we are aggregating
                elif not len(sl - set(keys)):

                    result = _agg(arg, _agg_1dim)

                # we are a DataFrame, with possibly multiple aggregations
                else:

                    result = _agg(arg, _agg_2dim)

            # no selection
            else:

                try:
                    result = _agg(arg, _agg_1dim)
                except SpecificationError:

                    # we are aggregating expecting all 1d-returns
                    # but we have 2d
                    result = _agg(arg, _agg_2dim)

            # combine results

            def is_any_series():
                # return a boolean if we have *any* nested series
                return any(isinstance(r, ABCSeries)
                           for r in compat.itervalues(result))

            def is_any_frame():
                # return a boolean if we have *any* nested series
                return any(isinstance(r, ABCDataFrame)
                           for r in compat.itervalues(result))

            if isinstance(result, list):
                return concat(result, keys=keys, axis=1), True

            elif is_any_frame():
                # we have a dict of DataFrames
                # return a MI DataFrame

                return concat([result[k] for k in keys],
                              keys=keys, axis=1), True

            elif isinstance(self, ABCSeries) and is_any_series():

                # we have a dict of Series
                # return a MI Series
                try:
                    result = concat(result)
                except TypeError:
                    # we want to give a nice error here if
                    # we have non-same sized objects, so
                    # we don't automatically broadcast

                    raise ValueError("cannot perform both aggregation "
                                     "and transformation operations "
                                     "simultaneously")

                return result, True

            # fall thru
            from pandas import DataFrame, Series
            try:
                result = DataFrame(result)
            except ValueError:

                # we have a dict of scalars
                result = Series(result,
                                name=getattr(self, 'name', None))

            return result, True
        elif is_list_like(arg) and arg not in compat.string_types:
            # we require a list, but not an 'str'
            return self._aggregate_multiple_funcs(arg,
                                                  _level=_level,
                                                  _axis=_axis), None
        else:
            result = None

        f = self._is_cython_func(arg)
        if f and not args and not kwargs:
            return getattr(self, f)(), None

        # caller can react
        return result, True
Beispiel #15
0
 def test_utf(self):
     # GH10581
     for encoding in self.utf_encodings:
         for frame in compat.itervalues(self.frame):
             result = self.encode_decode(frame, encoding=encoding)
             assert_frame_equal(result, frame)
Beispiel #16
0
def json_normalize(data,
                   record_path=None,
                   meta=None,
                   meta_prefix=None,
                   record_prefix=None,
                   errors='raise',
                   sep='.'):
    """
    "Normalize" semi-structured JSON data into a flat table

    Parameters
    ----------
    data : dict or list of dicts
        Unserialized JSON objects
    record_path : string or list of strings, default None
        Path in each object to list of records. If not passed, data will be
        assumed to be an array of records
    meta : list of paths (string or list of strings), default None
        Fields to use as metadata for each record in resulting table
    record_prefix : string, default None
        If True, prefix records with dotted (?) path, e.g. foo.bar.field if
        path to records is ['foo', 'bar']
    meta_prefix : string, default None
    errors : {'raise', 'ignore'}, default 'raise'

        * 'ignore' : will ignore KeyError if keys listed in meta are not
          always present
        * 'raise' : will raise KeyError if keys listed in meta are not
          always present

        .. versionadded:: 0.20.0

    sep : string, default '.'
        Nested records will generate names separated by sep,
        e.g., for sep='.', { 'foo' : { 'bar' : 0 } } -> foo.bar

        .. versionadded:: 0.20.0


    Returns
    -------
    frame : DataFrame

    Examples
    --------

    >>> from pandas.io.json import json_normalize
    >>> data = [{'id': 1, 'name': {'first': 'Coleen', 'last': 'Volk'}},
    ...         {'name': {'given': 'Mose', 'family': 'Regner'}},
    ...         {'id': 2, 'name': 'Faye Raker'}]
    >>> json_normalize(data)
        id        name name.family name.first name.given name.last
    0  1.0         NaN         NaN     Coleen        NaN      Volk
    1  NaN         NaN      Regner        NaN       Mose       NaN
    2  2.0  Faye Raker         NaN        NaN        NaN       NaN

    >>> data = [{'state': 'Florida',
    ...          'shortname': 'FL',
    ...          'info': {
    ...               'governor': 'Rick Scott'
    ...          },
    ...          'counties': [{'name': 'Dade', 'population': 12345},
    ...                      {'name': 'Broward', 'population': 40000},
    ...                      {'name': 'Palm Beach', 'population': 60000}]},
    ...         {'state': 'Ohio',
    ...          'shortname': 'OH',
    ...          'info': {
    ...               'governor': 'John Kasich'
    ...          },
    ...          'counties': [{'name': 'Summit', 'population': 1234},
    ...                       {'name': 'Cuyahoga', 'population': 1337}]}]
    >>> result = json_normalize(data, 'counties', ['state', 'shortname',
    ...                                           ['info', 'governor']])
    >>> result
             name  population info.governor    state shortname
    0        Dade       12345    Rick Scott  Florida        FL
    1     Broward       40000    Rick Scott  Florida        FL
    2  Palm Beach       60000    Rick Scott  Florida        FL
    3      Summit        1234   John Kasich     Ohio        OH
    4    Cuyahoga        1337   John Kasich     Ohio        OH

    >>> data = {'A': [1, 2]}
    >>> json_normalize(data, 'A', record_prefix='Prefix.')
        Prefix.0
    0          1
    1          2
    """
    def _pull_field(js, spec):
        result = js
        if isinstance(spec, list):
            for field in spec:
                result = result[field]
        else:
            result = result[spec]

        return result

    if isinstance(data, list) and not data:
        return DataFrame()

    # A bit of a hackjob
    if isinstance(data, dict):
        data = [data]

    if record_path is None:
        if any([isinstance(x, dict) for x in compat.itervalues(y)]
               for y in data):
            # naive normalization, this is idempotent for flat records
            # and potentially will inflate the data considerably for
            # deeply nested structures:
            #  {VeryLong: { b: 1,c:2}} -> {VeryLong.b:1 ,VeryLong.c:@}
            #
            # TODO: handle record value which are lists, at least error
            #       reasonably
            data = nested_to_record(data, sep=sep)
        return DataFrame(data)
    elif not isinstance(record_path, list):
        record_path = [record_path]

    if meta is None:
        meta = []
    elif not isinstance(meta, list):
        meta = [meta]

    meta = [m if isinstance(m, list) else [m] for m in meta]

    # Disastrously inefficient for now
    records = []
    lengths = []

    meta_vals = defaultdict(list)
    if not isinstance(sep, compat.string_types):
        sep = str(sep)
    meta_keys = [sep.join(val) for val in meta]

    def _recursive_extract(data, path, seen_meta, level=0):
        if len(path) > 1:
            for obj in data:
                for val, key in zip(meta, meta_keys):
                    if level + 1 == len(val):
                        seen_meta[key] = _pull_field(obj, val[-1])

                _recursive_extract(obj[path[0]],
                                   path[1:],
                                   seen_meta,
                                   level=level + 1)
        else:
            for obj in data:
                recs = _pull_field(obj, path[0])

                # For repeating the metadata later
                lengths.append(len(recs))

                for val, key in zip(meta, meta_keys):
                    if level + 1 > len(val):
                        meta_val = seen_meta[key]
                    else:
                        try:
                            meta_val = _pull_field(obj, val[level:])
                        except KeyError as e:
                            if errors == 'ignore':
                                meta_val = np.nan
                            else:
                                raise KeyError(
                                    "Try running with "
                                    "errors='ignore' as key "
                                    "{err} is not always present".format(
                                        err=e))
                    meta_vals[key].append(meta_val)

                records.extend(recs)

    _recursive_extract(data, record_path, {}, level=0)

    result = DataFrame(records)

    if record_prefix is not None:
        result = result.rename(
            columns=lambda x: "{p}{c}".format(p=record_prefix, c=x))

    # Data types, a problem
    for k, v in compat.iteritems(meta_vals):
        if meta_prefix is not None:
            k = meta_prefix + k

        if k in result:
            raise ValueError('Conflicting metadata name {name}, '
                             'need distinguishing prefix '.format(name=k))

        result[k] = np.array(v).repeat(lengths)

    return result
Beispiel #17
0
 def test_dict_iterators(self):
     assert next(itervalues({1: 2})) == 2
     assert next(iterkeys({1: 2})) == 1
     assert next(iteritems({1: 2})) == (1, 2)
Beispiel #18
0
def json_normalize(data, record_path=None, meta=None,
                   meta_prefix=None,
                   record_prefix=None,
                   errors='raise'):

    """
    "Normalize" semi-structured JSON data into a flat table

    Parameters
    ----------
    data : dict or list of dicts
        Unserialized JSON objects
    record_path : string or list of strings, default None
        Path in each object to list of records. If not passed, data will be
        assumed to be an array of records
    meta : list of paths (string or list of strings), default None
        Fields to use as metadata for each record in resulting table
    record_prefix : string, default None
        If True, prefix records with dotted (?) path, e.g. foo.bar.field if
        path to records is ['foo', 'bar']
    meta_prefix : string, default None
    errors : {'raise', 'ignore'}, default 'raise'
        * ignore : will ignore KeyError if keys listed in meta are not
        always present
        * raise : will raise KeyError if keys listed in meta are not
        always present

        .. versionadded:: 0.20.0

    Returns
    -------
    frame : DataFrame

    Examples
    --------

    >>> data = [{'state': 'Florida',
    ...          'shortname': 'FL',
    ...          'info': {
    ...               'governor': 'Rick Scott'
    ...          },
    ...          'counties': [{'name': 'Dade', 'population': 12345},
    ...                      {'name': 'Broward', 'population': 40000},
    ...                      {'name': 'Palm Beach', 'population': 60000}]},
    ...         {'state': 'Ohio',
    ...          'shortname': 'OH',
    ...          'info': {
    ...               'governor': 'John Kasich'
    ...          },
    ...          'counties': [{'name': 'Summit', 'population': 1234},
    ...                       {'name': 'Cuyahoga', 'population': 1337}]}]
    >>> from pandas.io.json import json_normalize
    >>> result = json_normalize(data, 'counties', ['state', 'shortname',
    ...                                           ['info', 'governor']])
    >>> result
             name  population info.governor    state shortname
    0        Dade       12345    Rick Scott  Florida        FL
    1     Broward       40000    Rick Scott  Florida        FL
    2  Palm Beach       60000    Rick Scott  Florida        FL
    3      Summit        1234   John Kasich     Ohio        OH
    4    Cuyahoga        1337   John Kasich     Ohio        OH

    """
    def _pull_field(js, spec):
        result = js
        if isinstance(spec, list):
            for field in spec:
                result = result[field]
        else:
            result = result[spec]

        return result

    # A bit of a hackjob
    if isinstance(data, dict):
        data = [data]

    if record_path is None:
        if any([isinstance(x, dict) for x in compat.itervalues(data[0])]):
            # naive normalization, this is idempotent for flat records
            # and potentially will inflate the data considerably for
            # deeply nested structures:
            #  {VeryLong: { b: 1,c:2}} -> {VeryLong.b:1 ,VeryLong.c:@}
            #
            # TODO: handle record value which are lists, at least error
            #       reasonably
            data = nested_to_record(data)
        return DataFrame(data)
    elif not isinstance(record_path, list):
        record_path = [record_path]

    if meta is None:
        meta = []
    elif not isinstance(meta, list):
        meta = [meta]

    for i, x in enumerate(meta):
        if not isinstance(x, list):
            meta[i] = [x]

    # Disastrously inefficient for now
    records = []
    lengths = []

    meta_vals = defaultdict(list)
    meta_keys = ['.'.join(val) for val in meta]

    def _recursive_extract(data, path, seen_meta, level=0):
        if len(path) > 1:
            for obj in data:
                for val, key in zip(meta, meta_keys):
                    if level + 1 == len(val):
                        seen_meta[key] = _pull_field(obj, val[-1])

                _recursive_extract(obj[path[0]], path[1:],
                                   seen_meta, level=level + 1)
        else:
            for obj in data:
                recs = _pull_field(obj, path[0])

                # For repeating the metadata later
                lengths.append(len(recs))

                for val, key in zip(meta, meta_keys):
                    if level + 1 > len(val):
                        meta_val = seen_meta[key]
                    else:
                        try:
                            meta_val = _pull_field(obj, val[level:])
                        except KeyError as e:
                            if errors == 'ignore':
                                meta_val = np.nan
                            else:
                                raise \
                                    KeyError("Try running with "
                                             "errors='ignore' as key "
                                             "%s is not always present", e)
                    meta_vals[key].append(meta_val)

                records.extend(recs)

    _recursive_extract(data, record_path, {}, level=0)

    result = DataFrame(records)

    if record_prefix is not None:
        result.rename(columns=lambda x: record_prefix + x, inplace=True)

    # Data types, a problem
    for k, v in compat.iteritems(meta_vals):
        if meta_prefix is not None:
            k = meta_prefix + k

        if k in result:
            raise ValueError('Conflicting metadata name %s, '
                             'need distinguishing prefix ' % k)

        result[k] = np.array(v).repeat(lengths)

    return result
Beispiel #19
0
def shuffle_uri(df, grouped):
    perm = np.r_[tuple([np.random.permutation(
        idxs) for idxs in compat.itervalues(grouped.groups)])]
    df['state_permuted'] = np.asarray(df.ix[perm]['value'])
Beispiel #20
0
 def is_any_series():
     # return a boolean if we have *any* nested series
     return any(
         isinstance(r, ABCSeries)
         for r in compat.itervalues(result))
Beispiel #21
0
    def _aggregate(self, arg, *args, **kwargs):
        """
        provide an implementation for the aggregators

        Parameters
        ----------
        arg : string, dict, function
        *args : args to pass on to the function
        **kwargs : kwargs to pass on to the function

        Returns
        -------
        tuple of result, how

        Notes
        -----
        how can be a string describe the required post-processing, or
        None if not required
        """
        is_aggregator = lambda x: isinstance(x, (list, tuple, dict))
        is_nested_renamer = False

        _axis = kwargs.pop('_axis', None)
        if _axis is None:
            _axis = getattr(self, 'axis', 0)
        _level = kwargs.pop('_level', None)

        if isinstance(arg, compat.string_types):
            return self._try_aggregate_string_function(arg, *args,
                                                       **kwargs), None

        if isinstance(arg, dict):

            # aggregate based on the passed dict
            if _axis != 0:  # pragma: no cover
                raise ValueError('Can only pass dict with axis=0')

            obj = self._selected_obj

            def nested_renaming_depr(level=4):
                # deprecation of nested renaming
                # GH 15931
                warnings.warn(("using a dict with renaming "
                               "is deprecated and will be removed in a future "
                               "version"),
                              FutureWarning,
                              stacklevel=level)

            # if we have a dict of any non-scalars
            # eg. {'A' : ['mean']}, normalize all to
            # be list-likes
            if any(is_aggregator(x) for x in compat.itervalues(arg)):
                new_arg = compat.OrderedDict()
                for k, v in compat.iteritems(arg):
                    if not isinstance(v, (tuple, list, dict)):
                        new_arg[k] = [v]
                    else:
                        new_arg[k] = v

                    # the keys must be in the columns
                    # for ndim=2, or renamers for ndim=1

                    # ok for now, but deprecated
                    # {'A': { 'ra': 'mean' }}
                    # {'A': { 'ra': ['mean'] }}
                    # {'ra': ['mean']}

                    # not ok
                    # {'ra' : { 'A' : 'mean' }}
                    if isinstance(v, dict):
                        is_nested_renamer = True

                        if k not in obj.columns:
                            raise SpecificationError('cannot perform renaming '
                                                     'for {0} with a nested '
                                                     'dictionary'.format(k))
                        nested_renaming_depr(4 + (_level or 0))

                    elif isinstance(obj, ABCSeries):
                        nested_renaming_depr()

                arg = new_arg

            else:
                # deprecation of renaming keys
                # GH 15931
                keys = list(compat.iterkeys(arg))
                if (isinstance(obj, ABCDataFrame)
                        and len(obj.columns.intersection(keys)) != len(keys)):
                    nested_renaming_depr()

            from pandas.tools.concat import concat

            def _agg_1dim(name, how, subset=None):
                """
                aggregate a 1-dim with how
                """
                colg = self._gotitem(name, ndim=1, subset=subset)
                if colg.ndim != 1:
                    raise SpecificationError("nested dictionary is ambiguous "
                                             "in aggregation")
                return colg.aggregate(how, _level=(_level or 0) + 1)

            def _agg_2dim(name, how):
                """
                aggregate a 2-dim with how
                """
                colg = self._gotitem(self._selection, ndim=2, subset=obj)
                return colg.aggregate(how, _level=None)

            def _agg(arg, func):
                """
                run the aggregations over the arg with func
                return an OrderedDict
                """
                result = compat.OrderedDict()
                for fname, agg_how in compat.iteritems(arg):
                    result[fname] = func(fname, agg_how)
                return result

            # set the final keys
            keys = list(compat.iterkeys(arg))
            result = compat.OrderedDict()

            # nested renamer
            if is_nested_renamer:
                result = list(_agg(arg, _agg_1dim).values())

                if all(isinstance(r, dict) for r in result):

                    result, results = compat.OrderedDict(), result
                    for r in results:
                        result.update(r)
                    keys = list(compat.iterkeys(result))

                else:

                    if self._selection is not None:
                        keys = None

            # some selection on the object
            elif self._selection is not None:

                sl = set(self._selection_list)

                # we are a Series like object,
                # but may have multiple aggregations
                if len(sl) == 1:

                    result = _agg(
                        arg, lambda fname, agg_how: _agg_1dim(
                            self._selection, agg_how))

                # we are selecting the same set as we are aggregating
                elif not len(sl - set(keys)):

                    result = _agg(arg, _agg_1dim)

                # we are a DataFrame, with possibly multiple aggregations
                else:

                    result = _agg(arg, _agg_2dim)

            # no selection
            else:

                try:
                    result = _agg(arg, _agg_1dim)
                except SpecificationError:

                    # we are aggregating expecting all 1d-returns
                    # but we have 2d
                    result = _agg(arg, _agg_2dim)

            # combine results

            def is_any_series():
                # return a boolean if we have *any* nested series
                return any([
                    isinstance(r, ABCSeries) for r in compat.itervalues(result)
                ])

            def is_any_frame():
                # return a boolean if we have *any* nested series
                return any([
                    isinstance(r, ABCDataFrame)
                    for r in compat.itervalues(result)
                ])

            if isinstance(result, list):
                return concat(result, keys=keys, axis=1), True

            elif is_any_frame():
                # we have a dict of DataFrames
                # return a MI DataFrame

                return concat([result[k] for k in keys], keys=keys,
                              axis=1), True

            elif isinstance(self, ABCSeries) and is_any_series():

                # we have a dict of Series
                # return a MI Series
                try:
                    result = concat(result)
                except TypeError:
                    # we want to give a nice error here if
                    # we have non-same sized objects, so
                    # we don't automatically broadcast

                    raise ValueError("cannot perform both aggregation "
                                     "and transformation operations "
                                     "simultaneously")

                return result, True

            # fall thru
            from pandas import DataFrame, Series
            try:
                result = DataFrame(result)
            except ValueError:

                # we have a dict of scalars
                result = Series(result, name=getattr(self, 'name', None))

            return result, True
        elif is_list_like(arg) and arg not in compat.string_types:
            # we require a list, but not an 'str'
            return self._aggregate_multiple_funcs(arg,
                                                  _level=_level,
                                                  _axis=_axis), None
        else:
            result = None

        f = self._is_cython_func(arg)
        if f and not args and not kwargs:
            return getattr(self, f)(), None

        # caller can react
        return result, True
Beispiel #22
0
from pandas.io.data import _yahoo_codes
from pandas.io.common import urlopen
import pandas.compat as compat
from collections import defaultdict

sym_list = 'APT'
_yahoo_codes.update({'MarketCap' : 'j1'})
request = ''.join(compat.itervalues(_yahoo_codes))  # code request string
header = list(_yahoo_codes.keys())
print header
data = defaultdict(list)

url_str = 'http://finance.yahoo.com/d/quotes.csv?s=%s&f=%s' % (sym_list, request)

with urlopen(url_str) as url:
    lines = url.readlines()
    print lines
Beispiel #23
0
 def is_any_frame():
     # return a boolean if we have *any* nested series
     return any([
         isinstance(r, ABCDataFrame)
         for r in compat.itervalues(result)
     ])
Beispiel #24
0
 def is_any_series():
     # return a boolean if we have *any* nested series
     return any([isinstance(r, ABCSeries)
                 for r in compat.itervalues(result)])
Beispiel #25
0
    def _aggregate(self, arg, *args, **kwargs):
        """
        provide an implementation for the aggregators

        Parameters
        ----------
        arg : string, dict, function
        *args : args to pass on to the function
        **kwargs : kwargs to pass on to the function

        Returns
        -------
        tuple of result, how

        Notes
        -----
        how can be a string describe the required post-processing, or
        None if not required
        """

        is_aggregator = lambda x: isinstance(x, (list, tuple, dict))
        is_nested_renamer = False

        _level = kwargs.pop('_level', None)
        if isinstance(arg, compat.string_types):
            return getattr(self, arg)(*args, **kwargs), None

        if isinstance(arg, dict):

            # aggregate based on the passed dict
            if self.axis != 0:  # pragma: no cover
                raise ValueError('Can only pass dict with axis=0')

            obj = self._selected_obj

            # if we have a dict of any non-scalars
            # eg. {'A' : ['mean']}, normalize all to
            # be list-likes
            if any(is_aggregator(x) for x in compat.itervalues(arg)):
                new_arg = compat.OrderedDict()
                for k, v in compat.iteritems(arg):
                    if not isinstance(v, (tuple, list, dict)):
                        new_arg[k] = [v]
                    else:
                        new_arg[k] = v

                    # the keys must be in the columns
                    # for ndim=2, or renamers for ndim=1

                    # ok
                    # {'A': { 'ra': 'mean' }}
                    # {'A': { 'ra': ['mean'] }}
                    # {'ra': ['mean']}

                    # not ok
                    # {'ra' : { 'A' : 'mean' }}
                    if isinstance(v, dict):
                        is_nested_renamer = True

                        if k not in obj.columns:
                            raise SpecificationError('cannot perform renaming '
                                                     'for {0} with a nested '
                                                     'dictionary'.format(k))

                arg = new_arg

            from pandas.tools.merge import concat

            def _agg_1dim(name, how, subset=None):
                """
                aggregate a 1-dim with how
                """
                colg = self._gotitem(name, ndim=1, subset=subset)
                if colg.ndim != 1:
                    raise SpecificationError("nested dictionary is ambiguous "
                                             "in aggregation")
                return colg.aggregate(how, _level=(_level or 0) + 1)

            def _agg_2dim(name, how):
                """
                aggregate a 2-dim with how
                """
                colg = self._gotitem(self._selection, ndim=2,
                                     subset=obj)
                return colg.aggregate(how, _level=None)

            def _agg(arg, func):
                """
                run the aggregations over the arg with func
                return an OrderedDict
                """
                result = compat.OrderedDict()
                for fname, agg_how in compat.iteritems(arg):
                    result[fname] = func(fname, agg_how)
                return result

            # set the final keys
            keys = list(compat.iterkeys(arg))
            result = compat.OrderedDict()

            # nested renamer
            if is_nested_renamer:
                result = list(_agg(arg, _agg_1dim).values())

                if all(isinstance(r, dict) for r in result):

                    result, results = compat.OrderedDict(), result
                    for r in results:
                        result.update(r)
                    keys = list(compat.iterkeys(result))

                else:

                    if self._selection is not None:
                        keys = None

            # some selection on the object
            elif self._selection is not None:

                sl = set(self._selection_list)

                # we are a Series like object,
                # but may have multiple aggregations
                if len(sl) == 1:

                    result = _agg(arg, lambda fname,
                                  agg_how: _agg_1dim(self._selection, agg_how))

                # we are selecting the same set as we are aggregating
                elif not len(sl - set(compat.iterkeys(arg))):

                    result = _agg(arg, _agg_1dim)

                # we are a DataFrame, with possibly multiple aggregations
                else:

                    result = _agg(arg, _agg_2dim)

            # no selection
            else:

                try:
                    result = _agg(arg, _agg_1dim)
                except SpecificationError:

                    # we are aggregating expecting all 1d-returns
                    # but we have 2d
                    result = _agg(arg, _agg_2dim)

            # combine results
            if isinstance(result, list):
                result = concat(result, keys=keys, axis=1)
            elif isinstance(list(compat.itervalues(result))[0],
                            gt.ABCDataFrame):
                result = concat([result[k] for k in keys], keys=keys, axis=1)
            else:
                from pandas import DataFrame
                result = DataFrame(result)

            return result, True
        elif hasattr(arg, '__iter__'):
            return self._aggregate_multiple_funcs(arg, _level=_level), None
        else:
            result = None

        cy_func = self._is_cython_func(arg)
        if cy_func and not args and not kwargs:
            return getattr(self, cy_func)(), None

        # caller can react
        return result, True
Beispiel #26
0
 def test_dict_iterators(self):
     self.assertEqual(next(itervalues({1: 2})), 2)
     self.assertEqual(next(iterkeys({1: 2})), 1)
     self.assertEqual(next(iteritems({1: 2})), (1, 2))
Beispiel #27
0
 def test_dict_iterators(self):
     assert next(itervalues({1: 2})) == 2
     assert next(iterkeys({1: 2})) == 1
     assert next(iteritems({1: 2})) == (1, 2)
Beispiel #28
0
    def _aggregate(self, arg, *args, **kwargs):
        """
        provide an implementation for the aggregators

        Parameters
        ----------
        arg : string, dict, function
        *args : args to pass on to the function
        **kwargs : kwargs to pass on to the function

        Returns
        -------
        tuple of result, how

        Notes
        -----
        how can be a string describe the required post-processing, or
        None if not required
        """

        is_aggregator = lambda x: isinstance(x, (list, tuple, dict))
        is_nested_renamer = False

        _level = kwargs.pop('_level', None)
        if isinstance(arg, compat.string_types):
            return getattr(self, arg)(*args, **kwargs), None

        if isinstance(arg, dict):

            # aggregate based on the passed dict
            if self.axis != 0:  # pragma: no cover
                raise ValueError('Can only pass dict with axis=0')

            obj = self._selected_obj

            # if we have a dict of any non-scalars
            # eg. {'A' : ['mean']}, normalize all to
            # be list-likes
            if any(is_aggregator(x) for x in compat.itervalues(arg)):
                new_arg = compat.OrderedDict()
                for k, v in compat.iteritems(arg):
                    if not isinstance(v, (tuple, list, dict)):
                        new_arg[k] = [v]
                    else:
                        new_arg[k] = v

                    # the keys must be in the columns
                    # for ndim=2, or renamers for ndim=1

                    # ok
                    # {'A': { 'ra': 'mean' }}
                    # {'A': { 'ra': ['mean'] }}
                    # {'ra': ['mean']}

                    # not ok
                    # {'ra' : { 'A' : 'mean' }}
                    if isinstance(v, dict):
                        is_nested_renamer = True

                        if k not in obj.columns:
                            raise SpecificationError('cannot perform renaming '
                                                     'for {0} with a nested '
                                                     'dictionary'.format(k))

                arg = new_arg

            from pandas.tools.merge import concat

            def _agg_1dim(name, how, subset=None):
                """
                aggregate a 1-dim with how
                """
                colg = self._gotitem(name, ndim=1, subset=subset)
                if colg.ndim != 1:
                    raise SpecificationError("nested dictionary is ambiguous "
                                             "in aggregation")
                return colg.aggregate(how, _level=(_level or 0) + 1)

            def _agg_2dim(name, how):
                """
                aggregate a 2-dim with how
                """
                colg = self._gotitem(self._selection, ndim=2,
                                     subset=obj)
                return colg.aggregate(how, _level=None)

            def _agg(arg, func):
                """
                run the aggregations over the arg with func
                return an OrderedDict
                """
                result = compat.OrderedDict()
                for fname, agg_how in compat.iteritems(arg):
                    result[fname] = func(fname, agg_how)
                return result

            # set the final keys
            keys = list(compat.iterkeys(arg))
            result = compat.OrderedDict()

            # nested renamer
            if is_nested_renamer:
                result = list(_agg(arg, _agg_1dim).values())

                if all(isinstance(r, dict) for r in result):

                    result, results = compat.OrderedDict(), result
                    for r in results:
                        result.update(r)
                    keys = list(compat.iterkeys(result))

                else:

                    if self._selection is not None:
                        keys = None

            # some selection on the object
            elif self._selection is not None:

                sl = set(self._selection_list)

                # we are a Series like object,
                # but may have multiple aggregations
                if len(sl) == 1:

                    result = _agg(arg, lambda fname,
                                  agg_how: _agg_1dim(self._selection, agg_how))

                # we are selecting the same set as we are aggregating
                elif not len(sl - set(compat.iterkeys(arg))):

                    result = _agg(arg, _agg_1dim)

                # we are a DataFrame, with possibly multiple aggregations
                else:

                    result = _agg(arg, _agg_2dim)

            # no selection
            else:

                try:
                    result = _agg(arg, _agg_1dim)
                except SpecificationError:

                    # we are aggregating expecting all 1d-returns
                    # but we have 2d
                    result = _agg(arg, _agg_2dim)

            # combine results
            if isinstance(result, list):
                result = concat(result, keys=keys, axis=1)
            elif isinstance(list(compat.itervalues(result))[0],
                            com.ABCDataFrame):
                result = concat([result[k] for k in keys], keys=keys, axis=1)
            else:
                from pandas import DataFrame
                result = DataFrame(result)

            return result, True
        elif hasattr(arg, '__iter__'):
            return self._aggregate_multiple_funcs(arg, _level=_level), None
        else:
            result = None

        cy_func = self._is_cython_func(arg)
        if cy_func and not args and not kwargs:
            return getattr(self, cy_func)(), None

        # caller can react
        return result, True
Beispiel #29
0
def json_normalize(data,
                   record_path=None,
                   meta=None,
                   meta_prefix=None,
                   record_prefix=None,
                   errors='raise'):
    """
    "Normalize" semi-structured JSON data into a flat table

    Parameters
    ----------
    data : dict or list of dicts
        Unserialized JSON objects
    record_path : string or list of strings, default None
        Path in each object to list of records. If not passed, data will be
        assumed to be an array of records
    meta : list of paths (string or list of strings), default None
        Fields to use as metadata for each record in resulting table
    record_prefix : string, default None
        If True, prefix records with dotted (?) path, e.g. foo.bar.field if
        path to records is ['foo', 'bar']
    meta_prefix : string, default None
    errors : {'raise', 'ignore'}, default 'raise'

        * ignore : will ignore KeyError if keys listed in meta are not
        always present
        * raise : will raise KeyError if keys listed in meta are not
        always present

        .. versionadded:: 0.20.0

    Returns
    -------
    frame : DataFrame

    Examples
    --------

    >>> data = [{'state': 'Florida',
    ...          'shortname': 'FL',
    ...          'info': {
    ...               'governor': 'Rick Scott'
    ...          },
    ...          'counties': [{'name': 'Dade', 'population': 12345},
    ...                      {'name': 'Broward', 'population': 40000},
    ...                      {'name': 'Palm Beach', 'population': 60000}]},
    ...         {'state': 'Ohio',
    ...          'shortname': 'OH',
    ...          'info': {
    ...               'governor': 'John Kasich'
    ...          },
    ...          'counties': [{'name': 'Summit', 'population': 1234},
    ...                       {'name': 'Cuyahoga', 'population': 1337}]}]
    >>> from pandas.io.json import json_normalize
    >>> result = json_normalize(data, 'counties', ['state', 'shortname',
    ...                                           ['info', 'governor']])
    >>> result
             name  population info.governor    state shortname
    0        Dade       12345    Rick Scott  Florida        FL
    1     Broward       40000    Rick Scott  Florida        FL
    2  Palm Beach       60000    Rick Scott  Florida        FL
    3      Summit        1234   John Kasich     Ohio        OH
    4    Cuyahoga        1337   John Kasich     Ohio        OH

    """
    def _pull_field(js, spec):
        result = js
        if isinstance(spec, list):
            for field in spec:
                result = result[field]
        else:
            result = result[spec]

        return result

    # A bit of a hackjob
    if isinstance(data, dict):
        data = [data]

    if record_path is None:
        if any([isinstance(x, dict) for x in compat.itervalues(data[0])]):
            # naive normalization, this is idempotent for flat records
            # and potentially will inflate the data considerably for
            # deeply nested structures:
            #  {VeryLong: { b: 1,c:2}} -> {VeryLong.b:1 ,VeryLong.c:@}
            #
            # TODO: handle record value which are lists, at least error
            #       reasonably
            data = nested_to_record(data)
        return DataFrame(data)
    elif not isinstance(record_path, list):
        record_path = [record_path]

    if meta is None:
        meta = []
    elif not isinstance(meta, list):
        meta = [meta]

    for i, x in enumerate(meta):
        if not isinstance(x, list):
            meta[i] = [x]

    # Disastrously inefficient for now
    records = []
    lengths = []

    meta_vals = defaultdict(list)
    meta_keys = ['.'.join(val) for val in meta]

    def _recursive_extract(data, path, seen_meta, level=0):
        if len(path) > 1:
            for obj in data:
                for val, key in zip(meta, meta_keys):
                    if level + 1 == len(val):
                        seen_meta[key] = _pull_field(obj, val[-1])

                _recursive_extract(obj[path[0]],
                                   path[1:],
                                   seen_meta,
                                   level=level + 1)
        else:
            for obj in data:
                recs = _pull_field(obj, path[0])

                # For repeating the metadata later
                lengths.append(len(recs))

                for val, key in zip(meta, meta_keys):
                    if level + 1 > len(val):
                        meta_val = seen_meta[key]
                    else:
                        try:
                            meta_val = _pull_field(obj, val[level:])
                        except KeyError as e:
                            if errors == 'ignore':
                                meta_val = np.nan
                            else:
                                raise \
                                    KeyError("Try running with "
                                             "errors='ignore' as key "
                                             "%s is not always present", e)
                    meta_vals[key].append(meta_val)

                records.extend(recs)

    _recursive_extract(data, record_path, {}, level=0)

    result = DataFrame(records)

    if record_prefix is not None:
        result.rename(columns=lambda x: record_prefix + x, inplace=True)

    # Data types, a problem
    for k, v in compat.iteritems(meta_vals):
        if meta_prefix is not None:
            k = meta_prefix + k

        if k in result:
            raise ValueError('Conflicting metadata name %s, '
                             'need distinguishing prefix ' % k)

        result[k] = np.array(v).repeat(lengths)

    return result
Beispiel #30
0
def json_normalize(data, record_path=None, meta=None,
                   meta_prefix=None,
                   record_prefix=None,
                   errors='raise',
                   sep='.'):
    """
    Normalize semi-structured JSON data into a flat table.

    Parameters
    ----------
    data : dict or list of dicts
        Unserialized JSON objects
    record_path : string or list of strings, default None
        Path in each object to list of records. If not passed, data will be
        assumed to be an array of records
    meta : list of paths (string or list of strings), default None
        Fields to use as metadata for each record in resulting table
    meta_prefix : string, default None
    record_prefix : string, default None
        If True, prefix records with dotted (?) path, e.g. foo.bar.field if
        path to records is ['foo', 'bar']
    errors : {'raise', 'ignore'}, default 'raise'

        * 'ignore' : will ignore KeyError if keys listed in meta are not
          always present
        * 'raise' : will raise KeyError if keys listed in meta are not
          always present

        .. versionadded:: 0.20.0

    sep : string, default '.'
        Nested records will generate names separated by sep,
        e.g., for sep='.', { 'foo' : { 'bar' : 0 } } -> foo.bar

        .. versionadded:: 0.20.0

    Returns
    -------
    frame : DataFrame

    Examples
    --------

    >>> from pandas.io.json import json_normalize
    >>> data = [{'id': 1, 'name': {'first': 'Coleen', 'last': 'Volk'}},
    ...         {'name': {'given': 'Mose', 'family': 'Regner'}},
    ...         {'id': 2, 'name': 'Faye Raker'}]
    >>> json_normalize(data)
        id        name name.family name.first name.given name.last
    0  1.0         NaN         NaN     Coleen        NaN      Volk
    1  NaN         NaN      Regner        NaN       Mose       NaN
    2  2.0  Faye Raker         NaN        NaN        NaN       NaN

    >>> data = [{'state': 'Florida',
    ...          'shortname': 'FL',
    ...          'info': {
    ...               'governor': 'Rick Scott'
    ...          },
    ...          'counties': [{'name': 'Dade', 'population': 12345},
    ...                      {'name': 'Broward', 'population': 40000},
    ...                      {'name': 'Palm Beach', 'population': 60000}]},
    ...         {'state': 'Ohio',
    ...          'shortname': 'OH',
    ...          'info': {
    ...               'governor': 'John Kasich'
    ...          },
    ...          'counties': [{'name': 'Summit', 'population': 1234},
    ...                       {'name': 'Cuyahoga', 'population': 1337}]}]
    >>> result = json_normalize(data, 'counties', ['state', 'shortname',
    ...                                           ['info', 'governor']])
    >>> result
             name  population info.governor    state shortname
    0        Dade       12345    Rick Scott  Florida        FL
    1     Broward       40000    Rick Scott  Florida        FL
    2  Palm Beach       60000    Rick Scott  Florida        FL
    3      Summit        1234   John Kasich     Ohio        OH
    4    Cuyahoga        1337   John Kasich     Ohio        OH

    >>> data = {'A': [1, 2]}
    >>> json_normalize(data, 'A', record_prefix='Prefix.')
        Prefix.0
    0          1
    1          2
    """
    def _pull_field(js, spec):
        result = js
        if isinstance(spec, list):
            for field in spec:
                result = result[field]
        else:
            result = result[spec]

        return result

    if isinstance(data, list) and not data:
        return DataFrame()

    # A bit of a hackjob
    if isinstance(data, dict):
        data = [data]

    if record_path is None:
        if any([isinstance(x, dict)
                for x in compat.itervalues(y)] for y in data):
            # naive normalization, this is idempotent for flat records
            # and potentially will inflate the data considerably for
            # deeply nested structures:
            #  {VeryLong: { b: 1,c:2}} -> {VeryLong.b:1 ,VeryLong.c:@}
            #
            # TODO: handle record value which are lists, at least error
            #       reasonably
            data = nested_to_record(data, sep=sep)
        return DataFrame(data)
    elif not isinstance(record_path, list):
        record_path = [record_path]

    if meta is None:
        meta = []
    elif not isinstance(meta, list):
        meta = [meta]

    meta = [m if isinstance(m, list) else [m] for m in meta]

    # Disastrously inefficient for now
    records = []
    lengths = []

    meta_vals = defaultdict(list)
    if not isinstance(sep, compat.string_types):
        sep = str(sep)
    meta_keys = [sep.join(val) for val in meta]

    def _recursive_extract(data, path, seen_meta, level=0):
        if isinstance(data, dict):
            data = [data]
        if len(path) > 1:
            for obj in data:
                for val, key in zip(meta, meta_keys):
                    if level + 1 == len(val):
                        seen_meta[key] = _pull_field(obj, val[-1])

                _recursive_extract(obj[path[0]], path[1:],
                                   seen_meta, level=level + 1)
        else:
            for obj in data:
                recs = _pull_field(obj, path[0])

                # For repeating the metadata later
                lengths.append(len(recs))

                for val, key in zip(meta, meta_keys):
                    if level + 1 > len(val):
                        meta_val = seen_meta[key]
                    else:
                        try:
                            meta_val = _pull_field(obj, val[level:])
                        except KeyError as e:
                            if errors == 'ignore':
                                meta_val = np.nan
                            else:
                                raise KeyError("Try running with "
                                               "errors='ignore' as key "
                                               "{err} is not always present"
                                               .format(err=e))
                    meta_vals[key].append(meta_val)

                records.extend(recs)

    _recursive_extract(data, record_path, {}, level=0)

    result = DataFrame(records)

    if record_prefix is not None:
        result = result.rename(
            columns=lambda x: "{p}{c}".format(p=record_prefix, c=x))

    # Data types, a problem
    for k, v in compat.iteritems(meta_vals):
        if meta_prefix is not None:
            k = meta_prefix + k

        if k in result:
            raise ValueError('Conflicting metadata name {name}, '
                             'need distinguishing prefix '.format(name=k))

        result[k] = np.array(v).repeat(lengths)

    return result
Beispiel #31
0
 def test_utf(self):
     # GH10581
     for encoding in self.utf_encodings:
         for frame in compat.itervalues(self.frame):
             result = self.encode_decode(frame, encoding=encoding)
             assert_frame_equal(result, frame)