コード例 #1
0
ファイル: excel.py プロジェクト: Avasse/larray
        def array(self, data, row_labels=None, column_labels=None, names=None):
            """

            Parameters
            ----------
            data : str
                range for data
            row_labels : str, optional
                range for row labels
            column_labels : str, optional
                range for column labels
            names : list of str, optional

            Returns
            -------
            LArray
            """
            if row_labels is not None:
                row_labels = np.asarray(self[row_labels])
            if column_labels is not None:
                column_labels = np.asarray(self[column_labels])
            if names is not None:
                labels = (row_labels, column_labels)
                axes = [
                    Axis(axis_labels, name)
                    for axis_labels, name in zip(labels, names)
                ]
            else:
                axes = (row_labels, column_labels)
            # _converted_value is used implicitly via Range.__array__
            return LArray(np.asarray(self[data]), axes)
コード例 #2
0
 def __eq__(self, other):
     self_keys = set(self.keys())
     all_keys = list(
         self.keys()) + [n for n in other.keys() if n not in self_keys]
     res = [
         larray_nan_equal(self.get(key), other.get(key)) for key in all_keys
     ]
     return LArray(res, [Axis(all_keys, 'name')])
コード例 #3
0
def df_aslarray(df, sort_rows=False, sort_columns=False, raw=False, parse_header=True, **kwargs):
    # the dataframe was read without index at all (ie 2D dataframe), irrespective of the actual data dimensionality
    if raw:
        columns = df.columns.values.tolist()
        try:
            # take the first column which contains '\'
            # pos_last = next(i for i, v in enumerate(columns) if '\\' in str(v))
            pos_last = next(i for i, v in enumerate(columns) if isinstance(v, basestring) and '\\' in v)
            onedim = False
        except StopIteration:
            # we assume first column will not contain data
            pos_last = 0
            onedim = True

        axes_names = columns[:pos_last + 1]
        if onedim:
            df = df.iloc[:, 1:]
        else:
            # This is required to handle int column names (otherwise we can simply use column positions in set_index).
            # This is NOT the same as df.columns[list(range(...))] !
            index_columns = [df.columns[i] for i in range(pos_last + 1)]
            # TODO: we should pass a flag to df_aslarray so that we can use inplace=True here
            # df.set_index(index_columns, inplace=True)
            df = df.set_index(index_columns)
    else:
        axes_names = [decode(name, 'utf8') for name in df.index.names]

    # handle 2 or more dimensions with the last axis name given using \
    if isinstance(axes_names[-1], basestring) and '\\' in axes_names[-1]:
        last_axes = [name.strip() for name in axes_names[-1].split('\\')]
        axes_names = axes_names[:-1] + last_axes
    # handle 1D
    elif len(df) == 1 and axes_names == [None]:
        axes_names = [df.columns.name]
    # handle 2 or more dimensions with the last axis name given as the columns index name
    elif len(df) > 1:
        axes_names += [df.columns.name]

    if len(axes_names) > 1:
        df, axes_labels = cartesian_product_df(df, sort_rows=sort_rows, sort_columns=sort_columns, **kwargs)
    else:
        axes_labels = []

    # we could inline df_aslarray into the functions that use it, so that the
    # original (non-cartesian) df is freed from memory at this point, but it
    # would be much uglier and would not lower the peak memory usage which
    # happens during cartesian_product_df.reindex

    # Pandas treats column labels as column names (strings) so we need to convert them to values
    last_axis_labels = [parse(cell) for cell in df.columns.values] if parse_header else list(df.columns.values)
    axes_labels.append(last_axis_labels)
    axes_names = [str(name) if name is not None else name
                  for name in axes_names]

    axes = [Axis(labels, name) for labels, name in zip(axes_labels, axes_names)]
    data = df.values.reshape([len(axis) for axis in axes])
    return LArray(data, axes)
コード例 #4
0
ファイル: array.py プロジェクト: Avasse/larray
def from_series(s, sort_rows=False):
    """
    Converts Pandas Series into 1D LArray.

    Parameters
    ----------
    s : Pandas Series
        Input Pandas Series.
    sort_rows : bool, optional
        Whether or not to sort the rows alphabetically. Defaults to False.

    Returns
    -------
    LArray
    """
    name = s.name if s.name is not None else s.index.name
    if name is not None:
        name = str(name)
    if sort_rows:
        s = s.sort_index()
    return LArray(s.values, Axis(s.index.values, name))
コード例 #5
0
    def array_equals(self, other):
        """Test if arrays of the current session are equal to those of another session.

        Equivalent to apply :py:meth:`LArray.equals` with flag nan_equals=True to all arrays from two sessions.

        Parameters
        ----------
        other : Session
            Session to compare with.

        Returns
        -------
        Boolean LArray

        See Also
        --------
        Session.equals

        Examples
        --------
        >>> s1 = Session([('arr1', ndtest(2)), ('arr2', ndtest((2, 2)))])
        >>> s2 = Session([('arr1', ndtest(2)), ('arr2', ndtest((2, 2)))])
        >>> s1.array_equals(s2)
        name  arr1  arr2
              True  True

        Different value(s)

        >>> s2.arr1['a1'] = 0
        >>> s1.array_equals(s2)
        name   arr1  arr2
              False  True

        Different label(s)

        >>> s2.arr2 = ndtest("b=b0,b1; a=a0,a1")
        >>> s1.array_equals(s2)
        name   arr1   arr2
              False  False

        Extra/missing array(s)

        >>> s2.arr3 = ndtest((3, 3))
        >>> s1.array_equals(s2)
        name   arr1   arr2   arr3
              False  False  False
        """
        self_keys = set(self.keys())
        all_keys = list(
            self.keys()) + [n for n in other.keys() if n not in self_keys]

        def larray_nan_equal(a1, a2):
            try:
                a1 = aslarray(a1)
            except Exception:
                return False
            return a1.equals(a2, nan_equals=True)

        res = [
            larray_nan_equal(self.get(key), other.get(key)) for key in all_keys
        ]
        return LArray(res, [Axis(all_keys, 'name')])
コード例 #6
0
ファイル: pandas.py プロジェクト: alixdamman/larray
def from_series(s, sort_rows=False, fill_value=nan, meta=None, **kwargs):
    r"""
    Converts Pandas Series into Array.

    Parameters
    ----------
    s : Pandas Series
        Input Pandas Series.
    sort_rows : bool, optional
        Whether or not to sort the rows alphabetically. Defaults to False.
    fill_value : scalar, optional
        Value used to fill cells corresponding to label combinations which are not present in the input Series.
        Defaults to NaN.
    meta : list of pairs or dict or OrderedDict or Metadata, optional
        Metadata (title, description, author, creation_date, ...) associated with the array.
        Keys must be strings. Values must be of type string, int, float, date, time or datetime.

    Returns
    -------
    Array

    See Also
    --------
    Array.to_series

    Examples
    --------
    >>> from larray import ndtest
    >>> s = ndtest((2, 2, 2), dtype=float).to_series()
    >>> s                                                                             # doctest: +NORMALIZE_WHITESPACE
    a   b   c
    a0  b0  c0    0.0
            c1    1.0
        b1  c0    2.0
            c1    3.0
    a1  b0  c0    4.0
            c1    5.0
        b1  c0    6.0
            c1    7.0
    dtype: float64
    >>> from_series(s)
     a  b\c   c0   c1
    a0   b0  0.0  1.0
    a0   b1  2.0  3.0
    a1   b0  4.0  5.0
    a1   b1  6.0  7.0
    """
    if isinstance(s.index, pd.MultiIndex):
        # TODO: use argument sort=False when it will be available
        # (see https://github.com/pandas-dev/pandas/issues/15105)
        df = s.unstack(level=-1, fill_value=fill_value)
        # pandas (un)stack and pivot(_table) methods return a Dataframe/Series with sorted index and columns
        if not sort_rows:
            labels = index_to_labels(s.index, sort=False)
            if isinstance(df.index, pd.MultiIndex):
                index = pd.MultiIndex.from_tuples(list(product(*labels[:-1])),
                                                  names=s.index.names[:-1])
            else:
                index = labels[0]
            columns = labels[-1]
            df = df.reindex(index=index,
                            columns=columns,
                            fill_value=fill_value)
        return from_frame(df,
                          sort_rows=sort_rows,
                          sort_columns=sort_rows,
                          fill_value=fill_value,
                          meta=meta,
                          **kwargs)
    else:
        name = decode(s.name, 'utf8') if s.name is not None else decode(
            s.index.name, 'utf8')
        if sort_rows:
            s = s.sort_index()
        return Array(s.values, Axis(s.index.values, name), meta=meta)
コード例 #7
0
ファイル: pandas.py プロジェクト: alixdamman/larray
def from_frame(df,
               sort_rows=False,
               sort_columns=False,
               parse_header=False,
               unfold_last_axis_name=False,
               fill_value=nan,
               meta=None,
               cartesian_prod=True,
               **kwargs):
    r"""
    Converts Pandas DataFrame into Array.

    Parameters
    ----------
    df : pandas.DataFrame
        Input dataframe. By default, name and labels of the last axis are defined by the name and labels of the
        columns Index of the dataframe unless argument unfold_last_axis_name is set to True.
    sort_rows : bool, optional
        Whether or not to sort the rows alphabetically (sorting is more efficient than not sorting).
        Must be False if `cartesian_prod` is set to True.
        Defaults to False.
    sort_columns : bool, optional
        Whether or not to sort the columns alphabetically (sorting is more efficient than not sorting).
        Must be False if `cartesian_prod` is set to True.
        Defaults to False.
    parse_header : bool, optional
        Whether or not to parse columns labels. Pandas treats column labels as strings.
        If True, column labels are converted into int, float or boolean when possible. Defaults to False.
    unfold_last_axis_name : bool, optional
        Whether or not to extract the names of the last two axes by splitting the name of the last index column of the
        dataframe using ``\``. Defaults to False.
    fill_value : scalar, optional
        Value used to fill cells corresponding to label combinations which are not present in the input DataFrame.
        Defaults to NaN.
    meta : list of pairs or dict or OrderedDict or Metadata, optional
        Metadata (title, description, author, creation_date, ...) associated with the array.
        Keys must be strings. Values must be of type string, int, float, date, time or datetime.
    cartesian_prod : bool, optional
        Whether or not to expand the dataframe to a cartesian product dataframe as needed by Array.
        This is an expensive operation but is absolutely required if you cannot guarantee your dataframe is already
        well formed. If True, arguments `sort_rows` and `sort_columns` must be set to False.
        Defaults to True.

    Returns
    -------
    Array

    See Also
    --------
    Array.to_frame

    Examples
    --------
    >>> from larray import ndtest
    >>> df = ndtest((2, 2, 2)).to_frame()
    >>> df                                                                             # doctest: +NORMALIZE_WHITESPACE
    c      c0  c1
    a  b
    a0 b0   0   1
       b1   2   3
    a1 b0   4   5
       b1   6   7
    >>> from_frame(df)
     a  b\c  c0  c1
    a0   b0   0   1
    a0   b1   2   3
    a1   b0   4   5
    a1   b1   6   7

    Names of the last two axes written as ``before_last_axis_name\\last_axis_name``

    >>> df = ndtest((2, 2, 2)).to_frame(fold_last_axis_name=True)
    >>> df                                                                             # doctest: +NORMALIZE_WHITESPACE
            c0  c1
    a  b\c
    a0 b0    0   1
       b1    2   3
    a1 b0    4   5
       b1    6   7
    >>> from_frame(df, unfold_last_axis_name=True)
     a  b\c  c0  c1
    a0   b0   0   1
    a0   b1   2   3
    a1   b0   4   5
    a1   b1   6   7
    """
    axes_names = [
        decode(name, 'utf8') if isinstance(name, bytes) else name
        for name in df.index.names
    ]

    # handle 2 or more dimensions with the last axis name given using \
    if unfold_last_axis_name:
        if isinstance(axes_names[-1], str) and '\\' in axes_names[-1]:
            last_axes = [name.strip() for name in axes_names[-1].split('\\')]
            axes_names = axes_names[:-1] + last_axes
        else:
            axes_names += [None]
    else:
        axes_names += [df.columns.name]

    if cartesian_prod:
        df, axes_labels = cartesian_product_df(df,
                                               sort_rows=sort_rows,
                                               sort_columns=sort_columns,
                                               fill_value=fill_value,
                                               **kwargs)
    else:
        if sort_rows or sort_columns:
            raise ValueError(
                'sort_rows and sort_columns cannot not be used when cartesian_prod is set to False. '
                'Please call the method sort_axes on the returned array to sort rows or columns'
            )
        axes_labels = index_to_labels(df.index, sort=False)

    # Pandas treats column labels as column names (strings) so we need to convert them to values
    last_axis_labels = [parse(cell) for cell in df.columns.values
                        ] if parse_header else list(df.columns.values)
    axes_labels.append(last_axis_labels)

    axes = AxisCollection(
        [Axis(labels, name) for labels, name in zip(axes_labels, axes_names)])
    data = df.values.reshape(axes.shape)
    return Array(data, axes, meta=meta)
コード例 #8
0
def read_hdf(filepath_or_buffer,
             key,
             fill_value=nan,
             na=nan,
             sort_rows=False,
             sort_columns=False,
             name=None,
             **kwargs):
    r"""Reads a scalar or an axis or group or array named key from a HDF5 file in filepath (path+name)

    Parameters
    ----------
    filepath_or_buffer : str or pandas.HDFStore
        Path and name where the HDF5 file is stored or a HDFStore object.
    key : str or Group
        Name of the scalar or axis or group or array.
    fill_value : scalar or Array, optional
        Value used to fill cells corresponding to label combinations which are not present in the input.
        Defaults to NaN.
    sort_rows : bool, optional
        Whether or not to sort the rows alphabetically.
        Must be False if the read array has been dumped with an larray version >= 0.30.
        Defaults to False.
    sort_columns : bool, optional
        Whether or not to sort the columns alphabetically.
        Must be False if the read array has been dumped with an larray version >= 0.30.
        Defaults to False.
    name : str, optional
        Name of the axis or group to return. If None, name is set to passed key.
        Defaults to None.

    Returns
    -------
    Array

    Examples
    --------
    >>> fname = get_example_filepath('examples.h5')

    Read array by passing its identifier (key) inside the HDF file

    >>> # The data below is derived from a subset of the demo_pjan table from Eurostat
    >>> read_hdf(fname, 'pop')                     # doctest: +SKIP
    country  gender\time      2013      2014      2015
    Belgium         Male   5472856   5493792   5524068
    Belgium       Female   5665118   5687048   5713206
     France         Male  31772665  32045129  32174258
     France       Female  33827685  34120851  34283895
    Germany         Male  39380976  39556923  39835457
    Germany       Female  41142770  41210540  41362080
    """
    if not np.isnan(na):
        fill_value = na
        warnings.warn(
            "read_hdf `na` argument has been renamed to `fill_value`. Please use that instead.",
            FutureWarning,
            stacklevel=2)

    key = _translate_group_key_hdf(key)
    res = None
    with LHDFStore(filepath_or_buffer) as store:
        try:
            pd_obj = store.get(key)
        except KeyError:
            filepath = filepath_or_buffer if isinstance(
                filepath_or_buffer, HDFStore) else store.filename
            raise KeyError(
                f'No item with name {key} has been found in file {filepath}')
        attrs = store.get_storer(key).attrs
        writer = attrs.writer if 'writer' in attrs else None
        _type = _get_type_from_attrs(attrs)
        _meta = attrs.metadata if 'metadata' in attrs else None
        if _type == 'Array':
            # cartesian product is not necessary if the array was written by LArray
            cartesian_prod = writer != 'LArray'
            res = df_asarray(pd_obj,
                             sort_rows=sort_rows,
                             sort_columns=sort_columns,
                             fill_value=fill_value,
                             parse_header=False,
                             cartesian_prod=cartesian_prod)
            if _meta is not None:
                res.meta = _meta
        elif _type == 'Axis':
            if name is None:
                name = str(pd_obj.name)
            if name == 'None':
                name = None
            labels = pd_obj.values
            if 'dtype_kind' in attrs and attrs['dtype_kind'] == 'U':
                # this check is there because there are cases where dtype_kind is 'U' but pandas returns
                # an array with object dtype containing bytes instead of a string array, and in that case
                # np.char.decode does not work
                # this is at least the case for Python2 + Pandas 0.24.2 combination
                if labels.dtype.kind == 'O':
                    labels = np.array(
                        [label.decode('utf-8') for label in labels], dtype='U')
                else:
                    labels = np.char.decode(labels, 'utf-8')
            res = Axis(labels=labels, name=name)
            res._iswildcard = attrs['wildcard']
        elif _type == 'Group':
            if name is None:
                name = str(pd_obj.name)
            if name == 'None':
                name = None
            key = pd_obj.values
            if 'dtype_kind' in attrs and attrs['dtype_kind'] == 'U':
                key = np.char.decode(key, 'utf-8')
            axis = read_hdf(filepath_or_buffer, attrs['axis_key'])
            res = LGroup(key=key, name=name, axis=axis)
        elif _type in _supported_typenames:
            res = pd_obj.values
            assert len(res) == 1
            res = res[0]
    return res
コード例 #9
0
ファイル: array.py プロジェクト: Avasse/larray
def from_frame(df,
               sort_rows=False,
               sort_columns=False,
               parse_header=False,
               unfold_last_axis_name=False,
               **kwargs):
    """
    Converts Pandas DataFrame into LArray.

    Parameters
    ----------
    df : pandas.DataFrame
        Input dataframe. By default, name and labels of the last axis are defined by the name and labels of the
        columns Index of the dataframe unless argument unfold_last_axis_name is set to True.
    sort_rows : bool, optional
        Whether or not to sort the rows alphabetically (sorting is more efficient than not sorting). Defaults to False.
    sort_columns : bool, optional
        Whether or not to sort the columns alphabetically (sorting is more efficient than not sorting).
        Defaults to False.
    parse_header : bool, optional
        Whether or not to parse columns labels. Pandas treats column labels as strings.
        If True, column labels are converted into int, float or boolean when possible. Defaults to False.
    unfold_last_axis_name : bool, optional
        Whether or not to extract the names of the last two axes by splitting the name of the last index column of the
        dataframe using ``\\``. Defaults to False.

    Returns
    -------
    LArray

    See Also
    --------
    LArray.to_frame

    Examples
    --------
    >>> df = ndtest((2, 2, 2)).to_frame()
    >>> df                                                                             # doctest: +NORMALIZE_WHITESPACE
    c      c0  c1
    a  b
    a0 b0   0   1
       b1   2   3
    a1 b0   4   5
       b1   6   7
    >>> from_frame(df)
     a  b\\c  c0  c1
    a0   b0   0   1
    a0   b1   2   3
    a1   b0   4   5
    a1   b1   6   7

    Names of the last two axes written as ``before_last_axis_name\\last_axis_name``

    >>> df = ndtest((2, 2, 2)).to_frame(fold_last_axis_name=True)
    >>> df                                                                             # doctest: +NORMALIZE_WHITESPACE
            c0  c1
    a  b\\c
    a0 b0    0   1
       b1    2   3
    a1 b0    4   5
       b1    6   7
    >>> from_frame(df, unfold_last_axis_name=True)
     a  b\\c  c0  c1
    a0   b0   0   1
    a0   b1   2   3
    a1   b0   4   5
    a1   b1   6   7
    """
    axes_names = [decode(name, 'utf8') for name in df.index.names]

    # handle 2 or more dimensions with the last axis name given using \
    if unfold_last_axis_name:
        if isinstance(axes_names[-1], basestring) and '\\' in axes_names[-1]:
            last_axes = [name.strip() for name in axes_names[-1].split('\\')]
            axes_names = axes_names[:-1] + last_axes
        else:
            axes_names += [None]
    else:
        axes_names += [df.columns.name]

    df, axes_labels = cartesian_product_df(df,
                                           sort_rows=sort_rows,
                                           sort_columns=sort_columns,
                                           **kwargs)

    # Pandas treats column labels as column names (strings) so we need to convert them to values
    last_axis_labels = [parse(cell) for cell in df.columns.values
                        ] if parse_header else list(df.columns.values)
    axes_labels.append(last_axis_labels)
    axes_names = [
        str(name) if name is not None else name for name in axes_names
    ]

    axes = [
        Axis(labels, name) for labels, name in zip(axes_labels, axes_names)
    ]
    data = df.values.reshape([len(axis) for axis in axes])
    return LArray(data, axes)
コード例 #10
0
ファイル: hdf.py プロジェクト: liam2/larray
def read_hdf(filepath_or_buffer, key, fill_value=nan, na=nan, sort_rows=False, sort_columns=False,
             name=None, **kwargs):
    """Reads an axis or group or array named key from a HDF5 file in filepath (path+name)

    Parameters
    ----------
    filepath_or_buffer : str or pandas.HDFStore
        Path and name where the HDF5 file is stored or a HDFStore object.
    key : str or Group
        Name of the array.
    fill_value : scalar or LArray, optional
        Value used to fill cells corresponding to label combinations which are not present in the input.
        Defaults to NaN.
    sort_rows : bool, optional
        Whether or not to sort the rows alphabetically.
        Must be False if the read array has been dumped with an larray version >= 0.30.
        Defaults to False.
    sort_columns : bool, optional
        Whether or not to sort the columns alphabetically.
        Must be False if the read array has been dumped with an larray version >= 0.30.
        Defaults to False.
    name : str, optional
        Name of the axis or group to return. If None, name is set to passed key.
        Defaults to None.

    Returns
    -------
    LArray

    Examples
    --------
    >>> fname = get_example_filepath('examples.h5')

    Read array by passing its identifier (key) inside the HDF file

    >>> # The data below is derived from a subset of the demo_pjan table from Eurostat
    >>> read_hdf(fname, 'pop')
    country  gender\\time      2013      2014      2015
    Belgium         Male   5472856   5493792   5524068
    Belgium       Female   5665118   5687048   5713206
     France         Male  31772665  31936596  32175328
     France       Female  33827685  34005671  34280951
    Germany         Male  39380976  39556923  39835457
    Germany       Female  41142770  41210540  41362080
    """
    if not np.isnan(na):
        fill_value = na
        warnings.warn("read_hdf `na` argument has been renamed to `fill_value`. Please use that instead.",
                      FutureWarning, stacklevel=2)

    key = _translate_group_key_hdf(key)
    res = None
    with LHDFStore(filepath_or_buffer) as store:
        pd_obj = store.get(key)
        attrs = store.get_storer(key).attrs
        writer = attrs.writer if 'writer' in attrs else None
        # for backward compatibility but any object read from an hdf file should have an attribute 'type'
        _type = attrs.type if 'type' in attrs else 'Array'
        _meta = attrs.metadata if 'metadata' in attrs else None
        if _type == 'Array':
            # cartesian product is not necessary if the array was written by LArray
            cartesian_prod = writer != 'LArray'
            res = df_aslarray(pd_obj, sort_rows=sort_rows, sort_columns=sort_columns, fill_value=fill_value,
                              parse_header=False, cartesian_prod=cartesian_prod)
            if _meta is not None:
                res.meta = _meta
        elif _type == 'Axis':
            if name is None:
                name = str(pd_obj.name)
            if name == 'None':
                name = None
            res = Axis(labels=pd_obj.values, name=name)
            res._iswildcard = attrs['wildcard']
        elif _type == 'Group':
            if name is None:
                name = str(pd_obj.name)
            if name == 'None':
                name = None
            axis = read_hdf(filepath_or_buffer, attrs['axis_key'])
            res = LGroup(key=pd_obj.values, name=name, axis=axis)
    return res