def array(self, data, row_labels=None, column_labels=None, names=None): """ Parameters ---------- data : str range for data row_labels : str, optional range for row labels column_labels : str, optional range for column labels names : list of str, optional Returns ------- LArray """ if row_labels is not None: row_labels = np.asarray(self[row_labels]) if column_labels is not None: column_labels = np.asarray(self[column_labels]) if names is not None: labels = (row_labels, column_labels) axes = [ Axis(axis_labels, name) for axis_labels, name in zip(labels, names) ] else: axes = (row_labels, column_labels) # _converted_value is used implicitly via Range.__array__ return LArray(np.asarray(self[data]), axes)
def __eq__(self, other): self_keys = set(self.keys()) all_keys = list( self.keys()) + [n for n in other.keys() if n not in self_keys] res = [ larray_nan_equal(self.get(key), other.get(key)) for key in all_keys ] return LArray(res, [Axis(all_keys, 'name')])
def df_aslarray(df, sort_rows=False, sort_columns=False, raw=False, parse_header=True, **kwargs): # the dataframe was read without index at all (ie 2D dataframe), irrespective of the actual data dimensionality if raw: columns = df.columns.values.tolist() try: # take the first column which contains '\' # pos_last = next(i for i, v in enumerate(columns) if '\\' in str(v)) pos_last = next(i for i, v in enumerate(columns) if isinstance(v, basestring) and '\\' in v) onedim = False except StopIteration: # we assume first column will not contain data pos_last = 0 onedim = True axes_names = columns[:pos_last + 1] if onedim: df = df.iloc[:, 1:] else: # This is required to handle int column names (otherwise we can simply use column positions in set_index). # This is NOT the same as df.columns[list(range(...))] ! index_columns = [df.columns[i] for i in range(pos_last + 1)] # TODO: we should pass a flag to df_aslarray so that we can use inplace=True here # df.set_index(index_columns, inplace=True) df = df.set_index(index_columns) else: axes_names = [decode(name, 'utf8') for name in df.index.names] # handle 2 or more dimensions with the last axis name given using \ if isinstance(axes_names[-1], basestring) and '\\' in axes_names[-1]: last_axes = [name.strip() for name in axes_names[-1].split('\\')] axes_names = axes_names[:-1] + last_axes # handle 1D elif len(df) == 1 and axes_names == [None]: axes_names = [df.columns.name] # handle 2 or more dimensions with the last axis name given as the columns index name elif len(df) > 1: axes_names += [df.columns.name] if len(axes_names) > 1: df, axes_labels = cartesian_product_df(df, sort_rows=sort_rows, sort_columns=sort_columns, **kwargs) else: axes_labels = [] # we could inline df_aslarray into the functions that use it, so that the # original (non-cartesian) df is freed from memory at this point, but it # would be much uglier and would not lower the peak memory usage which # happens during cartesian_product_df.reindex # Pandas treats column labels as column names (strings) so we need to convert them to values last_axis_labels = [parse(cell) for cell in df.columns.values] if parse_header else list(df.columns.values) axes_labels.append(last_axis_labels) axes_names = [str(name) if name is not None else name for name in axes_names] axes = [Axis(labels, name) for labels, name in zip(axes_labels, axes_names)] data = df.values.reshape([len(axis) for axis in axes]) return LArray(data, axes)
def from_series(s, sort_rows=False): """ Converts Pandas Series into 1D LArray. Parameters ---------- s : Pandas Series Input Pandas Series. sort_rows : bool, optional Whether or not to sort the rows alphabetically. Defaults to False. Returns ------- LArray """ name = s.name if s.name is not None else s.index.name if name is not None: name = str(name) if sort_rows: s = s.sort_index() return LArray(s.values, Axis(s.index.values, name))
def array_equals(self, other): """Test if arrays of the current session are equal to those of another session. Equivalent to apply :py:meth:`LArray.equals` with flag nan_equals=True to all arrays from two sessions. Parameters ---------- other : Session Session to compare with. Returns ------- Boolean LArray See Also -------- Session.equals Examples -------- >>> s1 = Session([('arr1', ndtest(2)), ('arr2', ndtest((2, 2)))]) >>> s2 = Session([('arr1', ndtest(2)), ('arr2', ndtest((2, 2)))]) >>> s1.array_equals(s2) name arr1 arr2 True True Different value(s) >>> s2.arr1['a1'] = 0 >>> s1.array_equals(s2) name arr1 arr2 False True Different label(s) >>> s2.arr2 = ndtest("b=b0,b1; a=a0,a1") >>> s1.array_equals(s2) name arr1 arr2 False False Extra/missing array(s) >>> s2.arr3 = ndtest((3, 3)) >>> s1.array_equals(s2) name arr1 arr2 arr3 False False False """ self_keys = set(self.keys()) all_keys = list( self.keys()) + [n for n in other.keys() if n not in self_keys] def larray_nan_equal(a1, a2): try: a1 = aslarray(a1) except Exception: return False return a1.equals(a2, nan_equals=True) res = [ larray_nan_equal(self.get(key), other.get(key)) for key in all_keys ] return LArray(res, [Axis(all_keys, 'name')])
def from_series(s, sort_rows=False, fill_value=nan, meta=None, **kwargs): r""" Converts Pandas Series into Array. Parameters ---------- s : Pandas Series Input Pandas Series. sort_rows : bool, optional Whether or not to sort the rows alphabetically. Defaults to False. fill_value : scalar, optional Value used to fill cells corresponding to label combinations which are not present in the input Series. Defaults to NaN. meta : list of pairs or dict or OrderedDict or Metadata, optional Metadata (title, description, author, creation_date, ...) associated with the array. Keys must be strings. Values must be of type string, int, float, date, time or datetime. Returns ------- Array See Also -------- Array.to_series Examples -------- >>> from larray import ndtest >>> s = ndtest((2, 2, 2), dtype=float).to_series() >>> s # doctest: +NORMALIZE_WHITESPACE a b c a0 b0 c0 0.0 c1 1.0 b1 c0 2.0 c1 3.0 a1 b0 c0 4.0 c1 5.0 b1 c0 6.0 c1 7.0 dtype: float64 >>> from_series(s) a b\c c0 c1 a0 b0 0.0 1.0 a0 b1 2.0 3.0 a1 b0 4.0 5.0 a1 b1 6.0 7.0 """ if isinstance(s.index, pd.MultiIndex): # TODO: use argument sort=False when it will be available # (see https://github.com/pandas-dev/pandas/issues/15105) df = s.unstack(level=-1, fill_value=fill_value) # pandas (un)stack and pivot(_table) methods return a Dataframe/Series with sorted index and columns if not sort_rows: labels = index_to_labels(s.index, sort=False) if isinstance(df.index, pd.MultiIndex): index = pd.MultiIndex.from_tuples(list(product(*labels[:-1])), names=s.index.names[:-1]) else: index = labels[0] columns = labels[-1] df = df.reindex(index=index, columns=columns, fill_value=fill_value) return from_frame(df, sort_rows=sort_rows, sort_columns=sort_rows, fill_value=fill_value, meta=meta, **kwargs) else: name = decode(s.name, 'utf8') if s.name is not None else decode( s.index.name, 'utf8') if sort_rows: s = s.sort_index() return Array(s.values, Axis(s.index.values, name), meta=meta)
def from_frame(df, sort_rows=False, sort_columns=False, parse_header=False, unfold_last_axis_name=False, fill_value=nan, meta=None, cartesian_prod=True, **kwargs): r""" Converts Pandas DataFrame into Array. Parameters ---------- df : pandas.DataFrame Input dataframe. By default, name and labels of the last axis are defined by the name and labels of the columns Index of the dataframe unless argument unfold_last_axis_name is set to True. sort_rows : bool, optional Whether or not to sort the rows alphabetically (sorting is more efficient than not sorting). Must be False if `cartesian_prod` is set to True. Defaults to False. sort_columns : bool, optional Whether or not to sort the columns alphabetically (sorting is more efficient than not sorting). Must be False if `cartesian_prod` is set to True. Defaults to False. parse_header : bool, optional Whether or not to parse columns labels. Pandas treats column labels as strings. If True, column labels are converted into int, float or boolean when possible. Defaults to False. unfold_last_axis_name : bool, optional Whether or not to extract the names of the last two axes by splitting the name of the last index column of the dataframe using ``\``. Defaults to False. fill_value : scalar, optional Value used to fill cells corresponding to label combinations which are not present in the input DataFrame. Defaults to NaN. meta : list of pairs or dict or OrderedDict or Metadata, optional Metadata (title, description, author, creation_date, ...) associated with the array. Keys must be strings. Values must be of type string, int, float, date, time or datetime. cartesian_prod : bool, optional Whether or not to expand the dataframe to a cartesian product dataframe as needed by Array. This is an expensive operation but is absolutely required if you cannot guarantee your dataframe is already well formed. If True, arguments `sort_rows` and `sort_columns` must be set to False. Defaults to True. Returns ------- Array See Also -------- Array.to_frame Examples -------- >>> from larray import ndtest >>> df = ndtest((2, 2, 2)).to_frame() >>> df # doctest: +NORMALIZE_WHITESPACE c c0 c1 a b a0 b0 0 1 b1 2 3 a1 b0 4 5 b1 6 7 >>> from_frame(df) a b\c c0 c1 a0 b0 0 1 a0 b1 2 3 a1 b0 4 5 a1 b1 6 7 Names of the last two axes written as ``before_last_axis_name\\last_axis_name`` >>> df = ndtest((2, 2, 2)).to_frame(fold_last_axis_name=True) >>> df # doctest: +NORMALIZE_WHITESPACE c0 c1 a b\c a0 b0 0 1 b1 2 3 a1 b0 4 5 b1 6 7 >>> from_frame(df, unfold_last_axis_name=True) a b\c c0 c1 a0 b0 0 1 a0 b1 2 3 a1 b0 4 5 a1 b1 6 7 """ axes_names = [ decode(name, 'utf8') if isinstance(name, bytes) else name for name in df.index.names ] # handle 2 or more dimensions with the last axis name given using \ if unfold_last_axis_name: if isinstance(axes_names[-1], str) and '\\' in axes_names[-1]: last_axes = [name.strip() for name in axes_names[-1].split('\\')] axes_names = axes_names[:-1] + last_axes else: axes_names += [None] else: axes_names += [df.columns.name] if cartesian_prod: df, axes_labels = cartesian_product_df(df, sort_rows=sort_rows, sort_columns=sort_columns, fill_value=fill_value, **kwargs) else: if sort_rows or sort_columns: raise ValueError( 'sort_rows and sort_columns cannot not be used when cartesian_prod is set to False. ' 'Please call the method sort_axes on the returned array to sort rows or columns' ) axes_labels = index_to_labels(df.index, sort=False) # Pandas treats column labels as column names (strings) so we need to convert them to values last_axis_labels = [parse(cell) for cell in df.columns.values ] if parse_header else list(df.columns.values) axes_labels.append(last_axis_labels) axes = AxisCollection( [Axis(labels, name) for labels, name in zip(axes_labels, axes_names)]) data = df.values.reshape(axes.shape) return Array(data, axes, meta=meta)
def read_hdf(filepath_or_buffer, key, fill_value=nan, na=nan, sort_rows=False, sort_columns=False, name=None, **kwargs): r"""Reads a scalar or an axis or group or array named key from a HDF5 file in filepath (path+name) Parameters ---------- filepath_or_buffer : str or pandas.HDFStore Path and name where the HDF5 file is stored or a HDFStore object. key : str or Group Name of the scalar or axis or group or array. fill_value : scalar or Array, optional Value used to fill cells corresponding to label combinations which are not present in the input. Defaults to NaN. sort_rows : bool, optional Whether or not to sort the rows alphabetically. Must be False if the read array has been dumped with an larray version >= 0.30. Defaults to False. sort_columns : bool, optional Whether or not to sort the columns alphabetically. Must be False if the read array has been dumped with an larray version >= 0.30. Defaults to False. name : str, optional Name of the axis or group to return. If None, name is set to passed key. Defaults to None. Returns ------- Array Examples -------- >>> fname = get_example_filepath('examples.h5') Read array by passing its identifier (key) inside the HDF file >>> # The data below is derived from a subset of the demo_pjan table from Eurostat >>> read_hdf(fname, 'pop') # doctest: +SKIP country gender\time 2013 2014 2015 Belgium Male 5472856 5493792 5524068 Belgium Female 5665118 5687048 5713206 France Male 31772665 32045129 32174258 France Female 33827685 34120851 34283895 Germany Male 39380976 39556923 39835457 Germany Female 41142770 41210540 41362080 """ if not np.isnan(na): fill_value = na warnings.warn( "read_hdf `na` argument has been renamed to `fill_value`. Please use that instead.", FutureWarning, stacklevel=2) key = _translate_group_key_hdf(key) res = None with LHDFStore(filepath_or_buffer) as store: try: pd_obj = store.get(key) except KeyError: filepath = filepath_or_buffer if isinstance( filepath_or_buffer, HDFStore) else store.filename raise KeyError( f'No item with name {key} has been found in file {filepath}') attrs = store.get_storer(key).attrs writer = attrs.writer if 'writer' in attrs else None _type = _get_type_from_attrs(attrs) _meta = attrs.metadata if 'metadata' in attrs else None if _type == 'Array': # cartesian product is not necessary if the array was written by LArray cartesian_prod = writer != 'LArray' res = df_asarray(pd_obj, sort_rows=sort_rows, sort_columns=sort_columns, fill_value=fill_value, parse_header=False, cartesian_prod=cartesian_prod) if _meta is not None: res.meta = _meta elif _type == 'Axis': if name is None: name = str(pd_obj.name) if name == 'None': name = None labels = pd_obj.values if 'dtype_kind' in attrs and attrs['dtype_kind'] == 'U': # this check is there because there are cases where dtype_kind is 'U' but pandas returns # an array with object dtype containing bytes instead of a string array, and in that case # np.char.decode does not work # this is at least the case for Python2 + Pandas 0.24.2 combination if labels.dtype.kind == 'O': labels = np.array( [label.decode('utf-8') for label in labels], dtype='U') else: labels = np.char.decode(labels, 'utf-8') res = Axis(labels=labels, name=name) res._iswildcard = attrs['wildcard'] elif _type == 'Group': if name is None: name = str(pd_obj.name) if name == 'None': name = None key = pd_obj.values if 'dtype_kind' in attrs and attrs['dtype_kind'] == 'U': key = np.char.decode(key, 'utf-8') axis = read_hdf(filepath_or_buffer, attrs['axis_key']) res = LGroup(key=key, name=name, axis=axis) elif _type in _supported_typenames: res = pd_obj.values assert len(res) == 1 res = res[0] return res
def from_frame(df, sort_rows=False, sort_columns=False, parse_header=False, unfold_last_axis_name=False, **kwargs): """ Converts Pandas DataFrame into LArray. Parameters ---------- df : pandas.DataFrame Input dataframe. By default, name and labels of the last axis are defined by the name and labels of the columns Index of the dataframe unless argument unfold_last_axis_name is set to True. sort_rows : bool, optional Whether or not to sort the rows alphabetically (sorting is more efficient than not sorting). Defaults to False. sort_columns : bool, optional Whether or not to sort the columns alphabetically (sorting is more efficient than not sorting). Defaults to False. parse_header : bool, optional Whether or not to parse columns labels. Pandas treats column labels as strings. If True, column labels are converted into int, float or boolean when possible. Defaults to False. unfold_last_axis_name : bool, optional Whether or not to extract the names of the last two axes by splitting the name of the last index column of the dataframe using ``\\``. Defaults to False. Returns ------- LArray See Also -------- LArray.to_frame Examples -------- >>> df = ndtest((2, 2, 2)).to_frame() >>> df # doctest: +NORMALIZE_WHITESPACE c c0 c1 a b a0 b0 0 1 b1 2 3 a1 b0 4 5 b1 6 7 >>> from_frame(df) a b\\c c0 c1 a0 b0 0 1 a0 b1 2 3 a1 b0 4 5 a1 b1 6 7 Names of the last two axes written as ``before_last_axis_name\\last_axis_name`` >>> df = ndtest((2, 2, 2)).to_frame(fold_last_axis_name=True) >>> df # doctest: +NORMALIZE_WHITESPACE c0 c1 a b\\c a0 b0 0 1 b1 2 3 a1 b0 4 5 b1 6 7 >>> from_frame(df, unfold_last_axis_name=True) a b\\c c0 c1 a0 b0 0 1 a0 b1 2 3 a1 b0 4 5 a1 b1 6 7 """ axes_names = [decode(name, 'utf8') for name in df.index.names] # handle 2 or more dimensions with the last axis name given using \ if unfold_last_axis_name: if isinstance(axes_names[-1], basestring) and '\\' in axes_names[-1]: last_axes = [name.strip() for name in axes_names[-1].split('\\')] axes_names = axes_names[:-1] + last_axes else: axes_names += [None] else: axes_names += [df.columns.name] df, axes_labels = cartesian_product_df(df, sort_rows=sort_rows, sort_columns=sort_columns, **kwargs) # Pandas treats column labels as column names (strings) so we need to convert them to values last_axis_labels = [parse(cell) for cell in df.columns.values ] if parse_header else list(df.columns.values) axes_labels.append(last_axis_labels) axes_names = [ str(name) if name is not None else name for name in axes_names ] axes = [ Axis(labels, name) for labels, name in zip(axes_labels, axes_names) ] data = df.values.reshape([len(axis) for axis in axes]) return LArray(data, axes)
def read_hdf(filepath_or_buffer, key, fill_value=nan, na=nan, sort_rows=False, sort_columns=False, name=None, **kwargs): """Reads an axis or group or array named key from a HDF5 file in filepath (path+name) Parameters ---------- filepath_or_buffer : str or pandas.HDFStore Path and name where the HDF5 file is stored or a HDFStore object. key : str or Group Name of the array. fill_value : scalar or LArray, optional Value used to fill cells corresponding to label combinations which are not present in the input. Defaults to NaN. sort_rows : bool, optional Whether or not to sort the rows alphabetically. Must be False if the read array has been dumped with an larray version >= 0.30. Defaults to False. sort_columns : bool, optional Whether or not to sort the columns alphabetically. Must be False if the read array has been dumped with an larray version >= 0.30. Defaults to False. name : str, optional Name of the axis or group to return. If None, name is set to passed key. Defaults to None. Returns ------- LArray Examples -------- >>> fname = get_example_filepath('examples.h5') Read array by passing its identifier (key) inside the HDF file >>> # The data below is derived from a subset of the demo_pjan table from Eurostat >>> read_hdf(fname, 'pop') country gender\\time 2013 2014 2015 Belgium Male 5472856 5493792 5524068 Belgium Female 5665118 5687048 5713206 France Male 31772665 31936596 32175328 France Female 33827685 34005671 34280951 Germany Male 39380976 39556923 39835457 Germany Female 41142770 41210540 41362080 """ if not np.isnan(na): fill_value = na warnings.warn("read_hdf `na` argument has been renamed to `fill_value`. Please use that instead.", FutureWarning, stacklevel=2) key = _translate_group_key_hdf(key) res = None with LHDFStore(filepath_or_buffer) as store: pd_obj = store.get(key) attrs = store.get_storer(key).attrs writer = attrs.writer if 'writer' in attrs else None # for backward compatibility but any object read from an hdf file should have an attribute 'type' _type = attrs.type if 'type' in attrs else 'Array' _meta = attrs.metadata if 'metadata' in attrs else None if _type == 'Array': # cartesian product is not necessary if the array was written by LArray cartesian_prod = writer != 'LArray' res = df_aslarray(pd_obj, sort_rows=sort_rows, sort_columns=sort_columns, fill_value=fill_value, parse_header=False, cartesian_prod=cartesian_prod) if _meta is not None: res.meta = _meta elif _type == 'Axis': if name is None: name = str(pd_obj.name) if name == 'None': name = None res = Axis(labels=pd_obj.values, name=name) res._iswildcard = attrs['wildcard'] elif _type == 'Group': if name is None: name = str(pd_obj.name) if name == 'None': name = None axis = read_hdf(filepath_or_buffer, attrs['axis_key']) res = LGroup(key=pd_obj.values, name=name, axis=axis) return res