Beispiel #1
0
def _decode_mixins(tbl):
    """Decode a Table ``tbl`` that has astropy Columns + appropriate meta-data into
    the corresponding table with mixin columns (as appropriate).
    """
    # If available read in __serialized_columns__ meta info which is stored
    # in FITS COMMENTS between two sentinels.
    try:
        i0 = tbl.meta['comments'].index('--BEGIN-ASTROPY-SERIALIZED-COLUMNS--')
        i1 = tbl.meta['comments'].index('--END-ASTROPY-SERIALIZED-COLUMNS--')
    except (ValueError, KeyError):
        return tbl

    # The YAML data are split into COMMENT cards, with lines longer than 70
    # characters being split with a continuation character \ (backslash).
    # Strip the backslashes and join together.
    continuation_line = False
    lines = []
    for line in tbl.meta['comments'][i0 + 1:i1]:
        if continuation_line:
            lines[-1] = lines[-1] + line[:70]
        else:
            lines.append(line[:70])
        continuation_line = len(line) == 71

    del tbl.meta['comments'][i0:i1 + 1]
    if not tbl.meta['comments']:
        del tbl.meta['comments']

    try:
        info = meta.get_header_from_yaml(lines)
    except ImportError as exc:
        if 'PyYAML package is required' in str(exc):
            warnings.warn(
                "the file contains information about Astropy native objects "
                "(mixin columns) that have been serialized when writing it, "
                "but the PyYAML package is required to read those. Without "
                "this package some information will be missing in the table",
                AstropyUserWarning
            )
            return tbl
        else:
            raise

    # Add serialized column information to table meta for use in constructing mixins
    tbl.meta['__serialized_columns__'] = info['meta']['__serialized_columns__']

    # Use the `datatype` attribute info to update column attributes that are
    # NOT already handled via standard FITS column keys (name, dtype, unit).
    for col in info['datatype']:
        for attr in ['description', 'meta']:
            if attr in col:
                setattr(tbl[col['name']].info, attr, col[attr])

    # Construct new table with mixins, using tbl.meta['__serialized_columns__']
    # as guidance.
    tbl = serialize._construct_mixins_from_columns(tbl)

    return tbl
Beispiel #2
0
    def __call__(self, cols, meta):
        # Convert to a Table with all plain Column subclass columns
        out = super().__call__(cols, meta)

        # If mixin columns exist (based on the special '__mixin_columns__'
        # key in the table ``meta``), then use that information to construct
        # appropriate mixin columns and remove the original data columns.
        # If no __mixin_columns__ exists then this function just passes back
        # the input table.
        out = serialize._construct_mixins_from_columns(out)

        return out
Beispiel #3
0
    def __call__(self, cols, meta):
        # Convert to a Table with all plain Column subclass columns
        out = super().__call__(cols, meta)

        # If mixin columns exist (based on the special '__mixin_columns__'
        # key in the table ``meta``), then use that information to construct
        # appropriate mixin columns and remove the original data columns.
        # If no __mixin_columns__ exists then this function just passes back
        # the input table.
        out = serialize._construct_mixins_from_columns(out)

        return out
Beispiel #4
0
def recursively_read_dict_contents(input):
    """
    Will recursive read a dictionary, initializing quantities and table from a dictionary read from an hdf5 file.

    Parameters
    ----------
    input : dict
        dictionary read from hdf5

    Returns
    --------
    dict
        Dictionary we want to use

    """
    new_keys = [k for k in input.keys()]
    # if all(elem in new_keys for elem in ['wl_grid', 'data', 'time_grid']):
    #     wl = input['wl_grid']['value'] * u.Unit(input['wl_grid']['unit'])
    #     data = input['data']['value'] * u.Unit(input['data']['unit'])
    #     time = input['time_grid']['value'] * u.Unit(input['time_grid']['unit'])
    #     input = signal[str(input['data']['unit'])](wl, data, time)
    if all(elem in new_keys for elem in ['value', 'unit']):
        input['value'] = input['value'] * u.Unit(input['unit'])
    if any('.__table_column_meta__' in elem for elem in new_keys):
        table_keys = [
            elem for elem in new_keys if '.__table_column_meta__' in elem
        ]
        table_keys = (elem.split('.')[0] for elem in table_keys)
        for k in table_keys:
            table = Table(np.array(input[k]))
            header = meta.get_header_from_yaml(
                h.decode('utf-8')
                for h in input['{}.__table_column_meta__'.format(k)])
            header_cols = dict((x['name'], x) for x in header['datatype'])
            for col in table.columns.values():
                for attr in ('description', 'format', 'unit', 'meta'):
                    if attr in header_cols[col.name]:
                        setattr(col, attr, header_cols[col.name][attr])
            table = serialize._construct_mixins_from_columns(table)
            try:
                header['meta'].pop('__serialized_columns__')
                table.meta = header['meta']
            except KeyError:
                pass
            input[k] = table
    for key in new_keys:
        if isinstance(input[key], dict):
            input[key] = recursively_read_dict_contents(input[key])
    return input
Beispiel #5
0
def load_table(input_table, k):
    table = Table(np.array(input_table[k]))
    header = meta.get_header_from_yaml(
        h.decode('utf-8')
        for h in input_table['{}.__table_column_meta__'.format(k)])
    header_cols = dict((x['name'], x) for x in header['datatype'])
    for col in table.columns.values():
        for attr in ('description', 'format', 'unit', 'meta'):
            if attr in header_cols[col.name]:
                setattr(col, attr, header_cols[col.name][attr])
    table = serialize._construct_mixins_from_columns(table)
    try:
        header['meta'].pop('__serialized_columns__')
        table.meta = header['meta']
    except KeyError:
        pass
    return table
Beispiel #6
0
    def _read_tables(cls, group, path=None):
        if path is None:
            path = cls._hdf5_path

        samples = group[f'{path}']
        metadata = group[f'{path}.__table_column_meta__']

        header = meta.get_header_from_yaml(
            h.decode('utf-8') for h in metadata.read())

        table = Table(np.array(samples.read()))
        if 'meta' in list(header.keys()):
            table.meta = header['meta']

        table = serialize._construct_mixins_from_columns(table)

        return cls(table)
Beispiel #7
0
def _decode_mixins(tbl):
    """Decode a Table ``tbl`` that has astropy Columns + appropriate meta-data into
    the corresponding table with mixin columns (as appropriate).
    """
    # If available read in __serialized_columns__ meta info which is stored
    # in FITS COMMENTS between two sentinels.
    try:
        i0 = tbl.meta['comments'].index('--BEGIN-ASTROPY-SERIALIZED-COLUMNS--')
        i1 = tbl.meta['comments'].index('--END-ASTROPY-SERIALIZED-COLUMNS--')
    except (ValueError, KeyError):
        return tbl

    # The YAML data are split into COMMENT cards, with lines longer than 70
    # characters being split with a continuation character \ (backslash).
    # Strip the backslashes and join together.
    continuation_line = False
    lines = []
    for line in tbl.meta['comments'][i0 + 1:i1]:
        if continuation_line:
            lines[-1] = lines[-1] + line[:70]
        else:
            lines.append(line[:70])
        continuation_line = len(line) == 71

    del tbl.meta['comments'][i0:i1 + 1]
    if not tbl.meta['comments']:
        del tbl.meta['comments']
    info = meta.get_header_from_yaml(lines)

    # Add serialized column information to table meta for use in constructing mixins
    tbl.meta['__serialized_columns__'] = info['meta']['__serialized_columns__']

    # Use the `datatype` attribute info to update column attributes that are
    # NOT already handled via standard FITS column keys (name, dtype, unit).
    for col in info['datatype']:
        for attr in ['description', 'meta']:
            if attr in col:
                setattr(tbl[col['name']].info, attr, col[attr])

    # Construct new table with mixins, using tbl.meta['__serialized_columns__']
    # as guidance.
    tbl = serialize._construct_mixins_from_columns(tbl)

    return tbl
Beispiel #8
0
def read_table_hdf5(input, path=None, character_as_bytes=True):
    """
    Read a Table object from an HDF5 file

    This requires `h5py <http://www.h5py.org/>`_ to be installed. If more than one
    table is present in the HDF5 file or group, the first table is read in and
    a warning is displayed.

    Parameters
    ----------
    input : str or :class:`h5py.File` or :class:`h5py.Group` or
        :class:`h5py.Dataset` If a string, the filename to read the table from.
        If an h5py object, either the file or the group object to read the
        table from.
    path : str
        The path from which to read the table inside the HDF5 file.
        This should be relative to the input file or group.
    character_as_bytes : bool
        If `True` then Table columns are left as bytes.
        If `False` then Table columns are converted to unicode.
    """

    try:
        import h5py
    except ImportError:
        raise Exception("h5py is required to read and write HDF5 files")

    # This function is iterative, and only gets to writing the file when
    # the input is an hdf5 Group. Moreover, the input variable is changed in
    # place.
    # Here, we save its value to be used at the end when the conditions are
    # right.
    input_save = input
    if isinstance(input, (h5py.File, h5py.Group)):

        # If a path was specified, follow the path

        if path is not None:
            try:
                input = input[path]
            except (KeyError, ValueError):
                raise OSError(f"Path {path} does not exist")

        # `input` is now either a group or a dataset. If it is a group, we
        # will search for all structured arrays inside the group, and if there
        # is one we can proceed otherwise an error is raised. If it is a
        # dataset, we just proceed with the reading.

        if isinstance(input, h5py.Group):

            # Find all structured arrays in group
            arrays = _find_all_structured_arrays(input)

            if len(arrays) == 0:
                raise ValueError(f"no table found in HDF5 group {path}")
            elif len(arrays) > 0:
                path = arrays[0] if path is None else path + '/' + arrays[0]
                if len(arrays) > 1:
                    warnings.warn(
                        "path= was not specified but multiple tables"
                        " are present, reading in first available"
                        " table (path={})".format(path), AstropyUserWarning)
                return read_table_hdf5(input, path=path)

    elif not isinstance(input, h5py.Dataset):

        # If a file object was passed, then we need to extract the filename
        # because h5py cannot properly read in file objects.

        if hasattr(input, 'read'):
            try:
                input = input.name
            except AttributeError:
                raise TypeError("h5py can only open regular files")

        # Open the file for reading, and recursively call read_table_hdf5 with
        # the file object and the path.

        f = h5py.File(input, 'r')

        try:
            return read_table_hdf5(f,
                                   path=path,
                                   character_as_bytes=character_as_bytes)
        finally:
            f.close()

    # If we are here, `input` should be a Dataset object, which we can now
    # convert to a Table.

    # Create a Table object
    from astropy.table import Table, meta, serialize

    table = Table(np.array(input))

    # Read the meta-data from the file. For back-compatibility, we can read
    # the old file format where the serialized metadata were saved in the
    # attributes of the HDF5 dataset.
    # In the new format, instead, metadata are stored in a new dataset in the
    # same file. This is introduced in Astropy 3.0
    old_version_meta = META_KEY in input.attrs
    new_version_meta = path is not None and meta_path(path) in input_save
    if old_version_meta or new_version_meta:
        if new_version_meta:
            header = meta.get_header_from_yaml(
                h.decode('utf-8') for h in input_save[meta_path(path)])
        else:
            # Must be old_version_meta is True. if (A or B) and not A then B is True
            header = meta.get_header_from_yaml(
                h.decode('utf-8') for h in input.attrs[META_KEY])
        if 'meta' in list(header.keys()):
            table.meta = header['meta']

        header_cols = dict((x['name'], x) for x in header['datatype'])
        for col in table.columns.values():
            for attr in ('description', 'format', 'unit', 'meta'):
                if attr in header_cols[col.name]:
                    setattr(col, attr, header_cols[col.name][attr])

        # Construct new table with mixins, using tbl.meta['__serialized_columns__']
        # as guidance.
        table = serialize._construct_mixins_from_columns(table)

    else:
        # Read the meta-data from the file
        table.meta.update(input.attrs)

    if not character_as_bytes:
        table.convert_bytestring_to_unicode()

    return table
Beispiel #9
0
def read_table_hdf5(input, path=None, character_as_bytes=True):
    """
    Read a Table object from an HDF5 file

    This requires `h5py <http://www.h5py.org/>`_ to be installed. If more than one
    table is present in the HDF5 file or group, the first table is read in and
    a warning is displayed.

    Parameters
    ----------
    input : str or :class:`h5py:File` or :class:`h5py:Group` or
        :class:`h5py:Dataset` If a string, the filename to read the table from.
        If an h5py object, either the file or the group object to read the
        table from.
    path : str
        The path from which to read the table inside the HDF5 file.
        This should be relative to the input file or group.
    character_as_bytes: boolean
        If `True` then Table columns are left as bytes.
        If `False` then Table columns are converted to unicode.
    """

    try:
        import h5py
    except ImportError:
        raise Exception("h5py is required to read and write HDF5 files")

    # This function is iterative, and only gets to writing the file when
    # the input is an hdf5 Group. Moreover, the input variable is changed in
    # place.
    # Here, we save its value to be used at the end when the conditions are
    # right.
    input_save = input
    if isinstance(input, (h5py.File, h5py.Group)):

        # If a path was specified, follow the path

        if path is not None:
            try:
                input = input[path]
            except (KeyError, ValueError):
                raise OSError("Path {0} does not exist".format(path))

        # `input` is now either a group or a dataset. If it is a group, we
        # will search for all structured arrays inside the group, and if there
        # is one we can proceed otherwise an error is raised. If it is a
        # dataset, we just proceed with the reading.

        if isinstance(input, h5py.Group):

            # Find all structured arrays in group
            arrays = _find_all_structured_arrays(input)

            if len(arrays) == 0:
                raise ValueError("no table found in HDF5 group {0}".
                                 format(path))
            elif len(arrays) > 0:
                path = arrays[0] if path is None else path + '/' + arrays[0]
                if len(arrays) > 1:
                    warnings.warn("path= was not specified but multiple tables"
                                  " are present, reading in first available"
                                  " table (path={0})".format(path),
                                  AstropyUserWarning)
                return read_table_hdf5(input, path=path)

    elif not isinstance(input, h5py.Dataset):

        # If a file object was passed, then we need to extract the filename
        # because h5py cannot properly read in file objects.

        if hasattr(input, 'read'):
            try:
                input = input.name
            except AttributeError:
                raise TypeError("h5py can only open regular files")

        # Open the file for reading, and recursively call read_table_hdf5 with
        # the file object and the path.

        f = h5py.File(input, 'r')

        try:
            return read_table_hdf5(f, path=path, character_as_bytes=character_as_bytes)
        finally:
            f.close()

    # If we are here, `input` should be a Dataset object, which we can now
    # convert to a Table.

    # Create a Table object
    from astropy.table import Table, meta, serialize

    table = Table(np.array(input))

    # Read the meta-data from the file. For back-compatibility, we can read
    # the old file format where the serialized metadata were saved in the
    # attributes of the HDF5 dataset.
    # In the new format, instead, metadata are stored in a new dataset in the
    # same file. This is introduced in Astropy 3.0
    old_version_meta = META_KEY in input.attrs
    new_version_meta = path is not None and meta_path(path) in input_save
    if old_version_meta or new_version_meta:
        if new_version_meta:
            header = meta.get_header_from_yaml(
                h.decode('utf-8') for h in input_save[meta_path(path)])
        elif old_version_meta:
            header = meta.get_header_from_yaml(
                h.decode('utf-8') for h in input.attrs[META_KEY])
        if 'meta' in list(header.keys()):
            table.meta = header['meta']

        header_cols = dict((x['name'], x) for x in header['datatype'])
        for col in table.columns.values():
            for attr in ('description', 'format', 'unit', 'meta'):
                if attr in header_cols[col.name]:
                    setattr(col, attr, header_cols[col.name][attr])

        # Construct new table with mixins, using tbl.meta['__serialized_columns__']
        # as guidance.
        table = serialize._construct_mixins_from_columns(table)

    else:
        # Read the meta-data from the file
        table.meta.update(input.attrs)

    if not character_as_bytes:
        table.convert_bytestring_to_unicode()

    return table
Beispiel #10
0
def read_table_parquet(input,
                       include_names=None,
                       exclude_names=None,
                       schema_only=False,
                       filters=None):
    """
    Read a Table object from a Parquet file.

    This requires `pyarrow <https://arrow.apache.org/docs/python/>`_
    to be installed.

    The ``filters`` parameter consists of predicates that are expressed
    in disjunctive normal form (DNF), like ``[[('x', '=', 0), ...], ...]``.
    DNF allows arbitrary boolean logical combinations of single column
    predicates. The innermost tuples each describe a single column predicate.
    The list of inner predicates is interpreted as a conjunction (AND),
    forming a more selective and multiple column predicate. Finally, the most
    outer list combines these filters as a disjunction (OR).

    Predicates may also be passed as List[Tuple]. This form is interpreted
    as a single conjunction. To express OR in predicates, one must
    use the (preferred) List[List[Tuple]] notation.

    Each tuple has format: (``key``, ``op``, ``value``) and compares the
    ``key`` with the ``value``.
    The supported ``op`` are:  ``=`` or ``==``, ``!=``, ``<``, ``>``, ``<=``,
    ``>=``, ``in`` and ``not in``. If the ``op`` is ``in`` or ``not in``, the
    ``value`` must be a collection such as a ``list``, a ``set`` or a
    ``tuple``.

    Examples:

    .. code-block:: python

        ('x', '=', 0)
        ('y', 'in', ['a', 'b', 'c'])
        ('z', 'not in', {'a','b'})

    Parameters
    ----------
    input : str or path-like or file-like object
        If a string or path-like object, the filename to read the table from.
        If a file-like object, the stream to read data.
    include_names : list [str], optional
        List of names to include in output. If not supplied, then
        include all columns.
    exclude_names : list [str], optional
        List of names to exclude from output (applied after ``include_names``).
        If not supplied then no columns are excluded.
    schema_only : bool, optional
        Only read the schema/metadata with table information.
    filters : list [tuple] or list [list [tuple] ] or None, optional
        Rows which do not match the filter predicate will be removed from
        scanned data.  See `pyarrow.parquet.read_table()` for details.

    Returns
    -------
    table : `~astropy.table.Table`
        Table will have zero rows and only metadata information
        if schema_only is True.
    """
    pa, parquet, _ = get_pyarrow()

    if not isinstance(input, (str, os.PathLike)):
        # The 'read' attribute is the key component of a generic
        # file-like object.
        if not hasattr(input, 'read'):
            raise TypeError(
                "pyarrow can only open path-like or file-like objects.")

    schema = parquet.read_schema(input)

    # Pyarrow stores all metadata as byte-strings, so we convert
    # to UTF-8 strings here.
    if schema.metadata is not None:
        md = {
            k.decode('UTF-8'): v.decode('UTF-8')
            for k, v in schema.metadata.items()
        }
    else:
        md = {}

    from astropy.table import Table, meta, serialize

    # parse metadata from table yaml
    meta_dict = {}
    if 'table_meta_yaml' in md:
        meta_yaml = md.pop('table_meta_yaml').split('\n')
        meta_hdr = meta.get_header_from_yaml(meta_yaml)
        if 'meta' in meta_hdr:
            meta_dict = meta_hdr['meta']
    else:
        meta_hdr = None

    # parse and set serialized columns
    full_table_columns = {name: name for name in schema.names}
    has_serialized_columns = False
    if '__serialized_columns__' in meta_dict:
        has_serialized_columns = True
        serialized_columns = meta_dict['__serialized_columns__']
        for scol in serialized_columns:
            for name in _get_names(serialized_columns[scol]):
                full_table_columns[name] = scol

    use_names = set(full_table_columns.values())
    # Apply include_names before exclude_names
    if include_names is not None:
        use_names.intersection_update(include_names)
    if exclude_names is not None:
        use_names.difference_update(exclude_names)
    # Preserve column ordering via list, and use this dict trick
    # to remove duplicates and preserve ordering (for mixin columns)
    use_names = list(
        dict.fromkeys(
            [x for x in full_table_columns.values() if x in use_names]))

    # names_to_read is a list of actual serialized column names, where
    # e.g. the requested name 'time' becomes ['time.jd1', 'time.jd2']
    names_to_read = []
    for name in use_names:
        names = [n for n, col in full_table_columns.items() if name == col]
        names_to_read.extend(names)

    if not names_to_read:
        raise ValueError("No include_names specified were found in the table.")

    # We need to pop any unread serialized columns out of the meta_dict.
    if has_serialized_columns:
        for scol in list(meta_dict['__serialized_columns__'].keys()):
            if scol not in use_names:
                meta_dict['__serialized_columns__'].pop(scol)

    # whether to return the whole table or a formatted empty table.
    if not schema_only:
        # Read the pyarrow table, specifying columns and filters.
        pa_table = parquet.read_table(input,
                                      columns=names_to_read,
                                      filters=filters)
        num_rows = pa_table.num_rows
    else:
        num_rows = 0

    # Now need to convert parquet table to Astropy
    dtype = []
    for name in names_to_read:
        # Pyarrow string and byte columns do not have native length information
        # so we must determine those here.
        if schema.field(name).type not in (pa.string(), pa.binary()):
            # Convert the pyarrow type into a numpy dtype (which is returned
            # by the to_pandas_type() method).
            dtype.append(schema.field(name).type.to_pandas_dtype())
            continue

        # Special-case for string and binary columns
        md_name = f'table::len::{name}'
        if md_name in md:
            # String/bytes length from header.
            strlen = int(md[md_name])
        elif schema_only:  # Find the maximum string length.
            # Choose an arbitrary string length since
            # are not reading in the table.
            strlen = 10
            warnings.warn(
                f"No {md_name} found in metadata. "
                f"Guessing {{strlen}} for schema.", AstropyUserWarning)
        else:
            strlen = max([len(row.as_py()) for row in pa_table[name]])
            warnings.warn(
                f"No {md_name} found in metadata. "
                f"Using longest string ({{strlen}} characters).",
                AstropyUserWarning)
        dtype.append(f'U{strlen}' if schema.field(name).type ==
                     pa.string() else f'|S{strlen}')

    # Create the empty numpy record array to store the pyarrow data.
    data = np.zeros(num_rows, dtype=list(zip(names_to_read, dtype)))

    if not schema_only:
        # Convert each column in the pyarrow table to a numpy array
        for name in names_to_read:
            data[name][:] = pa_table[name].to_numpy()

    table = Table(data=data, meta=meta_dict)

    if meta_hdr is not None:
        # Set description, format, unit, meta from the column
        # metadata that was serialized with the table.
        header_cols = dict((x['name'], x) for x in meta_hdr['datatype'])
        for col in table.columns.values():
            for attr in ('description', 'format', 'unit', 'meta'):
                if attr in header_cols[col.name]:
                    setattr(col, attr, header_cols[col.name][attr])

    # Convert all compound columns to astropy objects
    # (e.g. time.jd1, time.jd2 into a single time column)
    table = serialize._construct_mixins_from_columns(table)

    return table