Example #1
0
    def write_table(self, table_name, table, metadata=None):
        table = _encode_mixins(table)
        if any(col.info.dtype.kind == 'U' for col in table.itercols()):
            table = table.copy(copy_data=False)
            table.convert_unicode_to_bytestring()

        self._entry.create_dataset(str(table_name), data=table.as_array())
        header_yaml = meta.get_yaml_from_table(table)

        header_encoded = [h.encode('utf-8') for h in header_yaml]
        self._entry.create_dataset(str(table_name) + '.' + META_KEY,
                                   data=header_encoded)

        tg = self._entry.create_group('{}_to_group'.format(str(table_name)))
        for col in table.keys():
            tg_c = tg.create_group(str(col))
            tg_c.create_dataset('value', data=table[col])
            if (table[col].unit != None):
                tg_c.create_dataset('unit', data=str(table[col].unit))
Example #2
0
def write_table_hdf5(table,
                     output,
                     path=None,
                     compression=False,
                     append=False,
                     overwrite=False,
                     serialize_meta=False,
                     **create_dataset_kwargs):
    """
    Write a Table object to an HDF5 file

    This requires `h5py <http://www.h5py.org/>`_ to be installed.

    Parameters
    ----------
    table : `~astropy.table.Table`
        Data table that is to be written to file.
    output : str or :class:`h5py.File` or :class:`h5py.Group`
        If a string, the filename to write the table to. If an h5py object,
        either the file or the group object to write the table to.
    path : str
        The path to which to write the table inside the HDF5 file.
        This should be relative to the input file or group.
        If not specified, defaults to ``__astropy_table__``.
    compression : bool or str or int
        Whether to compress the table inside the HDF5 file. If set to `True`,
        ``'gzip'`` compression is used. If a string is specified, it should be
        one of ``'gzip'``, ``'szip'``, or ``'lzf'``. If an integer is
        specified (in the range 0-9), ``'gzip'`` compression is used, and the
        integer denotes the compression level.
    append : bool
        Whether to append the table to an existing HDF5 file.
    overwrite : bool
        Whether to overwrite any existing file without warning.
        If ``append=True`` and ``overwrite=True`` then only the dataset will be
        replaced; the file/group will not be overwritten.
    serialize_meta : bool
        Whether to serialize rich table meta-data when writing the HDF5 file, in
        particular such data required to write and read back mixin columns like
        ``Time``, ``SkyCoord``, or ``Quantity`` to the file.
    **create_dataset_kwargs
        Additional keyword arguments are passed to
        ``h5py.File.create_dataset()`` or ``h5py.Group.create_dataset()``.
    """

    from astropy.table import meta
    try:
        import h5py
    except ImportError:
        raise Exception("h5py is required to read and write HDF5 files")

    if path is None:
        # table is just an arbitrary, hardcoded string here.
        path = '__astropy_table__'
    elif path.endswith('/'):
        raise ValueError("table path should end with table name, not /")

    if '/' in path:
        group, name = path.rsplit('/', 1)
    else:
        group, name = None, path

    if isinstance(output, (h5py.File, h5py.Group)):
        if len(list(output.keys())) > 0 and name == '__astropy_table__':
            raise ValueError("table path should always be set via the "
                             "path= argument when writing to existing "
                             "files")
        elif name == '__astropy_table__':
            warnings.warn("table path was not set via the path= argument; "
                          "using default path {}".format(path))

        if group:
            try:
                output_group = output[group]
            except (KeyError, ValueError):
                output_group = output.create_group(group)
        else:
            output_group = output

    elif isinstance(output, str):

        if os.path.exists(output) and not append:
            if overwrite and not append:
                os.remove(output)
            else:
                raise OSError(NOT_OVERWRITING_MSG.format(output))

        # Open the file for appending or writing
        f = h5py.File(output, 'a' if append else 'w')

        # Recursively call the write function
        try:
            return write_table_hdf5(table,
                                    f,
                                    path=path,
                                    compression=compression,
                                    append=append,
                                    overwrite=overwrite,
                                    serialize_meta=serialize_meta)
        finally:
            f.close()

    else:

        raise TypeError('output should be a string or an h5py File or '
                        'Group object')

    # Check whether table already exists
    if name in output_group:
        if append and overwrite:
            # Delete only the dataset itself
            del output_group[name]
            if serialize_meta and name + '.__table_column_meta__' in output_group:
                del output_group[name + '.__table_column_meta__']
        else:
            raise OSError(f"Table {path} already exists")

    # Encode any mixin columns as plain columns + appropriate metadata
    table = _encode_mixins(table)

    # Table with numpy unicode strings can't be written in HDF5 so
    # to write such a table a copy of table is made containing columns as
    # bytestrings.  Now this copy of the table can be written in HDF5.
    if any(col.info.dtype.kind == 'U' for col in table.itercols()):
        table = table.copy(copy_data=False)
        table.convert_unicode_to_bytestring()

    # Warn if information will be lost when serialize_meta=False.  This is
    # hardcoded to the set difference between column info attributes and what
    # HDF5 can store natively (name, dtype) with no meta.
    if serialize_meta is False:
        for col in table.itercols():
            for attr in ('unit', 'format', 'description', 'meta'):
                if getattr(col.info, attr, None) not in (None, {}):
                    warnings.warn(
                        "table contains column(s) with defined 'unit', 'format',"
                        " 'description', or 'meta' info attributes. These will"
                        " be dropped since serialize_meta=False.",
                        AstropyUserWarning)

    # Write the table to the file
    if compression:
        if compression is True:
            compression = 'gzip'
        dset = output_group.create_dataset(name,
                                           data=table.as_array(),
                                           compression=compression,
                                           **create_dataset_kwargs)
    else:
        dset = output_group.create_dataset(name,
                                           data=table.as_array(),
                                           **create_dataset_kwargs)

    if serialize_meta:
        header_yaml = meta.get_yaml_from_table(table)
        header_encoded = np.array([h.encode('utf-8') for h in header_yaml])
        output_group.create_dataset(meta_path(name), data=header_encoded)

    else:
        # Write the Table meta dict key:value pairs to the file as HDF5
        # attributes.  This works only for a limited set of scalar data types
        # like numbers, strings, etc., but not any complex types.  This path
        # also ignores column meta like unit or format.
        for key in table.meta:
            val = table.meta[key]
            try:
                dset.attrs[key] = val
            except TypeError:
                warnings.warn(
                    "Attribute `{}` of type {} cannot be written to "
                    "HDF5 files - skipping. (Consider specifying "
                    "serialize_meta=True to write all meta data)".format(
                        key, type(val)), AstropyUserWarning)
Example #3
0
def _encode_mixins(tbl):
    """Encode a Table ``tbl`` that may have mixin columns to a Table with only
    astropy Columns + appropriate meta-data to allow subsequent decoding.
    """
    # Determine if information will be lost without serializing meta.  This is hardcoded
    # to the set difference between column info attributes and what FITS can store
    # natively (name, dtype, unit).  See _get_col_attributes() in table/meta.py for where
    # this comes from.
    info_lost = any(any(getattr(col.info, attr, None) not in (None, {})
                        for attr in ('description', 'meta'))
                    for col in tbl.itercols())

    # Convert the table to one with no mixins, only Column objects.  This adds
    # meta data which is extracted with meta.get_yaml_from_table.  This ignores
    # Time-subclass columns and leave them in the table so that the downstream
    # FITS Time handling does the right thing.

    with serialize_context_as('fits'):
        encode_tbl = serialize.represent_mixins_as_columns(
            tbl, exclude_classes=(Time,))

    # If the encoded table is unchanged then there were no mixins.  But if there
    # is column metadata (format, description, meta) that would be lost, then
    # still go through the serialized columns machinery.
    if encode_tbl is tbl and not info_lost:
        return tbl

    # Copy the meta dict if it was not copied by represent_mixins_as_columns.
    # We will modify .meta['comments'] below and we do not want to see these
    # comments in the input table.
    if encode_tbl is tbl:
        meta_copy = deepcopy(tbl.meta)
        encode_tbl = Table(tbl.columns, meta=meta_copy, copy=False)

    # Get the YAML serialization of information describing the table columns.
    # This is re-using ECSV code that combined existing table.meta with with
    # the extra __serialized_columns__ key.  For FITS the table.meta is handled
    # by the native FITS connect code, so don't include that in the YAML
    # output.
    ser_col = '__serialized_columns__'

    # encode_tbl might not have a __serialized_columns__ key if there were no mixins,
    # but machinery below expects it to be available, so just make an empty dict.
    encode_tbl.meta.setdefault(ser_col, {})

    tbl_meta_copy = encode_tbl.meta.copy()
    try:
        encode_tbl.meta = {ser_col: encode_tbl.meta[ser_col]}
        meta_yaml_lines = meta.get_yaml_from_table(encode_tbl)
    finally:
        encode_tbl.meta = tbl_meta_copy
    del encode_tbl.meta[ser_col]

    if 'comments' not in encode_tbl.meta:
        encode_tbl.meta['comments'] = []
    encode_tbl.meta['comments'].append('--BEGIN-ASTROPY-SERIALIZED-COLUMNS--')

    for line in meta_yaml_lines:
        if len(line) == 0:
            lines = ['']
        else:
            # Split line into 70 character chunks for COMMENT cards
            idxs = list(range(0, len(line) + 70, 70))
            lines = [line[i0:i1] + '\\' for i0, i1 in zip(idxs[:-1], idxs[1:])]
            lines[-1] = lines[-1][:-1]
        encode_tbl.meta['comments'].extend(lines)

    encode_tbl.meta['comments'].append('--END-ASTROPY-SERIALIZED-COLUMNS--')

    return encode_tbl
Example #4
0
def _encode_mixins(tbl):
    """Encode a Table ``tbl`` that may have mixin columns to a Table with only
    astropy Columns + appropriate meta-data to allow subsequent decoding.
    """
    # Determine if information will be lost without serializing meta.  This is hardcoded
    # to the set difference between column info attributes and what FITS can store
    # natively (name, dtype, unit).  See _get_col_attributes() in table/meta.py for where
    # this comes from.
    info_lost = any(
        any(
            getattr(col.info, attr, None) not in (None, {})
            for attr in ('description', 'meta')) for col in tbl.itercols())

    # If PyYAML is not available then check to see if there are any mixin cols
    # that *require* YAML serialization.  FITS already has support for Time,
    # Quantity, so if those are the only mixins the proceed without doing the
    # YAML bit, for backward compatibility (i.e. not requiring YAML to write
    # Time or Quantity).  In this case other mixin column meta (e.g.
    # description or meta) will be silently dropped, consistent with astropy <=
    # 2.0 behavior.
    try:
        import yaml  # noqa
    except ImportError:
        for col in tbl.itercols():
            if (has_info_class(col, MixinInfo)
                    and col.__class__ not in (u.Quantity, Time)):
                raise TypeError("cannot write type {} column '{}' "
                                "to FITS without PyYAML installed.".format(
                                    col.__class__.__name__, col.info.name))
        else:
            if info_lost:
                warnings.warn(
                    "table contains column(s) with defined 'format',"
                    " 'description', or 'meta' info attributes. These"
                    " will be dropped unless you install PyYAML.",
                    AstropyUserWarning)
            return tbl

    # Convert the table to one with no mixins, only Column objects.  This adds
    # meta data which is extracted with meta.get_yaml_from_table.  This ignores
    # Time-subclass columns and leave them in the table so that the downstream
    # FITS Time handling does the right thing.

    with serialize_context_as('fits'):
        encode_tbl = serialize._represent_mixins_as_columns(
            tbl, exclude_classes=(Time, ))

    # If the encoded table is unchanged then there were no mixins.  But if there
    # is column metadata (format, description, meta) that would be lost, then
    # still go through the serialized columns machinery.
    if encode_tbl is tbl and not info_lost:
        return tbl

    # Get the YAML serialization of information describing the table columns.
    # This is re-using ECSV code that combined existing table.meta with with
    # the extra __serialized_columns__ key.  For FITS the table.meta is handled
    # by the native FITS connect code, so don't include that in the YAML
    # output.
    ser_col = '__serialized_columns__'

    # encode_tbl might not have a __serialized_columns__ key if there were no mixins,
    # but machinery below expects it to be available, so just make an empty dict.
    encode_tbl.meta.setdefault(ser_col, {})

    tbl_meta_copy = encode_tbl.meta.copy()
    try:
        encode_tbl.meta = {ser_col: encode_tbl.meta[ser_col]}
        meta_yaml_lines = meta.get_yaml_from_table(encode_tbl)
    finally:
        encode_tbl.meta = tbl_meta_copy
    del encode_tbl.meta[ser_col]

    if 'comments' not in encode_tbl.meta:
        encode_tbl.meta['comments'] = []
    encode_tbl.meta['comments'].append('--BEGIN-ASTROPY-SERIALIZED-COLUMNS--')

    for line in meta_yaml_lines:
        if len(line) == 0:
            lines = ['']
        else:
            # Split line into 70 character chunks for COMMENT cards
            idxs = list(range(0, len(line) + 70, 70))
            lines = [line[i0:i1] + '\\' for i0, i1 in zip(idxs[:-1], idxs[1:])]
            lines[-1] = lines[-1][:-1]
        encode_tbl.meta['comments'].extend(lines)

    encode_tbl.meta['comments'].append('--END-ASTROPY-SERIALIZED-COLUMNS--')

    return encode_tbl
Example #5
0
def _encode_mixins(tbl):
    """Encode a Table ``tbl`` that may have mixin columns to a Table with only
    astropy Columns + appropriate meta-data to allow subsequent decoding.
    """
    # Determine if information will be lost without serializing meta.  This is hardcoded
    # to the set difference between column info attributes and what FITS can store
    # natively (name, dtype, unit).  See _get_col_attributes() in table/meta.py for where
    # this comes from.
    info_lost = any(any(getattr(col.info, attr, None) not in (None, {})
                        for attr in ('description', 'meta'))
                    for col in tbl.itercols())

    # If PyYAML is not available then check to see if there are any mixin cols
    # that *require* YAML serialization.  FITS already has support for Time,
    # Quantity, so if those are the only mixins the proceed without doing the
    # YAML bit, for backward compatibility (i.e. not requiring YAML to write
    # Time or Quantity).  In this case other mixin column meta (e.g.
    # description or meta) will be silently dropped, consistent with astropy <=
    # 2.0 behavior.
    try:
        import yaml  # noqa
    except ImportError:
        for col in tbl.itercols():
            if (has_info_class(col, MixinInfo) and
                    col.__class__ not in (u.Quantity, Time)):
                raise TypeError("cannot write type {} column '{}' "
                                "to FITS without PyYAML installed."
                                .format(col.__class__.__name__, col.info.name))
        else:
            if info_lost:
                warnings.warn("table contains column(s) with defined 'format',"
                              " 'description', or 'meta' info attributes. These"
                              " will be dropped unless you install PyYAML.",
                              AstropyUserWarning)
            return tbl

    # Convert the table to one with no mixins, only Column objects.  This adds
    # meta data which is extracted with meta.get_yaml_from_table.  This ignores
    # Time-subclass columns and leave them in the table so that the downstream
    # FITS Time handling does the right thing.

    with serialize_context_as('fits'):
        encode_tbl = serialize._represent_mixins_as_columns(
            tbl, exclude_classes=(Time,))

    # If the encoded table is unchanged then there were no mixins.  But if there
    # is column metadata (format, description, meta) that would be lost, then
    # still go through the serialized columns machinery.
    if encode_tbl is tbl and not info_lost:
        return tbl

    # Get the YAML serialization of information describing the table columns.
    # This is re-using ECSV code that combined existing table.meta with with
    # the extra __serialized_columns__ key.  For FITS the table.meta is handled
    # by the native FITS connect code, so don't include that in the YAML
    # output.
    ser_col = '__serialized_columns__'

    # encode_tbl might not have a __serialized_columns__ key if there were no mixins,
    # but machinery below expects it to be available, so just make an empty dict.
    encode_tbl.meta.setdefault(ser_col, {})

    tbl_meta_copy = encode_tbl.meta.copy()
    try:
        encode_tbl.meta = {ser_col: encode_tbl.meta[ser_col]}
        meta_yaml_lines = meta.get_yaml_from_table(encode_tbl)
    finally:
        encode_tbl.meta = tbl_meta_copy
    del encode_tbl.meta[ser_col]

    if 'comments' not in encode_tbl.meta:
        encode_tbl.meta['comments'] = []
    encode_tbl.meta['comments'].append('--BEGIN-ASTROPY-SERIALIZED-COLUMNS--')

    for line in meta_yaml_lines:
        if len(line) == 0:
            lines = ['']
        else:
            # Split line into 70 character chunks for COMMENT cards
            idxs = list(range(0, len(line) + 70, 70))
            lines = [line[i0:i1] + '\\' for i0, i1 in zip(idxs[:-1], idxs[1:])]
            lines[-1] = lines[-1][:-1]
        encode_tbl.meta['comments'].extend(lines)

    encode_tbl.meta['comments'].append('--END-ASTROPY-SERIALIZED-COLUMNS--')

    return encode_tbl
Example #6
0
def write_table_hdf5(table, output, path=None, compression=False,
                     append=False, overwrite=False, serialize_meta=False,
                     compatibility_mode=False):
    """
    Write a Table object to an HDF5 file

    This requires `h5py <http://www.h5py.org/>`_ to be installed.

    Parameters
    ----------
    table : `~astropy.table.Table`
        Data table that is to be written to file.
    output : str or :class:`h5py:File` or :class:`h5py:Group`
        If a string, the filename to write the table to. If an h5py object,
        either the file or the group object to write the table to.
    path : str
        The path to which to write the table inside the HDF5 file.
        This should be relative to the input file or group.
        If not specified, defaults to ``__astropy_table__``.
    compression : bool or str or int
        Whether to compress the table inside the HDF5 file. If set to `True`,
        ``'gzip'`` compression is used. If a string is specified, it should be
        one of ``'gzip'``, ``'szip'``, or ``'lzf'``. If an integer is
        specified (in the range 0-9), ``'gzip'`` compression is used, and the
        integer denotes the compression level.
    append : bool
        Whether to append the table to an existing HDF5 file.
    overwrite : bool
        Whether to overwrite any existing file without warning.
        If ``append=True`` and ``overwrite=True`` then only the dataset will be
        replaced; the file/group will not be overwritten.
    """

    from astropy.table import meta
    try:
        import h5py
    except ImportError:
        raise Exception("h5py is required to read and write HDF5 files")

    if path is None:
        # table is just an arbitrary, hardcoded string here.
        path = '__astropy_table__'
    elif path.endswith('/'):
        raise ValueError("table path should end with table name, not /")

    if '/' in path:
        group, name = path.rsplit('/', 1)
    else:
        group, name = None, path

    if isinstance(output, (h5py.File, h5py.Group)):
        if len(list(output.keys())) > 0 and name == '__astropy_table__':
            raise ValueError("table path should always be set via the "
                             "path= argument when writing to existing "
                             "files")
        elif name == '__astropy_table__':
            warnings.warn("table path was not set via the path= argument; "
                          "using default path {}".format(path))

        if group:
            try:
                output_group = output[group]
            except (KeyError, ValueError):
                output_group = output.create_group(group)
        else:
            output_group = output

    elif isinstance(output, str):

        if os.path.exists(output) and not append:
            if overwrite and not append:
                os.remove(output)
            else:
                raise OSError("File exists: {0}".format(output))

        # Open the file for appending or writing
        f = h5py.File(output, 'a' if append else 'w')

        # Recursively call the write function
        try:
            return write_table_hdf5(table, f, path=path,
                                    compression=compression, append=append,
                                    overwrite=overwrite,
                                    serialize_meta=serialize_meta,
                                    compatibility_mode=compatibility_mode)
        finally:
            f.close()

    else:

        raise TypeError('output should be a string or an h5py File or '
                        'Group object')

    # Check whether table already exists
    if name in output_group:
        if append and overwrite:
            # Delete only the dataset itself
            del output_group[name]
        else:
            raise OSError("Table {0} already exists".format(path))

    # Encode any mixin columns as plain columns + appropriate metadata
    table = _encode_mixins(table)

    # Table with numpy unicode strings can't be written in HDF5 so
    # to write such a table a copy of table is made containing columns as
    # bytestrings.  Now this copy of the table can be written in HDF5.
    if any(col.info.dtype.kind == 'U' for col in table.itercols()):
        table = table.copy(copy_data=False)
        table.convert_unicode_to_bytestring()

    # Warn if information will be lost when serialize_meta=False.  This is
    # hardcoded to the set difference between column info attributes and what
    # HDF5 can store natively (name, dtype) with no meta.
    if serialize_meta is False:
        for col in table.itercols():
            for attr in ('unit', 'format', 'description', 'meta'):
                if getattr(col.info, attr, None) not in (None, {}):
                    warnings.warn("table contains column(s) with defined 'unit', 'format',"
                                  " 'description', or 'meta' info attributes. These will"
                                  " be dropped since serialize_meta=False.",
                                  AstropyUserWarning)

    # Write the table to the file
    if compression:
        if compression is True:
            compression = 'gzip'
        dset = output_group.create_dataset(name, data=table.as_array(),
                                           compression=compression)
    else:
        dset = output_group.create_dataset(name, data=table.as_array())

    if serialize_meta:
        header_yaml = meta.get_yaml_from_table(table)

        header_encoded = [h.encode('utf-8') for h in header_yaml]
        if compatibility_mode:
            warnings.warn("compatibility mode for writing is deprecated",
                          AstropyDeprecationWarning)
            try:
                dset.attrs[META_KEY] = header_encoded
            except Exception as e:
                warnings.warn(
                "Attributes could not be written to the output HDF5 "
                "file: {0}".format(e))

        else:
            output_group.create_dataset(meta_path(name),
                                        data=header_encoded)

    else:
        # Write the Table meta dict key:value pairs to the file as HDF5
        # attributes.  This works only for a limited set of scalar data types
        # like numbers, strings, etc., but not any complex types.  This path
        # also ignores column meta like unit or format.
        for key in table.meta:
            val = table.meta[key]
            try:
                dset.attrs[key] = val
            except TypeError:
                warnings.warn("Attribute `{0}` of type {1} cannot be written to "
                              "HDF5 files - skipping. (Consider specifying "
                              "serialize_meta=True to write all meta data)".format(key, type(val)),
                              AstropyUserWarning)
Example #7
0
def write_table_parquet(table, output, overwrite=False):
    """
    Write a Table object to a Parquet file

    This requires `pyarrow <https://arrow.apache.org/docs/python/>`_
    to be installed.

    Parameters
    ----------
    table : `~astropy.table.Table`
        Data table that is to be written to file.
    output : str or path-like
        The filename to write the table to.
    overwrite : bool, optional
        Whether to overwrite any existing file without warning. Default `False`.
    """

    from astropy.table import meta, serialize
    from astropy.utils.data_info import serialize_context_as

    pa, parquet, writer_version = get_pyarrow()

    if not isinstance(output, (str, os.PathLike)):
        raise TypeError(
            f'`output` should be a string or path-like, not {output}')

    # Convert all compound columns into serialized column names, where
    # e.g. 'time' becomes ['time.jd1', 'time.jd2'].
    with serialize_context_as('parquet'):
        encode_table = serialize.represent_mixins_as_columns(table)
    # We store the encoded serialization metadata as a yaml string.
    meta_yaml = meta.get_yaml_from_table(encode_table)
    meta_yaml_str = '\n'.join(meta_yaml)

    metadata = {}
    for name, col in encode_table.columns.items():
        # Parquet will retain the datatypes of columns, but string and
        # byte column length is lost.  Therefore, we special-case these
        # types to record the length for precise round-tripping.
        if col.dtype.type is np.str_:
            metadata[f'table::len::{name}'] = str(col.dtype.itemsize // 4)
        elif col.dtype.type is np.bytes_:
            metadata[f'table::len::{name}'] = str(col.dtype.itemsize)

        metadata['table_meta_yaml'] = meta_yaml_str

    # Pyarrow stores all metadata as byte strings, so we explicitly encode
    # our unicode strings in metadata as UTF-8 byte strings here.
    metadata_encode = {
        k.encode('UTF-8'): v.encode('UTF-8')
        for k, v in metadata.items()
    }

    # Build the pyarrow schema by converting from the numpy dtype of each
    # column to an equivalent pyarrow type with from_numpy_dtype()
    type_list = [(name, pa.from_numpy_dtype(encode_table.dtype[name].type))
                 for name in encode_table.dtype.names]
    schema = pa.schema(type_list, metadata=metadata_encode)

    if os.path.exists(output):
        if overwrite:
            # We must remove the file prior to writing below.
            os.remove(output)
        else:
            raise OSError(NOT_OVERWRITING_MSG.format(output))

    # We use version='2.0' for full support of datatypes including uint32.
    with parquet.ParquetWriter(output, schema,
                               version=writer_version) as writer:
        # Convert each Table column to a pyarrow array
        arrays = [pa.array(col) for col in encode_table.itercols()]
        # Create a pyarrow table from the list of arrays and the schema
        pa_table = pa.Table.from_arrays(arrays, schema=schema)
        # Write the pyarrow table to a file
        writer.write_table(pa_table)
Example #8
0
def write_table_hdf5(table,
                     output,
                     path=None,
                     compression=False,
                     append=False,
                     overwrite=False,
                     serialize_meta=False,
                     metadata_conflicts='error',
                     **create_dataset_kwargs):
    """
    Write a Table object to an HDF5 file

    This requires `h5py <http://www.h5py.org/>`_ to be installed.

    Parameters
    ----------
    table : `~astropy.table.Table`
        Data table that is to be written to file.
    output : str or :class:`h5py:File` or :class:`h5py:Group`
        If a string, the filename to write the table to. If an h5py object,
        either the file or the group object to write the table to.
    path : str
        The path to which to write the table inside the HDF5 file.
        This should be relative to the input file or group.
        If not specified, defaults to ``__astropy_table__``.
    compression : bool or str or int
        Whether to compress the table inside the HDF5 file. If set to `True`,
        ``'gzip'`` compression is used. If a string is specified, it should be
        one of ``'gzip'``, ``'szip'``, or ``'lzf'``. If an integer is
        specified (in the range 0-9), ``'gzip'`` compression is used, and the
        integer denotes the compression level.
    append : bool
        Whether to append the table to an existing HDF5 file.
    overwrite : bool
        Whether to overwrite any existing file without warning.
        If ``append=True`` and ``overwrite=True`` then only the dataset will be
        replaced; the file/group will not be overwritten.
    metadata_conflicts : str
        How to proceed with metadata conflicts. This should be one of:
            * ``'silent'``: silently pick the last conflicting meta-data value
            * ``'warn'``: pick the last conflicting meta-data value, but emit a
              warning (default)
            * ``'error'``: raise an exception.
    **create_dataset_kwargs
        Additional keyword arguments are passed to `h5py.File.create_dataset`.
    """

    from astropy.table import meta
    try:
        import h5py
    except ImportError:
        raise Exception("h5py is required to read and write HDF5 files")

    if path is None:
        # table is just an arbitrary, hardcoded string here.
        path = '__astropy_table__'
    elif path.endswith('/'):
        raise ValueError("table path should end with table name, not /")

    if '/' in path:
        group, name = path.rsplit('/', 1)
    else:
        group, name = None, path

    if isinstance(output, (h5py.File, h5py.Group)):
        if len(list(output.keys())) > 0 and name == '__astropy_table__':
            raise ValueError("table path should always be set via the "
                             "path= argument when writing to existing "
                             "files")
        elif name == '__astropy_table__':
            warnings.warn("table path was not set via the path= argument; "
                          "using default path {}".format(path))

        if group:
            try:
                output_group = output[group]
            except (KeyError, ValueError):
                output_group = output.create_group(group)
        else:
            output_group = output

    elif isinstance(output, str):

        if os.path.exists(output) and not append:
            if overwrite and not append:
                os.remove(output)
            else:
                raise OSError(f"File exists: {output}")

        # Open the file for appending or writing
        f = h5py.File(output, 'a' if append else 'w')

        # Recursively call the write function
        try:
            return write_table_hdf5(table,
                                    f,
                                    path=path,
                                    compression=compression,
                                    append=append,
                                    overwrite=overwrite,
                                    serialize_meta=serialize_meta,
                                    **create_dataset_kwargs)
        finally:
            f.close()

    else:

        raise TypeError('output should be a string or an h5py File or '
                        'Group object')

    # Check whether table already exists
    existing_header = None
    if name in output_group:
        if append and overwrite:
            # Delete only the dataset itself
            del output_group[name]
        elif append:
            # Data table exists, so we interpret "append" to mean "extend
            # existing table with the table passed in". However, this requires
            # the table to have been written by this function in the past, so it
            # should have a metadata header
            if meta_path(name) not in output_group:
                raise ValueError("No metadata exists for existing table. We "
                                 "can only append tables if metadata "
                                 "is consistent for all tables")

            # Load existing table header:
            existing_header = get_header_from_yaml(
                h.decode('utf-8') for h in output_group[meta_path(name)])
        else:
            raise OSError(f"Table {path} already exists")

    # Encode any mixin columns as plain columns + appropriate metadata
    table = _encode_mixins(table)

    # Table with numpy unicode strings can't be written in HDF5 so
    # to write such a table a copy of table is made containing columns as
    # bytestrings.  Now this copy of the table can be written in HDF5.
    if any(col.info.dtype.kind == 'U' for col in table.itercols()):
        table = table.copy(copy_data=False)
        table.convert_unicode_to_bytestring()

    # Warn if information will be lost when serialize_meta=False.  This is
    # hardcoded to the set difference between column info attributes and what
    # HDF5 can store natively (name, dtype) with no meta.
    if serialize_meta is False:
        for col in table.itercols():
            for attr in ('unit', 'format', 'description', 'meta'):
                if getattr(col.info, attr, None) not in (None, {}):
                    warnings.warn(
                        "table contains column(s) with defined 'unit', 'format',"
                        " 'description', or 'meta' info attributes. These will"
                        " be dropped since serialize_meta=False.",
                        AstropyUserWarning)

    if existing_header is None:  # Just write the table and metadata
        # Write the table to the file
        if compression:
            if compression is True:
                compression = 'gzip'
            dset = output_group.create_dataset(name,
                                               data=table.as_array(),
                                               compression=compression,
                                               **create_dataset_kwargs)
        else:
            dset = output_group.create_dataset(name,
                                               data=table.as_array(),
                                               **create_dataset_kwargs)

        if serialize_meta:
            header_yaml = meta.get_yaml_from_table(table)

            header_encoded = [h.encode('utf-8') for h in header_yaml]
            output_group.create_dataset(meta_path(name), data=header_encoded)

        else:
            # Write the Table meta dict key:value pairs to the file as HDF5
            # attributes.  This works only for a limited set of scalar data types
            # like numbers, strings, etc., but not any complex types.  This path
            # also ignores column meta like unit or format.
            for key in table.meta:
                val = table.meta[key]
                try:
                    dset.attrs[key] = val
                except TypeError:
                    warnings.warn(
                        "Attribute `{}` of type {} cannot be written to "
                        "HDF5 files - skipping. (Consider specifying "
                        "serialize_meta=True to write all meta data)".format(
                            key, type(val)), AstropyUserWarning)

    else:  # We need to append the tables!
        try:
            # FIXME: do something with the merged metadata!
            metadata.merge(existing_header['meta'],
                           table.meta,
                           metadata_conflicts=metadata_conflicts)
        except metadata.MergeConflictError:
            raise metadata.MergeConflictError(
                "Cannot append table to existing file because "
                "the existing file table metadata and this "
                "table object's metadata do not match. If you "
                "want to ignore this issue, or change to a "
                "warning, set metadata_conflicts='silent' or 'warn'.")

        # Now compare datatype of this object and on disk
        this_header = get_header_from_yaml(get_yaml_from_table(table))

        if not _custom_tbl_dtype_compare(existing_header['datatype'],
                                         this_header['datatype']):
            raise ValueError(
                "Cannot append table to existing file because "
                "the existing file table datatype and this "
                "object's table datatype do not match. "
                f"{existing_header['datatype']} vs. {this_header['datatype']}")

        # If we got here, we can now try to append:
        current_size = len(output_group[name])
        output_group[name].resize((current_size + len(table), ))
        output_group[name][current_size:] = table.as_array()