Beispiel #1
0
def test_represent_mixins_as_columns_unit_fix():
    """
    If the unit is invalid for a column that gets serialized this would
    cause an exception.  Fixed in #7481.
    """
    t = Table({'a': [1, 2]}, masked=True)
    t['a'].unit = 'not a valid unit'
    t['a'].mask[1] = True
    serialize.represent_mixins_as_columns(t)
Beispiel #2
0
def test_represent_mixins_as_columns_unit_fix():
    """
    If the unit is invalid for a column that gets serialized this would
    cause an exception.  Fixed in #7481.
    """
    t = Table({'a': [1, 2]}, masked=True)
    t['a'].unit = 'not a valid unit'
    t['a'].mask[1] = True
    serialize.represent_mixins_as_columns(t)
Beispiel #3
0
def test_bad_info_class():
    """Make a mixin column class that does not trigger the machinery to generate
    a pure column representation"""
    class MyArrayWrapper(ArrayWrapper):
        info = ParentDtypeInfo()

    t = Table()
    t['tm'] = MyArrayWrapper([0, 1, 2])
    out = StringIO()
    match = r"failed to represent column 'tm' \(MyArrayWrapper\) as one or more Column subclasses"
    with pytest.raises(TypeError, match=match):
        represent_mixins_as_columns(t)
Beispiel #4
0
def _encode_mixins(tbl):
    """Encode a Table ``tbl`` that may have mixin columns to a Table with only
    astropy Columns + appropriate meta-data to allow subsequent decoding.
    """
    from astropy.table import serialize
    from astropy.table.table import has_info_class
    from astropy import units as u
    from astropy.utils.data_info import MixinInfo, serialize_context_as

    # If PyYAML is not available then check to see if there are any mixin cols
    # that *require* YAML serialization.  HDF5 already has support for
    # Quantity, so if those are the only mixins the proceed without doing the
    # YAML bit, for backward compatibility (i.e. not requiring YAML to write
    # Quantity).
    try:
        import yaml
    except ImportError:
        for col in tbl.itercols():
            if (has_info_class(col, MixinInfo)
                    and col.__class__ is not u.Quantity):
                raise TypeError("cannot write type {} column '{}' "
                                "to HDF5 without PyYAML installed.".format(
                                    col.__class__.__name__, col.info.name))

    # Convert the table to one with no mixins, only Column objects.  This adds
    # meta data which is extracted with meta.get_yaml_from_table.
    with serialize_context_as('hdf5'):
        encode_tbl = serialize.represent_mixins_as_columns(tbl)

    return encode_tbl
Beispiel #5
0
def _encode_mixins(tbl):
    """Encode a Table ``tbl`` that may have mixin columns to a Table with only
    astropy Columns + appropriate meta-data to allow subsequent decoding.
    """
    from astropy.table import serialize
    from astropy.table.table import has_info_class
    from astropy import units as u
    from astropy.utils.data_info import MixinInfo, serialize_context_as

    # If PyYAML is not available then check to see if there are any mixin cols
    # that *require* YAML serialization.  HDF5 already has support for
    # Quantity, so if those are the only mixins the proceed without doing the
    # YAML bit, for backward compatibility (i.e. not requiring YAML to write
    # Quantity).
    try:
        import yaml
    except ImportError:
        for col in tbl.itercols():
            if (has_info_class(col, MixinInfo) and
                    col.__class__ is not u.Quantity):
                raise TypeError("cannot write type {} column '{}' "
                                "to HDF5 without PyYAML installed."
                                .format(col.__class__.__name__, col.info.name))

    # Convert the table to one with no mixins, only Column objects.  This adds
    # meta data which is extracted with meta.get_yaml_from_table.
    with serialize_context_as('hdf5'):
        encode_tbl = serialize.represent_mixins_as_columns(tbl)

    return encode_tbl
Beispiel #6
0
def test_primary_data_column_gets_description():
    """
    If the mixin defines a primary data column, that should get the
    description, format, etc., so no __info__ should be needed.
    """
    t = QTable({'a': [1, 2] * u.m})
    t['a'].info.description = 'parrot'
    t['a'].info.format = '7.2f'
    tser = serialize.represent_mixins_as_columns(t)
    assert '__info__' not in tser.meta['__serialized_columns__']['a']
    assert tser['a'].format == '7.2f'
    assert tser['a'].description == 'parrot'
Beispiel #7
0
def _encode_mixins(tbl):
    """Encode a Table ``tbl`` that may have mixin columns to a Table with only
    astropy Columns + appropriate meta-data to allow subsequent decoding.
    """
    from astropy.table import serialize
    from astropy import units as u
    from astropy.utils.data_info import serialize_context_as

    # Convert the table to one with no mixins, only Column objects.  This adds
    # meta data which is extracted with meta.get_yaml_from_table.
    with serialize_context_as('hdf5'):
        encode_tbl = serialize.represent_mixins_as_columns(tbl)

    return encode_tbl
Beispiel #8
0
def _encode_mixins(tbl):
    from astropy.table import serialize
    from astropy.table.table import has_info_class
    from astropy import units as u
    from astropy.utils.data_info import MixinInfo, serialize_context_as

    try:
        import yaml
    except ImportError:
        for col in tbl.itercols():
            if (has_info_class(col, MixinInfo)
                    and col.__class__ is not u.Quantity):
                raise TypeError("cannot write type {} column '{}' "
                                "to HDF5 without PyYAML installed.".format(
                                    col.__class__.__name__, col.info.name))

    with serialize_context_as('hdf5'):
        encode_tbl = serialize.represent_mixins_as_columns(tbl)

    return encode_tbl
Beispiel #9
0
    def update_table_data(self, table):
        """
        Update table columns in place if mixin columns are present.

        This is a hook to allow updating the table columns after name
        filtering but before setting up to write the data.  This is currently
        only used by ECSV and is otherwise just a pass-through.

        Parameters
        ----------
        table : `astropy.table.Table`
            Input table for writing

        Returns
        -------
        table : `astropy.table.Table`
            Output table for writing
        """
        with serialize_context_as('ecsv'):
            out = serialize.represent_mixins_as_columns(table)
        return out
Beispiel #10
0
    def update_table_data(self, table):
        """
        Update table columns in place if mixin columns are present.

        This is a hook to allow updating the table columns after name
        filtering but before setting up to write the data.  This is currently
        only used by ECSV and is otherwise just a pass-through.

        Parameters
        ----------
        table : `astropy.table.Table`
            Input table for writing

        Returns
        -------
        table : `astropy.table.Table`
            Output table for writing
        """
        with serialize_context_as('ecsv'):
            out = serialize.represent_mixins_as_columns(table)
        return out
Beispiel #11
0
def _encode_mixins(tbl):
    """Encode a Table ``tbl`` that may have mixin columns to a Table with only
    astropy Columns + appropriate meta-data to allow subsequent decoding.
    """
    # Determine if information will be lost without serializing meta.  This is hardcoded
    # to the set difference between column info attributes and what FITS can store
    # natively (name, dtype, unit).  See _get_col_attributes() in table/meta.py for where
    # this comes from.
    info_lost = any(any(getattr(col.info, attr, None) not in (None, {})
                        for attr in ('description', 'meta'))
                    for col in tbl.itercols())

    # Convert the table to one with no mixins, only Column objects.  This adds
    # meta data which is extracted with meta.get_yaml_from_table.  This ignores
    # Time-subclass columns and leave them in the table so that the downstream
    # FITS Time handling does the right thing.

    with serialize_context_as('fits'):
        encode_tbl = serialize.represent_mixins_as_columns(
            tbl, exclude_classes=(Time,))

    # If the encoded table is unchanged then there were no mixins.  But if there
    # is column metadata (format, description, meta) that would be lost, then
    # still go through the serialized columns machinery.
    if encode_tbl is tbl and not info_lost:
        return tbl

    # Copy the meta dict if it was not copied by represent_mixins_as_columns.
    # We will modify .meta['comments'] below and we do not want to see these
    # comments in the input table.
    if encode_tbl is tbl:
        meta_copy = deepcopy(tbl.meta)
        encode_tbl = Table(tbl.columns, meta=meta_copy, copy=False)

    # Get the YAML serialization of information describing the table columns.
    # This is re-using ECSV code that combined existing table.meta with with
    # the extra __serialized_columns__ key.  For FITS the table.meta is handled
    # by the native FITS connect code, so don't include that in the YAML
    # output.
    ser_col = '__serialized_columns__'

    # encode_tbl might not have a __serialized_columns__ key if there were no mixins,
    # but machinery below expects it to be available, so just make an empty dict.
    encode_tbl.meta.setdefault(ser_col, {})

    tbl_meta_copy = encode_tbl.meta.copy()
    try:
        encode_tbl.meta = {ser_col: encode_tbl.meta[ser_col]}
        meta_yaml_lines = meta.get_yaml_from_table(encode_tbl)
    finally:
        encode_tbl.meta = tbl_meta_copy
    del encode_tbl.meta[ser_col]

    if 'comments' not in encode_tbl.meta:
        encode_tbl.meta['comments'] = []
    encode_tbl.meta['comments'].append('--BEGIN-ASTROPY-SERIALIZED-COLUMNS--')

    for line in meta_yaml_lines:
        if len(line) == 0:
            lines = ['']
        else:
            # Split line into 70 character chunks for COMMENT cards
            idxs = list(range(0, len(line) + 70, 70))
            lines = [line[i0:i1] + '\\' for i0, i1 in zip(idxs[:-1], idxs[1:])]
            lines[-1] = lines[-1][:-1]
        encode_tbl.meta['comments'].extend(lines)

    encode_tbl.meta['comments'].append('--END-ASTROPY-SERIALIZED-COLUMNS--')

    return encode_tbl
Beispiel #12
0
def _encode_mixins(tbl):
    """Encode a Table ``tbl`` that may have mixin columns to a Table with only
    astropy Columns + appropriate meta-data to allow subsequent decoding.
    """
    # Determine if information will be lost without serializing meta.  This is hardcoded
    # to the set difference between column info attributes and what FITS can store
    # natively (name, dtype, unit).  See _get_col_attributes() in table/meta.py for where
    # this comes from.
    info_lost = any(
        any(
            getattr(col.info, attr, None) not in (None, {})
            for attr in ('description', 'meta')) for col in tbl.itercols())

    # If PyYAML is not available then check to see if there are any mixin cols
    # that *require* YAML serialization.  FITS already has support for Time,
    # Quantity, so if those are the only mixins the proceed without doing the
    # YAML bit, for backward compatibility (i.e. not requiring YAML to write
    # Time or Quantity).  In this case other mixin column meta (e.g.
    # description or meta) will be silently dropped, consistent with astropy <=
    # 2.0 behavior.
    try:
        import yaml  # noqa
    except ImportError:
        for col in tbl.itercols():
            if (has_info_class(col, MixinInfo)
                    and col.__class__ not in (u.Quantity, Time)):
                raise TypeError("cannot write type {} column '{}' "
                                "to FITS without PyYAML installed.".format(
                                    col.__class__.__name__, col.info.name))
        else:
            if info_lost:
                warnings.warn(
                    "table contains column(s) with defined 'format',"
                    " 'description', or 'meta' info attributes. These"
                    " will be dropped unless you install PyYAML.",
                    AstropyUserWarning)
            return tbl

    # Convert the table to one with no mixins, only Column objects.  This adds
    # meta data which is extracted with meta.get_yaml_from_table.  This ignores
    # Time-subclass columns and leave them in the table so that the downstream
    # FITS Time handling does the right thing.

    with serialize_context_as('fits'):
        encode_tbl = serialize.represent_mixins_as_columns(
            tbl, exclude_classes=(Time, ))

    # If the encoded table is unchanged then there were no mixins.  But if there
    # is column metadata (format, description, meta) that would be lost, then
    # still go through the serialized columns machinery.
    if encode_tbl is tbl and not info_lost:
        return tbl

    # Copy the meta dict if it was not copied by represent_mixins_as_columns.
    # We will modify .meta['comments'] below and we do not want to see these
    # comments in the input table.
    if encode_tbl is tbl:
        meta_copy = deepcopy(tbl.meta)
        encode_tbl = Table(tbl.columns, meta=meta_copy, copy=False)

    # Get the YAML serialization of information describing the table columns.
    # This is re-using ECSV code that combined existing table.meta with with
    # the extra __serialized_columns__ key.  For FITS the table.meta is handled
    # by the native FITS connect code, so don't include that in the YAML
    # output.
    ser_col = '__serialized_columns__'

    # encode_tbl might not have a __serialized_columns__ key if there were no mixins,
    # but machinery below expects it to be available, so just make an empty dict.
    encode_tbl.meta.setdefault(ser_col, {})

    tbl_meta_copy = encode_tbl.meta.copy()
    try:
        encode_tbl.meta = {ser_col: encode_tbl.meta[ser_col]}
        meta_yaml_lines = meta.get_yaml_from_table(encode_tbl)
    finally:
        encode_tbl.meta = tbl_meta_copy
    del encode_tbl.meta[ser_col]

    if 'comments' not in encode_tbl.meta:
        encode_tbl.meta['comments'] = []
    encode_tbl.meta['comments'].append('--BEGIN-ASTROPY-SERIALIZED-COLUMNS--')

    for line in meta_yaml_lines:
        if len(line) == 0:
            lines = ['']
        else:
            # Split line into 70 character chunks for COMMENT cards
            idxs = list(range(0, len(line) + 70, 70))
            lines = [line[i0:i1] + '\\' for i0, i1 in zip(idxs[:-1], idxs[1:])]
            lines[-1] = lines[-1][:-1]
        encode_tbl.meta['comments'].extend(lines)

    encode_tbl.meta['comments'].append('--END-ASTROPY-SERIALIZED-COLUMNS--')

    return encode_tbl
Beispiel #13
0
def write_table_parquet(table, output, overwrite=False):
    """
    Write a Table object to a Parquet file

    This requires `pyarrow <https://arrow.apache.org/docs/python/>`_
    to be installed.

    Parameters
    ----------
    table : `~astropy.table.Table`
        Data table that is to be written to file.
    output : str or path-like
        The filename to write the table to.
    overwrite : bool, optional
        Whether to overwrite any existing file without warning. Default `False`.
    """

    from astropy.table import meta, serialize
    from astropy.utils.data_info import serialize_context_as

    pa, parquet, writer_version = get_pyarrow()

    if not isinstance(output, (str, os.PathLike)):
        raise TypeError(
            f'`output` should be a string or path-like, not {output}')

    # Convert all compound columns into serialized column names, where
    # e.g. 'time' becomes ['time.jd1', 'time.jd2'].
    with serialize_context_as('parquet'):
        encode_table = serialize.represent_mixins_as_columns(table)
    # We store the encoded serialization metadata as a yaml string.
    meta_yaml = meta.get_yaml_from_table(encode_table)
    meta_yaml_str = '\n'.join(meta_yaml)

    metadata = {}
    for name, col in encode_table.columns.items():
        # Parquet will retain the datatypes of columns, but string and
        # byte column length is lost.  Therefore, we special-case these
        # types to record the length for precise round-tripping.
        if col.dtype.type is np.str_:
            metadata[f'table::len::{name}'] = str(col.dtype.itemsize // 4)
        elif col.dtype.type is np.bytes_:
            metadata[f'table::len::{name}'] = str(col.dtype.itemsize)

        metadata['table_meta_yaml'] = meta_yaml_str

    # Pyarrow stores all metadata as byte strings, so we explicitly encode
    # our unicode strings in metadata as UTF-8 byte strings here.
    metadata_encode = {
        k.encode('UTF-8'): v.encode('UTF-8')
        for k, v in metadata.items()
    }

    # Build the pyarrow schema by converting from the numpy dtype of each
    # column to an equivalent pyarrow type with from_numpy_dtype()
    type_list = [(name, pa.from_numpy_dtype(encode_table.dtype[name].type))
                 for name in encode_table.dtype.names]
    schema = pa.schema(type_list, metadata=metadata_encode)

    if os.path.exists(output):
        if overwrite:
            # We must remove the file prior to writing below.
            os.remove(output)
        else:
            raise OSError(NOT_OVERWRITING_MSG.format(output))

    # We use version='2.0' for full support of datatypes including uint32.
    with parquet.ParquetWriter(output, schema,
                               version=writer_version) as writer:
        # Convert each Table column to a pyarrow array
        arrays = [pa.array(col) for col in encode_table.itercols()]
        # Create a pyarrow table from the list of arrays and the schema
        pa_table = pa.Table.from_arrays(arrays, schema=schema)
        # Write the pyarrow table to a file
        writer.write_table(pa_table)