def _encode_mixins(tbl): """Encode a Table ``tbl`` that may have mixin columns to a Table with only astropy Columns + appropriate meta-data to allow subsequent decoding. """ from astropy.table import serialize from astropy.table.table import has_info_class from astropy import units as u from astropy.utils.data_info import MixinInfo, serialize_context_as # If PyYAML is not available then check to see if there are any mixin cols # that *require* YAML serialization. HDF5 already has support for # Quantity, so if those are the only mixins the proceed without doing the # YAML bit, for backward compatibility (i.e. not requiring YAML to write # Quantity). try: import yaml except ImportError: for col in tbl.itercols(): if (has_info_class(col, MixinInfo) and col.__class__ is not u.Quantity): raise TypeError("cannot write type {} column '{}' " "to HDF5 without PyYAML installed.".format( col.__class__.__name__, col.info.name)) # Convert the table to one with no mixins, only Column objects. This adds # meta data which is extracted with meta.get_yaml_from_table. with serialize_context_as('hdf5'): encode_tbl = serialize.represent_mixins_as_columns(tbl) return encode_tbl
def _encode_mixins(tbl): """Encode a Table ``tbl`` that may have mixin columns to a Table with only astropy Columns + appropriate meta-data to allow subsequent decoding. """ from astropy.table import serialize from astropy.table.table import has_info_class from astropy import units as u from astropy.utils.data_info import MixinInfo, serialize_context_as # If PyYAML is not available then check to see if there are any mixin cols # that *require* YAML serialization. HDF5 already has support for # Quantity, so if those are the only mixins the proceed without doing the # YAML bit, for backward compatibility (i.e. not requiring YAML to write # Quantity). try: import yaml except ImportError: for col in tbl.itercols(): if (has_info_class(col, MixinInfo) and col.__class__ is not u.Quantity): raise TypeError("cannot write type {} column '{}' " "to HDF5 without PyYAML installed." .format(col.__class__.__name__, col.info.name)) # Convert the table to one with no mixins, only Column objects. This adds # meta data which is extracted with meta.get_yaml_from_table. with serialize_context_as('hdf5'): encode_tbl = serialize.represent_mixins_as_columns(tbl) return encode_tbl
def _encode_mixins(tbl): """Encode a Table ``tbl`` that may have mixin columns to a Table with only astropy Columns + appropriate meta-data to allow subsequent decoding. """ from astropy.table import serialize from astropy import units as u from astropy.utils.data_info import serialize_context_as # Convert the table to one with no mixins, only Column objects. This adds # meta data which is extracted with meta.get_yaml_from_table. with serialize_context_as('hdf5'): encode_tbl = serialize.represent_mixins_as_columns(tbl) return encode_tbl
def _encode_mixins(tbl): from astropy.table import serialize from astropy.table.table import has_info_class from astropy import units as u from astropy.utils.data_info import MixinInfo, serialize_context_as try: import yaml except ImportError: for col in tbl.itercols(): if (has_info_class(col, MixinInfo) and col.__class__ is not u.Quantity): raise TypeError("cannot write type {} column '{}' " "to HDF5 without PyYAML installed.".format( col.__class__.__name__, col.info.name)) with serialize_context_as('hdf5'): encode_tbl = serialize.represent_mixins_as_columns(tbl) return encode_tbl
def update_table_data(self, table): """ Update table columns in place if mixin columns are present. This is a hook to allow updating the table columns after name filtering but before setting up to write the data. This is currently only used by ECSV and is otherwise just a pass-through. Parameters ---------- table : `astropy.table.Table` Input table for writing Returns ------- table : `astropy.table.Table` Output table for writing """ with serialize_context_as('ecsv'): out = serialize._represent_mixins_as_columns(table) return out
def update_table_data(self, table): """ Update table columns in place if mixin columns are present. This is a hook to allow updating the table columns after name filtering but before setting up to write the data. This is currently only used by ECSV and is otherwise just a pass-through. Parameters ---------- table : `astropy.table.Table` Input table for writing Returns ------- table : `astropy.table.Table` Output table for writing """ with serialize_context_as('ecsv'): out = serialize.represent_mixins_as_columns(table) return out
def _encode_mixins(tbl): """Encode a Table ``tbl`` that may have mixin columns to a Table with only astropy Columns + appropriate meta-data to allow subsequent decoding. """ # Determine if information will be lost without serializing meta. This is hardcoded # to the set difference between column info attributes and what FITS can store # natively (name, dtype, unit). See _get_col_attributes() in table/meta.py for where # this comes from. info_lost = any(any(getattr(col.info, attr, None) not in (None, {}) for attr in ('description', 'meta')) for col in tbl.itercols()) # Convert the table to one with no mixins, only Column objects. This adds # meta data which is extracted with meta.get_yaml_from_table. This ignores # Time-subclass columns and leave them in the table so that the downstream # FITS Time handling does the right thing. with serialize_context_as('fits'): encode_tbl = serialize.represent_mixins_as_columns( tbl, exclude_classes=(Time,)) # If the encoded table is unchanged then there were no mixins. But if there # is column metadata (format, description, meta) that would be lost, then # still go through the serialized columns machinery. if encode_tbl is tbl and not info_lost: return tbl # Copy the meta dict if it was not copied by represent_mixins_as_columns. # We will modify .meta['comments'] below and we do not want to see these # comments in the input table. if encode_tbl is tbl: meta_copy = deepcopy(tbl.meta) encode_tbl = Table(tbl.columns, meta=meta_copy, copy=False) # Get the YAML serialization of information describing the table columns. # This is re-using ECSV code that combined existing table.meta with with # the extra __serialized_columns__ key. For FITS the table.meta is handled # by the native FITS connect code, so don't include that in the YAML # output. ser_col = '__serialized_columns__' # encode_tbl might not have a __serialized_columns__ key if there were no mixins, # but machinery below expects it to be available, so just make an empty dict. encode_tbl.meta.setdefault(ser_col, {}) tbl_meta_copy = encode_tbl.meta.copy() try: encode_tbl.meta = {ser_col: encode_tbl.meta[ser_col]} meta_yaml_lines = meta.get_yaml_from_table(encode_tbl) finally: encode_tbl.meta = tbl_meta_copy del encode_tbl.meta[ser_col] if 'comments' not in encode_tbl.meta: encode_tbl.meta['comments'] = [] encode_tbl.meta['comments'].append('--BEGIN-ASTROPY-SERIALIZED-COLUMNS--') for line in meta_yaml_lines: if len(line) == 0: lines = [''] else: # Split line into 70 character chunks for COMMENT cards idxs = list(range(0, len(line) + 70, 70)) lines = [line[i0:i1] + '\\' for i0, i1 in zip(idxs[:-1], idxs[1:])] lines[-1] = lines[-1][:-1] encode_tbl.meta['comments'].extend(lines) encode_tbl.meta['comments'].append('--END-ASTROPY-SERIALIZED-COLUMNS--') return encode_tbl
def _encode_mixins(tbl): """Encode a Table ``tbl`` that may have mixin columns to a Table with only astropy Columns + appropriate meta-data to allow subsequent decoding. """ # Determine if information will be lost without serializing meta. This is hardcoded # to the set difference between column info attributes and what FITS can store # natively (name, dtype, unit). See _get_col_attributes() in table/meta.py for where # this comes from. info_lost = any( any( getattr(col.info, attr, None) not in (None, {}) for attr in ('description', 'meta')) for col in tbl.itercols()) # If PyYAML is not available then check to see if there are any mixin cols # that *require* YAML serialization. FITS already has support for Time, # Quantity, so if those are the only mixins the proceed without doing the # YAML bit, for backward compatibility (i.e. not requiring YAML to write # Time or Quantity). In this case other mixin column meta (e.g. # description or meta) will be silently dropped, consistent with astropy <= # 2.0 behavior. try: import yaml # noqa except ImportError: for col in tbl.itercols(): if (has_info_class(col, MixinInfo) and col.__class__ not in (u.Quantity, Time)): raise TypeError("cannot write type {} column '{}' " "to FITS without PyYAML installed.".format( col.__class__.__name__, col.info.name)) else: if info_lost: warnings.warn( "table contains column(s) with defined 'format'," " 'description', or 'meta' info attributes. These" " will be dropped unless you install PyYAML.", AstropyUserWarning) return tbl # Convert the table to one with no mixins, only Column objects. This adds # meta data which is extracted with meta.get_yaml_from_table. This ignores # Time-subclass columns and leave them in the table so that the downstream # FITS Time handling does the right thing. with serialize_context_as('fits'): encode_tbl = serialize._represent_mixins_as_columns( tbl, exclude_classes=(Time, )) # If the encoded table is unchanged then there were no mixins. But if there # is column metadata (format, description, meta) that would be lost, then # still go through the serialized columns machinery. if encode_tbl is tbl and not info_lost: return tbl # Get the YAML serialization of information describing the table columns. # This is re-using ECSV code that combined existing table.meta with with # the extra __serialized_columns__ key. For FITS the table.meta is handled # by the native FITS connect code, so don't include that in the YAML # output. ser_col = '__serialized_columns__' # encode_tbl might not have a __serialized_columns__ key if there were no mixins, # but machinery below expects it to be available, so just make an empty dict. encode_tbl.meta.setdefault(ser_col, {}) tbl_meta_copy = encode_tbl.meta.copy() try: encode_tbl.meta = {ser_col: encode_tbl.meta[ser_col]} meta_yaml_lines = meta.get_yaml_from_table(encode_tbl) finally: encode_tbl.meta = tbl_meta_copy del encode_tbl.meta[ser_col] if 'comments' not in encode_tbl.meta: encode_tbl.meta['comments'] = [] encode_tbl.meta['comments'].append('--BEGIN-ASTROPY-SERIALIZED-COLUMNS--') for line in meta_yaml_lines: if len(line) == 0: lines = [''] else: # Split line into 70 character chunks for COMMENT cards idxs = list(range(0, len(line) + 70, 70)) lines = [line[i0:i1] + '\\' for i0, i1 in zip(idxs[:-1], idxs[1:])] lines[-1] = lines[-1][:-1] encode_tbl.meta['comments'].extend(lines) encode_tbl.meta['comments'].append('--END-ASTROPY-SERIALIZED-COLUMNS--') return encode_tbl
def _encode_mixins(tbl): """Encode a Table ``tbl`` that may have mixin columns to a Table with only astropy Columns + appropriate meta-data to allow subsequent decoding. """ # Determine if information will be lost without serializing meta. This is hardcoded # to the set difference between column info attributes and what FITS can store # natively (name, dtype, unit). See _get_col_attributes() in table/meta.py for where # this comes from. info_lost = any(any(getattr(col.info, attr, None) not in (None, {}) for attr in ('description', 'meta')) for col in tbl.itercols()) # If PyYAML is not available then check to see if there are any mixin cols # that *require* YAML serialization. FITS already has support for Time, # Quantity, so if those are the only mixins the proceed without doing the # YAML bit, for backward compatibility (i.e. not requiring YAML to write # Time or Quantity). In this case other mixin column meta (e.g. # description or meta) will be silently dropped, consistent with astropy <= # 2.0 behavior. try: import yaml # noqa except ImportError: for col in tbl.itercols(): if (has_info_class(col, MixinInfo) and col.__class__ not in (u.Quantity, Time)): raise TypeError("cannot write type {} column '{}' " "to FITS without PyYAML installed." .format(col.__class__.__name__, col.info.name)) else: if info_lost: warnings.warn("table contains column(s) with defined 'format'," " 'description', or 'meta' info attributes. These" " will be dropped unless you install PyYAML.", AstropyUserWarning) return tbl # Convert the table to one with no mixins, only Column objects. This adds # meta data which is extracted with meta.get_yaml_from_table. This ignores # Time-subclass columns and leave them in the table so that the downstream # FITS Time handling does the right thing. with serialize_context_as('fits'): encode_tbl = serialize._represent_mixins_as_columns( tbl, exclude_classes=(Time,)) # If the encoded table is unchanged then there were no mixins. But if there # is column metadata (format, description, meta) that would be lost, then # still go through the serialized columns machinery. if encode_tbl is tbl and not info_lost: return tbl # Get the YAML serialization of information describing the table columns. # This is re-using ECSV code that combined existing table.meta with with # the extra __serialized_columns__ key. For FITS the table.meta is handled # by the native FITS connect code, so don't include that in the YAML # output. ser_col = '__serialized_columns__' # encode_tbl might not have a __serialized_columns__ key if there were no mixins, # but machinery below expects it to be available, so just make an empty dict. encode_tbl.meta.setdefault(ser_col, {}) tbl_meta_copy = encode_tbl.meta.copy() try: encode_tbl.meta = {ser_col: encode_tbl.meta[ser_col]} meta_yaml_lines = meta.get_yaml_from_table(encode_tbl) finally: encode_tbl.meta = tbl_meta_copy del encode_tbl.meta[ser_col] if 'comments' not in encode_tbl.meta: encode_tbl.meta['comments'] = [] encode_tbl.meta['comments'].append('--BEGIN-ASTROPY-SERIALIZED-COLUMNS--') for line in meta_yaml_lines: if len(line) == 0: lines = [''] else: # Split line into 70 character chunks for COMMENT cards idxs = list(range(0, len(line) + 70, 70)) lines = [line[i0:i1] + '\\' for i0, i1 in zip(idxs[:-1], idxs[1:])] lines[-1] = lines[-1][:-1] encode_tbl.meta['comments'].extend(lines) encode_tbl.meta['comments'].append('--END-ASTROPY-SERIALIZED-COLUMNS--') return encode_tbl
def write_table_parquet(table, output, overwrite=False): """ Write a Table object to a Parquet file This requires `pyarrow <https://arrow.apache.org/docs/python/>`_ to be installed. Parameters ---------- table : `~astropy.table.Table` Data table that is to be written to file. output : str or path-like The filename to write the table to. overwrite : bool, optional Whether to overwrite any existing file without warning. Default `False`. """ from astropy.table import meta, serialize from astropy.utils.data_info import serialize_context_as pa, parquet, writer_version = get_pyarrow() if not isinstance(output, (str, os.PathLike)): raise TypeError( f'`output` should be a string or path-like, not {output}') # Convert all compound columns into serialized column names, where # e.g. 'time' becomes ['time.jd1', 'time.jd2']. with serialize_context_as('parquet'): encode_table = serialize.represent_mixins_as_columns(table) # We store the encoded serialization metadata as a yaml string. meta_yaml = meta.get_yaml_from_table(encode_table) meta_yaml_str = '\n'.join(meta_yaml) metadata = {} for name, col in encode_table.columns.items(): # Parquet will retain the datatypes of columns, but string and # byte column length is lost. Therefore, we special-case these # types to record the length for precise round-tripping. if col.dtype.type is np.str_: metadata[f'table::len::{name}'] = str(col.dtype.itemsize // 4) elif col.dtype.type is np.bytes_: metadata[f'table::len::{name}'] = str(col.dtype.itemsize) metadata['table_meta_yaml'] = meta_yaml_str # Pyarrow stores all metadata as byte strings, so we explicitly encode # our unicode strings in metadata as UTF-8 byte strings here. metadata_encode = { k.encode('UTF-8'): v.encode('UTF-8') for k, v in metadata.items() } # Build the pyarrow schema by converting from the numpy dtype of each # column to an equivalent pyarrow type with from_numpy_dtype() type_list = [(name, pa.from_numpy_dtype(encode_table.dtype[name].type)) for name in encode_table.dtype.names] schema = pa.schema(type_list, metadata=metadata_encode) if os.path.exists(output): if overwrite: # We must remove the file prior to writing below. os.remove(output) else: raise OSError(NOT_OVERWRITING_MSG.format(output)) # We use version='2.0' for full support of datatypes including uint32. with parquet.ParquetWriter(output, schema, version=writer_version) as writer: # Convert each Table column to a pyarrow array arrays = [pa.array(col) for col in encode_table.itercols()] # Create a pyarrow table from the list of arrays and the schema pa_table = pa.Table.from_arrays(arrays, schema=schema) # Write the pyarrow table to a file writer.write_table(pa_table)