예제 #1
0
파일: connect.py 프로젝트: vrodgom/astropy
def _decode_mixins(tbl):
    """Decode a Table ``tbl`` that has astropy Columns + appropriate meta-data into
    the corresponding table with mixin columns (as appropriate).
    """
    # If available read in __serialized_columns__ meta info which is stored
    # in FITS COMMENTS between two sentinels.
    try:
        i0 = tbl.meta['comments'].index('--BEGIN-ASTROPY-SERIALIZED-COLUMNS--')
        i1 = tbl.meta['comments'].index('--END-ASTROPY-SERIALIZED-COLUMNS--')
    except (ValueError, KeyError):
        return tbl

    # The YAML data are split into COMMENT cards, with lines longer than 70
    # characters being split with a continuation character \ (backslash).
    # Strip the backslashes and join together.
    continuation_line = False
    lines = []
    for line in tbl.meta['comments'][i0 + 1:i1]:
        if continuation_line:
            lines[-1] = lines[-1] + line[:70]
        else:
            lines.append(line[:70])
        continuation_line = len(line) == 71

    del tbl.meta['comments'][i0:i1 + 1]
    if not tbl.meta['comments']:
        del tbl.meta['comments']

    try:
        info = meta.get_header_from_yaml(lines)
    except ImportError as exc:
        if 'PyYAML package is required' in str(exc):
            warnings.warn(
                "the file contains information about Astropy native objects "
                "(mixin columns) that have been serialized when writing it, "
                "but the PyYAML package is required to read those. Without "
                "this package some information will be missing in the table",
                AstropyUserWarning
            )
            return tbl
        else:
            raise

    # Add serialized column information to table meta for use in constructing mixins
    tbl.meta['__serialized_columns__'] = info['meta']['__serialized_columns__']

    # Use the `datatype` attribute info to update column attributes that are
    # NOT already handled via standard FITS column keys (name, dtype, unit).
    for col in info['datatype']:
        for attr in ['description', 'meta']:
            if attr in col:
                setattr(tbl[col['name']].info, attr, col[attr])

    # Construct new table with mixins, using tbl.meta['__serialized_columns__']
    # as guidance.
    tbl = serialize._construct_mixins_from_columns(tbl)

    return tbl
예제 #2
0
파일: utils.py 프로젝트: minaskar/thejoker
def table_contains_column(root, column):
    from .samples import JokerSamples

    path = meta_path(JokerSamples._hdf5_path)
    header = get_header_from_yaml(h.decode('utf-8') for h in root[path])

    columns = []
    for row in header['datatype']:
        columns.append(row['name'])

    return column in columns
예제 #3
0
파일: utils.py 프로젝트: minaskar/thejoker
def table_header_to_units(header_dataset):
    """
    Convert a YAML-ized astropy.table header into a dictionary that maps from
    column name to unit.
    """

    header = get_header_from_yaml(h.decode('utf-8')
                                  for h in header_dataset)

    units = dict()
    for row in header['datatype']:
        units[row['name']] = u.Unit(row.get('unit', u.one))

    return units
예제 #4
0
def recursively_read_dict_contents(input):
    """
    Will recursive read a dictionary, initializing quantities and table from a dictionary read from an hdf5 file.

    Parameters
    ----------
    input : dict
        dictionary read from hdf5

    Returns
    --------
    dict
        Dictionary we want to use

    """
    new_keys = [k for k in input.keys()]
    # if all(elem in new_keys for elem in ['wl_grid', 'data', 'time_grid']):
    #     wl = input['wl_grid']['value'] * u.Unit(input['wl_grid']['unit'])
    #     data = input['data']['value'] * u.Unit(input['data']['unit'])
    #     time = input['time_grid']['value'] * u.Unit(input['time_grid']['unit'])
    #     input = signal[str(input['data']['unit'])](wl, data, time)
    if all(elem in new_keys for elem in ['value', 'unit']):
        input['value'] = input['value'] * u.Unit(input['unit'])
    if any('.__table_column_meta__' in elem for elem in new_keys):
        table_keys = [
            elem for elem in new_keys if '.__table_column_meta__' in elem
        ]
        table_keys = (elem.split('.')[0] for elem in table_keys)
        for k in table_keys:
            table = Table(np.array(input[k]))
            header = meta.get_header_from_yaml(
                h.decode('utf-8')
                for h in input['{}.__table_column_meta__'.format(k)])
            header_cols = dict((x['name'], x) for x in header['datatype'])
            for col in table.columns.values():
                for attr in ('description', 'format', 'unit', 'meta'):
                    if attr in header_cols[col.name]:
                        setattr(col, attr, header_cols[col.name][attr])
            table = serialize._construct_mixins_from_columns(table)
            try:
                header['meta'].pop('__serialized_columns__')
                table.meta = header['meta']
            except KeyError:
                pass
            input[k] = table
    for key in new_keys:
        if isinstance(input[key], dict):
            input[key] = recursively_read_dict_contents(input[key])
    return input
예제 #5
0
def load_table(input_table, k):
    table = Table(np.array(input_table[k]))
    header = meta.get_header_from_yaml(
        h.decode('utf-8')
        for h in input_table['{}.__table_column_meta__'.format(k)])
    header_cols = dict((x['name'], x) for x in header['datatype'])
    for col in table.columns.values():
        for attr in ('description', 'format', 'unit', 'meta'):
            if attr in header_cols[col.name]:
                setattr(col, attr, header_cols[col.name][attr])
    table = serialize._construct_mixins_from_columns(table)
    try:
        header['meta'].pop('__serialized_columns__')
        table.meta = header['meta']
    except KeyError:
        pass
    return table
예제 #6
0
    def _read_tables(cls, group, path=None):
        if path is None:
            path = cls._hdf5_path

        samples = group[f'{path}']
        metadata = group[f'{path}.__table_column_meta__']

        header = meta.get_header_from_yaml(
            h.decode('utf-8') for h in metadata.read())

        table = Table(np.array(samples.read()))
        if 'meta' in list(header.keys()):
            table.meta = header['meta']

        table = serialize._construct_mixins_from_columns(table)

        return cls(table)
예제 #7
0
def _decode_mixins(tbl):
    """Decode a Table ``tbl`` that has astropy Columns + appropriate meta-data into
    the corresponding table with mixin columns (as appropriate).
    """
    # If available read in __serialized_columns__ meta info which is stored
    # in FITS COMMENTS between two sentinels.
    try:
        i0 = tbl.meta['comments'].index('--BEGIN-ASTROPY-SERIALIZED-COLUMNS--')
        i1 = tbl.meta['comments'].index('--END-ASTROPY-SERIALIZED-COLUMNS--')
    except (ValueError, KeyError):
        return tbl

    # The YAML data are split into COMMENT cards, with lines longer than 70
    # characters being split with a continuation character \ (backslash).
    # Strip the backslashes and join together.
    continuation_line = False
    lines = []
    for line in tbl.meta['comments'][i0 + 1:i1]:
        if continuation_line:
            lines[-1] = lines[-1] + line[:70]
        else:
            lines.append(line[:70])
        continuation_line = len(line) == 71

    del tbl.meta['comments'][i0:i1 + 1]
    if not tbl.meta['comments']:
        del tbl.meta['comments']
    info = meta.get_header_from_yaml(lines)

    # Add serialized column information to table meta for use in constructing mixins
    tbl.meta['__serialized_columns__'] = info['meta']['__serialized_columns__']

    # Use the `datatype` attribute info to update column attributes that are
    # NOT already handled via standard FITS column keys (name, dtype, unit).
    for col in info['datatype']:
        for attr in ['description', 'meta']:
            if attr in col:
                setattr(tbl[col['name']].info, attr, col[attr])

    # Construct new table with mixins, using tbl.meta['__serialized_columns__']
    # as guidance.
    tbl = serialize._construct_mixins_from_columns(tbl)

    return tbl
예제 #8
0
파일: ecsv.py 프로젝트: MaxNoe/astropy
    def get_cols(self, lines):
        """
        Initialize the header Column objects from the table ``lines``.

        Parameters
        ----------
        lines : list
            List of table lines

        """
        # Cache a copy of the original input lines before processing below
        raw_lines = lines

        # Extract non-blank comment (header) lines with comment character stripped
        lines = list(self.process_lines(lines))

        # Validate that this is a ECSV file
        ecsv_header_re = r"""%ECSV [ ]
                             (?P<major> \d+)
                             \. (?P<minor> \d+)
                             \.? (?P<bugfix> \d+)? $"""

        no_header_msg = ('ECSV header line like "# %ECSV <version>" not found as first line.'
                         '  This is required for a ECSV file.')

        if not lines:
            raise core.InconsistentTableError(no_header_msg)

        match = re.match(ecsv_header_re, lines[0].strip(), re.VERBOSE)
        if not match:
            raise core.InconsistentTableError(no_header_msg)
        # ecsv_version could be constructed here, but it is not currently used.

        try:
            header = meta.get_header_from_yaml(lines)
        except ImportError as exc:
            if 'PyYAML package is required' in str(exc):
                warnings.warn("file looks like ECSV format but PyYAML is not installed "
                              "so it cannot be parsed as ECSV",
                              AstropyWarning)
            raise core.InconsistentTableError('unable to parse yaml in meta header'
                                              ' (PyYAML package is required)')
        except meta.YamlParseError:
            raise core.InconsistentTableError('unable to parse yaml in meta header')

        if 'meta' in header:
            self.table_meta = header['meta']

        if 'delimiter' in header:
            delimiter = header['delimiter']
            if delimiter not in DELIMITERS:
                raise ValueError('only space and comma are allowed for delimiter in ECSV format')
            self.splitter.delimiter = delimiter
            self.data.splitter.delimiter = delimiter

        # Create the list of io.ascii column objects from `header`
        header_cols = OrderedDict((x['name'], x) for x in header['datatype'])
        self.names = [x['name'] for x in header['datatype']]

        # Read the first non-commented line of table and split to get the CSV
        # header column names.  This is essentially what the Basic reader does.
        header_line = next(super().process_lines(raw_lines))
        header_names = next(self.splitter([header_line]))

        # Check for consistency of the ECSV vs. CSV header column names
        if header_names != self.names:
            raise core.InconsistentTableError('column names from ECSV header {} do not '
                             'match names from header line of CSV data {}'
                             .format(self.names, header_names))

        # BaseHeader method to create self.cols, which is a list of
        # io.ascii.core.Column objects (*not* Table Column objects).
        self._set_cols_from_names()

        # Transfer attributes from the column descriptor stored in the input
        # header YAML metadata to the new columns to create this table.
        for col in self.cols:
            for attr in ('description', 'format', 'unit', 'meta'):
                if attr in header_cols[col.name]:
                    setattr(col, attr, header_cols[col.name][attr])
            col.dtype = header_cols[col.name]['datatype']
            # ECSV "string" means numpy dtype.kind == 'U' AKA str in Python 3
            if col.dtype == 'string':
                col.dtype = 'str'
            if col.dtype.startswith('complex'):
                raise TypeError('ecsv reader does not support complex number types')
예제 #9
0
파일: ecsv.py 프로젝트: seberg/astropy
    def get_cols(self, lines):
        """
        READ: Initialize the header Column objects from the table ``lines``.

        Parameters
        ----------
        lines : list
            List of table lines

        """
        # Cache a copy of the original input lines before processing below
        raw_lines = lines

        # Extract non-blank comment (header) lines with comment character stripped
        lines = list(self.process_lines(lines))

        # Validate that this is a ECSV file
        ecsv_header_re = r"""%ECSV [ ]
                             (?P<major> \d+)
                             \. (?P<minor> \d+)
                             \.? (?P<bugfix> \d+)? $"""

        no_header_msg = (
            'ECSV header line like "# %ECSV <version>" not found as first line.'
            '  This is required for a ECSV file.')

        if not lines:
            raise core.InconsistentTableError(no_header_msg)

        match = re.match(ecsv_header_re, lines[0].strip(), re.VERBOSE)
        if not match:
            raise core.InconsistentTableError(no_header_msg)
        # ecsv_version could be constructed here, but it is not currently used.

        try:
            header = meta.get_header_from_yaml(lines)
        except ImportError as exc:
            if 'PyYAML package is required' in str(exc):
                warnings.warn(
                    "file looks like ECSV format but PyYAML is not installed "
                    "so it cannot be parsed as ECSV", AstropyWarning)
            raise core.InconsistentTableError(
                'unable to parse yaml in meta header'
                ' (PyYAML package is required)')
        except meta.YamlParseError:
            raise core.InconsistentTableError(
                'unable to parse yaml in meta header')

        if 'meta' in header:
            self.table_meta = header['meta']

        if 'delimiter' in header:
            delimiter = header['delimiter']
            if delimiter not in DELIMITERS:
                raise ValueError(
                    'only space and comma are allowed for delimiter in ECSV format'
                )
            self.splitter.delimiter = delimiter
            self.data.splitter.delimiter = delimiter

        # Create the list of io.ascii column objects from `header`
        header_cols = OrderedDict((x['name'], x) for x in header['datatype'])
        self.names = [x['name'] for x in header['datatype']]

        # Read the first non-commented line of table and split to get the CSV
        # header column names.  This is essentially what the Basic reader does.
        header_line = next(super().process_lines(raw_lines))
        header_names = next(self.splitter([header_line]))

        # Check for consistency of the ECSV vs. CSV header column names
        if header_names != self.names:
            raise core.InconsistentTableError(
                'column names from ECSV header {} do not '
                'match names from header line of CSV data {}'.format(
                    self.names, header_names))

        # BaseHeader method to create self.cols, which is a list of
        # io.ascii.core.Column objects (*not* Table Column objects).
        self._set_cols_from_names()

        # Transfer attributes from the column descriptor stored in the input
        # header YAML metadata to the new columns to create this table.
        for col in self.cols:
            for attr in ('description', 'format', 'unit', 'meta', 'subtype'):
                if attr in header_cols[col.name]:
                    setattr(col, attr, header_cols[col.name][attr])

            col.dtype = header_cols[col.name]['datatype']
            if col.dtype not in ECSV_DATATYPES:
                raise ValueError(
                    f'datatype {col.dtype!r} of column {col.name!r} '
                    f'is not in allowed values {ECSV_DATATYPES}')

            # Subtype is written like "int64[2,null]" and we want to split this
            # out to "int64" and [2, None].
            subtype = col.subtype
            if subtype and '[' in subtype:
                idx = subtype.index('[')
                col.subtype = subtype[:idx]
                col.shape = json.loads(subtype[idx:])

            # Convert ECSV "string" to numpy "str"
            for attr in ('dtype', 'subtype'):
                if getattr(col, attr) == 'string':
                    setattr(col, attr, 'str')

            # ECSV subtype of 'json' maps to numpy 'object' dtype
            if col.subtype == 'json':
                col.subtype = 'object'
예제 #10
0
def read_table_hdf5(input, path=None, character_as_bytes=True):
    """
    Read a Table object from an HDF5 file

    This requires `h5py <http://www.h5py.org/>`_ to be installed. If more than one
    table is present in the HDF5 file or group, the first table is read in and
    a warning is displayed.

    Parameters
    ----------
    input : str or :class:`h5py.File` or :class:`h5py.Group` or
        :class:`h5py.Dataset` If a string, the filename to read the table from.
        If an h5py object, either the file or the group object to read the
        table from.
    path : str
        The path from which to read the table inside the HDF5 file.
        This should be relative to the input file or group.
    character_as_bytes : bool
        If `True` then Table columns are left as bytes.
        If `False` then Table columns are converted to unicode.
    """

    try:
        import h5py
    except ImportError:
        raise Exception("h5py is required to read and write HDF5 files")

    # This function is iterative, and only gets to writing the file when
    # the input is an hdf5 Group. Moreover, the input variable is changed in
    # place.
    # Here, we save its value to be used at the end when the conditions are
    # right.
    input_save = input
    if isinstance(input, (h5py.File, h5py.Group)):

        # If a path was specified, follow the path

        if path is not None:
            try:
                input = input[path]
            except (KeyError, ValueError):
                raise OSError(f"Path {path} does not exist")

        # `input` is now either a group or a dataset. If it is a group, we
        # will search for all structured arrays inside the group, and if there
        # is one we can proceed otherwise an error is raised. If it is a
        # dataset, we just proceed with the reading.

        if isinstance(input, h5py.Group):

            # Find all structured arrays in group
            arrays = _find_all_structured_arrays(input)

            if len(arrays) == 0:
                raise ValueError(f"no table found in HDF5 group {path}")
            elif len(arrays) > 0:
                path = arrays[0] if path is None else path + '/' + arrays[0]
                if len(arrays) > 1:
                    warnings.warn(
                        "path= was not specified but multiple tables"
                        " are present, reading in first available"
                        " table (path={})".format(path), AstropyUserWarning)
                return read_table_hdf5(input, path=path)

    elif not isinstance(input, h5py.Dataset):

        # If a file object was passed, then we need to extract the filename
        # because h5py cannot properly read in file objects.

        if hasattr(input, 'read'):
            try:
                input = input.name
            except AttributeError:
                raise TypeError("h5py can only open regular files")

        # Open the file for reading, and recursively call read_table_hdf5 with
        # the file object and the path.

        f = h5py.File(input, 'r')

        try:
            return read_table_hdf5(f,
                                   path=path,
                                   character_as_bytes=character_as_bytes)
        finally:
            f.close()

    # If we are here, `input` should be a Dataset object, which we can now
    # convert to a Table.

    # Create a Table object
    from astropy.table import Table, meta, serialize

    table = Table(np.array(input))

    # Read the meta-data from the file. For back-compatibility, we can read
    # the old file format where the serialized metadata were saved in the
    # attributes of the HDF5 dataset.
    # In the new format, instead, metadata are stored in a new dataset in the
    # same file. This is introduced in Astropy 3.0
    old_version_meta = META_KEY in input.attrs
    new_version_meta = path is not None and meta_path(path) in input_save
    if old_version_meta or new_version_meta:
        if new_version_meta:
            header = meta.get_header_from_yaml(
                h.decode('utf-8') for h in input_save[meta_path(path)])
        else:
            # Must be old_version_meta is True. if (A or B) and not A then B is True
            header = meta.get_header_from_yaml(
                h.decode('utf-8') for h in input.attrs[META_KEY])
        if 'meta' in list(header.keys()):
            table.meta = header['meta']

        header_cols = dict((x['name'], x) for x in header['datatype'])
        for col in table.columns.values():
            for attr in ('description', 'format', 'unit', 'meta'):
                if attr in header_cols[col.name]:
                    setattr(col, attr, header_cols[col.name][attr])

        # Construct new table with mixins, using tbl.meta['__serialized_columns__']
        # as guidance.
        table = serialize._construct_mixins_from_columns(table)

    else:
        # Read the meta-data from the file
        table.meta.update(input.attrs)

    if not character_as_bytes:
        table.convert_bytestring_to_unicode()

    return table
예제 #11
0
파일: hdf5.py 프로젝트: Cadair/astropy
def read_table_hdf5(input, path=None, character_as_bytes=True):
    """
    Read a Table object from an HDF5 file

    This requires `h5py <http://www.h5py.org/>`_ to be installed. If more than one
    table is present in the HDF5 file or group, the first table is read in and
    a warning is displayed.

    Parameters
    ----------
    input : str or :class:`h5py:File` or :class:`h5py:Group` or
        :class:`h5py:Dataset` If a string, the filename to read the table from.
        If an h5py object, either the file or the group object to read the
        table from.
    path : str
        The path from which to read the table inside the HDF5 file.
        This should be relative to the input file or group.
    character_as_bytes: boolean
        If `True` then Table columns are left as bytes.
        If `False` then Table columns are converted to unicode.
    """

    try:
        import h5py
    except ImportError:
        raise Exception("h5py is required to read and write HDF5 files")

    # This function is iterative, and only gets to writing the file when
    # the input is an hdf5 Group. Moreover, the input variable is changed in
    # place.
    # Here, we save its value to be used at the end when the conditions are
    # right.
    input_save = input
    if isinstance(input, (h5py.File, h5py.Group)):

        # If a path was specified, follow the path

        if path is not None:
            try:
                input = input[path]
            except (KeyError, ValueError):
                raise OSError("Path {0} does not exist".format(path))

        # `input` is now either a group or a dataset. If it is a group, we
        # will search for all structured arrays inside the group, and if there
        # is one we can proceed otherwise an error is raised. If it is a
        # dataset, we just proceed with the reading.

        if isinstance(input, h5py.Group):

            # Find all structured arrays in group
            arrays = _find_all_structured_arrays(input)

            if len(arrays) == 0:
                raise ValueError("no table found in HDF5 group {0}".
                                 format(path))
            elif len(arrays) > 0:
                path = arrays[0] if path is None else path + '/' + arrays[0]
                if len(arrays) > 1:
                    warnings.warn("path= was not specified but multiple tables"
                                  " are present, reading in first available"
                                  " table (path={0})".format(path),
                                  AstropyUserWarning)
                return read_table_hdf5(input, path=path)

    elif not isinstance(input, h5py.Dataset):

        # If a file object was passed, then we need to extract the filename
        # because h5py cannot properly read in file objects.

        if hasattr(input, 'read'):
            try:
                input = input.name
            except AttributeError:
                raise TypeError("h5py can only open regular files")

        # Open the file for reading, and recursively call read_table_hdf5 with
        # the file object and the path.

        f = h5py.File(input, 'r')

        try:
            return read_table_hdf5(f, path=path, character_as_bytes=character_as_bytes)
        finally:
            f.close()

    # If we are here, `input` should be a Dataset object, which we can now
    # convert to a Table.

    # Create a Table object
    from astropy.table import Table, meta, serialize

    table = Table(np.array(input))

    # Read the meta-data from the file. For back-compatibility, we can read
    # the old file format where the serialized metadata were saved in the
    # attributes of the HDF5 dataset.
    # In the new format, instead, metadata are stored in a new dataset in the
    # same file. This is introduced in Astropy 3.0
    old_version_meta = META_KEY in input.attrs
    new_version_meta = path is not None and meta_path(path) in input_save
    if old_version_meta or new_version_meta:
        if new_version_meta:
            header = meta.get_header_from_yaml(
                h.decode('utf-8') for h in input_save[meta_path(path)])
        elif old_version_meta:
            header = meta.get_header_from_yaml(
                h.decode('utf-8') for h in input.attrs[META_KEY])
        if 'meta' in list(header.keys()):
            table.meta = header['meta']

        header_cols = dict((x['name'], x) for x in header['datatype'])
        for col in table.columns.values():
            for attr in ('description', 'format', 'unit', 'meta'):
                if attr in header_cols[col.name]:
                    setattr(col, attr, header_cols[col.name][attr])

        # Construct new table with mixins, using tbl.meta['__serialized_columns__']
        # as guidance.
        table = serialize._construct_mixins_from_columns(table)

    else:
        # Read the meta-data from the file
        table.meta.update(input.attrs)

    if not character_as_bytes:
        table.convert_bytestring_to_unicode()

    return table
예제 #12
0
def read_table_parquet(input,
                       include_names=None,
                       exclude_names=None,
                       schema_only=False,
                       filters=None):
    """
    Read a Table object from a Parquet file.

    This requires `pyarrow <https://arrow.apache.org/docs/python/>`_
    to be installed.

    The ``filters`` parameter consists of predicates that are expressed
    in disjunctive normal form (DNF), like ``[[('x', '=', 0), ...], ...]``.
    DNF allows arbitrary boolean logical combinations of single column
    predicates. The innermost tuples each describe a single column predicate.
    The list of inner predicates is interpreted as a conjunction (AND),
    forming a more selective and multiple column predicate. Finally, the most
    outer list combines these filters as a disjunction (OR).

    Predicates may also be passed as List[Tuple]. This form is interpreted
    as a single conjunction. To express OR in predicates, one must
    use the (preferred) List[List[Tuple]] notation.

    Each tuple has format: (``key``, ``op``, ``value``) and compares the
    ``key`` with the ``value``.
    The supported ``op`` are:  ``=`` or ``==``, ``!=``, ``<``, ``>``, ``<=``,
    ``>=``, ``in`` and ``not in``. If the ``op`` is ``in`` or ``not in``, the
    ``value`` must be a collection such as a ``list``, a ``set`` or a
    ``tuple``.

    Examples:

    .. code-block:: python

        ('x', '=', 0)
        ('y', 'in', ['a', 'b', 'c'])
        ('z', 'not in', {'a','b'})

    Parameters
    ----------
    input : str or path-like or file-like object
        If a string or path-like object, the filename to read the table from.
        If a file-like object, the stream to read data.
    include_names : list [str], optional
        List of names to include in output. If not supplied, then
        include all columns.
    exclude_names : list [str], optional
        List of names to exclude from output (applied after ``include_names``).
        If not supplied then no columns are excluded.
    schema_only : bool, optional
        Only read the schema/metadata with table information.
    filters : list [tuple] or list [list [tuple] ] or None, optional
        Rows which do not match the filter predicate will be removed from
        scanned data.  See `pyarrow.parquet.read_table()` for details.

    Returns
    -------
    table : `~astropy.table.Table`
        Table will have zero rows and only metadata information
        if schema_only is True.
    """
    pa, parquet, _ = get_pyarrow()

    if not isinstance(input, (str, os.PathLike)):
        # The 'read' attribute is the key component of a generic
        # file-like object.
        if not hasattr(input, 'read'):
            raise TypeError(
                "pyarrow can only open path-like or file-like objects.")

    schema = parquet.read_schema(input)

    # Pyarrow stores all metadata as byte-strings, so we convert
    # to UTF-8 strings here.
    if schema.metadata is not None:
        md = {
            k.decode('UTF-8'): v.decode('UTF-8')
            for k, v in schema.metadata.items()
        }
    else:
        md = {}

    from astropy.table import Table, meta, serialize

    # parse metadata from table yaml
    meta_dict = {}
    if 'table_meta_yaml' in md:
        meta_yaml = md.pop('table_meta_yaml').split('\n')
        meta_hdr = meta.get_header_from_yaml(meta_yaml)
        if 'meta' in meta_hdr:
            meta_dict = meta_hdr['meta']
    else:
        meta_hdr = None

    # parse and set serialized columns
    full_table_columns = {name: name for name in schema.names}
    has_serialized_columns = False
    if '__serialized_columns__' in meta_dict:
        has_serialized_columns = True
        serialized_columns = meta_dict['__serialized_columns__']
        for scol in serialized_columns:
            for name in _get_names(serialized_columns[scol]):
                full_table_columns[name] = scol

    use_names = set(full_table_columns.values())
    # Apply include_names before exclude_names
    if include_names is not None:
        use_names.intersection_update(include_names)
    if exclude_names is not None:
        use_names.difference_update(exclude_names)
    # Preserve column ordering via list, and use this dict trick
    # to remove duplicates and preserve ordering (for mixin columns)
    use_names = list(
        dict.fromkeys(
            [x for x in full_table_columns.values() if x in use_names]))

    # names_to_read is a list of actual serialized column names, where
    # e.g. the requested name 'time' becomes ['time.jd1', 'time.jd2']
    names_to_read = []
    for name in use_names:
        names = [n for n, col in full_table_columns.items() if name == col]
        names_to_read.extend(names)

    if not names_to_read:
        raise ValueError("No include_names specified were found in the table.")

    # We need to pop any unread serialized columns out of the meta_dict.
    if has_serialized_columns:
        for scol in list(meta_dict['__serialized_columns__'].keys()):
            if scol not in use_names:
                meta_dict['__serialized_columns__'].pop(scol)

    # whether to return the whole table or a formatted empty table.
    if not schema_only:
        # Read the pyarrow table, specifying columns and filters.
        pa_table = parquet.read_table(input,
                                      columns=names_to_read,
                                      filters=filters)
        num_rows = pa_table.num_rows
    else:
        num_rows = 0

    # Now need to convert parquet table to Astropy
    dtype = []
    for name in names_to_read:
        # Pyarrow string and byte columns do not have native length information
        # so we must determine those here.
        if schema.field(name).type not in (pa.string(), pa.binary()):
            # Convert the pyarrow type into a numpy dtype (which is returned
            # by the to_pandas_type() method).
            dtype.append(schema.field(name).type.to_pandas_dtype())
            continue

        # Special-case for string and binary columns
        md_name = f'table::len::{name}'
        if md_name in md:
            # String/bytes length from header.
            strlen = int(md[md_name])
        elif schema_only:  # Find the maximum string length.
            # Choose an arbitrary string length since
            # are not reading in the table.
            strlen = 10
            warnings.warn(
                f"No {md_name} found in metadata. "
                f"Guessing {{strlen}} for schema.", AstropyUserWarning)
        else:
            strlen = max([len(row.as_py()) for row in pa_table[name]])
            warnings.warn(
                f"No {md_name} found in metadata. "
                f"Using longest string ({{strlen}} characters).",
                AstropyUserWarning)
        dtype.append(f'U{strlen}' if schema.field(name).type ==
                     pa.string() else f'|S{strlen}')

    # Create the empty numpy record array to store the pyarrow data.
    data = np.zeros(num_rows, dtype=list(zip(names_to_read, dtype)))

    if not schema_only:
        # Convert each column in the pyarrow table to a numpy array
        for name in names_to_read:
            data[name][:] = pa_table[name].to_numpy()

    table = Table(data=data, meta=meta_dict)

    if meta_hdr is not None:
        # Set description, format, unit, meta from the column
        # metadata that was serialized with the table.
        header_cols = dict((x['name'], x) for x in meta_hdr['datatype'])
        for col in table.columns.values():
            for attr in ('description', 'format', 'unit', 'meta'):
                if attr in header_cols[col.name]:
                    setattr(col, attr, header_cols[col.name][attr])

    # Convert all compound columns to astropy objects
    # (e.g. time.jd1, time.jd2 into a single time column)
    table = serialize._construct_mixins_from_columns(table)

    return table
예제 #13
0
def write_table_hdf5(table,
                     output,
                     path=None,
                     compression=False,
                     append=False,
                     overwrite=False,
                     serialize_meta=False,
                     metadata_conflicts='error',
                     **create_dataset_kwargs):
    """
    Write a Table object to an HDF5 file

    This requires `h5py <http://www.h5py.org/>`_ to be installed.

    Parameters
    ----------
    table : `~astropy.table.Table`
        Data table that is to be written to file.
    output : str or :class:`h5py:File` or :class:`h5py:Group`
        If a string, the filename to write the table to. If an h5py object,
        either the file or the group object to write the table to.
    path : str
        The path to which to write the table inside the HDF5 file.
        This should be relative to the input file or group.
        If not specified, defaults to ``__astropy_table__``.
    compression : bool or str or int
        Whether to compress the table inside the HDF5 file. If set to `True`,
        ``'gzip'`` compression is used. If a string is specified, it should be
        one of ``'gzip'``, ``'szip'``, or ``'lzf'``. If an integer is
        specified (in the range 0-9), ``'gzip'`` compression is used, and the
        integer denotes the compression level.
    append : bool
        Whether to append the table to an existing HDF5 file.
    overwrite : bool
        Whether to overwrite any existing file without warning.
        If ``append=True`` and ``overwrite=True`` then only the dataset will be
        replaced; the file/group will not be overwritten.
    metadata_conflicts : str
        How to proceed with metadata conflicts. This should be one of:
            * ``'silent'``: silently pick the last conflicting meta-data value
            * ``'warn'``: pick the last conflicting meta-data value, but emit a
              warning (default)
            * ``'error'``: raise an exception.
    **create_dataset_kwargs
        Additional keyword arguments are passed to `h5py.File.create_dataset`.
    """

    from astropy.table import meta
    try:
        import h5py
    except ImportError:
        raise Exception("h5py is required to read and write HDF5 files")

    if path is None:
        # table is just an arbitrary, hardcoded string here.
        path = '__astropy_table__'
    elif path.endswith('/'):
        raise ValueError("table path should end with table name, not /")

    if '/' in path:
        group, name = path.rsplit('/', 1)
    else:
        group, name = None, path

    if isinstance(output, (h5py.File, h5py.Group)):
        if len(list(output.keys())) > 0 and name == '__astropy_table__':
            raise ValueError("table path should always be set via the "
                             "path= argument when writing to existing "
                             "files")
        elif name == '__astropy_table__':
            warnings.warn("table path was not set via the path= argument; "
                          "using default path {}".format(path))

        if group:
            try:
                output_group = output[group]
            except (KeyError, ValueError):
                output_group = output.create_group(group)
        else:
            output_group = output

    elif isinstance(output, str):

        if os.path.exists(output) and not append:
            if overwrite and not append:
                os.remove(output)
            else:
                raise OSError(f"File exists: {output}")

        # Open the file for appending or writing
        f = h5py.File(output, 'a' if append else 'w')

        # Recursively call the write function
        try:
            return write_table_hdf5(table,
                                    f,
                                    path=path,
                                    compression=compression,
                                    append=append,
                                    overwrite=overwrite,
                                    serialize_meta=serialize_meta,
                                    **create_dataset_kwargs)
        finally:
            f.close()

    else:

        raise TypeError('output should be a string or an h5py File or '
                        'Group object')

    # Check whether table already exists
    existing_header = None
    if name in output_group:
        if append and overwrite:
            # Delete only the dataset itself
            del output_group[name]
        elif append:
            # Data table exists, so we interpret "append" to mean "extend
            # existing table with the table passed in". However, this requires
            # the table to have been written by this function in the past, so it
            # should have a metadata header
            if meta_path(name) not in output_group:
                raise ValueError("No metadata exists for existing table. We "
                                 "can only append tables if metadata "
                                 "is consistent for all tables")

            # Load existing table header:
            existing_header = get_header_from_yaml(
                h.decode('utf-8') for h in output_group[meta_path(name)])
        else:
            raise OSError(f"Table {path} already exists")

    # Encode any mixin columns as plain columns + appropriate metadata
    table = _encode_mixins(table)

    # Table with numpy unicode strings can't be written in HDF5 so
    # to write such a table a copy of table is made containing columns as
    # bytestrings.  Now this copy of the table can be written in HDF5.
    if any(col.info.dtype.kind == 'U' for col in table.itercols()):
        table = table.copy(copy_data=False)
        table.convert_unicode_to_bytestring()

    # Warn if information will be lost when serialize_meta=False.  This is
    # hardcoded to the set difference between column info attributes and what
    # HDF5 can store natively (name, dtype) with no meta.
    if serialize_meta is False:
        for col in table.itercols():
            for attr in ('unit', 'format', 'description', 'meta'):
                if getattr(col.info, attr, None) not in (None, {}):
                    warnings.warn(
                        "table contains column(s) with defined 'unit', 'format',"
                        " 'description', or 'meta' info attributes. These will"
                        " be dropped since serialize_meta=False.",
                        AstropyUserWarning)

    if existing_header is None:  # Just write the table and metadata
        # Write the table to the file
        if compression:
            if compression is True:
                compression = 'gzip'
            dset = output_group.create_dataset(name,
                                               data=table.as_array(),
                                               compression=compression,
                                               **create_dataset_kwargs)
        else:
            dset = output_group.create_dataset(name,
                                               data=table.as_array(),
                                               **create_dataset_kwargs)

        if serialize_meta:
            header_yaml = meta.get_yaml_from_table(table)

            header_encoded = [h.encode('utf-8') for h in header_yaml]
            output_group.create_dataset(meta_path(name), data=header_encoded)

        else:
            # Write the Table meta dict key:value pairs to the file as HDF5
            # attributes.  This works only for a limited set of scalar data types
            # like numbers, strings, etc., but not any complex types.  This path
            # also ignores column meta like unit or format.
            for key in table.meta:
                val = table.meta[key]
                try:
                    dset.attrs[key] = val
                except TypeError:
                    warnings.warn(
                        "Attribute `{}` of type {} cannot be written to "
                        "HDF5 files - skipping. (Consider specifying "
                        "serialize_meta=True to write all meta data)".format(
                            key, type(val)), AstropyUserWarning)

    else:  # We need to append the tables!
        try:
            # FIXME: do something with the merged metadata!
            metadata.merge(existing_header['meta'],
                           table.meta,
                           metadata_conflicts=metadata_conflicts)
        except metadata.MergeConflictError:
            raise metadata.MergeConflictError(
                "Cannot append table to existing file because "
                "the existing file table metadata and this "
                "table object's metadata do not match. If you "
                "want to ignore this issue, or change to a "
                "warning, set metadata_conflicts='silent' or 'warn'.")

        # Now compare datatype of this object and on disk
        this_header = get_header_from_yaml(get_yaml_from_table(table))

        if not _custom_tbl_dtype_compare(existing_header['datatype'],
                                         this_header['datatype']):
            raise ValueError(
                "Cannot append table to existing file because "
                "the existing file table datatype and this "
                "object's table datatype do not match. "
                f"{existing_header['datatype']} vs. {this_header['datatype']}")

        # If we got here, we can now try to append:
        current_size = len(output_group[name])
        output_group[name].resize((current_size + len(table), ))
        output_group[name][current_size:] = table.as_array()