Ejemplo n.º 1
0
    def from_arrow_schema(cls, parquet_dataset, omit_unsupported_fields=False):
        """
        Convert an apache arrow schema into a unischema object. This is useful for datasets of only scalars
        which need no special encoding/decoding. If there is an unsupported type in the arrow schema, it will
        throw an exception.
        When the warn_only parameter is turned to True, unsupported column types prints only warnings.

        We do not set codec field in the generated fields since all parquet fields are out-of-the-box supported
        by pyarrow and we do not need perform any custom decoding.

        :param arrow_schema: :class:`pyarrow.lib.Schema`
        :param omit_unsupported_fields: :class:`Boolean`
        :return: A :class:`Unischema` object.
        """
        meta = compat_get_metadata(parquet_dataset.pieces[0],
                                   parquet_dataset.fs.open)
        arrow_schema = meta.schema.to_arrow_schema()
        unischema_fields = []

        for partition in parquet_dataset.partitions:
            if (pa.types.is_binary(partition.dictionary.type) and six.PY2) or \
                    (pa.types.is_string(partition.dictionary.type) and six.PY3):
                numpy_dtype = np.str_
            elif pa.types.is_int64(partition.dictionary.type):
                numpy_dtype = np.int64
            else:
                raise RuntimeError((
                    'Expected partition type to be one of currently supported types: string or int64. '
                    'Got {}').format(partition.dictionary.type))

            unischema_fields.append(
                UnischemaField(partition.name, numpy_dtype, (), None, False))

        for column_name in arrow_schema.names:
            arrow_field = compat_schema_field(arrow_schema, column_name)
            field_type = arrow_field.type
            field_shape = ()
            if isinstance(field_type, ListType):
                if isinstance(field_type.value_type, ListType) or isinstance(
                        field_type.value_type, pyStructType):
                    warnings.warn(
                        '[ARROW-1644] Ignoring unsupported structure %r for field %r'
                        % (field_type, column_name))
                    continue
                field_shape = (None, )
            try:
                np_type = _numpy_and_codec_from_arrow_type(field_type)
            except ValueError:
                if omit_unsupported_fields:
                    warnings.warn(
                        'Column %r has an unsupported field %r. Ignoring...' %
                        (column_name, field_type))
                    continue
                else:
                    raise
            unischema_fields.append(
                UnischemaField(column_name, np_type, field_shape, None,
                               arrow_field.nullable))
        return Unischema('inferred_schema', unischema_fields)
Ejemplo n.º 2
0
def _split_piece(piece, fs_open):
    metadata = compat_get_metadata(piece, fs_open)
    return [
        compat_make_parquet_piece(piece.path,
                                  fs_open,
                                  row_group=row_group,
                                  partition_keys=piece.partition_keys)
        for row_group in range(metadata.num_row_groups)
    ]
Ejemplo n.º 3
0
def add_to_dataset_metadata(dataset, key, value):
    """
    Adds a key and value to the parquet metadata file of a parquet dataset.
    :param dataset: (ParquetDataset) parquet dataset
    :param key:     (str) key of metadata entry
    :param value:   (str) value of metadata
    """
    if not isinstance(dataset.paths, str):
        raise ValueError(
            'Expected dataset.paths to be a single path, not a list of paths')

    metadata_file_path = dataset.paths.rstrip('/') + '/_metadata'
    common_metadata_file_path = dataset.paths.rstrip('/') + '/_common_metadata'
    common_metadata_file_crc_path = dataset.paths.rstrip(
        '/') + '/._common_metadata.crc'

    # If the metadata file already exists, add to it.
    # Otherwise fetch the schema from one of the existing parquet files in the dataset
    if dataset.fs.exists(common_metadata_file_path):
        with dataset.fs.open(common_metadata_file_path) as f:
            arrow_metadata = pyarrow.parquet.read_metadata(f)
    elif dataset.fs.exists(metadata_file_path):
        # If just the metadata file exists and not the common metadata file, copy the contents of
        # the metadata file to the common_metadata file for backwards compatibility
        with dataset.fs.open(metadata_file_path) as f:
            arrow_metadata = pyarrow.parquet.read_metadata(f)
    else:
        arrow_metadata = compat_get_metadata(dataset.pieces[0],
                                             dataset.fs.open)

    base_schema = arrow_metadata.schema.to_arrow_schema()

    # base_schema.metadata may be None, e.g.
    metadata_dict = base_schema.metadata or dict()
    metadata_dict[key] = value
    schema = compat_with_metadata(base_schema, metadata_dict)

    with dataset.fs.open(common_metadata_file_path, 'wb') as metadata_file:
        pyarrow.parquet.write_metadata(schema, metadata_file)

    # We have just modified _common_metadata file, but the filesystem implementation used by pyarrow does not
    # update the .crc value. We better delete the .crc to make sure there is no mismatch between _common_metadata
    # content and the checksum.
    if isinstance(dataset.fs, LocalFileSystem) and dataset.fs.exists(
            common_metadata_file_crc_path):
        try:
            dataset.fs.rm(common_metadata_file_crc_path)
        except NotImplementedError:
            os.remove(common_metadata_file_crc_path)
Ejemplo n.º 4
0
def test_asymetric_parquet_pieces(reader_factory, tmpdir):
    """Check that datasets with parquet files that all rows in datasets that have different number of rowgroups can
    be fully read """
    url = 'file://' + tmpdir.strpath

    ROWS_COUNT = 1000
    # id_div_700 forces asymetric split between partitions and hopefully get us files with different number of row
    # groups
    create_test_scalar_dataset(url, ROWS_COUNT, partition_by=['id_div_700'])

    # We verify we have pieces with different number of row-groups
    dataset = pq.ParquetDataset(tmpdir.strpath)
    row_group_counts = set(compat_get_metadata(piece, dataset.fs.open).num_row_groups for piece in dataset.pieces)
    assert len(row_group_counts) > 1

    # Make sure we are not missing any rows.
    with reader_factory(url, schema_fields=['id']) as reader:
        row_ids_batched = [row.id for row in reader]
        actual_row_ids = list(itertools.chain(*row_ids_batched))

    assert ROWS_COUNT == len(actual_row_ids)
Ejemplo n.º 5
0
 def split_piece(piece):
     metadata = compat_get_metadata(dataset.pieces[0], dataset.fs.open)
     return [compat_make_parquet_piece(piece.path, dataset.fs.open,
                                       row_group=row_group,
                                       partition_keys=piece.partition_keys)
             for row_group in range(metadata.num_row_groups)]