コード例 #1
0
    def test_filter_schema_fields_from_url(self):
        TestSchema = Unischema('TestSchema', [
            UnischemaField('int32', np.int32, (), None, False),
            UnischemaField('uint8', np.uint8, (), None, False),
            UnischemaField('uint16', np.uint16, (), None, False),
        ])

        assert match_unischema_fields(TestSchema,
                                      ['.*nt.*6']) == [TestSchema.uint16]
        assert match_unischema_fields(TestSchema, ['nomatch']) == []
        assert match_unischema_fields(TestSchema, ['.*']) == list(
            TestSchema.fields.values())
        assert match_unischema_fields(TestSchema, ['int32', 'uint8']) == [
            TestSchema.int32, TestSchema.uint8
        ]
コード例 #2
0
    def convert_fields(self, unischema, field_list):
        """Convert all the fields in field_list into Unischema fields.
        field_list can contain unischema fields and strings (regular expressions)

        :param unischema: Unischema object
        :param field_list: A list of unischema fields or strings (regular expressions)
        :return: list of unischema fields
        """
        # Split fields parameter to regex pattern strings and UnischemaField objects
        regex_patterns = [f for f in field_list if isinstance(f, string_types)]
        # We can not check type against UnischemaField because the artifact introduced by
        # pickling, since depickled UnischemaField are of type collections.UnischemaField
        # while withing depickling they are of petastorm.unischema.UnischemaField
        # Since UnischemaField is a tuple, we check against it since it is invariant to
        # pickling
        unischema_field_objects = [
            f for f in field_list if isinstance(f, tuple)
        ]

        if len(unischema_field_objects) + len(regex_patterns) != len(
                field_list):
            raise ValueError(
                '"Elements of fields"/"timestamp field" must be either a string (regular expressions) or'
                ' an instance of UnischemaField class.')

        converted_fields = unischema_field_objects + match_unischema_fields(
            unischema, regex_patterns)

        return converted_fields
コード例 #3
0
def test_match_unischema_fields_legacy_warning():
    TestSchema = Unischema('TestSchema', [
        UnischemaField('int32', np.int32, (), None, False),
        UnischemaField('uint8', np.uint8, (), None, False),
        UnischemaField('uint16', np.uint16, (), None, False),
    ])

    # Check that no warnings are shown if the legacy and the new way of filtering produce the same results.
    with pytest.warns(None) as unexpected_warnings:
        match_unischema_fields(TestSchema, ['uint8'])
    assert not unexpected_warnings

    # uint8 and uint16 would have been matched using the old method, but not the new one
    with pytest.warns(
            UserWarning,
            match=r'schema_fields behavior has changed.*uint16, uint8'):
        assert match_unischema_fields(TestSchema, ['uint']) == []
コード例 #4
0
def copy_dataset(spark, source_url, target_url, field_regex, not_null_fields,
                 overwrite_output, partitions_count, row_group_size_mb):
    """Creates a copy of a dataset. A new dataset will optionally contain a subset of columns. Rows that have NULL
    values in fields defined by ``not_null_fields`` argument are filtered out.


    :param spark: An instance of ``SparkSession`` object
    :param source_url: A url of the dataset to be copied.
    :param target_url: A url specifying location of the target dataset.
    :param field_regex: A list of regex patterns. Only columns that match one of these patterns are copied to the new
      dataset.
    :param not_null_fields: A list of fields that must have non-NULL valus in the target dataset.
    :param overwrite_output: If ``False`` and there is an existing path defined by ``target_url``, the operation will
      fail.
    :param partitions_count: If not ``None``, the dataset is repartitioned before write. Number of files in the target
      Parquet store is defined by this parameter.
    :param row_group_size_mb: The size of the rowgroup in the target dataset. Specified in megabytes.
    :return: None
    """
    schema = get_schema_from_dataset_url(source_url)

    fields = match_unischema_fields(schema, field_regex)

    if field_regex and not fields:
        field_names = list(schema.fields.keys())
        raise ValueError(
            'Regular expressions (%s) do not match any fields (%s)',
            str(field_regex), str(field_names))

    if fields:
        subschema = schema.create_schema_view(fields)
    else:
        subschema = schema

    with materialize_dataset(spark, target_url, subschema, row_group_size_mb):
        data_frame = spark.read \
            .parquet(source_url)

        if fields:
            data_frame = data_frame.select(*[f.name for f in fields])

        if not_null_fields:
            not_null_condition = reduce(operator.__and__,
                                        (data_frame[f].isNotNull()
                                         for f in not_null_fields))
            data_frame = data_frame.filter(not_null_condition)

        if partitions_count:
            data_frame = data_frame.repartition(partitions_count)

        data_frame.write \
            .mode('overwrite' if overwrite_output else 'error') \
            .option('compression', 'none') \
            .parquet(target_url)
コード例 #5
0
def reader_v2_throughput(dataset_url, field_regex=None, warmup_cycles_count=300, measure_cycles_count=1000,
                         pool_type=WorkerPoolType.THREAD, loaders_count=3, decoders_count=3,
                         read_method=ReadMethod.PYTHON, shuffling_queue_size=500, min_after_dequeue=400,
                         reader_extra_args=None, pyarrow_serialize=False, spawn_new_process=True):
    """Constructs a ReaderV2 instance and uses it to performs throughput measurements.

    The function will spawn a new process if ``spawn_separate_process`` is set. This is needed to make memory footprint
    measurements accurate.

    :param dataset_url: A url of the dataset to be used for measurements.
    :param field_regex:  A list of regular expressions. Only fields that match one of the regex patterns will be used
      during the benchmark.
    :param warmup_cycles_count: Number of warmup cycles. During warmup cycles no measurements are being recorded.
    :param measure_cycles_count: Number of measurements cycles. Only time elapsed during measurements cycles are used
      in throughput calculations.
    :param pool_type: :class:`WorkerPoolType` enum value.
    :param loaders_count: Number of IO threads.
    :param decoders_count: Number of threads or processes used for decoding. ``pool_type`` parameter defines
      whether multiple processes or threads are used for parallel decoding.
    :param read_method:  An enum :class:`ReadMethod` that defines whether a :class:`petastorm.reader.Reader` will be
      used.
    :param shuffling_queue_size: Maximum number of elements in the shuffling queue.
    :param min_after_dequeue: Minimum number of elements in a shuffling queue before entries can be read from it.
    :param reader_extra_args: Extra arguments that would be passed to Reader constructor.
    :param pyarrow_serialize: When True, pyarrow.serialize library will be used for serializing decoded payloads.
    :param spawn_new_process: This function will respawn itself in a new process if the argument is True. Spawning
      a new process is needed to get an accurate memory footprint.

    :return: An instance of ``BenchmarkResult`` namedtuple with the results of the benchmark. The namedtuple has
      the following fields: `time_mean`, `samples_per_second`, `memory_info` and `cpu`
    """
    if not reader_extra_args:
        reader_extra_args = dict()

    if spawn_new_process:
        args = copy.deepcopy(locals())
        args['spawn_new_process'] = False
        executor = ProcessPoolExecutor(1)
        future = executor.submit(reader_v2_throughput, **args)
        return future.result()

    logger.info('Arguments: %s', locals())

    if 'schema_fields' not in reader_extra_args:
        unischema_fields = match_unischema_fields(get_schema_from_dataset_url(dataset_url), field_regex)
        reader_extra_args['schema_fields'] = unischema_fields

    logger.info('Fields used in the benchmark: %s', str(reader_extra_args['schema_fields']))

    decoder_pool_executor = _create_concurrent_executor(pool_type, decoders_count)

    with ReaderV2(dataset_url, num_epochs=None,
                  loader_pool=ThreadPoolExecutor(loaders_count),
                  decoder_pool=decoder_pool_executor,
                  shuffling_queue=RandomShufflingBuffer(shuffling_queue_size, min_after_dequeue),
                  **reader_extra_args) as reader:

        if read_method == ReadMethod.PYTHON:
            result = _time_warmup_and_work(reader, warmup_cycles_count, measure_cycles_count)
        elif read_method == ReadMethod.TF:
            result = _time_warmup_and_work_tf(reader, warmup_cycles_count, measure_cycles_count, 0, 0)
        else:
            raise RuntimeError('Unexpected reader_type value: %s', str(read_method))

    return result
コード例 #6
0
def copy_dataset(spark,
                 source_url,
                 target_url,
                 field_regex,
                 not_null_fields,
                 overwrite_output,
                 partitions_count,
                 row_group_size_mb,
                 hdfs_driver='libhdfs3'):
    """
    Creates a copy of a dataset. A new dataset will optionally contain a subset of columns. Rows that have NULL
    values in fields defined by ``not_null_fields`` argument are filtered out.


    :param spark: An instance of ``SparkSession`` object
    :param source_url: A url of the dataset to be copied.
    :param target_url: A url specifying location of the target dataset.
    :param field_regex: A list of regex patterns. Only columns that match one of these patterns are copied to the new
      dataset.
    :param not_null_fields: A list of fields that must have non-NULL valus in the target dataset.
    :param overwrite_output: If ``False`` and there is an existing path defined by ``target_url``, the operation will
      fail.
    :param partitions_count: If not ``None``, the dataset is repartitioned before write. Number of files in the target
      Parquet store is defined by this parameter.
    :param row_group_size_mb: The size of the rowgroup in the target dataset. Specified in megabytes.
    :param hdfs_driver: A string denoting the hdfs driver to use (if using a dataset on hdfs). Current choices are
        libhdfs (java through JNI) or libhdfs3 (C++)
    :param user: String denoting username when connecting to HDFS. None implies login user.
    :return: None
    """
    schema = get_schema_from_dataset_url(source_url, hdfs_driver=hdfs_driver)

    fields = match_unischema_fields(schema, field_regex)

    if field_regex and not fields:
        field_names = list(schema.fields.keys())
        raise ValueError(
            'Regular expressions (%s) do not match any fields (%s)',
            str(field_regex), str(field_names))

    if fields:
        subschema = schema.create_schema_view(fields)
    else:
        subschema = schema

    resolver = FilesystemResolver(
        target_url,
        spark.sparkContext._jsc.hadoopConfiguration(),
        hdfs_driver=hdfs_driver,
        user=spark.sparkContext.sparkUser())
    with materialize_dataset(spark,
                             target_url,
                             subschema,
                             row_group_size_mb,
                             filesystem_factory=resolver.filesystem_factory()):
        data_frame = spark.read \
            .parquet(source_url)

        if fields:
            data_frame = data_frame.select(*[f.name for f in fields])

        if not_null_fields:
            not_null_condition = reduce(operator.__and__,
                                        (data_frame[f].isNotNull()
                                         for f in not_null_fields))
            data_frame = data_frame.filter(not_null_condition)

        if partitions_count:
            data_frame = data_frame.repartition(partitions_count)

        data_frame.write \
            .mode('overwrite' if overwrite_output else 'error') \
            .option('compression', 'none') \
            .parquet(target_url)