def test_filter_schema_fields_from_url(self): TestSchema = Unischema('TestSchema', [ UnischemaField('int32', np.int32, (), None, False), UnischemaField('uint8', np.uint8, (), None, False), UnischemaField('uint16', np.uint16, (), None, False), ]) assert match_unischema_fields(TestSchema, ['.*nt.*6']) == [TestSchema.uint16] assert match_unischema_fields(TestSchema, ['nomatch']) == [] assert match_unischema_fields(TestSchema, ['.*']) == list( TestSchema.fields.values()) assert match_unischema_fields(TestSchema, ['int32', 'uint8']) == [ TestSchema.int32, TestSchema.uint8 ]
def convert_fields(self, unischema, field_list): """Convert all the fields in field_list into Unischema fields. field_list can contain unischema fields and strings (regular expressions) :param unischema: Unischema object :param field_list: A list of unischema fields or strings (regular expressions) :return: list of unischema fields """ # Split fields parameter to regex pattern strings and UnischemaField objects regex_patterns = [f for f in field_list if isinstance(f, string_types)] # We can not check type against UnischemaField because the artifact introduced by # pickling, since depickled UnischemaField are of type collections.UnischemaField # while withing depickling they are of petastorm.unischema.UnischemaField # Since UnischemaField is a tuple, we check against it since it is invariant to # pickling unischema_field_objects = [ f for f in field_list if isinstance(f, tuple) ] if len(unischema_field_objects) + len(regex_patterns) != len( field_list): raise ValueError( '"Elements of fields"/"timestamp field" must be either a string (regular expressions) or' ' an instance of UnischemaField class.') converted_fields = unischema_field_objects + match_unischema_fields( unischema, regex_patterns) return converted_fields
def test_match_unischema_fields_legacy_warning(): TestSchema = Unischema('TestSchema', [ UnischemaField('int32', np.int32, (), None, False), UnischemaField('uint8', np.uint8, (), None, False), UnischemaField('uint16', np.uint16, (), None, False), ]) # Check that no warnings are shown if the legacy and the new way of filtering produce the same results. with pytest.warns(None) as unexpected_warnings: match_unischema_fields(TestSchema, ['uint8']) assert not unexpected_warnings # uint8 and uint16 would have been matched using the old method, but not the new one with pytest.warns( UserWarning, match=r'schema_fields behavior has changed.*uint16, uint8'): assert match_unischema_fields(TestSchema, ['uint']) == []
def copy_dataset(spark, source_url, target_url, field_regex, not_null_fields, overwrite_output, partitions_count, row_group_size_mb): """Creates a copy of a dataset. A new dataset will optionally contain a subset of columns. Rows that have NULL values in fields defined by ``not_null_fields`` argument are filtered out. :param spark: An instance of ``SparkSession`` object :param source_url: A url of the dataset to be copied. :param target_url: A url specifying location of the target dataset. :param field_regex: A list of regex patterns. Only columns that match one of these patterns are copied to the new dataset. :param not_null_fields: A list of fields that must have non-NULL valus in the target dataset. :param overwrite_output: If ``False`` and there is an existing path defined by ``target_url``, the operation will fail. :param partitions_count: If not ``None``, the dataset is repartitioned before write. Number of files in the target Parquet store is defined by this parameter. :param row_group_size_mb: The size of the rowgroup in the target dataset. Specified in megabytes. :return: None """ schema = get_schema_from_dataset_url(source_url) fields = match_unischema_fields(schema, field_regex) if field_regex and not fields: field_names = list(schema.fields.keys()) raise ValueError( 'Regular expressions (%s) do not match any fields (%s)', str(field_regex), str(field_names)) if fields: subschema = schema.create_schema_view(fields) else: subschema = schema with materialize_dataset(spark, target_url, subschema, row_group_size_mb): data_frame = spark.read \ .parquet(source_url) if fields: data_frame = data_frame.select(*[f.name for f in fields]) if not_null_fields: not_null_condition = reduce(operator.__and__, (data_frame[f].isNotNull() for f in not_null_fields)) data_frame = data_frame.filter(not_null_condition) if partitions_count: data_frame = data_frame.repartition(partitions_count) data_frame.write \ .mode('overwrite' if overwrite_output else 'error') \ .option('compression', 'none') \ .parquet(target_url)
def reader_v2_throughput(dataset_url, field_regex=None, warmup_cycles_count=300, measure_cycles_count=1000, pool_type=WorkerPoolType.THREAD, loaders_count=3, decoders_count=3, read_method=ReadMethod.PYTHON, shuffling_queue_size=500, min_after_dequeue=400, reader_extra_args=None, pyarrow_serialize=False, spawn_new_process=True): """Constructs a ReaderV2 instance and uses it to performs throughput measurements. The function will spawn a new process if ``spawn_separate_process`` is set. This is needed to make memory footprint measurements accurate. :param dataset_url: A url of the dataset to be used for measurements. :param field_regex: A list of regular expressions. Only fields that match one of the regex patterns will be used during the benchmark. :param warmup_cycles_count: Number of warmup cycles. During warmup cycles no measurements are being recorded. :param measure_cycles_count: Number of measurements cycles. Only time elapsed during measurements cycles are used in throughput calculations. :param pool_type: :class:`WorkerPoolType` enum value. :param loaders_count: Number of IO threads. :param decoders_count: Number of threads or processes used for decoding. ``pool_type`` parameter defines whether multiple processes or threads are used for parallel decoding. :param read_method: An enum :class:`ReadMethod` that defines whether a :class:`petastorm.reader.Reader` will be used. :param shuffling_queue_size: Maximum number of elements in the shuffling queue. :param min_after_dequeue: Minimum number of elements in a shuffling queue before entries can be read from it. :param reader_extra_args: Extra arguments that would be passed to Reader constructor. :param pyarrow_serialize: When True, pyarrow.serialize library will be used for serializing decoded payloads. :param spawn_new_process: This function will respawn itself in a new process if the argument is True. Spawning a new process is needed to get an accurate memory footprint. :return: An instance of ``BenchmarkResult`` namedtuple with the results of the benchmark. The namedtuple has the following fields: `time_mean`, `samples_per_second`, `memory_info` and `cpu` """ if not reader_extra_args: reader_extra_args = dict() if spawn_new_process: args = copy.deepcopy(locals()) args['spawn_new_process'] = False executor = ProcessPoolExecutor(1) future = executor.submit(reader_v2_throughput, **args) return future.result() logger.info('Arguments: %s', locals()) if 'schema_fields' not in reader_extra_args: unischema_fields = match_unischema_fields(get_schema_from_dataset_url(dataset_url), field_regex) reader_extra_args['schema_fields'] = unischema_fields logger.info('Fields used in the benchmark: %s', str(reader_extra_args['schema_fields'])) decoder_pool_executor = _create_concurrent_executor(pool_type, decoders_count) with ReaderV2(dataset_url, num_epochs=None, loader_pool=ThreadPoolExecutor(loaders_count), decoder_pool=decoder_pool_executor, shuffling_queue=RandomShufflingBuffer(shuffling_queue_size, min_after_dequeue), **reader_extra_args) as reader: if read_method == ReadMethod.PYTHON: result = _time_warmup_and_work(reader, warmup_cycles_count, measure_cycles_count) elif read_method == ReadMethod.TF: result = _time_warmup_and_work_tf(reader, warmup_cycles_count, measure_cycles_count, 0, 0) else: raise RuntimeError('Unexpected reader_type value: %s', str(read_method)) return result
def copy_dataset(spark, source_url, target_url, field_regex, not_null_fields, overwrite_output, partitions_count, row_group_size_mb, hdfs_driver='libhdfs3'): """ Creates a copy of a dataset. A new dataset will optionally contain a subset of columns. Rows that have NULL values in fields defined by ``not_null_fields`` argument are filtered out. :param spark: An instance of ``SparkSession`` object :param source_url: A url of the dataset to be copied. :param target_url: A url specifying location of the target dataset. :param field_regex: A list of regex patterns. Only columns that match one of these patterns are copied to the new dataset. :param not_null_fields: A list of fields that must have non-NULL valus in the target dataset. :param overwrite_output: If ``False`` and there is an existing path defined by ``target_url``, the operation will fail. :param partitions_count: If not ``None``, the dataset is repartitioned before write. Number of files in the target Parquet store is defined by this parameter. :param row_group_size_mb: The size of the rowgroup in the target dataset. Specified in megabytes. :param hdfs_driver: A string denoting the hdfs driver to use (if using a dataset on hdfs). Current choices are libhdfs (java through JNI) or libhdfs3 (C++) :param user: String denoting username when connecting to HDFS. None implies login user. :return: None """ schema = get_schema_from_dataset_url(source_url, hdfs_driver=hdfs_driver) fields = match_unischema_fields(schema, field_regex) if field_regex and not fields: field_names = list(schema.fields.keys()) raise ValueError( 'Regular expressions (%s) do not match any fields (%s)', str(field_regex), str(field_names)) if fields: subschema = schema.create_schema_view(fields) else: subschema = schema resolver = FilesystemResolver( target_url, spark.sparkContext._jsc.hadoopConfiguration(), hdfs_driver=hdfs_driver, user=spark.sparkContext.sparkUser()) with materialize_dataset(spark, target_url, subschema, row_group_size_mb, filesystem_factory=resolver.filesystem_factory()): data_frame = spark.read \ .parquet(source_url) if fields: data_frame = data_frame.select(*[f.name for f in fields]) if not_null_fields: not_null_condition = reduce(operator.__and__, (data_frame[f].isNotNull() for f in not_null_fields)) data_frame = data_frame.filter(not_null_condition) if partitions_count: data_frame = data_frame.repartition(partitions_count) data_frame.write \ .mode('overwrite' if overwrite_output else 'error') \ .option('compression', 'none') \ .parquet(target_url)