def compute_correlation_distribution(dataset_url, id_column, shuffle_options, num_corr_samples=100): """ Compute the correlation distribution of a given shuffle_options on an existing dataset. Use this to compare 2 different shuffling options compare. It is encouraged to use a dataset generated by generate_shuffle_analysis_dataset for this analysis. :param dataset_url: Dataset url to compute correlation distribution of :param id_column: Column where an integer or string id can be found :param shuffle_options: shuffle options to test correlation against :param num_corr_samples: How many samples of the correlation to take to compute distribution :return: (mean, standard deviation) of computed distribution """ # Read the dataset without any shuffling in order (need to use a dummy pool for this). with Reader(dataset_url, shuffle_options=ShuffleOptions(False), reader_pool=DummyPool()) as reader: unshuffled = [row[id_column] for row in reader] correlations = [] for _ in range(num_corr_samples): with Reader(dataset_url, shuffle_options=shuffle_options) as reader: shuffled = [row[id_column] for row in reader] correlations.append(abs(np.corrcoef(unshuffled, shuffled)[0, 1])) mean = np.mean(correlations) std_dev = np.std(correlations) return mean, std_dev
def test_predicate_on_multiple_fields(synthetic_dataset, reader_factory): expected_values = {'id': 11, 'id2': 1} with reader_factory(synthetic_dataset.url, shuffle_options=ShuffleOptions(False), predicate=EqualPredicate(expected_values)) as reader: actual = next(reader) assert actual.id == expected_values['id'] assert actual.id2 == expected_values['id2']
def test_stable_pieces_order(synthetic_dataset, reader_factory): """Tests that the reader raises value errors when appropriate""" RERUN_THE_TEST_COUNT = 20 baseline_run = None for _ in range(RERUN_THE_TEST_COUNT): with reader_factory(synthetic_dataset.url, schema_fields=[TestSchema.id], shuffle_options=ShuffleOptions(False)) as reader: this_run = [row.id for row in reader] if baseline_run: assert this_run == baseline_run baseline_run = this_run
def test_invalid_schema_field(synthetic_dataset, reader_factory): # Let's assume we are selecting columns using a schema which is different from the one # stored in the dataset. Would expect to get a reasonable error message BogusSchema = Unischema('BogusSchema', [ UnischemaField('partition_key', np.string_, (), ScalarCodec(StringType()), False), UnischemaField('id', np.int64, (), ScalarCodec(LongType()), False), UnischemaField('bogus_key', np.int32, (), ScalarCodec(ShortType()), False) ]) expected_values = {'bogus_key': 11, 'id': 1} with pytest.raises(ValueError) as e: reader_factory(synthetic_dataset.url, schema_fields=BogusSchema.fields.values(), shuffle_options=ShuffleOptions(False), predicate=EqualPredicate(expected_values)) assert 'bogus_key' in str(e)
def test_predicate_with_invalid_fields(synthetic_dataset, reader_factory): """Try passing an invalid field name from a predicate to the reader. An error should be raised.""" TEST_CASES = [{ 'invalid_field_name': 1 }, dict(), { 'invalid_field_name': 1, 'id': 11 }, { 'invalid_field_name': 1, 'invalid_field_name_2': 11 }] for predicate_spec in TEST_CASES: with reader_factory( synthetic_dataset.url, shuffle_options=ShuffleOptions(False), predicate=EqualPredicate(predicate_spec)) as reader: with pytest.raises(ValueError): next(reader)
def readout_all_ids(shuffle, drop_ratio): with reader_factory(synthetic_dataset.url, shuffle_options=ShuffleOptions( shuffle, drop_ratio)) as reader: ids = [row.id for row in reader] return ids
def __init__(self, dataset_url, schema_fields=None, shuffle=None, predicate=None, rowgroup_selector=None, num_epochs=1, sequence=None, training_partition=None, num_training_partitions=None, read_timeout_s=None, cache=None, loader_pool=None, decoder_pool=None, shuffling_queue=None, shuffle_options=None, pyarrow_filesystem=None): """Initializes a reader object. :param dataset_url: an filepath or a url to a parquet directory, e.g. 'hdfs://some_hdfs_cluster/user/yevgeni/parquet8', or '/tmp/mydataset' :param schema_fields: Either list of unischema fields to subset, or None to read all fields. OR an NGram object, then it will return an NGram of the specified properties. :param predicate: instance of predicate object to filter rows to be returned by reader. :param rowgroup_selector: instance of row group selector object to select row groups to be read :param reader_pool: parallelization pool. ThreadPool(10) (10 threads) is used by default. This pool is a custom implementation used to parallelize reading data from the dataset. Any object from workers_pool package can be used (e.g. ProcessPool) :param num_epochs: An epoch is a single pass over all samples in the dataset. Setting num_epochs to 'None' will result in an infinite number of epochs. :param sequence: This is deprecated. To use sequence/ngram, please supply the argument in schema_fields instead. :param training_partition: An int denoting the partition number used for multi node training. Each node should pass in a unique partition number in the range [0, num_training_partitions). num_training_partitions must be supplied as well. :param num_training_partitions An int denoting the number of training partitions (how many nodes are performing the multi node training) :param read_timeout_s: A numeric with the amount of time in seconds you would like to give a read before it times out and raises an EmptyResultError. Pass in None for an infinite timeout :param cache: An object conforming to `cache.CacheBase` interface. Before loading row groups from a parquet file the Reader will attempt to load these values from cache. Caching is useful when communication to the main data store is either slow or expensive and the local machine has large enough storage to store entire dataset (or a partition of a dataset if num_training_partitions is used). :param decoder_pool: An instance of a concurrent.futures pool executor used for decoding. If None, a default ThreadPoolExecutor(5) will be used. :param loader_pool: An instance of a concurrent.futures pool executor used for decoding. If None, a default ThreadPoolExecutor(5) will be used. :param shuffle_options : ShuffleOptions object to describe how to shuffle dataset (supercedes shuffle parameter) defaults to shuffling row groups but not to drop rows based on partitions. :param shuffle: DEPRECATED boolean whether to shuffle the row group order. Use shuffle_row_groups in ShuffleOptions instead. By default, `NullCache` implementation """ # 1. Resolve dataset path (hdfs://, file://) and open the parquet storage (dataset) # 2. Get a list of all groups # 3. Filter rowgroups # a. predicates # b. row-group selector (our indexing mechanism) # c. partition: used to get a subset of data for distributed training # 4. Launch a new thread running `worker_loop` function. if dataset_url is None or not isinstance(dataset_url, six.string_types): raise ValueError("""dataset_url must be a string""") if not (isinstance(schema_fields, collections.Iterable) or isinstance(schema_fields, NGram) or schema_fields is None): raise ValueError("""Fields must be either None, an iterable collection of Unischema fields or an NGram object.""") if sequence is not None: raise ValueError("""'sequence' argument of Reader object is deprecated. Please pass an NGram instance to 'schema_fields' argument instead.""") self.ngram = schema_fields if isinstance(schema_fields, NGram) else None if self.ngram and not self.ngram.timestamp_overlap and shuffle_options.shuffle_row_drop_partitions > 1: raise NotImplementedError('Using timestamp_overlap=False is not implemented with' ' shuffle_options.shuffle_row_drop_partitions > 1') cache = cache or NullCache() dataset_url = dataset_url[:-1] if dataset_url[-1] == '/' else dataset_url if shuffle_options is None: if shuffle is None: shuffle = True else: logger.warning('shuffle option is deprecated. Please use shuffle_options instead') shuffle_options = ShuffleOptions(shuffle) # 1. Resolve dataset path (hdfs://, file://) and open the parquet storage (dataset) logger.debug('dataset_url: %s', dataset_url) if pyarrow_filesystem is not None: filesystem = pyarrow_filesystem dataset_path = urlparse(dataset_url).path else: resolver = FilesystemResolver(dataset_url) filesystem = resolver.filesystem() dataset_path = resolver.parsed_dataset_url().path self._dataset = pq.ParquetDataset(dataset_path, filesystem=filesystem, validate_schema=False) self._normalize_shuffle_options(shuffle_options, self._dataset) # Get a unischema stored in the dataset metadata. stored_schema = dataset_metadata.get_schema(self._dataset) # Make a schema view (a view is a Unischema containing only a subset of fields # Will raise an exception if invalid schema fields are in schema_fields fields = schema_fields if isinstance(schema_fields, collections.Iterable) else None self.schema = stored_schema.create_schema_view(fields) if fields else stored_schema # 2. Get a list of all groups row_groups = dataset_metadata.load_row_groups(self._dataset) # 3. Filter rowgroups filtered_row_groups, worker_predicate = self._filter_row_groups(self._dataset, row_groups, predicate, rowgroup_selector, training_partition, num_training_partitions) epoch_items = self._apply_row_drop_partition(filtered_row_groups, shuffle_options) # 4. Launch a new thread running `worker_loop` function. epochs_iterator = lambda: epoch_generator(epoch_items, num_epochs, shuffle_options.shuffle_row_groups) self._results_queue = Queue(_OUTPUT_QUEUE_SIZE) loader = RowGroupLoader(dataset_url, self.schema, self.ngram, cache, worker_predicate) decoder = RowDecoder(self.schema, self.ngram) self._loader_pool = loader_pool or ThreadPoolExecutor(5) self._decoder_pool = decoder_pool or ThreadPoolExecutor(5) self._stop_flow_manager_event = threading.Event() self._diags = Counter() if not shuffling_queue: shuffling_queue = NoopShufflingBuffer() self._flow_manager_thread = threading.Thread(target=worker_loop, args=(epochs_iterator, self._loader_pool, loader, self._decoder_pool, decoder, shuffling_queue, self._results_queue, self._stop_flow_manager_event, self._diags)) self._flow_manager_thread.daemon = True self._flow_manager_thread.start() self._read_timeout_s = read_timeout_s