コード例 #1
0
ファイル: dataset.py プロジェクト: zuston/tensornet
def list_files(datapath, days, match_pattern):
    with ops.name_scope("list_files"):
        file_pattern = []
        for day in days:
            file_pattern.append(os.path.join(datapath, day, match_pattern))

        file_pattern = ops.convert_to_tensor(
            file_pattern, dtype=dtypes.string, name="file_pattern")
        matching_files = gen_io_ops.matching_files(file_pattern)


        # Raise an exception if `file_pattern` does not match any files.
        condition = math_ops.greater(array_ops.shape(matching_files)[0], 0,
                                     name="match_not_empty")

        message = math_ops.add(
            "No files matched pattern: ",
            string_ops.reduce_join(file_pattern, separator=", "), name="message")

        assert_not_empty = control_flow_ops.Assert(
            condition, [message], summarize=1, name="assert_not_empty")
        with ops.control_dependencies([assert_not_empty]):
            matching_files = array_ops.identity(matching_files)

        dataset = dataset_ops.Dataset.from_tensor_slices(matching_files)

        return dataset
コード例 #2
0
    def _count_num_records(self):
        """
        Counts the number of non-empty lines (the data samples) from the data_files. This function
        is called from get_size the first time.
        :return int: the number of non-empty lines in the data_files
        """
        # TODO in TF 1.3 use: dataset = Dataset.list_files(self.data_files_pattern).repeat(1)
        from tensorflow.python.ops import gen_io_ops

        dataset = Dataset.from_tensor_slices(
            gen_io_ops.matching_files(self.data_files_pattern)).repeat(1)

        files = self._read_files_once(dataset)
        with tf.Graph().as_default():
            dataset = self.dataset_class(files).repeat(1)
            samples = 0
            try:
                next_element = dataset.make_one_shot_iterator().get_next()
                with tf.Session() as sess:
                    while True:
                        sess.run(next_element)
                        samples += 1
            except:
                pass
        return samples
コード例 #3
0
def stateless_list_files(file_pattern, shuffle=None, seed=None):
    """A dataset of all files matching one or more glob patterns.

  Note that, if `shuffle` is not None, it will use a stateless shuffle
  implementation. Then the returned dataset supports the TF1 compatibility API
  `tf.data.make_one_shot_iterator()` in TF2.

  Example:
    >>> dataset = tf.stateless_list_files("some_file_pattern")

  Args:
    file_pattern: A string, a list of strings, or a `tf.Tensor` of string type
      (scalar or vector), representing the filename glob (i.e. shell wildcard)
      pattern(s) that will be matched.
    shuffle: (Optional.) If `True`, the file names will be shuffled randomly
      based on a stateless implementation. Defaults to `True`.
    seed: (Optional.) A `tf.int64` scalar `tf.Tensor`, representing the random
      seed that will be used to create the distribution. See
      `tf.random.set_seed` for behavior.

  Returns:
   Dataset: A `Dataset` of strings corresponding to file names.
  """
    with ops.name_scope("list_files"):
        if shuffle is None:
            shuffle = True
        file_pattern = ops.convert_to_tensor(file_pattern,
                                             dtype=string,
                                             name="file_pattern")
        matching_files = gen_io_ops.matching_files(file_pattern)

        # Raise an exception if `file_pattern` does not match any files.
        condition = math_ops.greater(array_ops.shape(matching_files)[0],
                                     0,
                                     name="match_not_empty")
        message = math_ops.add("No files matched pattern: ",
                               strings.reduce_join(file_pattern,
                                                   separator=", "),
                               name="message")

        assert_not_empty = debugging.Assert(condition, [message],
                                            summarize=1,
                                            name="assert_not_empty")
        with control_dependencies([assert_not_empty]):
            matching_files = identity(matching_files)

        dataset = data.Dataset.from_tensor_slices(matching_files)
        if shuffle:
            buffer_size = math_ops.maximum(
                shape(matching_files, out_type=dtypes.int64)[0], 1)
            # Use stateless shuffled dataset
            dataset = dataset.apply(
                stateless_shuffle_dataset(buffer_size, seed=seed))
        return dataset
コード例 #4
0
ファイル: dataset_ops.py プロジェクト: azrael417/tensorflow
 def list_files(file_pattern):
     """A dataset of all files matching a pattern.
 Example:
   If we had the following files on our filesystem:
     - /path/to/dir/a.txt
     - /path/to/dir/b.py
     - /path/to/dir/c.py
   If we pass "/path/to/dir/*.py" as the directory, the dataset would
   produce:
     - /path/to/dir/b.py
     - /path/to/dir/c.py
 Args:
   file_pattern: A string or scalar string `tf.Tensor`, representing
     the filename pattern that will be matched.
 Returns:
  A `Dataset` of strings corresponding to file names.
 """
     return Dataset.from_tensor_slices(
         gen_io_ops.matching_files(file_pattern))
コード例 #5
0
ファイル: dataset_ops.py プロジェクト: DjangoPeng/tensorflow
  def list_files(file_pattern):
    """A dataset of all files matching a pattern.

    Example:
      If we had the following files on our filesystem:
        - /path/to/dir/a.txt
        - /path/to/dir/b.py
        - /path/to/dir/c.py
      If we pass "/path/to/dir/*.py" as the directory, the dataset would
      produce:
        - /path/to/dir/b.py
        - /path/to/dir/c.py

    Args:
      file_pattern: A string or scalar string `tf.Tensor`, representing
        the filename pattern that will be matched.

    Returns:
     A `Dataset` of strings corresponding to file names.
    """
    return Dataset.from_tensor_slices(gen_io_ops.matching_files(file_pattern))
コード例 #6
0
    def read(self, batch_size, num_epochs=1, shuffle=False, task_spec=None):
        """
        Reads the data and return a tuple of (inputs,outputs)
        :param batch_size: the batch size of the returned inputs/outputs
        :param num_epochs: the number of epochs to read the dataset
        :param shuffle: whether to shuffle the data or not
        :param task_spec: the task spec of the training. I will help to know whether it is
        distributed training or not
        :return: The result of calling dataset.make_one_shot_iterator().get_next()
        """
        # create the dataset of files with the data
        # TODO in TF 1.3 use:  dataset = Dataset.list_files(self.data_files_pattern)
        from tensorflow.python.ops import gen_io_ops
        dataset = Dataset.from_tensor_slices(
            gen_io_ops.matching_files(self.data_files_pattern))
        if shuffle:
            # read one sample per file
            # TODO in TF 1.3 use:
            # dataset = dataset.interleave(self.dataset_class,
            #                              # number of readers the same as number of CPUs
            #                              cycle_length=multiprocessing.cpu_count() + 1,
            #                              # block size is 1 to get directly a flat map
            #                              block_length=1)
            files = self._read_files_once(dataset)
            import random
            random.shuffle(files)
            dataset = self.dataset_class(files)
        else:
            # reads files sequentially
            files = self._read_files_once(dataset)
            dataset = self.dataset_class(files)
        # set the number of epochs
        dataset = dataset.repeat(num_epochs)

        if task_spec and task_spec.num_workers > 1:
            # split the dataset in shards
            # TODO in TF 1.4 use: dataset = dataset.shard(task_spec.num_workers, task_spec.index)
            from tensorflow.python.ops import math_ops

            def filter_fn(elem_index, _):
                mod_result = math_ops.mod(elem_index, task_spec.num_workers)
                return math_ops.equal(mod_result, task_spec.index)

            dataset = dataset.enumerate().filter(filter_fn).map(
                lambda _, elem: elem)

        if shuffle:
            # shuffle the samples
            if self.shuffle_size is None:
                raise ValueError('shuffle_size has not been set')
            dataset = dataset.shuffle(buffer_size=self.shuffle_size)

        # process each example. We check the method is defined in the child class:
        if self._flat_map.__func__ not in TFDataSet.__dict__.values():
            dataset = dataset.flat_map(self._flat_map)
        if self._map.__func__ not in TFDataSet.__dict__.values():
            dataset = dataset.map(
                self._map,
                # use as many threads as CPUs + 1
                # TODO in TF 1.4 use: num_parallel_calls=multiprocessing.cpu_count() + 1,
                num_threads=multiprocessing.cpu_count() + 1,
                # buffer the data as CPUs * batch_size + minimum_size
                output_buffer_size=batch_size * multiprocessing.cpu_count() +
                self.min_queue_examples)
        if self.padded_shapes is not None:
            dataset = dataset.padded_batch(batch_size, self.padded_shapes,
                                           self.padded_values)
        else:
            dataset = dataset.batch(batch_size)
        return dataset.make_one_shot_iterator().get_next()