コード例 #1
0
ファイル: pipeline.py プロジェクト: BillBezos/magentaVAE
def run_pipeline_serial(pipeline,
                        input_iterator,
                        output_dir,
                        output_file_base=None):
  if isinstance(pipeline.output_type, dict):
    for name, type_ in pipeline.output_type.items():
      if not hasattr(type_, 'SerializeToString'):
        raise ValueError(
            'Pipeline output "%s" does not have method SerializeToString. '
            'Output type = %s' % (name, pipeline.output_type))
  else:
    if not hasattr(pipeline.output_type, 'SerializeToString'):
      raise ValueError(
          'Pipeline output type %s does not have method SerializeToString.'
          % pipeline.output_type)

  if not tf.gfile.Exists(output_dir):
    tf.gfile.MakeDirs(output_dir)

  output_names = pipeline.output_type_as_dict.keys()

  if output_file_base is None:
    output_paths = [os.path.join(output_dir, name + '.tfrecord')
                    for name in output_names]
  else:
    output_paths = [os.path.join(output_dir,
                                 '%s_%s.tfrecord' % (output_file_base, name))
                    for name in output_names]

  writers = dict([(name, tf.python_io.TFRecordWriter(path))
                  for name, path in zip(output_names, output_paths)])

  total_inputs = 0
  total_outputs = 0
  stats = []
  for input_ in input_iterator:
    total_inputs += 1
    pipeline.mw.write(pipeline.mw.model_dir, 'input_', input_)
    for name, outputs in _guarantee_dict(pipeline.transform(input_),
                                         list(output_names)[0]).items():
      for output in outputs:
        #filename = 'asdf' + f'{int(random.random() * 100)}'
        #pipeline.mw.write(pipeline.mw.model_dir, filename, output)
        #pipeline.mw.write(pipeline.mw.model_dir, filename + 'z', output.SerializeToString())
        writers[name].write(output.SerializeToString())
      total_outputs += len(outputs)
    stats = statistics.merge_statistics(stats + pipeline.get_stats())
    if total_inputs % 500 == 0:
      tf.logging.info('Processed %d inputs so far. Produced %d outputs.',
                      total_inputs, total_outputs)
      statistics.log_statistics_list(stats, tf.logging.info)
  tf.logging.info('\n\nCompleted.\n')
  tf.logging.info('Processed %d inputs total. Produced %d outputs.',
                  total_inputs, total_outputs)
  statistics.log_statistics_list(stats, tf.logging.info)
コード例 #2
0
ファイル: pipeline.py プロジェクト: timgates42/magenta
def load_pipeline(pipeline, input_iterator):
    """Runs a pipeline saving the output into memory.

  Use this instead of `run_pipeline_serial` to build a dataset on the fly
  without saving it to disk.

  Args:
    pipeline: A Pipeline instance.
    input_iterator: Iterates over the input data. Items returned by it are fed
        directly into the pipeline's `transform` method.

  Returns:
    The aggregated return values of pipeline.transform. Specifically a
    dictionary mapping dataset names to lists of objects. Each name acts
    as a bucket where outputs are aggregated.
  """
    aggregated_outputs = dict(
        (name, []) for name in pipeline.output_type_as_dict)
    total_inputs = 0
    total_outputs = 0
    stats = []
    for input_object in input_iterator:
        total_inputs += 1
        outputs = _guarantee_dict(pipeline.transform(input_object),
                                  list(aggregated_outputs.keys())[0])
        for name, output_list in outputs.items():
            aggregated_outputs[name].extend(output_list)
            total_outputs += len(output_list)
        stats = statistics.merge_statistics(stats + pipeline.get_stats())
        if total_inputs % 500 == 0:
            tf.logging.info('Processed %d inputs so far. Produced %d outputs.',
                            total_inputs, total_outputs)
            statistics.log_statistics_list(stats, tf.logging.info)
    tf.logging.info('\n\nCompleted.\n')
    tf.logging.info('Processed %d inputs total. Produced %d outputs.',
                    total_inputs, total_outputs)
    statistics.log_statistics_list(stats, tf.logging.info)
    return aggregated_outputs
コード例 #3
0
ファイル: pipeline.py プロジェクト: danabo/magenta
def load_pipeline(pipeline, input_iterator):
  """Runs a pipeline saving the output into memory.

  Use this instead of `run_pipeline_serial` to build a dataset on the fly
  without saving it to disk.

  Args:
    pipeline: A Pipeline instance.
    input_iterator: Iterates over the input data. Items returned by it are fed
        directly into the pipeline's `transform` method.

  Returns:
    The aggregated return values of pipeline.transform. Specifically a
    dictionary mapping dataset names to lists of objects. Each name acts
    as a bucket where outputs are aggregated.
  """
  aggregated_outputs = dict(
      [(name, []) for name in pipeline.output_type_as_dict])
  total_inputs = 0
  total_outputs = 0
  stats = []
  for input_object in input_iterator:
    total_inputs += 1
    outputs = _guarantee_dict(pipeline.transform(input_object),
                              aggregated_outputs.keys()[0])
    for name, output_list in outputs.items():
      aggregated_outputs[name].extend(output_list)
      total_outputs += len(output_list)
    stats = statistics.merge_statistics(stats + pipeline.get_stats())
    if total_inputs % 500 == 0:
      tf.logging.info('Processed %d inputs so far. Produced %d outputs.',
                      total_inputs, total_outputs)
      statistics.log_statistics_list(stats, tf.logging.info)
  tf.logging.info('\n\nCompleted.\n')
  tf.logging.info('Processed %d inputs total. Produced %d outputs.',
                  total_inputs, total_outputs)
  statistics.log_statistics_list(stats, tf.logging.info)
  return aggregated_outputs
コード例 #4
0
ファイル: pipeline.py プロジェクト: timgates42/magenta
def run_pipeline_serial(pipeline,
                        input_iterator,
                        output_dir,
                        output_file_base=None):
    """Runs the a pipeline on a data source and writes to a directory.

  Run the pipeline on each input from the iterator one at a time.
  A file will be written to `output_dir` for each dataset name specified
  by the pipeline. pipeline.transform is called on each input and the
  results are aggregated into their correct datasets.

  The output type or types given by `pipeline.output_type` must be protocol
  buffers or objects that have a SerializeToString method.

  Args:
    pipeline: A Pipeline instance. `pipeline.output_type` must be a protocol
        buffer or a dictionary mapping names to protocol buffers.
    input_iterator: Iterates over the input data. Items returned by it are fed
        directly into the pipeline's `transform` method.
    output_dir: Path to directory where datasets will be written. Each dataset
        is a file whose name contains the pipeline's dataset name. If the
        directory does not exist, it will be created.
    output_file_base: An optional string prefix for all datasets output by this
        run. The prefix will also be followed by an underscore.

  Raises:
    ValueError: If any of `pipeline`'s output types do not have a
        SerializeToString method.
  """
    if isinstance(pipeline.output_type, dict):
        for name, type_ in pipeline.output_type.items():
            if not hasattr(type_, 'SerializeToString'):
                raise ValueError(
                    'Pipeline output "%s" does not have method SerializeToString. '
                    'Output type = %s' % (name, pipeline.output_type))
    else:
        if not hasattr(pipeline.output_type, 'SerializeToString'):
            raise ValueError(
                'Pipeline output type %s does not have method SerializeToString.'
                % pipeline.output_type)

    if not tf.gfile.Exists(output_dir):
        tf.gfile.MakeDirs(output_dir)

    output_names = pipeline.output_type_as_dict.keys()

    if output_file_base is None:
        output_paths = [
            os.path.join(output_dir, name + '.tfrecord')
            for name in output_names
        ]
    else:
        output_paths = [
            os.path.join(output_dir,
                         '%s_%s.tfrecord' % (output_file_base, name))
            for name in output_names
        ]

    writers = dict((name, tf.python_io.TFRecordWriter(path))
                   for name, path in zip(output_names, output_paths))

    total_inputs = 0
    total_outputs = 0
    stats = []
    for input_ in input_iterator:
        total_inputs += 1
        for name, outputs in _guarantee_dict(pipeline.transform(input_),
                                             list(output_names)[0]).items():
            for output in outputs:  # pylint:disable=not-an-iterable
                writers[name].write(output.SerializeToString())
            total_outputs += len(outputs)
        stats = statistics.merge_statistics(stats + pipeline.get_stats())
        if total_inputs % 500 == 0:
            tf.logging.info('Processed %d inputs so far. Produced %d outputs.',
                            total_inputs, total_outputs)
            statistics.log_statistics_list(stats, tf.logging.info)
    tf.logging.info('\n\nCompleted.\n')
    tf.logging.info('Processed %d inputs total. Produced %d outputs.',
                    total_inputs, total_outputs)
    statistics.log_statistics_list(stats, tf.logging.info)
コード例 #5
0
def run_pipeline_text(pipeline, input_iterator, output_dir):
    """Runs a pipeline graph saving output to disk as text.
     
    Run the the pipeline on each input from the iterator one at a time.
    A file will be written to `output_dir` for each dataset name specified
    by the pipeline. pipeline.transform is called on each input and the
    results are aggregated into their correct datasets.

    The output type given by `pipeline.output_type` must be str.

    Args:
        pipeline: A Pipeline instance. `pipeline.output_type` must be a str.
        input_iterator: Iterates over the input data. Items returned by it are fed
            directly into the pipeline's `transform` method.
        output_dir: Path to directory where datasets will be written. Each dataset
            is a file whose name contains the pipeline's dataset name. If the
            directory does not exist, it will be created.
            
    Raises:
        ValueError: If any of `pipeline`'s output type is not str.
     
    """

    if isinstance(pipeline.output_type, dict):
        for name, type_ in pipeline.output_type.items():
            if type_ != str:
                raise ValueError('Pipeline "%s" must output %s type. '
                                 'Output type was %s' % (name, str, type_))
    else:
        if type_ != str:
            raise ValueError('Pipeline "%s" must output %s type. '
                             'Output type was %s' %
                             (name, str, pipeline.output_type))

    aggregated_outputs = dict([(name, [])
                               for name in pipeline.output_type_as_dict])
    total_inputs = 0
    total_outputs = 0
    stats = []

    output_names = pipeline.output_type_as_dict.keys()
    output_paths = [
        os.path.join(output_dir, name + '.txt') for name in output_names
    ]

    for path in output_paths:
        if os.path.exists(path):
            raise FileExistsError(
                'File {} already exists. Please remove and try again.'.format(
                    path))

    writers = dict([(name, open(path, 'a'))
                    for name, path in zip(output_names, output_paths)])

    for input_object in input_iterator:
        total_inputs += 1

        for name, outputs in _guarantee_dict(pipeline.transform(input_object),
                                             list(output_names)[0]).items():

            for output in outputs:
                writers[name].write(output + '\n')

            total_outputs += len(outputs)
        stats = statistics.merge_statistics(stats + pipeline.get_stats())
        if total_inputs % 5000 == 0:
            tf.logging.info('Processed %d inputs so far. Produced %d outputs.',
                            total_inputs, total_outputs)
            statistics.log_statistics_list(stats, tf.logging.info)
    tf.logging.info('\n\nCompleted.\n')
    tf.logging.info('Processed %d inputs total. Produced %d outputs.',
                    total_inputs, total_outputs)
    statistics.log_statistics_list(stats, tf.logging.info)
    return aggregated_outputs
コード例 #6
0
ファイル: pipeline.py プロジェクト: danabo/magenta
def run_pipeline_serial(pipeline,
                        input_iterator,
                        output_dir,
                        output_file_base=None):
  """Runs the a pipeline on a data source and writes to a directory.

  Run the the pipeline on each input from the iterator one at a time.
  A file will be written to `output_dir` for each dataset name specified
  by the pipeline. pipeline.transform is called on each input and the
  results are aggregated into their correct datasets.

  The output type or types given by `pipeline.output_type` must be protocol
  buffers or objects that have a SerializeToString method.

  Args:
    pipeline: A Pipeline instance. `pipeline.output_type` must be a protocol
        buffer or a dictionary mapping names to protocol buffers.
    input_iterator: Iterates over the input data. Items returned by it are fed
        directly into the pipeline's `transform` method.
    output_dir: Path to directory where datasets will be written. Each dataset
        is a file whose name contains the pipeline's dataset name. If the
        directory does not exist, it will be created.
    output_file_base: An optional string prefix for all datasets output by this
        run. The prefix will also be followed by an underscore.

  Raises:
    ValueError: If any of `pipeline`'s output types do not have a
        SerializeToString method.
  """
  if isinstance(pipeline.output_type, dict):
    for name, type_ in pipeline.output_type.items():
      if not hasattr(type_, 'SerializeToString'):
        raise ValueError(
            'Pipeline output "%s" does not have method SerializeToString. '
            'Output type = %s' % (name, pipeline.output_type))
  else:
    if not hasattr(pipeline.output_type, 'SerializeToString'):
      raise ValueError(
          'Pipeline output type %s does not have method SerializeToString.'
          % pipeline.output_type)

  if not tf.gfile.Exists(output_dir):
    tf.gfile.MakeDirs(output_dir)

  output_names = pipeline.output_type_as_dict.keys()

  if output_file_base is None:
    output_paths = [os.path.join(output_dir, name + '.tfrecord')
                    for name in output_names]
  else:
    output_paths = [os.path.join(output_dir,
                                 '%s_%s.tfrecord' % (output_file_base, name))
                    for name in output_names]

  writers = dict([(name, tf.python_io.TFRecordWriter(path))
                  for name, path in zip(output_names, output_paths)])

  total_inputs = 0
  total_outputs = 0
  stats = []
  for input_ in input_iterator:
    total_inputs += 1
    for name, outputs in _guarantee_dict(pipeline.transform(input_),
                                         output_names[0]).items():
      for output in outputs:
        writers[name].write(output.SerializeToString())
        total_outputs += 1
    stats = statistics.merge_statistics(stats + pipeline.get_stats())
    if total_inputs % 500 == 0:
      tf.logging.info('Processed %d inputs so far. Produced %d outputs.',
                      total_inputs, total_outputs)
      statistics.log_statistics_list(stats, tf.logging.info)
  tf.logging.info('\n\nCompleted.\n')
  tf.logging.info('Processed %d inputs total. Produced %d outputs.',
                  total_inputs, total_outputs)
  statistics.log_statistics_list(stats, tf.logging.info)