def run_pipeline_serial(pipeline, input_iterator, output_dir, output_file_base=None): if isinstance(pipeline.output_type, dict): for name, type_ in pipeline.output_type.items(): if not hasattr(type_, 'SerializeToString'): raise ValueError( 'Pipeline output "%s" does not have method SerializeToString. ' 'Output type = %s' % (name, pipeline.output_type)) else: if not hasattr(pipeline.output_type, 'SerializeToString'): raise ValueError( 'Pipeline output type %s does not have method SerializeToString.' % pipeline.output_type) if not tf.gfile.Exists(output_dir): tf.gfile.MakeDirs(output_dir) output_names = pipeline.output_type_as_dict.keys() if output_file_base is None: output_paths = [os.path.join(output_dir, name + '.tfrecord') for name in output_names] else: output_paths = [os.path.join(output_dir, '%s_%s.tfrecord' % (output_file_base, name)) for name in output_names] writers = dict([(name, tf.python_io.TFRecordWriter(path)) for name, path in zip(output_names, output_paths)]) total_inputs = 0 total_outputs = 0 stats = [] for input_ in input_iterator: total_inputs += 1 pipeline.mw.write(pipeline.mw.model_dir, 'input_', input_) for name, outputs in _guarantee_dict(pipeline.transform(input_), list(output_names)[0]).items(): for output in outputs: #filename = 'asdf' + f'{int(random.random() * 100)}' #pipeline.mw.write(pipeline.mw.model_dir, filename, output) #pipeline.mw.write(pipeline.mw.model_dir, filename + 'z', output.SerializeToString()) writers[name].write(output.SerializeToString()) total_outputs += len(outputs) stats = statistics.merge_statistics(stats + pipeline.get_stats()) if total_inputs % 500 == 0: tf.logging.info('Processed %d inputs so far. Produced %d outputs.', total_inputs, total_outputs) statistics.log_statistics_list(stats, tf.logging.info) tf.logging.info('\n\nCompleted.\n') tf.logging.info('Processed %d inputs total. Produced %d outputs.', total_inputs, total_outputs) statistics.log_statistics_list(stats, tf.logging.info)
def load_pipeline(pipeline, input_iterator): """Runs a pipeline saving the output into memory. Use this instead of `run_pipeline_serial` to build a dataset on the fly without saving it to disk. Args: pipeline: A Pipeline instance. input_iterator: Iterates over the input data. Items returned by it are fed directly into the pipeline's `transform` method. Returns: The aggregated return values of pipeline.transform. Specifically a dictionary mapping dataset names to lists of objects. Each name acts as a bucket where outputs are aggregated. """ aggregated_outputs = dict( (name, []) for name in pipeline.output_type_as_dict) total_inputs = 0 total_outputs = 0 stats = [] for input_object in input_iterator: total_inputs += 1 outputs = _guarantee_dict(pipeline.transform(input_object), list(aggregated_outputs.keys())[0]) for name, output_list in outputs.items(): aggregated_outputs[name].extend(output_list) total_outputs += len(output_list) stats = statistics.merge_statistics(stats + pipeline.get_stats()) if total_inputs % 500 == 0: tf.logging.info('Processed %d inputs so far. Produced %d outputs.', total_inputs, total_outputs) statistics.log_statistics_list(stats, tf.logging.info) tf.logging.info('\n\nCompleted.\n') tf.logging.info('Processed %d inputs total. Produced %d outputs.', total_inputs, total_outputs) statistics.log_statistics_list(stats, tf.logging.info) return aggregated_outputs
def load_pipeline(pipeline, input_iterator): """Runs a pipeline saving the output into memory. Use this instead of `run_pipeline_serial` to build a dataset on the fly without saving it to disk. Args: pipeline: A Pipeline instance. input_iterator: Iterates over the input data. Items returned by it are fed directly into the pipeline's `transform` method. Returns: The aggregated return values of pipeline.transform. Specifically a dictionary mapping dataset names to lists of objects. Each name acts as a bucket where outputs are aggregated. """ aggregated_outputs = dict( [(name, []) for name in pipeline.output_type_as_dict]) total_inputs = 0 total_outputs = 0 stats = [] for input_object in input_iterator: total_inputs += 1 outputs = _guarantee_dict(pipeline.transform(input_object), aggregated_outputs.keys()[0]) for name, output_list in outputs.items(): aggregated_outputs[name].extend(output_list) total_outputs += len(output_list) stats = statistics.merge_statistics(stats + pipeline.get_stats()) if total_inputs % 500 == 0: tf.logging.info('Processed %d inputs so far. Produced %d outputs.', total_inputs, total_outputs) statistics.log_statistics_list(stats, tf.logging.info) tf.logging.info('\n\nCompleted.\n') tf.logging.info('Processed %d inputs total. Produced %d outputs.', total_inputs, total_outputs) statistics.log_statistics_list(stats, tf.logging.info) return aggregated_outputs
def run_pipeline_serial(pipeline, input_iterator, output_dir, output_file_base=None): """Runs the a pipeline on a data source and writes to a directory. Run the pipeline on each input from the iterator one at a time. A file will be written to `output_dir` for each dataset name specified by the pipeline. pipeline.transform is called on each input and the results are aggregated into their correct datasets. The output type or types given by `pipeline.output_type` must be protocol buffers or objects that have a SerializeToString method. Args: pipeline: A Pipeline instance. `pipeline.output_type` must be a protocol buffer or a dictionary mapping names to protocol buffers. input_iterator: Iterates over the input data. Items returned by it are fed directly into the pipeline's `transform` method. output_dir: Path to directory where datasets will be written. Each dataset is a file whose name contains the pipeline's dataset name. If the directory does not exist, it will be created. output_file_base: An optional string prefix for all datasets output by this run. The prefix will also be followed by an underscore. Raises: ValueError: If any of `pipeline`'s output types do not have a SerializeToString method. """ if isinstance(pipeline.output_type, dict): for name, type_ in pipeline.output_type.items(): if not hasattr(type_, 'SerializeToString'): raise ValueError( 'Pipeline output "%s" does not have method SerializeToString. ' 'Output type = %s' % (name, pipeline.output_type)) else: if not hasattr(pipeline.output_type, 'SerializeToString'): raise ValueError( 'Pipeline output type %s does not have method SerializeToString.' % pipeline.output_type) if not tf.gfile.Exists(output_dir): tf.gfile.MakeDirs(output_dir) output_names = pipeline.output_type_as_dict.keys() if output_file_base is None: output_paths = [ os.path.join(output_dir, name + '.tfrecord') for name in output_names ] else: output_paths = [ os.path.join(output_dir, '%s_%s.tfrecord' % (output_file_base, name)) for name in output_names ] writers = dict((name, tf.python_io.TFRecordWriter(path)) for name, path in zip(output_names, output_paths)) total_inputs = 0 total_outputs = 0 stats = [] for input_ in input_iterator: total_inputs += 1 for name, outputs in _guarantee_dict(pipeline.transform(input_), list(output_names)[0]).items(): for output in outputs: # pylint:disable=not-an-iterable writers[name].write(output.SerializeToString()) total_outputs += len(outputs) stats = statistics.merge_statistics(stats + pipeline.get_stats()) if total_inputs % 500 == 0: tf.logging.info('Processed %d inputs so far. Produced %d outputs.', total_inputs, total_outputs) statistics.log_statistics_list(stats, tf.logging.info) tf.logging.info('\n\nCompleted.\n') tf.logging.info('Processed %d inputs total. Produced %d outputs.', total_inputs, total_outputs) statistics.log_statistics_list(stats, tf.logging.info)
def run_pipeline_text(pipeline, input_iterator, output_dir): """Runs a pipeline graph saving output to disk as text. Run the the pipeline on each input from the iterator one at a time. A file will be written to `output_dir` for each dataset name specified by the pipeline. pipeline.transform is called on each input and the results are aggregated into their correct datasets. The output type given by `pipeline.output_type` must be str. Args: pipeline: A Pipeline instance. `pipeline.output_type` must be a str. input_iterator: Iterates over the input data. Items returned by it are fed directly into the pipeline's `transform` method. output_dir: Path to directory where datasets will be written. Each dataset is a file whose name contains the pipeline's dataset name. If the directory does not exist, it will be created. Raises: ValueError: If any of `pipeline`'s output type is not str. """ if isinstance(pipeline.output_type, dict): for name, type_ in pipeline.output_type.items(): if type_ != str: raise ValueError('Pipeline "%s" must output %s type. ' 'Output type was %s' % (name, str, type_)) else: if type_ != str: raise ValueError('Pipeline "%s" must output %s type. ' 'Output type was %s' % (name, str, pipeline.output_type)) aggregated_outputs = dict([(name, []) for name in pipeline.output_type_as_dict]) total_inputs = 0 total_outputs = 0 stats = [] output_names = pipeline.output_type_as_dict.keys() output_paths = [ os.path.join(output_dir, name + '.txt') for name in output_names ] for path in output_paths: if os.path.exists(path): raise FileExistsError( 'File {} already exists. Please remove and try again.'.format( path)) writers = dict([(name, open(path, 'a')) for name, path in zip(output_names, output_paths)]) for input_object in input_iterator: total_inputs += 1 for name, outputs in _guarantee_dict(pipeline.transform(input_object), list(output_names)[0]).items(): for output in outputs: writers[name].write(output + '\n') total_outputs += len(outputs) stats = statistics.merge_statistics(stats + pipeline.get_stats()) if total_inputs % 5000 == 0: tf.logging.info('Processed %d inputs so far. Produced %d outputs.', total_inputs, total_outputs) statistics.log_statistics_list(stats, tf.logging.info) tf.logging.info('\n\nCompleted.\n') tf.logging.info('Processed %d inputs total. Produced %d outputs.', total_inputs, total_outputs) statistics.log_statistics_list(stats, tf.logging.info) return aggregated_outputs
def run_pipeline_serial(pipeline, input_iterator, output_dir, output_file_base=None): """Runs the a pipeline on a data source and writes to a directory. Run the the pipeline on each input from the iterator one at a time. A file will be written to `output_dir` for each dataset name specified by the pipeline. pipeline.transform is called on each input and the results are aggregated into their correct datasets. The output type or types given by `pipeline.output_type` must be protocol buffers or objects that have a SerializeToString method. Args: pipeline: A Pipeline instance. `pipeline.output_type` must be a protocol buffer or a dictionary mapping names to protocol buffers. input_iterator: Iterates over the input data. Items returned by it are fed directly into the pipeline's `transform` method. output_dir: Path to directory where datasets will be written. Each dataset is a file whose name contains the pipeline's dataset name. If the directory does not exist, it will be created. output_file_base: An optional string prefix for all datasets output by this run. The prefix will also be followed by an underscore. Raises: ValueError: If any of `pipeline`'s output types do not have a SerializeToString method. """ if isinstance(pipeline.output_type, dict): for name, type_ in pipeline.output_type.items(): if not hasattr(type_, 'SerializeToString'): raise ValueError( 'Pipeline output "%s" does not have method SerializeToString. ' 'Output type = %s' % (name, pipeline.output_type)) else: if not hasattr(pipeline.output_type, 'SerializeToString'): raise ValueError( 'Pipeline output type %s does not have method SerializeToString.' % pipeline.output_type) if not tf.gfile.Exists(output_dir): tf.gfile.MakeDirs(output_dir) output_names = pipeline.output_type_as_dict.keys() if output_file_base is None: output_paths = [os.path.join(output_dir, name + '.tfrecord') for name in output_names] else: output_paths = [os.path.join(output_dir, '%s_%s.tfrecord' % (output_file_base, name)) for name in output_names] writers = dict([(name, tf.python_io.TFRecordWriter(path)) for name, path in zip(output_names, output_paths)]) total_inputs = 0 total_outputs = 0 stats = [] for input_ in input_iterator: total_inputs += 1 for name, outputs in _guarantee_dict(pipeline.transform(input_), output_names[0]).items(): for output in outputs: writers[name].write(output.SerializeToString()) total_outputs += 1 stats = statistics.merge_statistics(stats + pipeline.get_stats()) if total_inputs % 500 == 0: tf.logging.info('Processed %d inputs so far. Produced %d outputs.', total_inputs, total_outputs) statistics.log_statistics_list(stats, tf.logging.info) tf.logging.info('\n\nCompleted.\n') tf.logging.info('Processed %d inputs total. Produced %d outputs.', total_inputs, total_outputs) statistics.log_statistics_list(stats, tf.logging.info)