def main(argv): del argv absl.flags.mark_flags_as_required(['output_directory']) tf.io.gfile.makedirs(FLAGS.output_directory) splits = collections.defaultdict(list) for split in FLAGS.expected_splits.split(','): split_base = FLAGS.base + split wavs = tf.io.gfile.glob(split_base + FLAGS.wav_dir) midis = tf.io.gfile.glob(split_base + FLAGS.midi_dir) splits[split] = list(zip(wavs, midis)) if sorted(splits.keys()) != sorted(FLAGS.expected_splits.split(',')): raise ValueError('Got unexpected set of splits: %s' % splits.keys()) pipeline_options = beam.options.pipeline_options.PipelineOptions( FLAGS.pipeline_options) with beam.Pipeline(options=pipeline_options) as p: for split in splits: split_p = p | 'prepare_split_%s' % split >> beam.Create( splits[split]) split_p |= 'shuffle_input_%s' % split >> beam.Reshuffle() split_p |= 'create_examples_%s' % split >> beam.ParDo( CreateExampleDoFn(FLAGS.base + split, FLAGS.add_wav_glob)) split_p |= 'shuffle_output_%s' % split >> beam.Reshuffle() split_p |= 'write_%s' % split >> beam.io.WriteToTFRecord( os.path.join(FLAGS.output_directory, '%s.tfrecord' % split), coder=beam.coders.ProtoCoder(tf.train.Example), num_shards=FLAGS.num_shards)
def expand(self, pipeline): # The Reshuffles allow for better parallelism. return (pipeline | "create_shards" >> beam.Create(self.shards) | "shard_reshuffle" >> beam.Reshuffle() | "emit_examples" >> beam.FlatMap(self._emit_examples) | "example_reshuffle" >> beam.Reshuffle())
def pipeline(): """Pipeline for dataset creation.""" pipeline_options = beam.options.pipeline_options.PipelineOptions( FLAGS.pipeline_options.split(',')) with beam.Pipeline(options=pipeline_options) as p: tf.flags.mark_flags_as_required(['output_directory']) splits = [ ('train', generate_sharded_filenames(FLAGS.train_tfrecord)), ('validation', generate_sharded_filenames(FLAGS.validation_tfrecord)), ('test', generate_sharded_filenames(FLAGS.test_tfrecord)), ] for split_name, split_tfrecord in splits: split_p = p | 'tfrecord_list_%s' % split_name >> beam.Create( split_tfrecord) split_p |= 'read_tfrecord_%s' % split_name >> ( beam.io.tfrecordio.ReadAllFromTFRecord( coder=beam.coders.ProtoCoder(tf.train.Example))) split_p |= 'shuffle_input_%s' % split_name >> beam.Reshuffle() split_p |= 'split_wav_%s' % split_name >> beam.ParDo( SplitWavDoFn(FLAGS.min_length, FLAGS.max_length, FLAGS.sample_rate, split_name, FLAGS.output_directory)) split_p |= 'shuffle_output_%s' % split_name >> beam.Reshuffle() split_p |= 'write_%s' % split_name >> beam.io.WriteToTFRecord( os.path.join(FLAGS.output_directory, '%s.tfrecord' % split_name), coder=beam.coders.ProtoCoder(tf.train.Example))
def main(argv): del argv # Unused. p = beam.Pipeline() version_config = _get_version_config(FLAGS.fhir_version_config) keyed_bundles = ( p | 'readBundles' >> beam.io.ReadFromTFRecord( FLAGS.input_filepattern, coder=beam.coders.ProtoCoder(resources_pb2.Bundle)) | 'KeyBundlesByPatientId' >> beam.ParDo( bundle_to_seqex.KeyBundleByPatientIdFn())) event_labels = ( p | 'readEventLabels' >> beam.io.ReadFromTFRecord( FLAGS.labels_filepattern, coder=beam.coders.ProtoCoder(google_extensions_pb2.EventLabel))) keyed_event_labels = bundle_to_seqex.CreateTriggerLabelsPairLists( event_labels) bundles_and_labels = bundle_to_seqex.CreateBundleAndLabels( keyed_bundles, keyed_event_labels) _ = ( bundles_and_labels | 'Reshuffle1' >> beam.Reshuffle() | 'GenerateSeqex' >> beam.ParDo( bundle_to_seqex.BundleAndLabelsToSeqexDoFn( version_config=version_config, enable_attribution=False)) | 'Reshuffle2' >> beam.Reshuffle() | 'WriteSeqex' >> beam.io.WriteToTFRecord( FLAGS.output_filepattern, coder=beam.coders.ProtoCoder(example_pb2.SequenceExample))) result = p.run() logging.info('Job result: %s', result)
def pipeline(config_map, dataset_config_map): """Pipeline for dataset creation.""" tf.flags.mark_flags_as_required(['output_directory']) pipeline_options = beam.options.pipeline_options.PipelineOptions( FLAGS.pipeline_options.split(',')) config = config_map[FLAGS.config] hparams = config.hparams hparams.parse(FLAGS.hparams) datasets = dataset_config_map[FLAGS.dataset_config] if tf.gfile.Exists(FLAGS.output_directory): raise ValueError('Output directory %s already exists!' % FLAGS.output_directory) tf.gfile.MakeDirs(FLAGS.output_directory) with tf.gfile.Open(os.path.join(FLAGS.output_directory, 'config.txt'), 'w') as f: f.write('\n\n'.join([ 'min_length: {}'.format(FLAGS.min_length), 'max_length: {}'.format(FLAGS.max_length), 'sample_rate: {}'.format(FLAGS.sample_rate), 'preprocess_examples: {}'.format(FLAGS.preprocess_examples), 'preprocess_train_example_multiplier: {}'.format( FLAGS.preprocess_train_example_multiplier), 'config: {}'.format(FLAGS.config), 'hparams: {}'.format(hparams), 'dataset_config: {}'.format(FLAGS.dataset_config), 'datasets: {}'.format(datasets), ])) with beam.Pipeline(options=pipeline_options) as p: for dataset in datasets: split_p = p | 'tfrecord_list_%s' % dataset.name >> beam.Create( generate_sharded_filenames(dataset.path)) split_p |= 'read_tfrecord_%s' % dataset.name >> ( beam.io.tfrecordio.ReadAllFromTFRecord( coder=beam.coders.ProtoCoder(tf.train.Example))) split_p |= 'shuffle_input_%s' % dataset.name >> beam.Reshuffle() split_p |= 'split_wav_%s' % dataset.name >> beam.FlatMap( split_wav, FLAGS.min_length, FLAGS.max_length, FLAGS.sample_rate, FLAGS.output_directory, dataset.process_for_training, FLAGS.load_audio_with_librosa) if FLAGS.preprocess_examples: if dataset.process_for_training: mul_name = 'preprocess_multiply_%dx_%s' % ( FLAGS.preprocess_train_example_multiplier, dataset.name) split_p |= mul_name >> beam.FlatMap( multiply_example, FLAGS.preprocess_train_example_multiplier) split_p |= 'preprocess_%s' % dataset.name >> beam.Map( preprocess_data, hparams, dataset.process_for_training) split_p |= 'shuffle_output_%s' % dataset.name >> beam.Reshuffle() split_p |= 'write_%s' % dataset.name >> beam.io.WriteToTFRecord( os.path.join(FLAGS.output_directory, '%s.tfrecord' % dataset.name), coder=beam.coders.ProtoCoder(tf.train.Example))
def expand(self, pbegin): assert isinstance(pbegin, pvalue.PBegin), ( 'Input to transform must be a PBegin but found %s' % pbegin) return ( pbegin | 'Impulse' >> beam.Impulse() | 'GenerateKeys' >> beam.ParDo( StatefulLoadGenerator.GenerateKeys(self.num_keys, self.key_size)) | 'Reshuffle' >> beam.Reshuffle() | 'GenerateLoad' >> beam.ParDo( StatefulLoadGenerator.GenerateLoad( self.num_records // self.num_keys, self.value_size)) | 'Reshuffle2' >> beam.Reshuffle())
def expand(self, pcoll): return (pcoll | 'MatchAll' >> fileio.MatchAll() | beam.Reshuffle() | 'ReadEach' >> fileio.ReadMatches() | beam.FlatMap(lambda rfile: csv.DictReader( io.TextIOWrapper(rfile.open()))))
def pipeline(root): seq2seq_inputs = ( root | "Read" >> beam.io.ReadFromText(input_pattern) | "ParseJSONL" >> beam.Map(json.loads) | "ToSeq2SeqInput" >> beam.ParDo( ToSeq2SeqInput(vocab_model_file=vocab_model_file, task=task, delimiter_type=delimiter_type, include_source=include_source, include_evidence=include_evidence, include_distractors=include_distractors, evidence_marker_type=evidence_marker_type, max_input_length=max_input_length, filter_no_diff=filter_no_diff))) _ = (seq2seq_inputs | "ReShuffle" >> beam.Reshuffle() | "ToExample" >> beam.Map(tf_utils.to_example) | "Write" >> beam.io.tfrecordio.WriteToTFRecord( output_pattern, coder=beam.coders.ProtoCoder(tf.train.Example), num_shards=10)) if plot_lengths: _ = (seq2seq_inputs | "GetLengths" >> beam.ParDo(GetLengths(vocab_model_file)) | "Combine" >> beam.CombineGlobally(combine_lengths) | "PlotHistogram" >> beam.Map( plot_histogram, path=output_pattern + ".hist.png"))
def pipeline(root): """Beam pipeline for preprocessing open images.""" assert FLAGS.input_file_pattern assert FLAGS.output_dir assert FLAGS.output_name assert FLAGS.num_shards assert FLAGS.kepid_whitelist # Read label whitelist. kepid_whitelist = [ int(kepid) for kepid in FLAGS.kepid_whitelist.split(",") ] logging.info("Read Kepid whitelist with %d labels", len(kepid_whitelist)) # Initialize DoFn. process_example = ProcessExampleDoFn(kepid_whitelist) # Create Pipeline. # pylint: disable=expression-not-assigned (root | "read_tfrecord" >> beam.io.tfrecordio.ReadFromTFRecord( FLAGS.input_file_pattern, coder=beam.coders.ProtoCoder(tf.train.Example)) | "process_examples" >> beam.ParDo(process_example) | "reshuffle" >> beam.Reshuffle() | "write_tfrecord" >> beam.io.tfrecordio.WriteToTFRecord( os.path.join(FLAGS.output_dir, FLAGS.output_name), coder=beam.coders.ProtoCoder(tf.train.Example), num_shards=FLAGS.num_shards))
def create_pipeline(pipeline, image_directory, input_annotations_file, output_tfrecord_prefix=None, num_images_per_shard=200, keep_bboxes=True): """Creates a beam pipeline for producing a COCO-CameraTraps Image dataset. Args: pipeline: Initialized beam pipeline. image_directory: Path to image directory input_annotations_file: Path to a coco-cameratraps annotation file output_tfrecord_prefix: Absolute path for tfrecord outputs. Final files will be named {output_tfrecord_prefix}@N. num_images_per_shard: The number of images to store in each shard keep_bboxes: Whether to keep any bounding boxes that exist in the json file """ data = load_json_data(input_annotations_file) num_shards = int(np.ceil(float(len(data['images']))/num_images_per_shard)) image_examples = ( pipeline | ('CreateCollections') >> beam.Create( [im['id'] for im in data['images']]) | ('ParseImage') >> beam.ParDo(ParseImage( image_directory, data['images'], data['annotations'], data['categories'], keep_bboxes=keep_bboxes))) _ = (image_examples | ('Reshuffle') >> beam.Reshuffle() | ('WriteTfImageExample') >> beam.io.tfrecordio.WriteToTFRecord( output_tfrecord_prefix, num_shards=num_shards, coder=beam.coders.ProtoCoder(tf.train.Example)))
def construct_pipeline(pipeline, input_tfrecord, output_tfrecord, model_dir, top_k_embedding_count, bottom_k_embedding_count, num_shards): """Returns a beam pipeline to run object detection inference. Args: pipeline: Initialized beam pipeline. input_tfrecord: An TFRecord of tf.train.Example protos containing images. output_tfrecord: An TFRecord of tf.train.Example protos that contain images in the input TFRecord and the detections from the model. model_dir: Path to `saved_model` to use for inference. top_k_embedding_count: The number of high-confidence embeddings to store. bottom_k_embedding_count: The number of low-confidence embeddings to store. num_shards: The number of output shards. """ input_collection = ( pipeline | 'ReadInputTFRecord' >> beam.io.tfrecordio.ReadFromTFRecord( input_tfrecord, coder=beam.coders.BytesCoder()) | 'AddKeys' >> beam.Map(add_keys)) output_collection = input_collection | 'ExtractEmbedding' >> beam.ParDo( GenerateEmbeddingDataFn(model_dir, top_k_embedding_count, bottom_k_embedding_count)) output_collection = output_collection | 'Reshuffle' >> beam.Reshuffle() _ = output_collection | 'DropKeys' >> beam.Map( drop_keys) | 'WritetoDisk' >> beam.io.tfrecordio.WriteToTFRecord( output_tfrecord, num_shards=num_shards, coder=beam.coders.ProtoCoder(tf.train.Example))
def expand(self, root): paths_pcoll = root | beam.Create([self.path]) match = io.filesystems.FileSystems.match([self.path], limits=[1])[0] if not match.metadata_list: # TODO(BEAM-12031): This should be allowed for streaming pipelines if # user provides an explicit schema. raise FileNotFoundError(f"Found no files that match {self.path!r}") first_path = match.metadata_list[0].path with io.filesystems.FileSystems.open(first_path) as handle: if not self.binary: handle = TextIOWrapper(handle) if self.incremental: sample = next( self.reader(handle, *self.args, **dict(self.kwargs, chunksize=100))) else: sample = self.reader(handle, *self.args, **self.kwargs) pcoll = (paths_pcoll | fileio.MatchFiles(self.path) | beam.Reshuffle() | fileio.ReadMatches() | beam.ParDo( _ReadFromPandasDoFn(self.reader, self.args, self.kwargs, self.binary, self.incremental, self.splitter))) from apache_beam.dataframe import convert return convert.to_dataframe(pcoll, proxy=_prefix_range_index_with( ':', sample[:0]))
def main(_, runner=None): # must create before flags are used if runner is None: runner = runners.DirectRunner() tasks = [] for problem in problems.PROBLEMS_BY_NAME.values(): if (FLAGS.problem_filter and not re.search(FLAGS.problem_filter, problem.name)): continue if FLAGS.quick_run and problem.width * problem.height > 64**2: continue for seed in range(-1, FLAGS.num_seeds): if seed >= 0: tasks.append((problem.name, seed, 'cnn', 'lbfgs')) tasks.append((problem.name, seed, 'pixels', 'lbfgs')) tasks.append((problem.name, seed, 'pixels', 'oc')) tasks.append((problem.name, seed, 'pixels', 'mma')) if not tasks: raise RuntimeError('no tasks to run') pipeline = ( beam.Create(tasks) | beam.Map(run_optimization) | beam.Reshuffle() # don't fuse optimizations together | 'group seeds' >> beam.GroupByKey() | beam.Map(groupby_seeds) | 'group methods' >> beam.GroupByKey() | beam.Map(groupby_methods) | beam.combiners.ToList() | beam.Map(save_all_losses)) runner.run(pipeline)
def test_reshuffle_window_fn_preserved(self): pipeline = TestPipeline() data = [(1, 1), (2, 1), (3, 1), (1, 2), (2, 2), (1, 4)] expected_windows = [TestWindowedValue(v, t, [w]) for (v, t, w) in [ ((1, 1), 1.0, IntervalWindow(1.0, 3.0)), ((2, 1), 1.0, IntervalWindow(1.0, 3.0)), ((3, 1), 1.0, IntervalWindow(1.0, 3.0)), ((1, 2), 2.0, IntervalWindow(2.0, 4.0)), ((2, 2), 2.0, IntervalWindow(2.0, 4.0)), ((1, 4), 4.0, IntervalWindow(4.0, 6.0))]] expected_merged_windows = [TestWindowedValue(v, t, [w]) for (v, t, w) in [ ((1, contains_in_any_order([2, 1])), 4.0, IntervalWindow(1.0, 4.0)), ((2, contains_in_any_order([2, 1])), 4.0, IntervalWindow(1.0, 4.0)), ((3, [1]), 3.0, IntervalWindow(1.0, 3.0)), ((1, [4]), 6.0, IntervalWindow(4.0, 6.0))]] before_reshuffle = (pipeline | 'start' >> beam.Create(data) | 'add_timestamp' >> beam.Map( lambda v: TimestampedValue(v, v[1])) | 'window' >> beam.WindowInto(Sessions(gap_size=2))) assert_that(before_reshuffle, equal_to(expected_windows), label='before_reshuffle', reify_windows=True) after_reshuffle = before_reshuffle | beam.Reshuffle() assert_that(after_reshuffle, equal_to(expected_windows), label='after_reshuffle', reify_windows=True) after_group = after_reshuffle | beam.GroupByKey() assert_that(after_group, equal_to(expected_merged_windows), label='after_group', reify_windows=True) pipeline.run()
def test_reshuffle_windows_unchanged(self): pipeline = TestPipeline() data = [(1, 1), (2, 1), (3, 1), (1, 2), (2, 2), (1, 4)] expected_data = [ TestWindowedValue(v, t, [w]) for (v, t, w) in [((1, [2, 1]), 4.0, IntervalWindow(1.0, 4.0) ), ((2, [2, 1]), 4.0, IntervalWindow(1.0, 4.0) ), ((3, [1]), 3.0, IntervalWindow(1.0, 3.0) ), ((1, [4]), 6.0, IntervalWindow(4.0, 6.0))] ] before_reshuffle = ( pipeline | 'start' >> beam.Create(data) | 'add_timestamp' >> beam.Map(lambda v: beam.window.TimestampedValue(v, v[1])) | 'window' >> beam.WindowInto(Sessions(gap_size=2)) | 'group_by_key' >> beam.GroupByKey()) assert_that(before_reshuffle, equal_to(expected_data), label='before_reshuffle', reify_windows=True) after_reshuffle = (before_reshuffle | 'reshuffle' >> beam.Reshuffle()) assert_that(after_reshuffle, equal_to(expected_data), label='after reshuffle', reify_windows=True) pipeline.run()
def main(unused_argv): # Validate flags and setup directories. utils.validate_flags(FLAGS.train_glob, FLAGS.eval_glob, FLAGS.test_glob, FLAGS.output_file) # Generate experiment parameters based on flags. exp_params = utils.experiment_params( FLAGS.embedding_list, FLAGS.speaker_id_name, FLAGS.label_name, FLAGS.label_list, FLAGS.train_glob, FLAGS.eval_glob, FLAGS.test_glob, FLAGS.save_model_dir, FLAGS.save_predictions_dir, FLAGS.eval_metric, ) # Make and run beam pipeline. beam_options = None logging.info('Starting to create flume pipeline...') with beam.Pipeline(beam_options) as root: _ = (root | 'MakeCollection' >> beam.Create(exp_params) | 'CalcScores' >> beam.Map(lambda d: (d, utils.train_and_get_score(**d))) | 'FormatText' >> beam.Map(utils.format_text_line) | 'Reshuffle' >> beam.Reshuffle() | 'WriteOutput' >> beam.io.WriteToText(FLAGS.output_file, num_shards=1))
def pipeline(root): _ = (root | 'Read JSONL' >> beam.io.ReadFromText(file_pattern=_INPUT_JSONL.value) | 'Run Episodes' >> beam.ParDo(RunT5EpisodeFn()) | 'Reshard' >> beam.Reshuffle() | 'Save' >> beam.io.WriteToText(_OUTPUT_PATH.value))
def _build_pcollection(self, pipeline, files, web_dir, wiki_dir, answer): if isinstance(self.builder_config, BigBirdTriviaQAConfig): self.builder_config.validate() question_answers = preprocess.read_question_answers(files[0]) return preprocess.make_pipeline( pipeline, question_answers=question_answers, answer=answer, max_num_tokens=self.builder_config.sequence_length, max_num_global_tokens=self.builder_config. global_sequence_length, stride=self.builder_config.stride, sentencepiece_model_path=self.builder_config. sentencepiece_model_path, wikipedia_dir=wiki_dir, web_dir=web_dir) parse_example_fn = functools.partial( parse_example, self.builder_config.exclude_context, web_dir, wiki_dir) return (pipeline | beam.Create(files) | beam.ParDo(ReadQuestions()) | beam.Reshuffle() | beam.Map(parse_example_fn))
def expand(self, root): # TODO(robertwb): Handle streaming (with explicit schema). paths_pcoll = root | beam.Create([self.path]) first = io.filesystems.FileSystems.match( [self.path], limits=[1])[0].metadata_list[0].path with io.filesystems.FileSystems.open(first) as handle: if not self.binary: handle = TextIOWrapper(handle) if self.incremental: sample = next( self.reader(handle, *self.args, **dict(self.kwargs, chunksize=100))) else: sample = self.reader(handle, *self.args, **self.kwargs) pcoll = (paths_pcoll | fileio.MatchFiles(self.path) | beam.Reshuffle() | fileio.ReadMatches() | beam.ParDo( _ReadFromPandasDoFn(self.reader, self.args, self.kwargs, self.binary, self.incremental, self.splitter))) from apache_beam.dataframe import convert return convert.to_dataframe(pcoll, proxy=_prefix_range_index_with( ':', sample[:0]))
def test_reshuffle_contents_unchanged(self): pipeline = TestPipeline() data = [(1, 1), (2, 1), (3, 1), (1, 2), (2, 2), (1, 3)] result = (pipeline | beam.Create(data) | beam.Reshuffle()) assert_that(result, equal_to(data)) pipeline.run()
def pipeline(root): if output_type == 'tf_example': coder = beam.coders.ProtoCoder(tf.train.Example) elif output_type == 'tf_sequence_example': coder = beam.coders.ProtoCoder(tf.train.SequenceExample) else: raise ValueError('Unsupported output type.') input_collection = ( root | 'ReadInputTFRecord' >> beam.io.tfrecordio.ReadFromTFRecord( input_tfrecord, coder=beam.coders.BytesCoder())) rekey_collection = input_collection | 'RekeyExamples' >> beam.ParDo( ReKeyDataFn(sequence_key, time_horizon, reduce_image_size, max_image_dimension)) grouped_collection = (rekey_collection | 'GroupBySequenceKey' >> beam.GroupByKey()) grouped_collection = (grouped_collection | 'ReshuffleGroups' >> beam.Reshuffle()) ordered_collection = ( grouped_collection | 'OrderByFrameNumber' >> beam.ParDo( SortGroupedDataFn(sequence_key, sorted_image_ids, max_num_elements_in_context_features))) ordered_collection = (ordered_collection | 'ReshuffleSortedGroups' >> beam.Reshuffle()) output_collection = ( ordered_collection | 'AddContextToExamples' >> beam.ParDo( GenerateContextFn( sequence_key, add_context_features, image_ids_to_keep, keep_context_features_image_id_list=( keep_context_features_image_id_list), subsample_context_features_rate= subsample_context_features_rate, keep_only_positives=keep_only_positives, keep_only_positives_gt=keep_only_positives_gt, context_features_score_threshold=( context_features_score_threshold), max_num_elements_in_context_features=( max_num_elements_in_context_features), output_type=output_type, max_clip_length=max_clip_length))) output_collection = (output_collection | 'ReshuffleExamples' >> beam.Reshuffle()) _ = output_collection | 'WritetoDisk' >> beam.io.tfrecordio.WriteToTFRecord( output_tfrecord, num_shards=num_shards, coder=coder)
def pipeline(root): """Beam pipeline. Args: root: the root of the pipeline. """ _ = ( root | 'CreateTopologies' >> beam.Create( smu_utils_lib.generate_bond_topologies_from_csv( FLAGS.input_bond_topology_csv)) | 'Reshuffle1' >> beam.Reshuffle() | 'CheckInvariance' >> beam.FlatMap(check_smiles_permutation_invariance) | 'Reshuffle2' >> beam.Reshuffle() | 'CSVFormat' >> beam.Map(lambda vals: ','.join(str(x) for x in vals)) | 'WriteOutput' >> beam.io.WriteToText( FLAGS.output_csv, header='bt_id,smiles0,smiles1', num_shards=1))
def expand(self, pcoll): return (pcoll | beam.Map(t5.data.dict_to_tfexample) | beam.Reshuffle() | beam.io.tfrecordio.WriteToTFRecord( self._output_path, num_shards=self._num_shards, coder=beam.coders.ProtoCoder(tf.train.Example)))
def main(_): runner = beam.runners.DirectRunner() # must create before flags are used pipeline = (beam.Create(tf.gfile.Glob(FLAGS.file_pattern)) | beam.Reshuffle() | beam.Map(create_survival_netcdf, quantile=FLAGS.quantile, exact_path=FLAGS.exact_results_file)) runner.run(pipeline)
def run(options): with beam.Pipeline(options=options) as p: (p | 'Read Avro' >> beam.io.ReadFromAvro(p.options.input, validate=False) | 're-shuffling' >> beam.Reshuffle() | 'Transformation' >> StackOverflowAvroDataTransform() | 'Write to file' >> beam.io.WriteToText(p.options.output, file_name_suffix=".csv", header=p.options.csv_header))
def run_split_pipeline(self, split_manager, elements, element_counter=None): with fn_api_runner.split_manager('Identity', split_manager): with self.create_pipeline() as p: res = (p | beam.Create(elements) | beam.Reshuffle() | 'Identity' >> beam.Map(lambda x: x) | beam.Map(lambda x: element_counter.increment() or x)) assert_that(res, equal_to(elements))
def _pipeline(root): _ = ( root | 'Read' >> beam.io.ReadFromText(screen_id_file) | 'ReadEpisodeProto' >> beam.ParDo( GenerateProto(input_dir, clean_tf_example, csv_label_file)) | 'ReShuffle' >> beam.Reshuffle() # workers may not parallel w/o this | 'WriteResults' >> beam.io.WriteToTFRecord( output_path, coder=beam.coders.ProtoCoder(tf.train.Example)))
def prepare_single_tfrecord( input_audio_path, source_id, env_id, output_tfrecord_path, sample_rate=16000, frame_rate=250, window_secs=4, hop_secs=1, pipeline_options=''): """Prepares a TFRecord for use in training, evaluation, and prediction. Args: input_audio_paths: An iterable of paths to audio files to include in TFRecord. output_tfrecord_path: The prefix path to the output TFRecord. Shard numbers will be added to actual path(s). num_shards: The number of shards to use for the TFRecord. If None, this number will be determined automatically. sample_rate: The sample rate to use for the audio. frame_rate: The frame rate to use for f0 and loudness features. If set to None, these features will not be computed. window_secs: The size of the sliding window (in seconds) to use to split the audio and features. If 0, they will not be split. hop_secs: The number of seconds to hop when computing the sliding windows. pipeline_options: An iterable of command line arguments to be used as options for the Beam Pipeline. """ pipeline_options = beam.options.pipeline_options.PipelineOptions( pipeline_options) with beam.Pipeline(options=pipeline_options) as pipeline: examples = ( pipeline | beam.Create([input_audio_path]) | beam.Map(_load_audio, sample_rate, source_id, env_id)) if frame_rate: examples = ( examples | beam.Map(_add_f0_estimate, frame_rate) | beam.Map(_add_loudness, sample_rate, frame_rate)) if window_secs: examples |= beam.FlatMap( _split_example, sample_rate, frame_rate, window_secs, hop_secs) _ = ( examples | beam.Reshuffle() | beam.Map(_float_dict_to_tfexample) | beam.io.tfrecordio.WriteToTFRecord( output_tfrecord_path, num_shards=1, shard_name_template='', coder=beam.coders.ProtoCoder(tf.train.Example)) )
def _build_pcollection(self, pipeline, **kwargs): db = wiki_db.WikiDatabase.from_local(self._wiki_db_path) wikipedia_urls = db.get_wikipedia_urls() return (pipeline | 'LoadPages' >> beam.Create(wikipedia_urls) | 'Repartition' >> beam.Reshuffle() | 'ExtractSentences' >> beam.ParDo( ExtractSentences(max_sentence_id=self._max_sentence_id, wiki_db_path=self._wiki_db_path)))
def main(unused_argv): # Data prep setup. prep_params, input_filenames_list, output_filenames, run_data_prep = _get_data_prep_params_from_flags( ) logging.info('beam_params: %s', prep_params) # Generate sklearn eval experiment parameters based on data prep flags. # Make (data_prep outputs / eval input filenames) globs. train_glob, eval_glob, test_glob = [f'{x}*' for x in output_filenames] sklearn_results_output_file = FLAGS.results_output_file exp_params = sklearn_utils.experiment_params( train_glob=train_glob, eval_glob=eval_glob, test_glob=test_glob, embedding_list=prep_params['embedding_names'], speaker_id_name=FLAGS.speaker_id_key, label_name=FLAGS.label_key, label_list=FLAGS.label_list, save_model_dir=FLAGS.save_model_dir, save_predictions_dir=FLAGS.save_predictions_dir, eval_metrics=FLAGS.eval_metrics, ) logging.info('exp_params: %s', exp_params) # Make and run beam pipeline. beam_options = None if run_data_prep: input_filenames_list, output_filenames = _remove_existing_outputs( input_filenames_list, output_filenames) logging.info('Data prep on: %s, %s...', input_filenames_list, output_filenames) with beam.Pipeline(beam_options) as root: for i, (input_filenames_or_glob, output_filename) in enumerate( zip(input_filenames_list, output_filenames)): utils.data_prep_pipeline( root=root, input_filenames_or_glob=input_filenames_or_glob, output_filename=output_filename, data_prep_behavior=FLAGS.data_prep_behavior, beam_params=prep_params, suffix=str(i)) # Check that previous beam pipeline wrote outputs. sklearn_utils.validate_flags(train_glob, eval_glob, test_glob, sklearn_results_output_file) logging.info('Eval sklearn...') with beam.Pipeline(beam_options) as root: _ = (root | 'MakeCollection' >> beam.Create(exp_params) | 'CalcScores' >> beam.Map(lambda d: (d, sklearn_utils.train_and_get_score(**d))) | 'FormatText' >> beam.Map(sklearn_utils.format_text_line) | 'Reshuffle' >> beam.Reshuffle() | 'WriteOutput' >> beam.io.WriteToText( sklearn_results_output_file, num_shards=1))