コード例 #1
0
def main(argv):
    del argv

    absl.flags.mark_flags_as_required(['output_directory'])

    tf.io.gfile.makedirs(FLAGS.output_directory)

    splits = collections.defaultdict(list)
    for split in FLAGS.expected_splits.split(','):
        split_base = FLAGS.base + split
        wavs = tf.io.gfile.glob(split_base + FLAGS.wav_dir)
        midis = tf.io.gfile.glob(split_base + FLAGS.midi_dir)
        splits[split] = list(zip(wavs, midis))

    if sorted(splits.keys()) != sorted(FLAGS.expected_splits.split(',')):
        raise ValueError('Got unexpected set of splits: %s' % splits.keys())

    pipeline_options = beam.options.pipeline_options.PipelineOptions(
        FLAGS.pipeline_options)
    with beam.Pipeline(options=pipeline_options) as p:
        for split in splits:
            split_p = p | 'prepare_split_%s' % split >> beam.Create(
                splits[split])
            split_p |= 'shuffle_input_%s' % split >> beam.Reshuffle()
            split_p |= 'create_examples_%s' % split >> beam.ParDo(
                CreateExampleDoFn(FLAGS.base + split, FLAGS.add_wav_glob))
            split_p |= 'shuffle_output_%s' % split >> beam.Reshuffle()
            split_p |= 'write_%s' % split >> beam.io.WriteToTFRecord(
                os.path.join(FLAGS.output_directory, '%s.tfrecord' % split),
                coder=beam.coders.ProtoCoder(tf.train.Example),
                num_shards=FLAGS.num_shards)
コード例 #2
0
 def expand(self, pipeline):
   # The Reshuffles allow for better parallelism.
   return (pipeline
           | "create_shards" >> beam.Create(self.shards)
           | "shard_reshuffle" >> beam.Reshuffle()
           | "emit_examples" >> beam.FlatMap(self._emit_examples)
           | "example_reshuffle" >> beam.Reshuffle())
コード例 #3
0
def pipeline():
  """Pipeline for dataset creation."""
  pipeline_options = beam.options.pipeline_options.PipelineOptions(
      FLAGS.pipeline_options.split(','))

  with beam.Pipeline(options=pipeline_options) as p:
    tf.flags.mark_flags_as_required(['output_directory'])

    splits = [
        ('train', generate_sharded_filenames(FLAGS.train_tfrecord)),
        ('validation', generate_sharded_filenames(FLAGS.validation_tfrecord)),
        ('test', generate_sharded_filenames(FLAGS.test_tfrecord)),
    ]

    for split_name, split_tfrecord in splits:
      split_p = p | 'tfrecord_list_%s' % split_name >> beam.Create(
          split_tfrecord)
      split_p |= 'read_tfrecord_%s' % split_name >> (
          beam.io.tfrecordio.ReadAllFromTFRecord(
              coder=beam.coders.ProtoCoder(tf.train.Example)))
      split_p |= 'shuffle_input_%s' % split_name >> beam.Reshuffle()
      split_p |= 'split_wav_%s' % split_name >> beam.ParDo(
          SplitWavDoFn(FLAGS.min_length, FLAGS.max_length, FLAGS.sample_rate,
                       split_name, FLAGS.output_directory))
      split_p |= 'shuffle_output_%s' % split_name >> beam.Reshuffle()
      split_p |= 'write_%s' % split_name >> beam.io.WriteToTFRecord(
          os.path.join(FLAGS.output_directory, '%s.tfrecord' % split_name),
          coder=beam.coders.ProtoCoder(tf.train.Example))
コード例 #4
0
ファイル: bundle_to_seqex_main.py プロジェクト: Shamvala/fhir
def main(argv):
  del argv  # Unused.
  p = beam.Pipeline()

  version_config = _get_version_config(FLAGS.fhir_version_config)

  keyed_bundles = (
      p
      | 'readBundles' >> beam.io.ReadFromTFRecord(
          FLAGS.input_filepattern,
          coder=beam.coders.ProtoCoder(resources_pb2.Bundle))
      | 'KeyBundlesByPatientId' >> beam.ParDo(
          bundle_to_seqex.KeyBundleByPatientIdFn()))
  event_labels = (
      p | 'readEventLabels' >> beam.io.ReadFromTFRecord(
          FLAGS.labels_filepattern,
          coder=beam.coders.ProtoCoder(google_extensions_pb2.EventLabel)))
  keyed_event_labels = bundle_to_seqex.CreateTriggerLabelsPairLists(
      event_labels)
  bundles_and_labels = bundle_to_seqex.CreateBundleAndLabels(
      keyed_bundles, keyed_event_labels)
  _ = (
      bundles_and_labels
      | 'Reshuffle1' >> beam.Reshuffle()
      | 'GenerateSeqex' >> beam.ParDo(
          bundle_to_seqex.BundleAndLabelsToSeqexDoFn(
              version_config=version_config, enable_attribution=False))
      | 'Reshuffle2' >> beam.Reshuffle()
      | 'WriteSeqex' >> beam.io.WriteToTFRecord(
          FLAGS.output_filepattern,
          coder=beam.coders.ProtoCoder(example_pb2.SequenceExample)))

  result = p.run()
  logging.info('Job result: %s', result)
コード例 #5
0
def pipeline(config_map, dataset_config_map):
    """Pipeline for dataset creation."""
    tf.flags.mark_flags_as_required(['output_directory'])

    pipeline_options = beam.options.pipeline_options.PipelineOptions(
        FLAGS.pipeline_options.split(','))

    config = config_map[FLAGS.config]
    hparams = config.hparams
    hparams.parse(FLAGS.hparams)

    datasets = dataset_config_map[FLAGS.dataset_config]

    if tf.gfile.Exists(FLAGS.output_directory):
        raise ValueError('Output directory %s already exists!' %
                         FLAGS.output_directory)
    tf.gfile.MakeDirs(FLAGS.output_directory)
    with tf.gfile.Open(os.path.join(FLAGS.output_directory, 'config.txt'),
                       'w') as f:
        f.write('\n\n'.join([
            'min_length: {}'.format(FLAGS.min_length),
            'max_length: {}'.format(FLAGS.max_length),
            'sample_rate: {}'.format(FLAGS.sample_rate),
            'preprocess_examples: {}'.format(FLAGS.preprocess_examples),
            'preprocess_train_example_multiplier: {}'.format(
                FLAGS.preprocess_train_example_multiplier),
            'config: {}'.format(FLAGS.config),
            'hparams: {}'.format(hparams),
            'dataset_config: {}'.format(FLAGS.dataset_config),
            'datasets: {}'.format(datasets),
        ]))

    with beam.Pipeline(options=pipeline_options) as p:
        for dataset in datasets:
            split_p = p | 'tfrecord_list_%s' % dataset.name >> beam.Create(
                generate_sharded_filenames(dataset.path))
            split_p |= 'read_tfrecord_%s' % dataset.name >> (
                beam.io.tfrecordio.ReadAllFromTFRecord(
                    coder=beam.coders.ProtoCoder(tf.train.Example)))
            split_p |= 'shuffle_input_%s' % dataset.name >> beam.Reshuffle()
            split_p |= 'split_wav_%s' % dataset.name >> beam.FlatMap(
                split_wav, FLAGS.min_length, FLAGS.max_length,
                FLAGS.sample_rate, FLAGS.output_directory,
                dataset.process_for_training, FLAGS.load_audio_with_librosa)
            if FLAGS.preprocess_examples:
                if dataset.process_for_training:
                    mul_name = 'preprocess_multiply_%dx_%s' % (
                        FLAGS.preprocess_train_example_multiplier,
                        dataset.name)
                    split_p |= mul_name >> beam.FlatMap(
                        multiply_example,
                        FLAGS.preprocess_train_example_multiplier)
                split_p |= 'preprocess_%s' % dataset.name >> beam.Map(
                    preprocess_data, hparams, dataset.process_for_training)
            split_p |= 'shuffle_output_%s' % dataset.name >> beam.Reshuffle()
            split_p |= 'write_%s' % dataset.name >> beam.io.WriteToTFRecord(
                os.path.join(FLAGS.output_directory,
                             '%s.tfrecord' % dataset.name),
                coder=beam.coders.ProtoCoder(tf.train.Example))
コード例 #6
0
ファイル: pardo_test.py プロジェクト: xhcom-ui/beam
 def expand(self, pbegin):
   assert isinstance(pbegin, pvalue.PBegin), (
       'Input to transform must be a PBegin but found %s' % pbegin)
   return (
       pbegin
       | 'Impulse' >> beam.Impulse()
       | 'GenerateKeys' >> beam.ParDo(
           StatefulLoadGenerator.GenerateKeys(self.num_keys, self.key_size))
       | 'Reshuffle' >> beam.Reshuffle()
       | 'GenerateLoad' >> beam.ParDo(
           StatefulLoadGenerator.GenerateLoad(
               self.num_records // self.num_keys, self.value_size))
       | 'Reshuffle2' >> beam.Reshuffle())
コード例 #7
0
 def expand(self, pcoll):
     return (pcoll
             | 'MatchAll' >> fileio.MatchAll()
             | beam.Reshuffle()
             | 'ReadEach' >> fileio.ReadMatches()
             | beam.FlatMap(lambda rfile: csv.DictReader(
                 io.TextIOWrapper(rfile.open()))))
コード例 #8
0
 def pipeline(root):
     seq2seq_inputs = (
         root
         | "Read" >> beam.io.ReadFromText(input_pattern)
         | "ParseJSONL" >> beam.Map(json.loads)
         | "ToSeq2SeqInput" >> beam.ParDo(
             ToSeq2SeqInput(vocab_model_file=vocab_model_file,
                            task=task,
                            delimiter_type=delimiter_type,
                            include_source=include_source,
                            include_evidence=include_evidence,
                            include_distractors=include_distractors,
                            evidence_marker_type=evidence_marker_type,
                            max_input_length=max_input_length,
                            filter_no_diff=filter_no_diff)))
     _ = (seq2seq_inputs
          | "ReShuffle" >> beam.Reshuffle()
          | "ToExample" >> beam.Map(tf_utils.to_example)
          | "Write" >> beam.io.tfrecordio.WriteToTFRecord(
              output_pattern,
              coder=beam.coders.ProtoCoder(tf.train.Example),
              num_shards=10))
     if plot_lengths:
         _ = (seq2seq_inputs
              | "GetLengths" >> beam.ParDo(GetLengths(vocab_model_file))
              | "Combine" >> beam.CombineGlobally(combine_lengths)
              | "PlotHistogram" >> beam.Map(
                  plot_histogram, path=output_pattern + ".hist.png"))
コード例 #9
0
    def pipeline(root):
        """Beam pipeline for preprocessing open images."""
        assert FLAGS.input_file_pattern
        assert FLAGS.output_dir
        assert FLAGS.output_name
        assert FLAGS.num_shards
        assert FLAGS.kepid_whitelist

        # Read label whitelist.
        kepid_whitelist = [
            int(kepid) for kepid in FLAGS.kepid_whitelist.split(",")
        ]
        logging.info("Read Kepid whitelist with %d labels",
                     len(kepid_whitelist))

        # Initialize DoFn.
        process_example = ProcessExampleDoFn(kepid_whitelist)

        # Create Pipeline.
        # pylint: disable=expression-not-assigned
        (root
         | "read_tfrecord" >> beam.io.tfrecordio.ReadFromTFRecord(
             FLAGS.input_file_pattern,
             coder=beam.coders.ProtoCoder(tf.train.Example))
         | "process_examples" >> beam.ParDo(process_example)
         | "reshuffle" >> beam.Reshuffle()
         | "write_tfrecord" >> beam.io.tfrecordio.WriteToTFRecord(
             os.path.join(FLAGS.output_dir, FLAGS.output_name),
             coder=beam.coders.ProtoCoder(tf.train.Example),
             num_shards=FLAGS.num_shards))
コード例 #10
0
def create_pipeline(pipeline,
                    image_directory,
                    input_annotations_file,
                    output_tfrecord_prefix=None,
                    num_images_per_shard=200,
                    keep_bboxes=True):
  """Creates a beam pipeline for producing a COCO-CameraTraps Image dataset.

  Args:
    pipeline: Initialized beam pipeline.
    image_directory: Path to image directory
    input_annotations_file: Path to a coco-cameratraps annotation file
    output_tfrecord_prefix: Absolute path for tfrecord outputs. Final files will
      be named {output_tfrecord_prefix}@N.
    num_images_per_shard: The number of images to store in each shard
    keep_bboxes: Whether to keep any bounding boxes that exist in the json file
  """

  data = load_json_data(input_annotations_file)

  num_shards = int(np.ceil(float(len(data['images']))/num_images_per_shard))

  image_examples = (
      pipeline | ('CreateCollections') >> beam.Create(
          [im['id'] for im in data['images']])
      | ('ParseImage') >> beam.ParDo(ParseImage(
          image_directory, data['images'], data['annotations'],
          data['categories'], keep_bboxes=keep_bboxes)))
  _ = (image_examples
       | ('Reshuffle') >> beam.Reshuffle()
       | ('WriteTfImageExample') >> beam.io.tfrecordio.WriteToTFRecord(
           output_tfrecord_prefix,
           num_shards=num_shards,
           coder=beam.coders.ProtoCoder(tf.train.Example)))
コード例 #11
0
def construct_pipeline(pipeline, input_tfrecord, output_tfrecord, model_dir,
                       top_k_embedding_count, bottom_k_embedding_count,
                       num_shards):
    """Returns a beam pipeline to run object detection inference.

  Args:
    pipeline: Initialized beam pipeline.
    input_tfrecord: An TFRecord of tf.train.Example protos containing images.
    output_tfrecord: An TFRecord of tf.train.Example protos that contain images
      in the input TFRecord and the detections from the model.
    model_dir: Path to `saved_model` to use for inference.
    top_k_embedding_count: The number of high-confidence embeddings to store.
    bottom_k_embedding_count: The number of low-confidence embeddings to store.
    num_shards: The number of output shards.
  """
    input_collection = (
        pipeline | 'ReadInputTFRecord' >> beam.io.tfrecordio.ReadFromTFRecord(
            input_tfrecord, coder=beam.coders.BytesCoder())
        | 'AddKeys' >> beam.Map(add_keys))
    output_collection = input_collection | 'ExtractEmbedding' >> beam.ParDo(
        GenerateEmbeddingDataFn(model_dir, top_k_embedding_count,
                                bottom_k_embedding_count))
    output_collection = output_collection | 'Reshuffle' >> beam.Reshuffle()
    _ = output_collection | 'DropKeys' >> beam.Map(
        drop_keys) | 'WritetoDisk' >> beam.io.tfrecordio.WriteToTFRecord(
            output_tfrecord,
            num_shards=num_shards,
            coder=beam.coders.ProtoCoder(tf.train.Example))
コード例 #12
0
    def expand(self, root):
        paths_pcoll = root | beam.Create([self.path])
        match = io.filesystems.FileSystems.match([self.path], limits=[1])[0]
        if not match.metadata_list:
            # TODO(BEAM-12031): This should be allowed for streaming pipelines if
            # user provides an explicit schema.
            raise FileNotFoundError(f"Found no files that match {self.path!r}")
        first_path = match.metadata_list[0].path
        with io.filesystems.FileSystems.open(first_path) as handle:
            if not self.binary:
                handle = TextIOWrapper(handle)
            if self.incremental:
                sample = next(
                    self.reader(handle, *self.args,
                                **dict(self.kwargs, chunksize=100)))
            else:
                sample = self.reader(handle, *self.args, **self.kwargs)

        pcoll = (paths_pcoll
                 | fileio.MatchFiles(self.path)
                 | beam.Reshuffle()
                 | fileio.ReadMatches()
                 | beam.ParDo(
                     _ReadFromPandasDoFn(self.reader, self.args, self.kwargs,
                                         self.binary, self.incremental,
                                         self.splitter)))
        from apache_beam.dataframe import convert
        return convert.to_dataframe(pcoll,
                                    proxy=_prefix_range_index_with(
                                        ':', sample[:0]))
コード例 #13
0
def main(_, runner=None):
    # must create before flags are used
    if runner is None:
        runner = runners.DirectRunner()

    tasks = []
    for problem in problems.PROBLEMS_BY_NAME.values():
        if (FLAGS.problem_filter
                and not re.search(FLAGS.problem_filter, problem.name)):
            continue

        if FLAGS.quick_run and problem.width * problem.height > 64**2:
            continue

        for seed in range(-1, FLAGS.num_seeds):
            if seed >= 0:
                tasks.append((problem.name, seed, 'cnn', 'lbfgs'))
            tasks.append((problem.name, seed, 'pixels', 'lbfgs'))
            tasks.append((problem.name, seed, 'pixels', 'oc'))
            tasks.append((problem.name, seed, 'pixels', 'mma'))

    if not tasks:
        raise RuntimeError('no tasks to run')

    pipeline = (
        beam.Create(tasks)
        | beam.Map(run_optimization)
        | beam.Reshuffle()  # don't fuse optimizations together
        | 'group seeds' >> beam.GroupByKey()
        | beam.Map(groupby_seeds)
        | 'group methods' >> beam.GroupByKey()
        | beam.Map(groupby_methods)
        | beam.combiners.ToList()
        | beam.Map(save_all_losses))
    runner.run(pipeline)
コード例 #14
0
 def test_reshuffle_window_fn_preserved(self):
   pipeline = TestPipeline()
   data = [(1, 1), (2, 1), (3, 1), (1, 2), (2, 2), (1, 4)]
   expected_windows = [TestWindowedValue(v, t, [w]) for (v, t, w) in [
       ((1, 1), 1.0, IntervalWindow(1.0, 3.0)),
       ((2, 1), 1.0, IntervalWindow(1.0, 3.0)),
       ((3, 1), 1.0, IntervalWindow(1.0, 3.0)),
       ((1, 2), 2.0, IntervalWindow(2.0, 4.0)),
       ((2, 2), 2.0, IntervalWindow(2.0, 4.0)),
       ((1, 4), 4.0, IntervalWindow(4.0, 6.0))]]
   expected_merged_windows = [TestWindowedValue(v, t, [w]) for (v, t, w) in [
       ((1, contains_in_any_order([2, 1])), 4.0, IntervalWindow(1.0, 4.0)),
       ((2, contains_in_any_order([2, 1])), 4.0, IntervalWindow(1.0, 4.0)),
       ((3, [1]), 3.0, IntervalWindow(1.0, 3.0)),
       ((1, [4]), 6.0, IntervalWindow(4.0, 6.0))]]
   before_reshuffle = (pipeline
                       | 'start' >> beam.Create(data)
                       | 'add_timestamp' >> beam.Map(
                           lambda v: TimestampedValue(v, v[1]))
                       | 'window' >> beam.WindowInto(Sessions(gap_size=2)))
   assert_that(before_reshuffle, equal_to(expected_windows),
               label='before_reshuffle', reify_windows=True)
   after_reshuffle = before_reshuffle | beam.Reshuffle()
   assert_that(after_reshuffle, equal_to(expected_windows),
               label='after_reshuffle', reify_windows=True)
   after_group = after_reshuffle | beam.GroupByKey()
   assert_that(after_group, equal_to(expected_merged_windows),
               label='after_group', reify_windows=True)
   pipeline.run()
コード例 #15
0
ファイル: util_test.py プロジェクト: zoyahav/beam
 def test_reshuffle_windows_unchanged(self):
     pipeline = TestPipeline()
     data = [(1, 1), (2, 1), (3, 1), (1, 2), (2, 2), (1, 4)]
     expected_data = [
         TestWindowedValue(v, t, [w])
         for (v, t,
              w) in [((1, [2, 1]), 4.0, IntervalWindow(1.0, 4.0)
                      ), ((2, [2, 1]), 4.0, IntervalWindow(1.0, 4.0)
                          ), ((3, [1]), 3.0, IntervalWindow(1.0, 3.0)
                              ), ((1, [4]), 6.0, IntervalWindow(4.0, 6.0))]
     ]
     before_reshuffle = (
         pipeline
         | 'start' >> beam.Create(data)
         | 'add_timestamp' >>
         beam.Map(lambda v: beam.window.TimestampedValue(v, v[1]))
         | 'window' >> beam.WindowInto(Sessions(gap_size=2))
         | 'group_by_key' >> beam.GroupByKey())
     assert_that(before_reshuffle,
                 equal_to(expected_data),
                 label='before_reshuffle',
                 reify_windows=True)
     after_reshuffle = (before_reshuffle | 'reshuffle' >> beam.Reshuffle())
     assert_that(after_reshuffle,
                 equal_to(expected_data),
                 label='after reshuffle',
                 reify_windows=True)
     pipeline.run()
コード例 #16
0
def main(unused_argv):
    # Validate flags and setup directories.
    utils.validate_flags(FLAGS.train_glob, FLAGS.eval_glob, FLAGS.test_glob,
                         FLAGS.output_file)

    # Generate experiment parameters based on flags.
    exp_params = utils.experiment_params(
        FLAGS.embedding_list,
        FLAGS.speaker_id_name,
        FLAGS.label_name,
        FLAGS.label_list,
        FLAGS.train_glob,
        FLAGS.eval_glob,
        FLAGS.test_glob,
        FLAGS.save_model_dir,
        FLAGS.save_predictions_dir,
        FLAGS.eval_metric,
    )

    # Make and run beam pipeline.
    beam_options = None

    logging.info('Starting to create flume pipeline...')
    with beam.Pipeline(beam_options) as root:
        _ = (root
             | 'MakeCollection' >> beam.Create(exp_params)
             | 'CalcScores' >> beam.Map(lambda d:
                                        (d, utils.train_and_get_score(**d)))
             | 'FormatText' >> beam.Map(utils.format_text_line)
             | 'Reshuffle' >> beam.Reshuffle()
             | 'WriteOutput' >> beam.io.WriteToText(FLAGS.output_file,
                                                    num_shards=1))
コード例 #17
0
def pipeline(root):
    _ = (root
         |
         'Read JSONL' >> beam.io.ReadFromText(file_pattern=_INPUT_JSONL.value)
         | 'Run Episodes' >> beam.ParDo(RunT5EpisodeFn())
         | 'Reshard' >> beam.Reshuffle()
         | 'Save' >> beam.io.WriteToText(_OUTPUT_PATH.value))
コード例 #18
0
    def _build_pcollection(self, pipeline, files, web_dir, wiki_dir, answer):
        if isinstance(self.builder_config, BigBirdTriviaQAConfig):
            self.builder_config.validate()
            question_answers = preprocess.read_question_answers(files[0])
            return preprocess.make_pipeline(
                pipeline,
                question_answers=question_answers,
                answer=answer,
                max_num_tokens=self.builder_config.sequence_length,
                max_num_global_tokens=self.builder_config.
                global_sequence_length,
                stride=self.builder_config.stride,
                sentencepiece_model_path=self.builder_config.
                sentencepiece_model_path,
                wikipedia_dir=wiki_dir,
                web_dir=web_dir)

        parse_example_fn = functools.partial(
            parse_example, self.builder_config.exclude_context, web_dir,
            wiki_dir)
        return (pipeline
                | beam.Create(files)
                | beam.ParDo(ReadQuestions())
                | beam.Reshuffle()
                | beam.Map(parse_example_fn))
コード例 #19
0
    def expand(self, root):
        # TODO(robertwb): Handle streaming (with explicit schema).
        paths_pcoll = root | beam.Create([self.path])
        first = io.filesystems.FileSystems.match(
            [self.path], limits=[1])[0].metadata_list[0].path
        with io.filesystems.FileSystems.open(first) as handle:
            if not self.binary:
                handle = TextIOWrapper(handle)
            if self.incremental:
                sample = next(
                    self.reader(handle, *self.args,
                                **dict(self.kwargs, chunksize=100)))
            else:
                sample = self.reader(handle, *self.args, **self.kwargs)

        pcoll = (paths_pcoll
                 | fileio.MatchFiles(self.path)
                 | beam.Reshuffle()
                 | fileio.ReadMatches()
                 | beam.ParDo(
                     _ReadFromPandasDoFn(self.reader, self.args, self.kwargs,
                                         self.binary, self.incremental,
                                         self.splitter)))
        from apache_beam.dataframe import convert
        return convert.to_dataframe(pcoll,
                                    proxy=_prefix_range_index_with(
                                        ':', sample[:0]))
コード例 #20
0
 def test_reshuffle_contents_unchanged(self):
   pipeline = TestPipeline()
   data = [(1, 1), (2, 1), (3, 1), (1, 2), (2, 2), (1, 3)]
   result = (pipeline
             | beam.Create(data)
             | beam.Reshuffle())
   assert_that(result, equal_to(data))
   pipeline.run()
コード例 #21
0
    def pipeline(root):
        if output_type == 'tf_example':
            coder = beam.coders.ProtoCoder(tf.train.Example)
        elif output_type == 'tf_sequence_example':
            coder = beam.coders.ProtoCoder(tf.train.SequenceExample)
        else:
            raise ValueError('Unsupported output type.')
        input_collection = (
            root | 'ReadInputTFRecord' >> beam.io.tfrecordio.ReadFromTFRecord(
                input_tfrecord, coder=beam.coders.BytesCoder()))
        rekey_collection = input_collection | 'RekeyExamples' >> beam.ParDo(
            ReKeyDataFn(sequence_key, time_horizon, reduce_image_size,
                        max_image_dimension))
        grouped_collection = (rekey_collection
                              | 'GroupBySequenceKey' >> beam.GroupByKey())
        grouped_collection = (grouped_collection
                              | 'ReshuffleGroups' >> beam.Reshuffle())
        ordered_collection = (
            grouped_collection | 'OrderByFrameNumber' >> beam.ParDo(
                SortGroupedDataFn(sequence_key, sorted_image_ids,
                                  max_num_elements_in_context_features)))
        ordered_collection = (ordered_collection
                              | 'ReshuffleSortedGroups' >> beam.Reshuffle())
        output_collection = (
            ordered_collection | 'AddContextToExamples' >> beam.ParDo(
                GenerateContextFn(
                    sequence_key,
                    add_context_features,
                    image_ids_to_keep,
                    keep_context_features_image_id_list=(
                        keep_context_features_image_id_list),
                    subsample_context_features_rate=
                    subsample_context_features_rate,
                    keep_only_positives=keep_only_positives,
                    keep_only_positives_gt=keep_only_positives_gt,
                    context_features_score_threshold=(
                        context_features_score_threshold),
                    max_num_elements_in_context_features=(
                        max_num_elements_in_context_features),
                    output_type=output_type,
                    max_clip_length=max_clip_length)))

        output_collection = (output_collection
                             | 'ReshuffleExamples' >> beam.Reshuffle())
        _ = output_collection | 'WritetoDisk' >> beam.io.tfrecordio.WriteToTFRecord(
            output_tfrecord, num_shards=num_shards, coder=coder)
コード例 #22
0
def pipeline(root):
  """Beam pipeline.

  Args:
    root: the root of the pipeline.
  """
  _ = (
      root
      | 'CreateTopologies' >> beam.Create(
          smu_utils_lib.generate_bond_topologies_from_csv(
              FLAGS.input_bond_topology_csv))
      | 'Reshuffle1' >> beam.Reshuffle()
      | 'CheckInvariance' >> beam.FlatMap(check_smiles_permutation_invariance)
      | 'Reshuffle2' >> beam.Reshuffle()
      | 'CSVFormat' >> beam.Map(lambda vals: ','.join(str(x) for x in vals))
      | 'WriteOutput' >> beam.io.WriteToText(
          FLAGS.output_csv, header='bt_id,smiles0,smiles1', num_shards=1))
コード例 #23
0
 def expand(self, pcoll):
     return (pcoll
             | beam.Map(t5.data.dict_to_tfexample)
             | beam.Reshuffle()
             | beam.io.tfrecordio.WriteToTFRecord(
                 self._output_path,
                 num_shards=self._num_shards,
                 coder=beam.coders.ProtoCoder(tf.train.Example)))
コード例 #24
0
def main(_):
    runner = beam.runners.DirectRunner()  # must create before flags are used

    pipeline = (beam.Create(tf.gfile.Glob(FLAGS.file_pattern))
                | beam.Reshuffle()
                | beam.Map(create_survival_netcdf,
                           quantile=FLAGS.quantile,
                           exact_path=FLAGS.exact_results_file))
    runner.run(pipeline)
コード例 #25
0
def run(options):
    with beam.Pipeline(options=options) as p:
        (p
         | 'Read Avro' >> beam.io.ReadFromAvro(p.options.input, validate=False)
         | 're-shuffling' >> beam.Reshuffle()
         | 'Transformation' >> StackOverflowAvroDataTransform()
         | 'Write to file' >> beam.io.WriteToText(p.options.output,
                                                  file_name_suffix=".csv",
                                                  header=p.options.csv_header))
コード例 #26
0
ファイル: fn_api_runner_test.py プロジェクト: xsm110/beam
 def run_split_pipeline(self, split_manager, elements, element_counter=None):
   with fn_api_runner.split_manager('Identity', split_manager):
     with self.create_pipeline() as p:
       res = (p
              | beam.Create(elements)
              | beam.Reshuffle()
              | 'Identity' >> beam.Map(lambda x: x)
              | beam.Map(lambda x: element_counter.increment() or x))
       assert_that(res, equal_to(elements))
コード例 #27
0
 def _pipeline(root):
   _ = (
       root
       | 'Read' >> beam.io.ReadFromText(screen_id_file)
       | 'ReadEpisodeProto' >> beam.ParDo(
           GenerateProto(input_dir, clean_tf_example, csv_label_file))
       | 'ReShuffle' >> beam.Reshuffle()  # workers may not parallel w/o this
       | 'WriteResults' >> beam.io.WriteToTFRecord(
           output_path, coder=beam.coders.ProtoCoder(tf.train.Example)))
コード例 #28
0
def prepare_single_tfrecord(
    input_audio_path,
    source_id,
    env_id,
    output_tfrecord_path,
    sample_rate=16000,
    frame_rate=250,
    window_secs=4,
    hop_secs=1,
    pipeline_options=''):
  """Prepares a TFRecord for use in training, evaluation, and prediction.

  Args:
    input_audio_paths: An iterable of paths to audio files to include in
      TFRecord.
    output_tfrecord_path: The prefix path to the output TFRecord. Shard numbers
      will be added to actual path(s).
    num_shards: The number of shards to use for the TFRecord. If None, this
      number will be determined automatically.
    sample_rate: The sample rate to use for the audio.
    frame_rate: The frame rate to use for f0 and loudness features.
      If set to None, these features will not be computed.
    window_secs: The size of the sliding window (in seconds) to use to
      split the audio and features. If 0, they will not be split.
    hop_secs: The number of seconds to hop when computing the sliding
      windows.
    pipeline_options: An iterable of command line arguments to be used as
      options for the Beam Pipeline.
  """
  pipeline_options = beam.options.pipeline_options.PipelineOptions(
      pipeline_options)
  with beam.Pipeline(options=pipeline_options) as pipeline:
    examples = (
        pipeline
        | beam.Create([input_audio_path])
        | beam.Map(_load_audio, sample_rate, source_id, env_id))

    if frame_rate:
      examples = (
          examples
          | beam.Map(_add_f0_estimate, frame_rate)
          | beam.Map(_add_loudness, sample_rate, frame_rate))

    if window_secs:
      examples |= beam.FlatMap(
          _split_example, sample_rate, frame_rate, window_secs, hop_secs)

    _ = (
        examples
        | beam.Reshuffle()
        | beam.Map(_float_dict_to_tfexample)
        | beam.io.tfrecordio.WriteToTFRecord(
            output_tfrecord_path,
            num_shards=1,
            shard_name_template='',
            coder=beam.coders.ProtoCoder(tf.train.Example))
    )
コード例 #29
0
 def _build_pcollection(self, pipeline, **kwargs):
     db = wiki_db.WikiDatabase.from_local(self._wiki_db_path)
     wikipedia_urls = db.get_wikipedia_urls()
     return (pipeline
             | 'LoadPages' >> beam.Create(wikipedia_urls)
             | 'Repartition' >> beam.Reshuffle()
             | 'ExtractSentences' >> beam.ParDo(
                 ExtractSentences(max_sentence_id=self._max_sentence_id,
                                  wiki_db_path=self._wiki_db_path)))
コード例 #30
0
def main(unused_argv):

    # Data prep setup.
    prep_params, input_filenames_list, output_filenames, run_data_prep = _get_data_prep_params_from_flags(
    )
    logging.info('beam_params: %s', prep_params)

    # Generate sklearn eval experiment parameters based on data prep flags.
    # Make (data_prep outputs / eval input filenames) globs.
    train_glob, eval_glob, test_glob = [f'{x}*' for x in output_filenames]
    sklearn_results_output_file = FLAGS.results_output_file
    exp_params = sklearn_utils.experiment_params(
        train_glob=train_glob,
        eval_glob=eval_glob,
        test_glob=test_glob,
        embedding_list=prep_params['embedding_names'],
        speaker_id_name=FLAGS.speaker_id_key,
        label_name=FLAGS.label_key,
        label_list=FLAGS.label_list,
        save_model_dir=FLAGS.save_model_dir,
        save_predictions_dir=FLAGS.save_predictions_dir,
        eval_metrics=FLAGS.eval_metrics,
    )
    logging.info('exp_params: %s', exp_params)

    # Make and run beam pipeline.
    beam_options = None

    if run_data_prep:
        input_filenames_list, output_filenames = _remove_existing_outputs(
            input_filenames_list, output_filenames)
        logging.info('Data prep on: %s, %s...', input_filenames_list,
                     output_filenames)
        with beam.Pipeline(beam_options) as root:
            for i, (input_filenames_or_glob, output_filename) in enumerate(
                    zip(input_filenames_list, output_filenames)):
                utils.data_prep_pipeline(
                    root=root,
                    input_filenames_or_glob=input_filenames_or_glob,
                    output_filename=output_filename,
                    data_prep_behavior=FLAGS.data_prep_behavior,
                    beam_params=prep_params,
                    suffix=str(i))

    # Check that previous beam pipeline wrote outputs.
    sklearn_utils.validate_flags(train_glob, eval_glob, test_glob,
                                 sklearn_results_output_file)
    logging.info('Eval sklearn...')
    with beam.Pipeline(beam_options) as root:
        _ = (root
             | 'MakeCollection' >> beam.Create(exp_params)
             | 'CalcScores' >>
             beam.Map(lambda d: (d, sklearn_utils.train_and_get_score(**d)))
             | 'FormatText' >> beam.Map(sklearn_utils.format_text_line)
             | 'Reshuffle' >> beam.Reshuffle()
             | 'WriteOutput' >> beam.io.WriteToText(
                 sklearn_results_output_file, num_shards=1))