Beispiel #1
0
def convert_and_save_tf_examples(features_and_labels,
                                 output_path,
                                 debug_output=False):
    """Beam PTransform taking features and labels and saving them as tf examples.

  Args:
    features_and_labels: PCollection of APData holding the features and labels.
    output_path: base folder for output, the function creates 3 sharded tfrecord
      files for the train, val and test sets.
    debug_output: whether to output debug information in addition to model
      inputs in the tf examples.
  """
    def _save_fn(pcoll, filename):
        _ = (pcoll
             | f"Reshuffle({filename})" >> beam.Reshuffle()
             | f"ToCSV({filename})" >>
             beam.Map(lambda x: "{},{}".format(x[1].subject_id, x[1].note_id))
             | f"SaveCSV({filename})" >> beam.io.WriteToText(
                 os.path.join(output_path, filename + ".csv"),
                 header="subject_id,note_id"))

        _ = (pcoll
             | f"ConvertToTFExamples({filename})" >> beam.Map(
                 convert_to_tf_examples, debug_output=debug_output)
             | f"SaveTFRecords({filename})" >> beam.io.WriteToTFRecord(
                 os.path.join(output_path, filename + ".tfrecord"),
                 coder=beam.coders.ProtoCoder(tf.train.Example)))

    train_set, val_set, test_set = (
        features_and_labels
        | "SplitTrainValTest" >> beam.Partition(
            lambda x, n_part: max(x[1].partition.value - 1, 0), 3))

    _save_fn(train_set, "train_set")
    _save_fn(val_set, "val_set")
    _save_fn(test_set, "test_set")

    # Partition by rating and augmentation status and save stratification.
    def _split_by_status(element, n_part):
        del n_part
        _, ap_data = element
        return 2 * (ap_data.is_rated) + bool(ap_data.augmentation_name)

    by_status = (train_set
                 | "SplitByStatus" >> beam.Partition(_split_by_status, 4))

    for i, rated_status in enumerate(["nonrated", "rated"]):
        for j, aug_status in enumerate(["nonaugmented", "augmented"]):
            stratus = f"train_{rated_status}_{aug_status}"
            _save_fn(by_status[i * 2 + j], stratus)
Beispiel #2
0
    def expand(self, model_pipe):
        """Function that takes in a beam.PCollection of datastore models and
        returns a beam.PCollection of validation errors.

        Args:
            model_pipe: beam.PCollection. A collection of models.

        Returns:
            beam.PCollection. A collection of errors represented as
            key-value pairs.
        """
        not_deleted, deleted = (
            model_pipe
            | 'SplitByDeleted' >> beam.Partition(lambda m, _: int(m.deleted), 2)
        )

        deletion_errors = deleted | beam.ParDo(ValidateDeleted())

        time_field_validation_errors = (
            not_deleted | beam.ParDo(ValidateModelTimeFields()))

        model_id_validation_errors = (
            not_deleted
            | beam.ParDo(
                ValidateModelIdWithRegex(), self._get_model_id_regex())
        )

        return (
            (
                deletion_errors,
                time_field_validation_errors,
                model_id_validation_errors)
            | beam.Flatten())
Beispiel #3
0
    def Do(self, input_dict, output_dict, exec_properties):
        """Take input data source and generates train and eval tf examples.

    Args:
      input_dict: Input dict from input key to a list of Artifacts. Depends on
        detailed example gen implementation.
      output_dict: Output dict from output key to a list of Artifacts.
        - examples: train and eval split of tf examples.
      exec_properties: A dict of execution properties. Depends on detailed
        example gen implementation.
        - output: JSON string of example_gen_pb2.Output instance, providing
          output configuration.

    Returns:
      None

    Raises:
      RuntimeError: if output split config is not specified.
    """
        self._log_startup(input_dict, output_dict, exec_properties)

        # Get output split information.
        output_config = example_gen_pb2.Output()
        json_format.Parse(exec_properties['output'], output_config)
        self._check_split_config(output_config.split_config)
        splits = output_config.split_config.splits
        # Calculate split buckets.
        buckets = []
        total_buckets = 0
        for split in splits:
            total_buckets += split.hash_buckets
            buckets.append(total_buckets)

        tf.logging.info('Generating examples.')
        with beam.Pipeline(argv=self._get_beam_pipeline_args()) as pipeline:
            input_to_example = self.GetInputSourceToExamplePTransform()
            example_splits = (
                pipeline
                | 'InputSourceToExample' >> input_to_example(
                    input_dict, exec_properties)
                # Returns deterministic string as partition is based on it.
                | 'SerializeDeterministically' >>
                beam.Map(lambda x: x.SerializeToString(deterministic=True))
                | 'SplitData' >> beam.Partition(_partition_fn, len(buckets),
                                                buckets))
            # TODO(jyzhao): make shuffle optional.
            # pylint: disable=expression-not-assigned
            for index, example_split in enumerate(example_splits):
                (example_split
                 | 'ShuffleSplit' + splits[index].name >>
                 beam.transforms.Reshuffle()
                 | 'OutputSplit' + splits[index].name >>
                 beam.io.WriteToTFRecord(os.path.join(
                     types.get_split_uri(output_dict['examples'],
                                         splits[index].name),
                     DEFAULT_FILE_NAME),
                                         file_name_suffix='.gz'))
            # pylint: enable=expression-not-assigned

        tf.logging.info('Examples generated.')
def process_satellite_lines(
    lines: beam.pvalue.PCollection[Tuple[str, str]]
) -> Tuple[beam.pvalue.PCollection[Row], beam.pvalue.PCollection[Row]]:
    """Process both satellite and blockpage data files.

  Args:
    lines: input lines from all satellite files. Tuple[filename, line]

  Returns:
    post_processed_satellite: rows of satellite scan data
    blockpage_rows: rows of blockpage data
  """
    # PCollection[Tuple[filename,line]] x3
    tags, blockpages, lines = lines | beam.Partition(
        partition_satellite_input, NUM_SATELLITE_INPUT_PARTITIONS)

    # PCollection[Row]
    tagged_satellite = process_satellite_with_tags(lines, tags)
    # PCollection[Row]
    post_processed_satellite = post_processing_satellite(tagged_satellite)

    # PCollection[Row]
    blockpage_rows = process_satellite_blockpages(blockpages)

    return post_processed_satellite, blockpage_rows
Beispiel #5
0
def model_multiple_pcollections_partition(contents, output_path):
    """Splitting a PCollection with Partition."""
    some_hash_fn = lambda s: ord(s[0])

    def get_percentile(i):
        """Assume i in [0,100)."""
        return i

    import apache_beam as beam
    with TestPipeline() as p:  # Use TestPipeline for testing.

        students = p | beam.Create(contents)

        # [START model_multiple_pcollections_partition]
        def partition_fn(student, num_partitions):
            return int(get_percentile(student) * num_partitions / 100)

        by_decile = students | beam.Partition(partition_fn, 10)
        # [END model_multiple_pcollections_partition]
        # [START model_multiple_pcollections_partition_40th]
        fortieth_percentile = by_decile[4]
        # [END model_multiple_pcollections_partition_40th]

        ([by_decile[d] for d in xrange(10) if d != 4] + [fortieth_percentile]
         | beam.Flatten()
         | beam.io.WriteToText(output_path))
def ReadImagesFromDisk(pipeline: beam.Pipeline,
                       base_path: Text) -> beam.pvalue.PCollection:
    """
    The Beam PTransform used to load a collection of images and metadata
    from a local file system or a remote cloud storage bucket.

    Args:
        pipeline (beam.Pipeline): Input beam.Pipeline object coming
         from a TFX Executor.
        base_path (Text): Base directory containing images and labels.
    """

    wildcard_qualifier = "*"

    # ingest all the files from the base path by supplying the wildcard
    file_pattern = os.path.join(base_path, wildcard_qualifier)

    allowed_ext = [".jpg", ".json", ".png", ".txt", ".jpeg"]

    images, label_file = (
        pipeline
        | fileio.MatchFiles(file_pattern)
        | fileio.ReadMatches()
        | beam.Map(read_file_content)
        | "FilterOutFiles" >> beam.Filter(lambda x: x[FILE_EXT] in allowed_ext)
        | "SplitLabelFile" >> beam.Partition(SplitByFileName, 2))

    # label_file is actually a dict
    label_dict = beam.pvalue.AsSingleton(label_file)
    ready_images = (
        images
        |
        "AddLabelAndMetadata" >> beam.Map(add_label_and_metadata, label_dict))

    return ready_images
Beispiel #7
0
def partition_lambda(test=None):
  # pylint: disable=line-too-long, expression-not-assigned
  # [START partition_lambda]
  import apache_beam as beam

  durations = ['annual', 'biennial', 'perennial']

  with beam.Pipeline() as pipeline:
    annuals, biennials, perennials = (
        pipeline
        | 'Gardening plants' >> beam.Create([
            {'icon': '🍓', 'name': 'Strawberry', 'duration': 'perennial'},
            {'icon': '🥕', 'name': 'Carrot', 'duration': 'biennial'},
            {'icon': '🍆', 'name': 'Eggplant', 'duration': 'perennial'},
            {'icon': '🍅', 'name': 'Tomato', 'duration': 'annual'},
            {'icon': '🥔', 'name': 'Potato', 'duration': 'perennial'},
        ])
        | 'Partition' >> beam.Partition(
            lambda plant, num_partitions: durations.index(plant['duration']),
            len(durations),
        )
    )

    annuals | 'Annuals' >> beam.Map(lambda x: print('annual: {}'.format(x)))
    biennials | 'Biennials' >> beam.Map(
        lambda x: print('biennial: {}'.format(x)))
    perennials | 'Perennials' >> beam.Map(
        lambda x: print('perennial: {}'.format(x)))
    # [END partition_lambda]
    # pylint: enable=line-too-long, expression-not-assigned
    if test:
      test(annuals, biennials, perennials)
Beispiel #8
0
    def test_partition_satellite_input(self) -> None:  # pylint: disable=no-self-use
        """Test partitioning of Satellite input into tags, blockpages, and results."""
        data = [
            ("CP_Satellite-2020-09-02-12-00-01/resolvers.json", "tag"),
            ("CP_Satellite-2020-09-02-12-00-01/resolvers.json", "tag"),
            ("CP_Satellite-2020-09-02-12-00-01/tagged_resolvers.json", "tag"),
            ("CP_Satellite-2020-09-02-12-00-01/tagged_resolvers.json", "tag"),
            ("CP_Satellite-2020-09-02-12-00-01/tagged_answers.json", "tag"),
            ("CP_Satellite-2020-09-02-12-00-01/tagged_answers.json", "tag"),
            ("CP_Satellite-2021-09-02-12-00-01/blockpages.json", "blockpage"),
            ("CP_Satellite-2020-09-02-12-00-01/interference.json", "row"),
            ("CP_Satellite-2020-09-02-12-00-01/interference.json", "row")
        ]

        expected_tags = data[0:6]
        expected_blockpages = data[6:7]
        expected_rows = data[7:]

        with TestPipeline() as p:
            lines = p | 'create data' >> beam.Create(data)

            tags, blockpages, rows = lines | beam.Partition(
                satellite.partition_satellite_input, 3)

            beam_test_util.assert_that(tags,
                                       beam_test_util.equal_to(expected_tags),
                                       label='assert_that/tags')
            beam_test_util.assert_that(
                blockpages,
                beam_test_util.equal_to(expected_blockpages),
                label='assert_that/blockpages')
            beam_test_util.assert_that(rows,
                                       beam_test_util.equal_to(expected_rows),
                                       label='assert_that/rows')
def _generate_random_train_eval_examples_from_bq(base_query, base_sample_rate,
                                                 train_samples_fraction,
                                                 train_output_dir,
                                                 eval_output_dir,
                                                 pipeline_options):
    sampling_query = f'''
    SELECT * FROM (
        SELECT *, (ABS(FARM_FINGERPRINT(row_id)) / 0x7FFFFFFFFFFFFFFF) AS selection_chance
        FROM ({base_query})
    )
    WHERE selection_chance < {base_sample_rate}
    '''

    row_to_tf_example_converter = BigQueryToTFExampleConverter(sampling_query)

    def train_eval_partition_fn(row, n_partitions):
        return int(row['selection_chance'] > base_sample_rate *
                   train_samples_fraction)

    bookkeeping_columns = ['row_id', 'selection_chance']
    with beam.Pipeline(options=pipeline_options) as pipeline:
        all_samples = pipeline | 'QueryTable' >> beam.io.Read(
            beam.io.BigQuerySource(query=sampling_query,
                                   use_standard_sql=True))
        train_samples, eval_samples = all_samples | 'TrainEvalPartition' >> beam.Partition(
            train_eval_partition_fn, 2)
        train_samples | "WriteTrainDataset" >> WriteBigQueryRowsToTFRecord(
            row_to_tf_example_converter, train_output_dir, bookkeeping_columns)
        eval_samples | "WriteEvalDataset" >> WriteBigQueryRowsToTFRecord(
            row_to_tf_example_converter, eval_output_dir, bookkeeping_columns)
Beispiel #10
0
def partition_multiple_arguments(test=None):
  # pylint: disable=expression-not-assigned
  # [START partition_multiple_arguments]
  import apache_beam as beam
  import json

  def split_dataset(plant, num_partitions, ratio):
    assert num_partitions == len(ratio)
    bucket = sum(map(ord, json.dumps(plant))) % sum(ratio)
    total = 0
    for i, part in enumerate(ratio):
      total += part
      if bucket < total:
        return i
    return len(ratio) - 1

  with beam.Pipeline() as pipeline:
    train_dataset, test_dataset = (
        pipeline
        | 'Gardening plants' >> beam.Create([
            {'icon': '🍓', 'name': 'Strawberry', 'duration': 'perennial'},
            {'icon': '🥕', 'name': 'Carrot', 'duration': 'biennial'},
            {'icon': '🍆', 'name': 'Eggplant', 'duration': 'perennial'},
            {'icon': '🍅', 'name': 'Tomato', 'duration': 'annual'},
            {'icon': '🥔', 'name': 'Potato', 'duration': 'perennial'},
        ])
        | 'Partition' >> beam.Partition(split_dataset, 2, ratio=[8, 2])
    )

    train_dataset | 'Train' >> beam.Map(lambda x: print('train: {}'.format(x)))
    test_dataset | 'Test' >> beam.Map(lambda x: print('test: {}'.format(x)))
    # [END partition_multiple_arguments]
    # pylint: enable=expression-not-assigned
    if test:
      test(train_dataset, test_dataset)
Beispiel #11
0
def run(argv=None, save_main_session=True):
    parser = argparse.ArgumentParser()
    parser.add_argument('--input', dest='input', required=True)
    parser.add_argument('--output', dest='output', required=True)
    args, pipeline_args = parser.parse_known_args(argv)

    pipeline_options = PipelineOptions()
    pipeline_options.view_as(
        SetupOptions).save_main_session = save_main_session

    p = beam.Pipeline(options=pipeline_options)

    # Input: '\t'.join((timestamp, ip, agent, url, referer))
    lines = p | 'Read' >> ReadFromText(args.input)
    requests = lines | 'Split' >> beam.Map(lambda l: l.split('\t'))
    shards = requests | 'Shard' >> beam.Partition(shard, NUM_SHARD)
    for i in range(NUM_SHARD):
        shards[i] | f'Write_{i}' >> WriteToText(f'{args.output}_shard{i}',
                                                num_shards=1)

    result = p.run()
    result.wait_until_finish()

    if (not hasattr(result, 'has_job')  # direct runner
            or result.has_job):  # not just a template creation
        query_result = result.metrics().query()
        for result in query_result['counters']:
            logging.info(f'{result.key.metric.name}: {result.result}')
Beispiel #12
0
def model_multiple_pcollections_flatten(contents, output_path):
    """Merging a PCollection with Flatten."""
    some_hash_fn = lambda s: ord(s[0])
    partition_fn = lambda element, partitions: some_hash_fn(element
                                                            ) % partitions
    import apache_beam as beam
    with TestPipeline() as p:  # Use TestPipeline for testing.

        # Partition into deciles
        partitioned = p | beam.Create(contents) | beam.Partition(
            partition_fn, 3)
        pcoll1 = partitioned[0]
        pcoll2 = partitioned[1]
        pcoll3 = partitioned[2]

        # Flatten them back into 1

        # A collection of PCollection objects can be represented simply
        # as a tuple (or list) of PCollections.
        # (The SDK for Python has no separate type to store multiple
        # PCollection objects, whether containing the same or different
        # types.)
        # [START model_multiple_pcollections_flatten]
        merged = (
            (pcoll1, pcoll2, pcoll3)
            # A list of tuples can be "piped" directly into a Flatten transform.
            | beam.Flatten())
        # [END model_multiple_pcollections_flatten]
        merged | beam.io.WriteToText(output_path)
Beispiel #13
0
  def testEachPTransformCopiedOnce(self):
    with beam.Pipeline() as p:
      created = p | 'Create1' >> beam.Create([(1, 'a'), (2, 'b')])
      modified1 = (created
                   | 'Transform1' >> beam.Map(
                       lambda x: DeepCopyTest._CountingIdentityFn(
                           'Transform1', x)))
      partition_fn = lambda element, partitions: element[0] % partitions
      p1, p2 = (modified1
                | 'Partition' >> beam.Partition(partition_fn, 2))
      merged = (p1, p2) | 'Flatten1' >> beam.Flatten()
      modified2 = (merged
                   | 'Transform2' >> beam.Map(
                       lambda x: DeepCopyTest._CountingIdentityFn(
                           'Transform2', x)))

      copied = deep_copy.deep_copy(modified2)

      # Check that deep copy was performed.
      self.assertIsNot(copied.producer.inputs[0], modified2.producer.inputs[0])
      self.assertIsNot(copied.producer.inputs[0].producer.inputs[0],
                       modified2.producer.inputs[0].producer.inputs[0])
      self.assertIsNot(copied.producer.inputs[0].producer.inputs[1],
                       modified2.producer.inputs[0].producer.inputs[1])

    # Check counts of processed items.
    self.assertEqual(DeepCopyTest._counts['Transform1'], 4)
    self.assertEqual(DeepCopyTest._counts['Transform2'], 4)
Beispiel #14
0
def configure_pipeline(p, dataset_train, dataset_eval, checkpoint_path, output_dir, job_id):
  source_train = _util.get_sources_from_dataset(p, dataset_train, 'train')
  labels_source = [source_train]
  if dataset_eval is not None:
    source_eval = _util.get_sources_from_dataset(p, dataset_eval, 'eval')
    labels_source.append(source_eval)

  labels = _labels_pipeline(labels_source)
  train_preprocessed = _transformation_pipeline(source_train, checkpoint_path, labels, 'train')
  if dataset_eval is not None:
    # explicit eval data.
    eval_preprocessed = _transformation_pipeline(source_eval, checkpoint_path, labels, 'eval')
  else:
    # Split train/eval.
    train_preprocessed, eval_preprocessed = (train_preprocessed |
                                             'Random Partition' >>
                                             beam.Partition(TrainEvalSplitPartitionFn(), 2))

  output_train_path = os.path.join(output_dir, job_id, 'train')
  output_eval_path = os.path.join(output_dir, job_id, 'eval')
  labels_file = os.path.join(output_dir, job_id, 'labels')
  labels_save = (labels |
                 'Write labels' >>
                 beam.io.textio.WriteToText(labels_file, shard_name_template=''))
  train_save = train_preprocessed | 'Save train to disk' >> SaveFeatures(output_train_path)
  eval_save = eval_preprocessed | 'Save eval to disk' >> SaveFeatures(output_eval_path)
  # Make sure we write "latest" file after train and eval data are successfully written.
  output_latest_file = os.path.join(output_dir, 'latest')
  ([eval_save, train_save, labels_save] | 'Wait for train eval saving' >> beam.Flatten() |
      'Fixed One' >> beam.transforms.combiners.Sample.FixedSizeGlobally(1) |
      beam.Map(lambda path: job_id) |
      'WriteLatest' >> beam.io.textio.WriteToText(output_latest_file, shard_name_template=''))
    def expand(self, input_models):
        """Transforms a PCollection of models into validation errors.

        Args:
            input_models: beam.PCollection. A collection of models.

        Returns:
            beam.PCollection. A collection of errors represented as
            key-value pairs.
        """
        existing_models, deleted_models = (
            input_models
            | 'Split by deleted' >> beam.Partition(
                lambda model, unused_num_partitions: int(model.deleted), 2))
        deletion_errors = (
            deleted_models
            | 'Validate deleted models' >> beam.ParDo(ValidateDeletedModel()))
        timestamp_errors = (
            existing_models
            | 'Validate timestamps' >> beam.ParDo(ValidateModelTimestamps()))
        id_errors = (existing_models
                     | 'Validate id' >> beam.ParDo(ValidateModelIdWithRegex(),
                                                   self.get_model_id_regex()))

        error_pcolls = (deletion_errors, timestamp_errors, id_errors)
        return error_pcolls | beam.Flatten()
Beispiel #16
0
def main():
    data = [
        (
            'a',
            1,
            100,
        ),
        (
            'b',
            2,
            100,
        ),
        (
            'c',
            1,
            100,
        ),
        (
            'd',
            2,
            100,
        ),
        (
            'e',
            1,
            100,
        ),
        (
            'f',
            1,
            100,
        ),
        (
            'g',
            1,
            100,
        ),
        (
            'h',
            1,
            100,
        ),
        (
            'i',
            1,
            100,
        ),
    ]
    with beam.Pipeline(options=pipeline_options.PipelineOptions()) as p:
        students = p | 'create_data1' >> beam.Create(data)

        def partition_fn(student, num_partitions):
            print(student)
            return ord(student[0]) % num_partitions

        by_decile = students | 'by_decile' >> beam.Partition(partition_fn, 5)
        path_output = os.path.join(PATH_TO_THIS_DIR, 'by_decile.txt')
        (by_decile
         | 'write_flatten' >> beam.io.WriteToText(path_output,
                                                  file_name_suffix='.csv'))
Beispiel #17
0
def run_pipeline(pipeline_args, known_args):
    """Splits images into separate directories using thresholds on randnum.

  Args:
    pipeline_args: arguments ingested by beam pipeline
    known_args: additional arguments for this project, such as the storage
                bucket, source_image_dir, and dest_image_dir.

  Returns:
    [nothing] - runs beam pipeline and copies output files to different dirs
  """
    # Specify pipeline options
    pipeline_options = PipelineOptions(pipeline_args)
    pipeline_options.view_as(SetupOptions).save_main_session = True

    # Attach bucket prefix if running on cloud
    source_images_pattern = known_args.source_image_dir + '/*'
    dest_prefix = known_args.dest_image_dir + '/'
    if known_args.cloud:
        source_images_pattern = ('gs://' + known_args.storage_bucket + '/' +
                                 source_images_pattern)
        dest_prefix = ('gs://' + known_args.storage_bucket + '/' + dest_prefix)

    # Get output directories for split images
    split_names = known_args.split_names
    split_fractions = known_args.split_fractions
    dest_images_dirs = [dest_prefix + x + '/' for x in split_names]

    # Create output directories if they do not already exist (for local runs)
    for dest_images_dir in dest_images_dirs:
        if not FileSystems.exists(dest_images_dir):
            FileSystems.mkdirs(dest_images_dir)

    # Log information on source, destination, and split fractions
    split_log_list = [
        x[0] + '(' + str(x[1]) + ')' for x in zip(split_names, split_fractions)
    ]
    logging.info('Starting ' + ' | '.join(split_log_list) +
                 ' split from images with source file pattern ' +
                 source_images_pattern)
    logging.info('Destination parent directory: ' + dest_prefix)

    with beam.Pipeline(options=pipeline_options) as p:
        # Read files and partition pipelines
        split_pipelines = (
            p
            | 'read_images' >> beam.io.Read(
                LabeledImageFileReader(source_images_pattern))
            | 'split_images' >> beam.Partition(
                generate_split_fn(split_fractions), len(split_fractions)))

        # Write each pipeline to a corresponding output directory
        for partition, split_name_and_dest_dir in enumerate(
                zip(split_names, dest_images_dirs)):
            _ = (split_pipelines[partition]
                 | 'write_' + split_name_and_dest_dir[0] >> beam.Map(
                     write_to_directory, dst_dir=split_name_and_dest_dir[1]))

    logging.info('Done splitting image sets')
Beispiel #18
0
 def expand(self, pvalue):
     frames = (pvalue.pipeline
               | Read(self._source)
               | beam.Partition(splitBadFiles, 2))
     chunks = (frames[1]
               | beam.FlatMap(lambda e: [e])
               | beam.CombinePerKey(combineTZ()))
     return chunks
Beispiel #19
0
def configure_pipeline(p, opt):
    # Type: (apache_beam.Pipeline, apache_beam.PipelineOptions) -> None
    """Specify PCollection and transformations in pipeline."""
    # Create a map of study_uid to label.
    study_uid_to_label = tcia_utils.GetStudyUIDToLabelMap()

    # Create a map of study_uid -> path of images in GCS
    study_uid_to_image_path = _get_study_uid_to_image_path_map(opt.input_path)

    # Create a map of study_uid -> (GCS path, label)
    # Split dataset into training, validation and test.
    paths_and_labels = []
    dataset_size = len(study_uid_to_label)
    training_size = dataset_size * (100 - opt.testing_percentage -
                                    opt.validation_percentage) / 100
    validation_size = dataset_size * opt.validation_percentage / 100
    testing_size = dataset_size * opt.testing_percentage / 100
    logging.info('Number of images in training dataset: %s', training_size)
    logging.info('Number of images in validation dataset: %s', validation_size)
    logging.info('Number of images in testing dataset: %s', testing_size)

    count = 0
    for k, v in study_uid_to_label.items():
        if k not in study_uid_to_image_path:
            logging.warning('Could not find image with study_uid %s in GCS', k)
            continue
        if count < training_size:
            dataset = constants.TRAINING_DATASET
        elif count >= training_size and count < (training_size +
                                                 validation_size):
            dataset = constants.VALIDATION_DATASET
        else:
            dataset = constants.TESTING_DATASET
        count += 1
        paths_and_labels.append((dataset, study_uid_to_image_path[k], v))

    # Shuffle the input
    random.shuffle(paths_and_labels)
    parts = (p
             | 'Download Labels' >> beam.Create(paths_and_labels)
             | 'Preprocess Image' >> beam.ParDo(PreprocessImage())
             | 'Split into Training-Validation-Testing' >> beam.Partition(
                 _partition_fn, 3))

    # Branch into workflows that serialize training/validation/testing TFRecords.
    for idx, path_suffix in enumerate([
            constants.TRAINING_DATASET, constants.VALIDATION_DATASET,
            constants.TESTING_DATASET
    ]):
        _ = (parts[idx]
             | 'Serialize TFRecord ' + path_suffix >>
             beam.Map(lambda x: x.SerializeToString())
             |
             'Save TFRecord to GCS ' + path_suffix >> beam.io.WriteToTFRecord(
                 os.path.join(opt.output_path, path_suffix),
                 file_name_suffix='.tfrecord'))
def main(argv=None):
    '''Run Preprocessing as a Dataflow pipeline.'''
    args = parse_arguments(sys.argv if argv is None else argv)
    if args.cloud:
        logging.info('Start running in the cloud')
        options = {
            'runner':
            'DataflowRunner',
            'job_name': ('mlengine-boilerplate-{}'.format(
                datetime.datetime.now().strftime('%Y%m%d%H%M%S'))),
            'staging_location':
            os.path.join(args.output_dir, 'staging'),
            'temp_location':
            os.path.join(args.output_dir, 'tmp'),
            'project':
            args.project_id,
            'zone':
            'europe-west1-d',
            'autoscaling_algorithm':
            'THROUGHPUT_BASED',
            'save_main_session':
            True,
            'setup_file':
            './setup.py',
        }
        pipeline_options = beam.pipeline.PipelineOptions(flags=[], **options)
        print(pipeline_options)
    else:
        pipeline_options = None

    train_coder = coders.ExampleProtoCoder(schema)

    p = beam.Pipeline(options=pipeline_options)

    examples = (p
                | 'ReadData' >> beam.io.ReadFromText(DATA_DIR + '/*',
                                                     skip_header_lines=1)
                | 'buildExamples' >>
                beam.FlatMap(lambda raw_input: buildExample(raw_input)))

    examples_split = examples | beam.Partition(partition_fn, 3)
    example_dict = {
        'train': examples_split[0],
        'validation': examples_split[1],
        'test': examples_split[2]
    }

    for part, examples in example_dict.items():
        _ = examples | part + '_writeExamples' >> tfrecordio.WriteToTFRecord(
            file_path_prefix=os.path.join(args.output_dir, part + '_examples'),
            compression_type=filesystem.CompressionTypes.GZIP,
            coder=train_coder,
            file_name_suffix='.gz')

    p.run()
def main(argv=None):
    """Run preprocessing as a Dataflow pipeline.

    Args:
        argv (list): list of arguments

    """
    logging.info('running main')
    args = parse_arguments(sys.argv if argv is None else argv)

    if args.cloud:
        pipeline_options = get_cloud_pipeline_options(args.project_id,
                                                      args.output_dir)
    else:
        pipeline_options = None

    pipeline = beam.Pipeline(options=pipeline_options)

    all_labels = (pipeline | 'ReadDictionary' >> beam.io.ReadFromText(
        'gs://cloud-ml-data/img/flower_photos/dict.txt',
        strip_trailing_newlines=True))

    examples = (pipeline
                | 'ReadData' >> beam.io.ReadFromText(
                    'gs://cloud-ml-data/img/flower_photos/train_set.csv',
                    strip_trailing_newlines=True)
                | 'Split' >> beam.FlatMap(select_files)
                | 'OneHotEncoding' >> beam.FlatMap(
                    one_hot_encoding, beam.pvalue.AsIter(all_labels))
                | 'ReadImage' >> beam.FlatMap(process_image)
                | 'BuildExamples' >> beam.FlatMap(build_example))

    examples_split = examples | beam.Partition(partition_fn, 3)

    example_dict = {
        'train': examples_split[0],
        'validation': examples_split[1],
        'test': examples_split[2]
    }

    train_coder = coders.ExampleProtoCoder(schema)

    for part, examples in example_dict.items():
        examples | part + '_writeExamples' >> \
            beam.io.tfrecordio.WriteToTFRecord(
                file_path_prefix=os.path.join(
                    args.output_dir, part + '_examples'),
                compression_type=beam.io.filesystem.CompressionTypes.GZIP,
                coder=train_coder,
                file_name_suffix='.tfrecord.gz')

    logging.info('running pipeline')

    pipeline.run().wait_until_finish()
Beispiel #22
0
 def expand(self,pcoll):
     table_spec = bigquery.TableReference(
         projectId='iotpubsub-1536350750202',
         datasetId='baybenames',
         #tableId='relation_extraction_data'
         tableId='relation_data_sample'
         )
     return  (
         pcoll
         |'Read in put table' >> beam.io.Read(beam.io.BigQuerySource(table_spec))
         |'Split words' >> beam.ParDo(SplitSentence_Updated_Table())
         |'Split test and training data' >>  beam.Partition(lambda element, _: 0 if randint(0,100)<80 else 1,2 )
         )
def split_data(examples, train_fraction, eval_fraction):
    """Splits the data into train/eval/test."""
    def partition_fn(data, n_partition):
        random_value = random.random()
        if random_value < train_fraction:
            return 0
        if random_value < train_fraction + eval_fraction:
            return 1
        return 2

    examples_split = (examples
                      | 'SplitData' >> beam.Partition(partition_fn, 3))
    return examples_split
Beispiel #24
0
def partition_lambda(test=None):
    # [START partition_lambda]
    import apache_beam as beam

    durations = ['annual', 'biennial', 'perennial']

    with beam.Pipeline() as pipeline:
        annuals, biennials, perennials = (
            pipeline
            | 'Gardening plants' >> beam.Create([
                {
                    'icon': '🍓',
                    'name': 'Strawberry',
                    'duration': 'perennial'
                },
                {
                    'icon': '🥕',
                    'name': 'Carrot',
                    'duration': 'biennial'
                },
                {
                    'icon': '🍆',
                    'name': 'Eggplant',
                    'duration': 'perennial'
                },
                {
                    'icon': '🍅',
                    'name': 'Tomato',
                    'duration': 'annual'
                },
                {
                    'icon': '🥔',
                    'name': 'Potato',
                    'duration': 'perennial'
                },
            ])
            | 'Partition' >> beam.Partition(
                lambda plant, num_partitions: durations.index(plant['duration']
                                                              ),
                len(durations),
            ))
        _ = (annuals
             | 'Annuals' >> beam.Map(lambda x: print('annual: ' + str(x))))
        _ = (biennials
             | 'Biennials' >> beam.Map(lambda x: print('biennial: ' + str(x))))
        _ = (perennials
             |
             'Perennials' >> beam.Map(lambda x: print('perennial: ' + str(x))))
        # [END partition_lambda]
        if test:
            test(annuals, biennials, perennials)
def preprocess(p, args):
    """Run preprocessing as pipeline."""
    train_eval_schema = _make_input_schema()

    train_eval_metadata = dataset_metadata.DatasetMetadata(
        schema=train_eval_schema)

    _ = (train_eval_metadata
         | 'WriteInputMetadata' >> tft_beam_io.WriteMetadata(os.path.join(
             args.output_dir, constants.RAW_METADATA_DIR),
                                                             pipeline=p))

    train_eval_data = (p | 'ReadDataFromBQ' >> beam.io.Read(
        beam.io.BigQuerySource(query=_get_query('bigquery-public-data',
                                                'samples', 'gsod'),
                               use_standard_sql=True)))

    train_eval_data = train_eval_data | 'ValidateData' >> beam.ParDo(
        DataValidator())

    (transformed_train_eval_data,
     transformed_train_eval_metadata), transform_fn = (
         (train_eval_data, train_eval_metadata)
         | 'AnalyzeAndTransform' >> tft_beam.AnalyzeAndTransformDataset(
             get_preprocessing_fn()))

    _ = (transform_fn
         | 'WriteTransformFn' >> tft_beam_io.WriteTransformFn(args.output_dir))

    transformed_train_eval_coder = coders.ExampleProtoCoder(
        transformed_train_eval_metadata.schema)

    transformed_train_data, transformed_eval_data = (
        transformed_train_eval_data
        | 'Partition' >> beam.Partition(get_partition_fn(0.7), 2))

    (transformed_train_data
     |
     'SerializeTrainExamples' >> beam.Map(transformed_train_eval_coder.encode)
     | 'WriteTraining' >>
     beam.io.WriteToTFRecord(os.path.join(
         args.output_dir, constants.TRANSFORMED_TRAIN_DATA_FILE_PREFIX),
                             file_name_suffix=constants.DATA_FILE_SUFFIX))

    (transformed_eval_data
     | 'SerializeEvalExamples' >> beam.Map(transformed_train_eval_coder.encode)
     | 'WriteEval' >>
     beam.io.WriteToTFRecord(os.path.join(
         args.output_dir, constants.TRANSFORMED_EVAL_DATA_FILE_PREFIX),
                             file_name_suffix=constants.DATA_FILE_SUFFIX))
Beispiel #26
0
  def Do(self, input_dict,
         output_dict,
         exec_properties):
    """Take BigQuery sql and generates train and eval tf examples.

    Args:
      input_dict: Input dict from input key to a list of Artifacts.
      output_dict: Output dict from output key to a list of Artifacts.
        - examples: train and eval split of tf examples.
      exec_properties: A dict of execution properties.
        - query: BigQuery sql string.

    Returns:
      None

    Raises:
      RuntimeError: if query is missing in exec_properties.
    """
    self._log_startup(input_dict, output_dict, exec_properties)

    training_tfrecord = types.get_split_uri(output_dict['examples'], 'train')
    eval_tfrecord = types.get_split_uri(output_dict['examples'], 'eval')

    if 'query' not in exec_properties:
      raise RuntimeError('Missing query.')
    query = exec_properties['query']

    tf.logging.info('Generating examples from BigQuery.')
    with beam.Pipeline(argv=self._get_beam_pipeline_args()) as pipeline:
      converter = _BigQueryConverter(query)
      example_splits = (
          pipeline
          | 'QueryTable' >> self._big_query_ptransform(query)
          | 'ToSerializedTFExample' >> beam.Map(
              converter.row_to_serialized_example)
          | 'SplitData' >> beam.Partition(_partition_fn, 2))
      # TODO(jyzhao): make shuffle optional.
      # pylint: disable=expression-not-assigned
      (example_splits[0]
       | 'ShuffleTrainSplit' >> beam.transforms.Reshuffle()
       | 'OutputTrainSplit' >> beam.io.WriteToTFRecord(
           os.path.join(training_tfrecord, DEFAULT_FILE_NAME),
           file_name_suffix='.gz'))
      (example_splits[1]
       | 'ShuffleEvalSplit' >> beam.transforms.Reshuffle()
       | 'OutputEvalSplit' >> beam.io.WriteToTFRecord(
           os.path.join(eval_tfrecord, DEFAULT_FILE_NAME),
           file_name_suffix='.gz'))
      # pylint: enable=expression-not-assigned
    tf.logging.info('Examples generated.')
Beispiel #27
0
  def Do(self, input_dict,
         output_dict,
         exec_properties):
    """Take input csv data and generates train and eval tf examples.

    Args:
      input_dict: Input dict from input key to a list of Artifacts.
        - input-base: input dir that contains csv data. csv files must have
          header line.
      output_dict: Output dict from output key to a list of Artifacts.
        - examples: train and eval split of tf examples.
      exec_properties: A dict of execution properties.

    Returns:
      None
    """
    self._log_startup(input_dict, output_dict, exec_properties)

    training_tfrecord = types.get_split_uri(output_dict['examples'], 'train')
    eval_tfrecord = types.get_split_uri(output_dict['examples'], 'eval')

    input_base = types.get_single_instance(input_dict['input-base'])
    input_base_uri = input_base.uri

    tf.logging.info('Generating examples.')

    raw_data = io_utils.get_only_uri_in_dir(input_base_uri)
    tf.logging.info('No split {}.'.format(raw_data))

    with beam.Pipeline(argv=self._get_beam_pipeline_args()) as pipeline:
      example_splits = (
          pipeline
          # pylint: disable=no-value-for-parameter
          | 'CsvToSerializedExample' >> _CsvToSerializedExample(raw_data)
          | 'SplitData' >> beam.Partition(_partition_fn, 2))
      # TODO(jyzhao): make shuffle optional.
      # pylint: disable=expression-not-assigned
      (example_splits[0]
       | 'ShuffleTrainSplit' >> beam.transforms.Reshuffle()
       | 'OutputTrainSplit' >> beam.io.WriteToTFRecord(
           os.path.join(training_tfrecord, DEFAULT_FILE_NAME),
           file_name_suffix='.gz'))
      (example_splits[1]
       | 'ShuffleEvalSplit' >> beam.transforms.Reshuffle()
       | 'OutputEvalSplit' >> beam.io.WriteToTFRecord(
           os.path.join(eval_tfrecord, DEFAULT_FILE_NAME),
           file_name_suffix='.gz'))
      # pylint: enable=expression-not-assigned

    tf.logging.info('Examples generated.')
Beispiel #28
0
 def test_partition(self):
   p = TestPipeline()
   even, odd = (p
                | beam.Create([1, 2, 3])
                | 'even_odd' >> beam.Partition(lambda e, _: e % 2, 2))
   self.assertIsNotNone(even.element_type)
   self.assertIsNotNone(odd.element_type)
   res_even = (even
               | 'id_even' >> beam.ParDo(lambda e: [e]).with_input_types(int))
   res_odd = (odd
              | 'id_odd' >> beam.ParDo(lambda e: [e]).with_input_types(int))
   assert_that(res_even, equal_to([2]), label='even_check')
   assert_that(res_odd, equal_to([1, 3]), label='odd_check')
   p.run()
def run(
    raw_data_dir: str,
    raw_labels_dir: str,
    train_data_dir: str,
    eval_data_dir: str,
    train_eval_split: List[int],
    **beam_args: Any,
) -> str:

    labels = pd.concat([
        data_utils.read_labels(filename)
        for filename in tf.io.gfile.glob(f"{raw_labels_dir}/*.csv")
    ]).sort_values(by="start_time")

    beam_options = PipelineOptions(flags=[], **beam_args)
    pipeline = beam.Pipeline(options=beam_options)

    training_data, evaluation_data = (
        pipeline
        | "Data files" >> beam.Create([f"{raw_data_dir}/*.npz"])
        | "Expand pattern" >> beam.FlatMap(tf.io.gfile.glob)
        | "Reshuffle files" >> beam.Reshuffle()
        | "Read data" >> beam.Map(data_utils.read_data)
        | "Label data" >> beam.Map(data_utils.label_data, labels)
        | "Get training points" >> beam.FlatMap(
            data_utils.generate_training_points)
        | "Serialize TFRecords" >> beam.Map(trainer.serialize)
        | "Train-eval split" >> beam.Partition(
            lambda x, n: random.choices([0, 1], train_eval_split)[0], 2))

    (training_data
     | "Write train files" >> beam.io.WriteToTFRecord(
         f"{train_data_dir}/part",
         file_name_suffix=".tfrecords.gz",
         compression_type=beam.io.filesystems.CompressionTypes.GZIP,
     ))

    (evaluation_data
     | "Write eval files" >> beam.io.WriteToTFRecord(
         f"{eval_data_dir}/part",
         file_name_suffix=".tfrecords.gz",
         compression_type=beam.io.filesystems.CompressionTypes.GZIP,
     ))

    result = pipeline.run()
    logging.info(result)
    try:
        return result._job.id
    except Exception:
        return beam_args.get("job_name")
Beispiel #30
0
def _split_data(examples,
                train_fraction=constants.TRAIN_SIZE,
                val_fraction=constants.VAL_SIZE):
    """Splits the data into train/validation/test."""
    def partition_fn(*_):
        random_value = np.random.random()
        if random_value < train_fraction:
            return 0
        if random_value < train_fraction + val_fraction:
            return 1
        return 2

    examples_split = examples | "SplitData" >> beam.Partition(partition_fn, 3)
    return zip([constants.TRAIN, constants.VAL, constants.TEST],
               examples_split)