def convert_and_save_tf_examples(features_and_labels, output_path, debug_output=False): """Beam PTransform taking features and labels and saving them as tf examples. Args: features_and_labels: PCollection of APData holding the features and labels. output_path: base folder for output, the function creates 3 sharded tfrecord files for the train, val and test sets. debug_output: whether to output debug information in addition to model inputs in the tf examples. """ def _save_fn(pcoll, filename): _ = (pcoll | f"Reshuffle({filename})" >> beam.Reshuffle() | f"ToCSV({filename})" >> beam.Map(lambda x: "{},{}".format(x[1].subject_id, x[1].note_id)) | f"SaveCSV({filename})" >> beam.io.WriteToText( os.path.join(output_path, filename + ".csv"), header="subject_id,note_id")) _ = (pcoll | f"ConvertToTFExamples({filename})" >> beam.Map( convert_to_tf_examples, debug_output=debug_output) | f"SaveTFRecords({filename})" >> beam.io.WriteToTFRecord( os.path.join(output_path, filename + ".tfrecord"), coder=beam.coders.ProtoCoder(tf.train.Example))) train_set, val_set, test_set = ( features_and_labels | "SplitTrainValTest" >> beam.Partition( lambda x, n_part: max(x[1].partition.value - 1, 0), 3)) _save_fn(train_set, "train_set") _save_fn(val_set, "val_set") _save_fn(test_set, "test_set") # Partition by rating and augmentation status and save stratification. def _split_by_status(element, n_part): del n_part _, ap_data = element return 2 * (ap_data.is_rated) + bool(ap_data.augmentation_name) by_status = (train_set | "SplitByStatus" >> beam.Partition(_split_by_status, 4)) for i, rated_status in enumerate(["nonrated", "rated"]): for j, aug_status in enumerate(["nonaugmented", "augmented"]): stratus = f"train_{rated_status}_{aug_status}" _save_fn(by_status[i * 2 + j], stratus)
def expand(self, model_pipe): """Function that takes in a beam.PCollection of datastore models and returns a beam.PCollection of validation errors. Args: model_pipe: beam.PCollection. A collection of models. Returns: beam.PCollection. A collection of errors represented as key-value pairs. """ not_deleted, deleted = ( model_pipe | 'SplitByDeleted' >> beam.Partition(lambda m, _: int(m.deleted), 2) ) deletion_errors = deleted | beam.ParDo(ValidateDeleted()) time_field_validation_errors = ( not_deleted | beam.ParDo(ValidateModelTimeFields())) model_id_validation_errors = ( not_deleted | beam.ParDo( ValidateModelIdWithRegex(), self._get_model_id_regex()) ) return ( ( deletion_errors, time_field_validation_errors, model_id_validation_errors) | beam.Flatten())
def Do(self, input_dict, output_dict, exec_properties): """Take input data source and generates train and eval tf examples. Args: input_dict: Input dict from input key to a list of Artifacts. Depends on detailed example gen implementation. output_dict: Output dict from output key to a list of Artifacts. - examples: train and eval split of tf examples. exec_properties: A dict of execution properties. Depends on detailed example gen implementation. - output: JSON string of example_gen_pb2.Output instance, providing output configuration. Returns: None Raises: RuntimeError: if output split config is not specified. """ self._log_startup(input_dict, output_dict, exec_properties) # Get output split information. output_config = example_gen_pb2.Output() json_format.Parse(exec_properties['output'], output_config) self._check_split_config(output_config.split_config) splits = output_config.split_config.splits # Calculate split buckets. buckets = [] total_buckets = 0 for split in splits: total_buckets += split.hash_buckets buckets.append(total_buckets) tf.logging.info('Generating examples.') with beam.Pipeline(argv=self._get_beam_pipeline_args()) as pipeline: input_to_example = self.GetInputSourceToExamplePTransform() example_splits = ( pipeline | 'InputSourceToExample' >> input_to_example( input_dict, exec_properties) # Returns deterministic string as partition is based on it. | 'SerializeDeterministically' >> beam.Map(lambda x: x.SerializeToString(deterministic=True)) | 'SplitData' >> beam.Partition(_partition_fn, len(buckets), buckets)) # TODO(jyzhao): make shuffle optional. # pylint: disable=expression-not-assigned for index, example_split in enumerate(example_splits): (example_split | 'ShuffleSplit' + splits[index].name >> beam.transforms.Reshuffle() | 'OutputSplit' + splits[index].name >> beam.io.WriteToTFRecord(os.path.join( types.get_split_uri(output_dict['examples'], splits[index].name), DEFAULT_FILE_NAME), file_name_suffix='.gz')) # pylint: enable=expression-not-assigned tf.logging.info('Examples generated.')
def process_satellite_lines( lines: beam.pvalue.PCollection[Tuple[str, str]] ) -> Tuple[beam.pvalue.PCollection[Row], beam.pvalue.PCollection[Row]]: """Process both satellite and blockpage data files. Args: lines: input lines from all satellite files. Tuple[filename, line] Returns: post_processed_satellite: rows of satellite scan data blockpage_rows: rows of blockpage data """ # PCollection[Tuple[filename,line]] x3 tags, blockpages, lines = lines | beam.Partition( partition_satellite_input, NUM_SATELLITE_INPUT_PARTITIONS) # PCollection[Row] tagged_satellite = process_satellite_with_tags(lines, tags) # PCollection[Row] post_processed_satellite = post_processing_satellite(tagged_satellite) # PCollection[Row] blockpage_rows = process_satellite_blockpages(blockpages) return post_processed_satellite, blockpage_rows
def model_multiple_pcollections_partition(contents, output_path): """Splitting a PCollection with Partition.""" some_hash_fn = lambda s: ord(s[0]) def get_percentile(i): """Assume i in [0,100).""" return i import apache_beam as beam with TestPipeline() as p: # Use TestPipeline for testing. students = p | beam.Create(contents) # [START model_multiple_pcollections_partition] def partition_fn(student, num_partitions): return int(get_percentile(student) * num_partitions / 100) by_decile = students | beam.Partition(partition_fn, 10) # [END model_multiple_pcollections_partition] # [START model_multiple_pcollections_partition_40th] fortieth_percentile = by_decile[4] # [END model_multiple_pcollections_partition_40th] ([by_decile[d] for d in xrange(10) if d != 4] + [fortieth_percentile] | beam.Flatten() | beam.io.WriteToText(output_path))
def ReadImagesFromDisk(pipeline: beam.Pipeline, base_path: Text) -> beam.pvalue.PCollection: """ The Beam PTransform used to load a collection of images and metadata from a local file system or a remote cloud storage bucket. Args: pipeline (beam.Pipeline): Input beam.Pipeline object coming from a TFX Executor. base_path (Text): Base directory containing images and labels. """ wildcard_qualifier = "*" # ingest all the files from the base path by supplying the wildcard file_pattern = os.path.join(base_path, wildcard_qualifier) allowed_ext = [".jpg", ".json", ".png", ".txt", ".jpeg"] images, label_file = ( pipeline | fileio.MatchFiles(file_pattern) | fileio.ReadMatches() | beam.Map(read_file_content) | "FilterOutFiles" >> beam.Filter(lambda x: x[FILE_EXT] in allowed_ext) | "SplitLabelFile" >> beam.Partition(SplitByFileName, 2)) # label_file is actually a dict label_dict = beam.pvalue.AsSingleton(label_file) ready_images = ( images | "AddLabelAndMetadata" >> beam.Map(add_label_and_metadata, label_dict)) return ready_images
def partition_lambda(test=None): # pylint: disable=line-too-long, expression-not-assigned # [START partition_lambda] import apache_beam as beam durations = ['annual', 'biennial', 'perennial'] with beam.Pipeline() as pipeline: annuals, biennials, perennials = ( pipeline | 'Gardening plants' >> beam.Create([ {'icon': '🍓', 'name': 'Strawberry', 'duration': 'perennial'}, {'icon': '🥕', 'name': 'Carrot', 'duration': 'biennial'}, {'icon': '🍆', 'name': 'Eggplant', 'duration': 'perennial'}, {'icon': '🍅', 'name': 'Tomato', 'duration': 'annual'}, {'icon': '🥔', 'name': 'Potato', 'duration': 'perennial'}, ]) | 'Partition' >> beam.Partition( lambda plant, num_partitions: durations.index(plant['duration']), len(durations), ) ) annuals | 'Annuals' >> beam.Map(lambda x: print('annual: {}'.format(x))) biennials | 'Biennials' >> beam.Map( lambda x: print('biennial: {}'.format(x))) perennials | 'Perennials' >> beam.Map( lambda x: print('perennial: {}'.format(x))) # [END partition_lambda] # pylint: enable=line-too-long, expression-not-assigned if test: test(annuals, biennials, perennials)
def test_partition_satellite_input(self) -> None: # pylint: disable=no-self-use """Test partitioning of Satellite input into tags, blockpages, and results.""" data = [ ("CP_Satellite-2020-09-02-12-00-01/resolvers.json", "tag"), ("CP_Satellite-2020-09-02-12-00-01/resolvers.json", "tag"), ("CP_Satellite-2020-09-02-12-00-01/tagged_resolvers.json", "tag"), ("CP_Satellite-2020-09-02-12-00-01/tagged_resolvers.json", "tag"), ("CP_Satellite-2020-09-02-12-00-01/tagged_answers.json", "tag"), ("CP_Satellite-2020-09-02-12-00-01/tagged_answers.json", "tag"), ("CP_Satellite-2021-09-02-12-00-01/blockpages.json", "blockpage"), ("CP_Satellite-2020-09-02-12-00-01/interference.json", "row"), ("CP_Satellite-2020-09-02-12-00-01/interference.json", "row") ] expected_tags = data[0:6] expected_blockpages = data[6:7] expected_rows = data[7:] with TestPipeline() as p: lines = p | 'create data' >> beam.Create(data) tags, blockpages, rows = lines | beam.Partition( satellite.partition_satellite_input, 3) beam_test_util.assert_that(tags, beam_test_util.equal_to(expected_tags), label='assert_that/tags') beam_test_util.assert_that( blockpages, beam_test_util.equal_to(expected_blockpages), label='assert_that/blockpages') beam_test_util.assert_that(rows, beam_test_util.equal_to(expected_rows), label='assert_that/rows')
def _generate_random_train_eval_examples_from_bq(base_query, base_sample_rate, train_samples_fraction, train_output_dir, eval_output_dir, pipeline_options): sampling_query = f''' SELECT * FROM ( SELECT *, (ABS(FARM_FINGERPRINT(row_id)) / 0x7FFFFFFFFFFFFFFF) AS selection_chance FROM ({base_query}) ) WHERE selection_chance < {base_sample_rate} ''' row_to_tf_example_converter = BigQueryToTFExampleConverter(sampling_query) def train_eval_partition_fn(row, n_partitions): return int(row['selection_chance'] > base_sample_rate * train_samples_fraction) bookkeeping_columns = ['row_id', 'selection_chance'] with beam.Pipeline(options=pipeline_options) as pipeline: all_samples = pipeline | 'QueryTable' >> beam.io.Read( beam.io.BigQuerySource(query=sampling_query, use_standard_sql=True)) train_samples, eval_samples = all_samples | 'TrainEvalPartition' >> beam.Partition( train_eval_partition_fn, 2) train_samples | "WriteTrainDataset" >> WriteBigQueryRowsToTFRecord( row_to_tf_example_converter, train_output_dir, bookkeeping_columns) eval_samples | "WriteEvalDataset" >> WriteBigQueryRowsToTFRecord( row_to_tf_example_converter, eval_output_dir, bookkeeping_columns)
def partition_multiple_arguments(test=None): # pylint: disable=expression-not-assigned # [START partition_multiple_arguments] import apache_beam as beam import json def split_dataset(plant, num_partitions, ratio): assert num_partitions == len(ratio) bucket = sum(map(ord, json.dumps(plant))) % sum(ratio) total = 0 for i, part in enumerate(ratio): total += part if bucket < total: return i return len(ratio) - 1 with beam.Pipeline() as pipeline: train_dataset, test_dataset = ( pipeline | 'Gardening plants' >> beam.Create([ {'icon': '🍓', 'name': 'Strawberry', 'duration': 'perennial'}, {'icon': '🥕', 'name': 'Carrot', 'duration': 'biennial'}, {'icon': '🍆', 'name': 'Eggplant', 'duration': 'perennial'}, {'icon': '🍅', 'name': 'Tomato', 'duration': 'annual'}, {'icon': '🥔', 'name': 'Potato', 'duration': 'perennial'}, ]) | 'Partition' >> beam.Partition(split_dataset, 2, ratio=[8, 2]) ) train_dataset | 'Train' >> beam.Map(lambda x: print('train: {}'.format(x))) test_dataset | 'Test' >> beam.Map(lambda x: print('test: {}'.format(x))) # [END partition_multiple_arguments] # pylint: enable=expression-not-assigned if test: test(train_dataset, test_dataset)
def run(argv=None, save_main_session=True): parser = argparse.ArgumentParser() parser.add_argument('--input', dest='input', required=True) parser.add_argument('--output', dest='output', required=True) args, pipeline_args = parser.parse_known_args(argv) pipeline_options = PipelineOptions() pipeline_options.view_as( SetupOptions).save_main_session = save_main_session p = beam.Pipeline(options=pipeline_options) # Input: '\t'.join((timestamp, ip, agent, url, referer)) lines = p | 'Read' >> ReadFromText(args.input) requests = lines | 'Split' >> beam.Map(lambda l: l.split('\t')) shards = requests | 'Shard' >> beam.Partition(shard, NUM_SHARD) for i in range(NUM_SHARD): shards[i] | f'Write_{i}' >> WriteToText(f'{args.output}_shard{i}', num_shards=1) result = p.run() result.wait_until_finish() if (not hasattr(result, 'has_job') # direct runner or result.has_job): # not just a template creation query_result = result.metrics().query() for result in query_result['counters']: logging.info(f'{result.key.metric.name}: {result.result}')
def model_multiple_pcollections_flatten(contents, output_path): """Merging a PCollection with Flatten.""" some_hash_fn = lambda s: ord(s[0]) partition_fn = lambda element, partitions: some_hash_fn(element ) % partitions import apache_beam as beam with TestPipeline() as p: # Use TestPipeline for testing. # Partition into deciles partitioned = p | beam.Create(contents) | beam.Partition( partition_fn, 3) pcoll1 = partitioned[0] pcoll2 = partitioned[1] pcoll3 = partitioned[2] # Flatten them back into 1 # A collection of PCollection objects can be represented simply # as a tuple (or list) of PCollections. # (The SDK for Python has no separate type to store multiple # PCollection objects, whether containing the same or different # types.) # [START model_multiple_pcollections_flatten] merged = ( (pcoll1, pcoll2, pcoll3) # A list of tuples can be "piped" directly into a Flatten transform. | beam.Flatten()) # [END model_multiple_pcollections_flatten] merged | beam.io.WriteToText(output_path)
def testEachPTransformCopiedOnce(self): with beam.Pipeline() as p: created = p | 'Create1' >> beam.Create([(1, 'a'), (2, 'b')]) modified1 = (created | 'Transform1' >> beam.Map( lambda x: DeepCopyTest._CountingIdentityFn( 'Transform1', x))) partition_fn = lambda element, partitions: element[0] % partitions p1, p2 = (modified1 | 'Partition' >> beam.Partition(partition_fn, 2)) merged = (p1, p2) | 'Flatten1' >> beam.Flatten() modified2 = (merged | 'Transform2' >> beam.Map( lambda x: DeepCopyTest._CountingIdentityFn( 'Transform2', x))) copied = deep_copy.deep_copy(modified2) # Check that deep copy was performed. self.assertIsNot(copied.producer.inputs[0], modified2.producer.inputs[0]) self.assertIsNot(copied.producer.inputs[0].producer.inputs[0], modified2.producer.inputs[0].producer.inputs[0]) self.assertIsNot(copied.producer.inputs[0].producer.inputs[1], modified2.producer.inputs[0].producer.inputs[1]) # Check counts of processed items. self.assertEqual(DeepCopyTest._counts['Transform1'], 4) self.assertEqual(DeepCopyTest._counts['Transform2'], 4)
def configure_pipeline(p, dataset_train, dataset_eval, checkpoint_path, output_dir, job_id): source_train = _util.get_sources_from_dataset(p, dataset_train, 'train') labels_source = [source_train] if dataset_eval is not None: source_eval = _util.get_sources_from_dataset(p, dataset_eval, 'eval') labels_source.append(source_eval) labels = _labels_pipeline(labels_source) train_preprocessed = _transformation_pipeline(source_train, checkpoint_path, labels, 'train') if dataset_eval is not None: # explicit eval data. eval_preprocessed = _transformation_pipeline(source_eval, checkpoint_path, labels, 'eval') else: # Split train/eval. train_preprocessed, eval_preprocessed = (train_preprocessed | 'Random Partition' >> beam.Partition(TrainEvalSplitPartitionFn(), 2)) output_train_path = os.path.join(output_dir, job_id, 'train') output_eval_path = os.path.join(output_dir, job_id, 'eval') labels_file = os.path.join(output_dir, job_id, 'labels') labels_save = (labels | 'Write labels' >> beam.io.textio.WriteToText(labels_file, shard_name_template='')) train_save = train_preprocessed | 'Save train to disk' >> SaveFeatures(output_train_path) eval_save = eval_preprocessed | 'Save eval to disk' >> SaveFeatures(output_eval_path) # Make sure we write "latest" file after train and eval data are successfully written. output_latest_file = os.path.join(output_dir, 'latest') ([eval_save, train_save, labels_save] | 'Wait for train eval saving' >> beam.Flatten() | 'Fixed One' >> beam.transforms.combiners.Sample.FixedSizeGlobally(1) | beam.Map(lambda path: job_id) | 'WriteLatest' >> beam.io.textio.WriteToText(output_latest_file, shard_name_template=''))
def expand(self, input_models): """Transforms a PCollection of models into validation errors. Args: input_models: beam.PCollection. A collection of models. Returns: beam.PCollection. A collection of errors represented as key-value pairs. """ existing_models, deleted_models = ( input_models | 'Split by deleted' >> beam.Partition( lambda model, unused_num_partitions: int(model.deleted), 2)) deletion_errors = ( deleted_models | 'Validate deleted models' >> beam.ParDo(ValidateDeletedModel())) timestamp_errors = ( existing_models | 'Validate timestamps' >> beam.ParDo(ValidateModelTimestamps())) id_errors = (existing_models | 'Validate id' >> beam.ParDo(ValidateModelIdWithRegex(), self.get_model_id_regex())) error_pcolls = (deletion_errors, timestamp_errors, id_errors) return error_pcolls | beam.Flatten()
def main(): data = [ ( 'a', 1, 100, ), ( 'b', 2, 100, ), ( 'c', 1, 100, ), ( 'd', 2, 100, ), ( 'e', 1, 100, ), ( 'f', 1, 100, ), ( 'g', 1, 100, ), ( 'h', 1, 100, ), ( 'i', 1, 100, ), ] with beam.Pipeline(options=pipeline_options.PipelineOptions()) as p: students = p | 'create_data1' >> beam.Create(data) def partition_fn(student, num_partitions): print(student) return ord(student[0]) % num_partitions by_decile = students | 'by_decile' >> beam.Partition(partition_fn, 5) path_output = os.path.join(PATH_TO_THIS_DIR, 'by_decile.txt') (by_decile | 'write_flatten' >> beam.io.WriteToText(path_output, file_name_suffix='.csv'))
def run_pipeline(pipeline_args, known_args): """Splits images into separate directories using thresholds on randnum. Args: pipeline_args: arguments ingested by beam pipeline known_args: additional arguments for this project, such as the storage bucket, source_image_dir, and dest_image_dir. Returns: [nothing] - runs beam pipeline and copies output files to different dirs """ # Specify pipeline options pipeline_options = PipelineOptions(pipeline_args) pipeline_options.view_as(SetupOptions).save_main_session = True # Attach bucket prefix if running on cloud source_images_pattern = known_args.source_image_dir + '/*' dest_prefix = known_args.dest_image_dir + '/' if known_args.cloud: source_images_pattern = ('gs://' + known_args.storage_bucket + '/' + source_images_pattern) dest_prefix = ('gs://' + known_args.storage_bucket + '/' + dest_prefix) # Get output directories for split images split_names = known_args.split_names split_fractions = known_args.split_fractions dest_images_dirs = [dest_prefix + x + '/' for x in split_names] # Create output directories if they do not already exist (for local runs) for dest_images_dir in dest_images_dirs: if not FileSystems.exists(dest_images_dir): FileSystems.mkdirs(dest_images_dir) # Log information on source, destination, and split fractions split_log_list = [ x[0] + '(' + str(x[1]) + ')' for x in zip(split_names, split_fractions) ] logging.info('Starting ' + ' | '.join(split_log_list) + ' split from images with source file pattern ' + source_images_pattern) logging.info('Destination parent directory: ' + dest_prefix) with beam.Pipeline(options=pipeline_options) as p: # Read files and partition pipelines split_pipelines = ( p | 'read_images' >> beam.io.Read( LabeledImageFileReader(source_images_pattern)) | 'split_images' >> beam.Partition( generate_split_fn(split_fractions), len(split_fractions))) # Write each pipeline to a corresponding output directory for partition, split_name_and_dest_dir in enumerate( zip(split_names, dest_images_dirs)): _ = (split_pipelines[partition] | 'write_' + split_name_and_dest_dir[0] >> beam.Map( write_to_directory, dst_dir=split_name_and_dest_dir[1])) logging.info('Done splitting image sets')
def expand(self, pvalue): frames = (pvalue.pipeline | Read(self._source) | beam.Partition(splitBadFiles, 2)) chunks = (frames[1] | beam.FlatMap(lambda e: [e]) | beam.CombinePerKey(combineTZ())) return chunks
def configure_pipeline(p, opt): # Type: (apache_beam.Pipeline, apache_beam.PipelineOptions) -> None """Specify PCollection and transformations in pipeline.""" # Create a map of study_uid to label. study_uid_to_label = tcia_utils.GetStudyUIDToLabelMap() # Create a map of study_uid -> path of images in GCS study_uid_to_image_path = _get_study_uid_to_image_path_map(opt.input_path) # Create a map of study_uid -> (GCS path, label) # Split dataset into training, validation and test. paths_and_labels = [] dataset_size = len(study_uid_to_label) training_size = dataset_size * (100 - opt.testing_percentage - opt.validation_percentage) / 100 validation_size = dataset_size * opt.validation_percentage / 100 testing_size = dataset_size * opt.testing_percentage / 100 logging.info('Number of images in training dataset: %s', training_size) logging.info('Number of images in validation dataset: %s', validation_size) logging.info('Number of images in testing dataset: %s', testing_size) count = 0 for k, v in study_uid_to_label.items(): if k not in study_uid_to_image_path: logging.warning('Could not find image with study_uid %s in GCS', k) continue if count < training_size: dataset = constants.TRAINING_DATASET elif count >= training_size and count < (training_size + validation_size): dataset = constants.VALIDATION_DATASET else: dataset = constants.TESTING_DATASET count += 1 paths_and_labels.append((dataset, study_uid_to_image_path[k], v)) # Shuffle the input random.shuffle(paths_and_labels) parts = (p | 'Download Labels' >> beam.Create(paths_and_labels) | 'Preprocess Image' >> beam.ParDo(PreprocessImage()) | 'Split into Training-Validation-Testing' >> beam.Partition( _partition_fn, 3)) # Branch into workflows that serialize training/validation/testing TFRecords. for idx, path_suffix in enumerate([ constants.TRAINING_DATASET, constants.VALIDATION_DATASET, constants.TESTING_DATASET ]): _ = (parts[idx] | 'Serialize TFRecord ' + path_suffix >> beam.Map(lambda x: x.SerializeToString()) | 'Save TFRecord to GCS ' + path_suffix >> beam.io.WriteToTFRecord( os.path.join(opt.output_path, path_suffix), file_name_suffix='.tfrecord'))
def main(argv=None): '''Run Preprocessing as a Dataflow pipeline.''' args = parse_arguments(sys.argv if argv is None else argv) if args.cloud: logging.info('Start running in the cloud') options = { 'runner': 'DataflowRunner', 'job_name': ('mlengine-boilerplate-{}'.format( datetime.datetime.now().strftime('%Y%m%d%H%M%S'))), 'staging_location': os.path.join(args.output_dir, 'staging'), 'temp_location': os.path.join(args.output_dir, 'tmp'), 'project': args.project_id, 'zone': 'europe-west1-d', 'autoscaling_algorithm': 'THROUGHPUT_BASED', 'save_main_session': True, 'setup_file': './setup.py', } pipeline_options = beam.pipeline.PipelineOptions(flags=[], **options) print(pipeline_options) else: pipeline_options = None train_coder = coders.ExampleProtoCoder(schema) p = beam.Pipeline(options=pipeline_options) examples = (p | 'ReadData' >> beam.io.ReadFromText(DATA_DIR + '/*', skip_header_lines=1) | 'buildExamples' >> beam.FlatMap(lambda raw_input: buildExample(raw_input))) examples_split = examples | beam.Partition(partition_fn, 3) example_dict = { 'train': examples_split[0], 'validation': examples_split[1], 'test': examples_split[2] } for part, examples in example_dict.items(): _ = examples | part + '_writeExamples' >> tfrecordio.WriteToTFRecord( file_path_prefix=os.path.join(args.output_dir, part + '_examples'), compression_type=filesystem.CompressionTypes.GZIP, coder=train_coder, file_name_suffix='.gz') p.run()
def main(argv=None): """Run preprocessing as a Dataflow pipeline. Args: argv (list): list of arguments """ logging.info('running main') args = parse_arguments(sys.argv if argv is None else argv) if args.cloud: pipeline_options = get_cloud_pipeline_options(args.project_id, args.output_dir) else: pipeline_options = None pipeline = beam.Pipeline(options=pipeline_options) all_labels = (pipeline | 'ReadDictionary' >> beam.io.ReadFromText( 'gs://cloud-ml-data/img/flower_photos/dict.txt', strip_trailing_newlines=True)) examples = (pipeline | 'ReadData' >> beam.io.ReadFromText( 'gs://cloud-ml-data/img/flower_photos/train_set.csv', strip_trailing_newlines=True) | 'Split' >> beam.FlatMap(select_files) | 'OneHotEncoding' >> beam.FlatMap( one_hot_encoding, beam.pvalue.AsIter(all_labels)) | 'ReadImage' >> beam.FlatMap(process_image) | 'BuildExamples' >> beam.FlatMap(build_example)) examples_split = examples | beam.Partition(partition_fn, 3) example_dict = { 'train': examples_split[0], 'validation': examples_split[1], 'test': examples_split[2] } train_coder = coders.ExampleProtoCoder(schema) for part, examples in example_dict.items(): examples | part + '_writeExamples' >> \ beam.io.tfrecordio.WriteToTFRecord( file_path_prefix=os.path.join( args.output_dir, part + '_examples'), compression_type=beam.io.filesystem.CompressionTypes.GZIP, coder=train_coder, file_name_suffix='.tfrecord.gz') logging.info('running pipeline') pipeline.run().wait_until_finish()
def expand(self,pcoll): table_spec = bigquery.TableReference( projectId='iotpubsub-1536350750202', datasetId='baybenames', #tableId='relation_extraction_data' tableId='relation_data_sample' ) return ( pcoll |'Read in put table' >> beam.io.Read(beam.io.BigQuerySource(table_spec)) |'Split words' >> beam.ParDo(SplitSentence_Updated_Table()) |'Split test and training data' >> beam.Partition(lambda element, _: 0 if randint(0,100)<80 else 1,2 ) )
def split_data(examples, train_fraction, eval_fraction): """Splits the data into train/eval/test.""" def partition_fn(data, n_partition): random_value = random.random() if random_value < train_fraction: return 0 if random_value < train_fraction + eval_fraction: return 1 return 2 examples_split = (examples | 'SplitData' >> beam.Partition(partition_fn, 3)) return examples_split
def partition_lambda(test=None): # [START partition_lambda] import apache_beam as beam durations = ['annual', 'biennial', 'perennial'] with beam.Pipeline() as pipeline: annuals, biennials, perennials = ( pipeline | 'Gardening plants' >> beam.Create([ { 'icon': '🍓', 'name': 'Strawberry', 'duration': 'perennial' }, { 'icon': '🥕', 'name': 'Carrot', 'duration': 'biennial' }, { 'icon': '🍆', 'name': 'Eggplant', 'duration': 'perennial' }, { 'icon': '🍅', 'name': 'Tomato', 'duration': 'annual' }, { 'icon': '🥔', 'name': 'Potato', 'duration': 'perennial' }, ]) | 'Partition' >> beam.Partition( lambda plant, num_partitions: durations.index(plant['duration'] ), len(durations), )) _ = (annuals | 'Annuals' >> beam.Map(lambda x: print('annual: ' + str(x)))) _ = (biennials | 'Biennials' >> beam.Map(lambda x: print('biennial: ' + str(x)))) _ = (perennials | 'Perennials' >> beam.Map(lambda x: print('perennial: ' + str(x)))) # [END partition_lambda] if test: test(annuals, biennials, perennials)
def preprocess(p, args): """Run preprocessing as pipeline.""" train_eval_schema = _make_input_schema() train_eval_metadata = dataset_metadata.DatasetMetadata( schema=train_eval_schema) _ = (train_eval_metadata | 'WriteInputMetadata' >> tft_beam_io.WriteMetadata(os.path.join( args.output_dir, constants.RAW_METADATA_DIR), pipeline=p)) train_eval_data = (p | 'ReadDataFromBQ' >> beam.io.Read( beam.io.BigQuerySource(query=_get_query('bigquery-public-data', 'samples', 'gsod'), use_standard_sql=True))) train_eval_data = train_eval_data | 'ValidateData' >> beam.ParDo( DataValidator()) (transformed_train_eval_data, transformed_train_eval_metadata), transform_fn = ( (train_eval_data, train_eval_metadata) | 'AnalyzeAndTransform' >> tft_beam.AnalyzeAndTransformDataset( get_preprocessing_fn())) _ = (transform_fn | 'WriteTransformFn' >> tft_beam_io.WriteTransformFn(args.output_dir)) transformed_train_eval_coder = coders.ExampleProtoCoder( transformed_train_eval_metadata.schema) transformed_train_data, transformed_eval_data = ( transformed_train_eval_data | 'Partition' >> beam.Partition(get_partition_fn(0.7), 2)) (transformed_train_data | 'SerializeTrainExamples' >> beam.Map(transformed_train_eval_coder.encode) | 'WriteTraining' >> beam.io.WriteToTFRecord(os.path.join( args.output_dir, constants.TRANSFORMED_TRAIN_DATA_FILE_PREFIX), file_name_suffix=constants.DATA_FILE_SUFFIX)) (transformed_eval_data | 'SerializeEvalExamples' >> beam.Map(transformed_train_eval_coder.encode) | 'WriteEval' >> beam.io.WriteToTFRecord(os.path.join( args.output_dir, constants.TRANSFORMED_EVAL_DATA_FILE_PREFIX), file_name_suffix=constants.DATA_FILE_SUFFIX))
def Do(self, input_dict, output_dict, exec_properties): """Take BigQuery sql and generates train and eval tf examples. Args: input_dict: Input dict from input key to a list of Artifacts. output_dict: Output dict from output key to a list of Artifacts. - examples: train and eval split of tf examples. exec_properties: A dict of execution properties. - query: BigQuery sql string. Returns: None Raises: RuntimeError: if query is missing in exec_properties. """ self._log_startup(input_dict, output_dict, exec_properties) training_tfrecord = types.get_split_uri(output_dict['examples'], 'train') eval_tfrecord = types.get_split_uri(output_dict['examples'], 'eval') if 'query' not in exec_properties: raise RuntimeError('Missing query.') query = exec_properties['query'] tf.logging.info('Generating examples from BigQuery.') with beam.Pipeline(argv=self._get_beam_pipeline_args()) as pipeline: converter = _BigQueryConverter(query) example_splits = ( pipeline | 'QueryTable' >> self._big_query_ptransform(query) | 'ToSerializedTFExample' >> beam.Map( converter.row_to_serialized_example) | 'SplitData' >> beam.Partition(_partition_fn, 2)) # TODO(jyzhao): make shuffle optional. # pylint: disable=expression-not-assigned (example_splits[0] | 'ShuffleTrainSplit' >> beam.transforms.Reshuffle() | 'OutputTrainSplit' >> beam.io.WriteToTFRecord( os.path.join(training_tfrecord, DEFAULT_FILE_NAME), file_name_suffix='.gz')) (example_splits[1] | 'ShuffleEvalSplit' >> beam.transforms.Reshuffle() | 'OutputEvalSplit' >> beam.io.WriteToTFRecord( os.path.join(eval_tfrecord, DEFAULT_FILE_NAME), file_name_suffix='.gz')) # pylint: enable=expression-not-assigned tf.logging.info('Examples generated.')
def Do(self, input_dict, output_dict, exec_properties): """Take input csv data and generates train and eval tf examples. Args: input_dict: Input dict from input key to a list of Artifacts. - input-base: input dir that contains csv data. csv files must have header line. output_dict: Output dict from output key to a list of Artifacts. - examples: train and eval split of tf examples. exec_properties: A dict of execution properties. Returns: None """ self._log_startup(input_dict, output_dict, exec_properties) training_tfrecord = types.get_split_uri(output_dict['examples'], 'train') eval_tfrecord = types.get_split_uri(output_dict['examples'], 'eval') input_base = types.get_single_instance(input_dict['input-base']) input_base_uri = input_base.uri tf.logging.info('Generating examples.') raw_data = io_utils.get_only_uri_in_dir(input_base_uri) tf.logging.info('No split {}.'.format(raw_data)) with beam.Pipeline(argv=self._get_beam_pipeline_args()) as pipeline: example_splits = ( pipeline # pylint: disable=no-value-for-parameter | 'CsvToSerializedExample' >> _CsvToSerializedExample(raw_data) | 'SplitData' >> beam.Partition(_partition_fn, 2)) # TODO(jyzhao): make shuffle optional. # pylint: disable=expression-not-assigned (example_splits[0] | 'ShuffleTrainSplit' >> beam.transforms.Reshuffle() | 'OutputTrainSplit' >> beam.io.WriteToTFRecord( os.path.join(training_tfrecord, DEFAULT_FILE_NAME), file_name_suffix='.gz')) (example_splits[1] | 'ShuffleEvalSplit' >> beam.transforms.Reshuffle() | 'OutputEvalSplit' >> beam.io.WriteToTFRecord( os.path.join(eval_tfrecord, DEFAULT_FILE_NAME), file_name_suffix='.gz')) # pylint: enable=expression-not-assigned tf.logging.info('Examples generated.')
def test_partition(self): p = TestPipeline() even, odd = (p | beam.Create([1, 2, 3]) | 'even_odd' >> beam.Partition(lambda e, _: e % 2, 2)) self.assertIsNotNone(even.element_type) self.assertIsNotNone(odd.element_type) res_even = (even | 'id_even' >> beam.ParDo(lambda e: [e]).with_input_types(int)) res_odd = (odd | 'id_odd' >> beam.ParDo(lambda e: [e]).with_input_types(int)) assert_that(res_even, equal_to([2]), label='even_check') assert_that(res_odd, equal_to([1, 3]), label='odd_check') p.run()
def run( raw_data_dir: str, raw_labels_dir: str, train_data_dir: str, eval_data_dir: str, train_eval_split: List[int], **beam_args: Any, ) -> str: labels = pd.concat([ data_utils.read_labels(filename) for filename in tf.io.gfile.glob(f"{raw_labels_dir}/*.csv") ]).sort_values(by="start_time") beam_options = PipelineOptions(flags=[], **beam_args) pipeline = beam.Pipeline(options=beam_options) training_data, evaluation_data = ( pipeline | "Data files" >> beam.Create([f"{raw_data_dir}/*.npz"]) | "Expand pattern" >> beam.FlatMap(tf.io.gfile.glob) | "Reshuffle files" >> beam.Reshuffle() | "Read data" >> beam.Map(data_utils.read_data) | "Label data" >> beam.Map(data_utils.label_data, labels) | "Get training points" >> beam.FlatMap( data_utils.generate_training_points) | "Serialize TFRecords" >> beam.Map(trainer.serialize) | "Train-eval split" >> beam.Partition( lambda x, n: random.choices([0, 1], train_eval_split)[0], 2)) (training_data | "Write train files" >> beam.io.WriteToTFRecord( f"{train_data_dir}/part", file_name_suffix=".tfrecords.gz", compression_type=beam.io.filesystems.CompressionTypes.GZIP, )) (evaluation_data | "Write eval files" >> beam.io.WriteToTFRecord( f"{eval_data_dir}/part", file_name_suffix=".tfrecords.gz", compression_type=beam.io.filesystems.CompressionTypes.GZIP, )) result = pipeline.run() logging.info(result) try: return result._job.id except Exception: return beam_args.get("job_name")
def _split_data(examples, train_fraction=constants.TRAIN_SIZE, val_fraction=constants.VAL_SIZE): """Splits the data into train/validation/test.""" def partition_fn(*_): random_value = np.random.random() if random_value < train_fraction: return 0 if random_value < train_fraction + val_fraction: return 1 return 2 examples_split = examples | "SplitData" >> beam.Partition(partition_fn, 3) return zip([constants.TRAIN, constants.VAL, constants.TEST], examples_split)