def test_end2end_auto_compression_unsharded(self): file_path_prefix = os.path.join(self._new_tempdir(), 'result') # Generate a TFRecord file. with beam.Pipeline(DirectRunner()) as p: expected_data = [self.create_inputs() for _ in range(0, 10)] _ = p | beam.Create(expected_data) | WriteToTFRecord( file_path_prefix + '.gz', shard_name_template='') # Read the file back and compare. with beam.Pipeline(DirectRunner()) as p: actual_data = p | ReadFromTFRecord(file_path_prefix + '.gz') beam.assert_that(actual_data, beam.equal_to(expected_data))
def test(self): p = TestPipeline(DirectRunner()) test_user = {'account': {'id': 1}, 'country': 'Germany'} test_account_offer = { 'account_id': 1, 'account_offer_id': 2, 'offer_id': 3, } test_offer = {'offer_id': 3, 'offer_name': 'offer name'} users = p | "Create users" >> Create([test_user]) account_offers = p | "Create account offers" >> Create( [test_account_offer]) offers = p | "Create offers" >> Create([test_offer]) result = { 'users': users, 'account_offers': account_offers, 'offers': offers } | OfferStatTransform() assert_that(result, self.assertSimple) p.run()
def test_process_gzip_auto(self): path = os.path.join(self._new_tempdir(), 'result.gz') self._write_file_gzip(path, FOO_BAR_RECORD_BASE64) with beam.Pipeline(DirectRunner()) as p: result = (p | ReadFromTFRecord( path, compression_type=fileio.CompressionTypes.AUTO)) beam.assert_that(result, beam.equal_to(['foo', 'bar']))
def test_run_api(self): my_metric = Metrics.counter('namespace', 'my_metric') runner = DirectRunner() result = runner.run( beam.Create([1, 10, 100]) | beam.Map(lambda x: my_metric.inc(x))) result.wait_until_finish() # Use counters to assert the pipeline actually ran. my_metric_value = result.metrics().query()['counters'][0].committed self.assertEqual(my_metric_value, 111)
def test_direct_runner_metrics(self): class MyDoFn(beam.DoFn): def start_bundle(self): count = Metrics.counter(self.__class__, 'bundles') count.inc() def finish_bundle(self): count = Metrics.counter(self.__class__, 'finished_bundles') count.inc() def process(self, element): gauge = Metrics.gauge(self.__class__, 'latest_element') gauge.set(element) count = Metrics.counter(self.__class__, 'elements') count.inc() distro = Metrics.distribution(self.__class__, 'element_dist') distro.update(element) return [element] p = Pipeline(DirectRunner()) pcoll = (p | beam.Create([1, 2, 3, 4, 5]) | 'Do' >> beam.ParDo(MyDoFn())) assert_that(pcoll, equal_to([1, 2, 3, 4, 5])) result = p.run() result.wait_until_finish() metrics = result.metrics().query() namespace = '{}.{}'.format(MyDoFn.__module__, MyDoFn.__name__) hc.assert_that( metrics['counters'], hc.contains_inanyorder( MetricResult( MetricKey('Do', MetricName(namespace, 'elements')), 5, 5), MetricResult( MetricKey('Do', MetricName(namespace, 'bundles')), 1, 1), MetricResult( MetricKey('Do', MetricName(namespace, 'finished_bundles')), 1, 1))) hc.assert_that( metrics['distributions'], hc.contains_inanyorder( MetricResult( MetricKey('Do', MetricName(namespace, 'element_dist')), DistributionResult(DistributionData(15, 5, 1, 5)), DistributionResult(DistributionData(15, 5, 1, 5))))) gauge_result = metrics['gauges'][0] hc.assert_that( gauge_result.key, hc.equal_to(MetricKey('Do', MetricName(namespace, 'latest_element')))) hc.assert_that(gauge_result.committed.value, hc.equal_to(5)) hc.assert_that(gauge_result.attempted.value, hc.equal_to(5))
def test_end2end_example_proto(self): file_path_prefix = os.path.join(self._new_tempdir(), 'result') example = tf.train.Example() example.features.feature['int'].int64_list.value.extend(range(3)) example.features.feature['bytes'].bytes_list.value.extend( [b'foo', b'bar']) with beam.Pipeline(DirectRunner()) as p: _ = p | beam.Create([example]) | WriteToTFRecord( file_path_prefix, coder=beam.coders.ProtoCoder(example.__class__)) # Read the file back and compare. with beam.Pipeline(DirectRunner()) as p: actual_data = (p | ReadFromTFRecord(file_path_prefix + '-*', coder=beam.coders.ProtoCoder( example.__class__))) beam.assert_that(actual_data, beam.equal_to([example]))
def test_process_gzip(self): path = os.path.join(self._new_tempdir(), 'result') self._write_file_gzip(path, FOO_BAR_RECORD_BASE64) with beam.Pipeline(DirectRunner()) as p: result = (p | beam.Read( _TFRecordSource( path, coder=coders.BytesCoder(), compression_type=fileio.CompressionTypes.GZIP))) beam.assert_that(result, beam.equal_to(['foo', 'bar']))
def build_graph(self): # Move percentage of train data to .`PPGRAPH_EXT` files, used for graph building. # num_lines = 0 # for i in range(DATASET_NUM_SHARDS): # _fname = '{}-{:05}-of-{:05}'.format(self.train_fname_out, i, self.config.DATASET_NUM_SHARDS) # num_lines += sum(1 for _ in open(_fname)) # _fname_marked = '{}-{:05}-of-{:05}.{}'.format(self.train_fname_out, i, self.config.DATASET_NUM_SHARDS, # PPGRAPH_EXT) # shutil.move(_fname, _fname_marked) # if num_lines >= self.config.PPGRAPH_MAX_SAMPLES: # break # Set up the preprocessing pipeline for analyzing the dataset. The analyze call is not combined with the # transform call because we will parallelize the transform call later. We had the issue that this process # runs on a single core and tends to cause OOM issues. pipeline = beam.Pipeline(runner=DirectRunner()) with beam_impl.Context(temp_dir=tempfile.mkdtemp()): # todo: maybe, I should only use train data (or percentage of train data) to build the graph raw_train_data = ( pipeline | 'ReadTrainDataFile' >> textio.ReadFromText( 'data/features' + '*' + 'shard' + '*', skip_header_lines=0) | 'DecodeTrainDataCSV' >> MapAndFilterErrors( tft_coders.CsvCoder( self.data_formatter.get_ordered_columns(), self.data_formatter.get_raw_data_metadata().schema). decode)) # Combine data and schema into a dataset tuple. Note that we already used # the schema to read the CSV data, but we also need it to interpret # raw_data. # That is when to use vocabulary, scale_to_0_1 or sparse_to_dense ... transform_fn = ( (raw_train_data, self.data_formatter.get_raw_data_metadata()) | beam_impl.AnalyzeDataset( PreprocessingFunction().transform_to_tfrecord)) # Write SavedModel and metadata to two subdirectories of working_dir, given by # `transform_fn_io.TRANSFORM_FN_DIR` and `transform_fn_io.TRANSFORMED_METADATA_DIR` respectively. _ = (transform_fn | 'WriteTransformGraph' >> transform_fn_io.WriteTransformFn(TARGET_DIR)) # working dir # Run the Beam preprocessing pipeline. st = time.time() result = pipeline.run() result.wait_until_finish() self.logger.info( 'Transformation graph built and written in {:.2f} sec'.format( time.time() - st))
def test_direct_runner_metrics(self): from apache_beam.metrics.metric import Metrics class MyDoFn(beam.DoFn): def start_bundle(self): count = Metrics.counter(self.__class__, 'bundles') count.inc() def finish_bundle(self): count = Metrics.counter(self.__class__, 'finished_bundles') count.inc() def process(self, element): count = Metrics.counter(self.__class__, 'elements') count.inc() distro = Metrics.distribution(self.__class__, 'element_dist') distro.update(element) return [element] runner = DirectRunner() p = Pipeline(runner, options=PipelineOptions(self.default_properties)) # pylint: disable=expression-not-assigned (p | ptransform.Create([1, 2, 3, 4, 5]) | 'Do' >> beam.ParDo(MyDoFn())) result = p.run() result.wait_until_finish() metrics = result.metrics().query() namespace = '{}.{}'.format(MyDoFn.__module__, MyDoFn.__name__) hc.assert_that( metrics['counters'], hc.contains_inanyorder( MetricResult( MetricKey('Do', MetricName(namespace, 'elements')), 5, 5), MetricResult( MetricKey('Do', MetricName(namespace, 'bundles')), 1, 1), MetricResult( MetricKey('Do', MetricName(namespace, 'finished_bundles')), 1, 1))) hc.assert_that( metrics['distributions'], hc.contains_inanyorder( MetricResult( MetricKey('Do', MetricName(namespace, 'element_dist')), DistributionResult(DistributionData(15, 5, 1, 5)), DistributionResult(DistributionData(15, 5, 1, 5)))))
def test_write_record_auto(self): file_path_prefix = os.path.join(self._new_tempdir(), 'result') with beam.Pipeline(DirectRunner()) as p: input_data = ['foo', 'bar'] _ = p | beam.Create(input_data) | WriteToTFRecord( file_path_prefix, file_name_suffix='.gz') actual = [] file_name = glob.glob(file_path_prefix + '-*.gz')[0] for r in tf.python_io.tf_record_iterator( file_name, options=tf.python_io.TFRecordOptions( tf.python_io.TFRecordCompressionType.GZIP)): actual.append(r) self.assertEqual(actual, input_data)
def _pipeline_runner(): with beam.Pipeline(runner=DirectRunner()) as p: ts = TestStream().advance_watermark_to(0) all_elements = iter(range(size)) watermark = 0 while True: next_batch = list(itertools.islice(all_elements, 100)) if not next_batch: break ts = ts.add_elements([(i, random.randint(0, 1000)) for i in next_batch]) watermark = watermark + 100 ts = ts.advance_watermark_to(watermark) ts = ts.advance_watermark_to_infinity() input_pc = p | ts | WindowInto(FixedWindows(100)) for i in range(NUM_PARALLEL_STAGES): _build_serial_stages(input_pc, NUM_SERIAL_STAGES, i)
def write_to_tfrecord(args): """ This function is supposed to be called as a script. """ # Decode arguments current_index, num_shards, train_split_fname_out, eval_split_fname_out, \ exp_log_data_file_train_tfrecord, exp_log_data_file_eval_tfrecord, working_dir, data_formatter_module_path = args # num_shards = "32" current_index, num_shards = int(current_index), int(num_shards) split_train_file_pattern = '{}-{:05}-of-{:05}'.format( train_split_fname_out, current_index, num_shards) + '*' split_eval_file_pattern = '{}-{:05}-of-{:05}'.format( eval_split_fname_out, current_index, num_shards) log.info('exp_log_data_file_train_tfrecord {}'.format( exp_log_data_file_train_tfrecord)) log.info('exp_log_data_file_eval_tfrecord {}'.format( exp_log_data_file_eval_tfrecord)) log.info('split_train_file_pattern {}'.format(split_train_file_pattern)) log.info('split_eval_file_pattern {}'.format(split_eval_file_pattern)) data_formatter = import_from_uri( data_formatter_module_path).DataFormatter() # Set up the preprocessing pipeline. pipeline = beam.Pipeline(runner=DirectRunner()) with beam_impl.Context(temp_dir=tempfile.mkdtemp()): # Read raw data files: CSV format ordered according to the `data_formatter`, that are then converted # into a cleaned up format. raw_train_data = ( pipeline | 'ReadTrainDataFile' >> textio.ReadFromText( split_train_file_pattern, skip_header_lines=0) | 'DecodeTrainDataCSV' >> MapAndFilterErrors( tft_coders.CsvCoder( data_formatter.get_features_and_targets(), data_formatter.get_features_metadata().schema).decode)) raw_eval_data = ( pipeline | 'ReadEvalDataFile' >> textio.ReadFromText( split_eval_file_pattern, skip_header_lines=0) | 'DecodeEvalDataCSV' >> MapAndFilterErrors( tft_coders.CsvCoder( data_formatter.get_features_and_targets(), data_formatter.get_features_metadata().schema).decode)) # Examples in tf-example format (for model analysis purposes). # raw_feature_spec = data_formatter.RAW_DATA_METADATA.schema.as_feature_spec() # raw_schema = dataset_schema.from_feature_spec(raw_feature_spec) # coder = example_proto_coder.ExampleProtoCoder(raw_schema) # # _ = ( # raw_eval_data # | 'ToSerializedTFExample' >> beam.Map(coder.encode) # | 'WriteAnalysisTFRecord' >> tfrecordio.WriteToTFRecord( # '{}-{:05}-of-{:05}'.format(analysis_fname_out, i, num_shards), # shard_name_template='', num_shards=1) # ) # Write SavedModel and metadata to two subdirectories of working_dir, given by # `transform_fn_io.TRANSFORM_FN_DIR` and `transform_fn_io.TRANSFORMED_METADATA_DIR` respectively. transform_fn = (pipeline | 'ReadTransformGraph' >> transform_fn_io.ReadTransformFn(working_dir)) # Applies the transformation `transform_fn` to the raw eval dataset (transformed_train_data, transformed_metadata) = ( ((raw_train_data, data_formatter.get_features_metadata()), transform_fn) | 'TransformTrainData' >> beam_impl.TransformDataset()) # Applies the transformation `transform_fn` to the raw eval dataset (transformed_eval_data, transformed_metadata) = ( ((raw_eval_data, data_formatter.get_features_metadata()), transform_fn) | 'TransformEvalData' >> beam_impl.TransformDataset()) # The data schema of the transformed data gets used to build a signature to create # a TFRecord (tf binary data format). This signature is a wrapper function used to # encode transformed data. transformed_data_coder = tft.coders.ExampleProtoCoder( transformed_metadata.schema) _ = (transformed_train_data | 'EncodeTrainDataTransform' >> MapAndFilterErrors( transformed_data_coder.encode) | 'WriteTrainDataTFRecord' >> tfrecordio.WriteToTFRecord('{}-{:05}-of-{:05}'.format( exp_log_data_file_train_tfrecord, current_index, num_shards), shard_name_template='', num_shards=1)) _ = (transformed_eval_data | 'EncodeEvalDataTransform' >> MapAndFilterErrors( transformed_data_coder.encode) | 'WriteEvalDataTFRecord' >> tfrecordio.WriteToTFRecord('{}-{:05}-of-{:05}'.format( exp_log_data_file_eval_tfrecord, current_index, num_shards), shard_name_template='', num_shards=1)) result = pipeline.run() result.wait_until_finish()
def run(argv=None, saveMainSession=False): logging.info("____starting____") parser = argparse.ArgumentParser() parser.add_argument("--input-raw-str", required=False, default="THE QUICK BROWN FOX JUMPS OVER THE LAZY DOG") my_pipeline_options = MyOptions(input_file="./samples/kinglear.txt") known_args, pipeline_args = parser.parse_known_args(argv) pipeline_options = PipelineOptions(pipeline_args) try: shutil.rmtree(os.getcwd() + "/target", ignore_errors=True) except OSError as error: logging.error(error) else: with beam.Pipeline(runner=DirectRunner(), options=pipeline_options) as p: """ logging.debug("All options:\n%s", p.options.get_all_options()) logging.debug("Known Cmd Args : \n" "\t -input-raw-str: %s", known_args.input_raw_str) logging.debug("MyPipelineOptions : \n" "\t -input-file: %s" "\t -output-file: %s", my_pipeline_options.get_all_options()["input_file"], my_pipeline_options.get_all_options()["output_file"] ) """ lines = p | "ReadInputFile" >> beam.io.ReadFromText( my_pipeline_options.input_file) """ #ParDoFn: with DoFn lines_len_v1 = lines | "Mapping with ParDo Fn" >> beam.ParDo(ComputeWordLengthFn()) | "Write lines_len_v1" >> beam.io.WriteToText(os.getcwd()+"/target/lines_len_v1.out", file_name_suffix=".txt") lines_len_v2 = lines | "Mapping with FlatMap Fn" >> beam.FlatMap(lambda word: [len(word)]) | "Write lines_len_v2" >> beam.io.WriteToText(os.getcwd()+"/target/lines_len_v2.out", file_name_suffix=".txt") lines_len_v3 = lines | "Mapping with Map Fn" >> beam.Map(len) | "Write lines_len_v3" >> beam.io.WriteToText(os.getcwd()+"/target/lines_len_v3.out", file_name_suffix=".txt") #Filter non_empty_lines = lines | "Filter empty lines" >> beam.Filter(lambda x: len(x) > 0) | "Write non empty lines" >> beam.io.WriteToText(os.getcwd()+"/target/non_empty_lines.out", file_name_suffix=".txt") """ #GroupByKey (lines | "Extract" >> beam.ParDo(ExtractWordsFn()) | "Lower" >> beam.ParDo(lambda w: w.lower()) | "PairWithOne" >> beam.Map(lambda w: (w, 1)) | "GrouByKey" >> beam.GroupByKey() | "Count" >> beam.CombineValues(sum) | "WriteToFile" >> beam.io.WriteToText( os.getcwd() + "/target/word_count.out", file_name_suffix=".txt")) #CoGroupByKey emails_list = [ ('amy', '*****@*****.**'), ('carl', '*****@*****.**'), ('julia', '*****@*****.**'), ('carl', '*****@*****.**'), ] phones_list = [ ('amy', '111-222-3333'), ('james', '222-333-4444'), ('amy', '333-444-5555'), ('carl', '444-555-6666'), ] emails = p | 'CreateEmails' >> beam.Create(emails_list) phones = p | 'CreatePhones' >> beam.Create(phones_list) joined_result = ({ "emails": emails, "phones": phones } | beam.CoGroupByKey()) def join_person_info(person_infos): name, info = person_infos emails, phones = info["emails"], info["phones"] return f"{name} : {emails} - {phones}" (joined_result | "Show person info" >> beam.Map(join_person_info) | "Write infos to file" >> beam.io.WriteToText( os.getcwd() + "/target/person_info", file_name_suffix=".txt")) #CombineGlobally student_subjects_marks = [("Joseph", "Maths", 83), ("Joseph", "Physics", 74), ("Joseph", "Chemistry", 91), ("Joseph", "Biology", 82), ("Jimmy", "Maths", 69), ("Jimmy", "Physics", 62), ("Jimmy", "Chemistry", 97), ("Jimmy", "Biology", 80), ("Tina", "Maths", 78), ("Tina", "Physics", 73), ("Tina", "Chemistry", 68), ("Tina", "Biology", 87), ("Thomas", "Maths", 87), ("Thomas", "Physics", 93), ("Thomas", "Chemistry", 91), ("Thomas", "Biology", 74), ("Cory", "Maths", 56), ("Cory", "Physics", 65), ("Cory", "Chemistry", 71), ("Cory", "Biology", 68), ("Jackeline", "Maths", 86), ("Jackeline", "Physics", 62), ("Jackeline", "Chemistry", 75), ("Jackeline", "Biology", 83), ("Juan", "Maths", 63), ("Juan", "Physics", 69), ("Juan", "Chemistry", 64), ("Juan", "Biology", 60)] def print_row(row, *args): print("=" * 100) for v in args: print(v) print(row) print("=" * 100) students_results = p | "CreateStudentResult" >> beam.Create( student_subjects_marks) (students_results | beam.CombineGlobally(CombineAllMarks()).with_defaults( ) # return empty PCollection if input is empty | "Show Result" >> beam.Map(print_row, "GlobalAverage")) #CombinePerKey (students_results | "Group per name" >> beam.Map(lambda tuple: (tuple[0], (tuple[1], tuple[2]))) | "Compute avg per student" >> beam.CombinePerKey( CombineAllMarks(is_per_key=True)) | "Show Result Per Key" >> beam.Map(print_row, "AveragePerStudent") # | "Write avg marks to file" >> beam.io.WriteToText(os.getcwd()+"/target/avg_mark_per_student", file_name_suffix=".txt") ) #Flatten joseph_subjects_marks = p | "Create Joseph PCol" >> beam.Create( student_subjects_marks[:3]) juan_subjects_marks = p | "Create Juan PCol" >> beam.Create( student_subjects_marks[-4:]) ((joseph_subjects_marks, juan_subjects_marks) | beam.Flatten() | "Write Flattened to File" >> beam.io.WriteToText( os.getcwd() + "/target/joseph_and_juan", file_name_suffix=".txt")) #Partition def partition_fn(student, num_partitions): (_, subject, _) = student subjects = 'Maths', 'Physics', 'Chemistry', 'Biology', return subjects.index(subject) all_partitions = student_subjects_marks | beam.Partition( partition_fn, 4) (all_partitions['0'] # | "Show Maths students" >> beam.Map(print_row, "Math Student") ) | "Write Maths students to File" >> beam.io.WriteToText( os.getcwd() + "/target/maths_students", file_name_suffix=".txt")) #SideInput (lines | "SideInput : Extract words" >> beam.ParDo(ExtractWordsFn()) | "Filter using length" >> beam.ParDo( FilterWordsUsingLength(), lower_bound=2, upper_bound=5) | "Write small words" >> beam.io.WriteToText( os.getcwd() + "/target/small_words", file_name_suffix=".txt")) #SideOutput prefix = 'O' outputs = ( lines | "SideOutput : Extract words" >> beam.ParDo(ExtractWordsFn()) | "SideOutput : Filter using length" >> beam.ParDo( ProcessWordsMultiOutputs(), upper_bound=5, prefix=prefix).with_outputs( 'Short_Words', 'Long_Words', main='Start_With')) short_words = outputs.Short_Words long_words = outputs.Long_Words start_with = outputs.Start_With short_words | "SideOutput: Write short words" >> beam.io.WriteToText( os.getcwd() + "/target/side_output/short_words", file_name_suffix=".txt") long_words | "SideOutput : Write long words" >> beam.io.WriteToText( os.getcwd() + "/target/side_output/long_words", file_name_suffix=".txt") start_with | "SideOutput : Write words : start with" >> beam.io.WriteToText( os.getcwd() + f"/target/side_output/start_with_{prefix}", file_name_suffix=".txt") #PTransform (lines.apply(ComputeWordsTransform() ) # <=> lines | ComputeWordsTransform() | "PTransform : Write words" >> beam.io.WriteToText( os.getcwd() + "/target/ptransform_words", file_name_suffix=".txt"))