Example #1
0
 def test_run_api(self):
     my_metric = Metrics.counter('namespace', 'my_metric')
     runner = DirectRunner()
     result = runner.run(
         beam.Create([1, 10, 100]) | beam.Map(lambda x: my_metric.inc(x)))
     result.wait_until_finish()
     # Use counters to assert the pipeline actually ran.
     my_metric_value = result.metrics().query()['counters'][0].committed
     self.assertEqual(my_metric_value, 111)
Example #2
0
 def test_run_api(self):
   my_metric = Metrics.counter('namespace', 'my_metric')
   runner = DirectRunner()
   result = runner.run(
       beam.Create([1, 10, 100]) | beam.Map(lambda x: my_metric.inc(x)))
   result.wait_until_finish()
   # Use counters to assert the pipeline actually ran.
   my_metric_value = result.metrics().query()['counters'][0].committed
   self.assertEqual(my_metric_value, 111)
Example #3
0
    def test_end2end_auto_compression_unsharded(self):
        file_path_prefix = os.path.join(self._new_tempdir(), 'result')

        # Generate a TFRecord file.
        with beam.Pipeline(DirectRunner()) as p:
            expected_data = [self.create_inputs() for _ in range(0, 10)]
            _ = p | beam.Create(expected_data) | WriteToTFRecord(
                file_path_prefix + '.gz', shard_name_template='')

        # Read the file back and compare.
        with beam.Pipeline(DirectRunner()) as p:
            actual_data = p | ReadFromTFRecord(file_path_prefix + '.gz')
            beam.assert_that(actual_data, beam.equal_to(expected_data))
    def test(self):
        p = TestPipeline(DirectRunner())

        test_user = {'account': {'id': 1}, 'country': 'Germany'}
        test_account_offer = {
            'account_id': 1,
            'account_offer_id': 2,
            'offer_id': 3,
        }
        test_offer = {'offer_id': 3, 'offer_name': 'offer name'}

        users = p | "Create users" >> Create([test_user])
        account_offers = p | "Create account offers" >> Create(
            [test_account_offer])
        offers = p | "Create offers" >> Create([test_offer])

        result = {
            'users': users,
            'account_offers': account_offers,
            'offers': offers
        } | OfferStatTransform()

        assert_that(result, self.assertSimple)

        p.run()
Example #5
0
 def test_process_gzip_auto(self):
     path = os.path.join(self._new_tempdir(), 'result.gz')
     self._write_file_gzip(path, FOO_BAR_RECORD_BASE64)
     with beam.Pipeline(DirectRunner()) as p:
         result = (p
                   | ReadFromTFRecord(
                       path, compression_type=fileio.CompressionTypes.AUTO))
         beam.assert_that(result, beam.equal_to(['foo', 'bar']))
Example #6
0
  def test_direct_runner_metrics(self):

    class MyDoFn(beam.DoFn):
      def start_bundle(self):
        count = Metrics.counter(self.__class__, 'bundles')
        count.inc()

      def finish_bundle(self):
        count = Metrics.counter(self.__class__, 'finished_bundles')
        count.inc()

      def process(self, element):
        gauge = Metrics.gauge(self.__class__, 'latest_element')
        gauge.set(element)
        count = Metrics.counter(self.__class__, 'elements')
        count.inc()
        distro = Metrics.distribution(self.__class__, 'element_dist')
        distro.update(element)
        return [element]

    p = Pipeline(DirectRunner())
    pcoll = (p | beam.Create([1, 2, 3, 4, 5])
             | 'Do' >> beam.ParDo(MyDoFn()))
    assert_that(pcoll, equal_to([1, 2, 3, 4, 5]))
    result = p.run()
    result.wait_until_finish()
    metrics = result.metrics().query()
    namespace = '{}.{}'.format(MyDoFn.__module__,
                               MyDoFn.__name__)

    hc.assert_that(
        metrics['counters'],
        hc.contains_inanyorder(
            MetricResult(
                MetricKey('Do', MetricName(namespace, 'elements')),
                5, 5),
            MetricResult(
                MetricKey('Do', MetricName(namespace, 'bundles')),
                1, 1),
            MetricResult(
                MetricKey('Do', MetricName(namespace, 'finished_bundles')),
                1, 1)))

    hc.assert_that(
        metrics['distributions'],
        hc.contains_inanyorder(
            MetricResult(
                MetricKey('Do', MetricName(namespace, 'element_dist')),
                DistributionResult(DistributionData(15, 5, 1, 5)),
                DistributionResult(DistributionData(15, 5, 1, 5)))))

    gauge_result = metrics['gauges'][0]
    hc.assert_that(
        gauge_result.key,
        hc.equal_to(MetricKey('Do', MetricName(namespace, 'latest_element'))))
    hc.assert_that(gauge_result.committed.value, hc.equal_to(5))
    hc.assert_that(gauge_result.attempted.value, hc.equal_to(5))
Example #7
0
    def test_end2end_example_proto(self):
        file_path_prefix = os.path.join(self._new_tempdir(), 'result')

        example = tf.train.Example()
        example.features.feature['int'].int64_list.value.extend(range(3))
        example.features.feature['bytes'].bytes_list.value.extend(
            [b'foo', b'bar'])

        with beam.Pipeline(DirectRunner()) as p:
            _ = p | beam.Create([example]) | WriteToTFRecord(
                file_path_prefix,
                coder=beam.coders.ProtoCoder(example.__class__))

        # Read the file back and compare.
        with beam.Pipeline(DirectRunner()) as p:
            actual_data = (p | ReadFromTFRecord(file_path_prefix + '-*',
                                                coder=beam.coders.ProtoCoder(
                                                    example.__class__)))
            beam.assert_that(actual_data, beam.equal_to([example]))
Example #8
0
 def test_process_gzip(self):
     path = os.path.join(self._new_tempdir(), 'result')
     self._write_file_gzip(path, FOO_BAR_RECORD_BASE64)
     with beam.Pipeline(DirectRunner()) as p:
         result = (p
                   | beam.Read(
                       _TFRecordSource(
                           path,
                           coder=coders.BytesCoder(),
                           compression_type=fileio.CompressionTypes.GZIP)))
         beam.assert_that(result, beam.equal_to(['foo', 'bar']))
Example #9
0
    def build_graph(self):
        # Move percentage of train data to .`PPGRAPH_EXT` files, used for graph building.
        # num_lines = 0
        # for i in range(DATASET_NUM_SHARDS):
        #     _fname = '{}-{:05}-of-{:05}'.format(self.train_fname_out, i, self.config.DATASET_NUM_SHARDS)
        #     num_lines += sum(1 for _ in open(_fname))
        #     _fname_marked = '{}-{:05}-of-{:05}.{}'.format(self.train_fname_out, i, self.config.DATASET_NUM_SHARDS,
        #                                                   PPGRAPH_EXT)
        #     shutil.move(_fname, _fname_marked)
        #     if num_lines >= self.config.PPGRAPH_MAX_SAMPLES:
        #         break

        # Set up the preprocessing pipeline for analyzing the dataset. The analyze call is not combined with the
        # transform call because we will parallelize the transform call later. We had the issue that this process
        # runs on a single core and tends to cause OOM issues.
        pipeline = beam.Pipeline(runner=DirectRunner())

        with beam_impl.Context(temp_dir=tempfile.mkdtemp()):
            # todo: maybe, I should only use train data (or percentage of train data) to build the graph
            raw_train_data = (
                pipeline
                | 'ReadTrainDataFile' >> textio.ReadFromText(
                    'data/features' + '*' + 'shard' + '*', skip_header_lines=0)
                | 'DecodeTrainDataCSV' >> MapAndFilterErrors(
                    tft_coders.CsvCoder(
                        self.data_formatter.get_ordered_columns(),
                        self.data_formatter.get_raw_data_metadata().schema).
                    decode))

            # Combine data and schema into a dataset tuple.  Note that we already used
            # the schema to read the CSV data, but we also need it to interpret
            # raw_data.
            # That is when to use vocabulary, scale_to_0_1 or sparse_to_dense ...
            transform_fn = (
                (raw_train_data, self.data_formatter.get_raw_data_metadata())
                | beam_impl.AnalyzeDataset(
                    PreprocessingFunction().transform_to_tfrecord))

            # Write SavedModel and metadata to two subdirectories of working_dir, given by
            # `transform_fn_io.TRANSFORM_FN_DIR` and `transform_fn_io.TRANSFORMED_METADATA_DIR` respectively.
            _ = (transform_fn
                 | 'WriteTransformGraph' >>
                 transform_fn_io.WriteTransformFn(TARGET_DIR))  # working dir

        # Run the Beam preprocessing pipeline.
        st = time.time()
        result = pipeline.run()
        result.wait_until_finish()
        self.logger.info(
            'Transformation graph built and written in {:.2f} sec'.format(
                time.time() - st))
Example #10
0
  def test_direct_runner_metrics(self):
    from apache_beam.metrics.metric import Metrics

    class MyDoFn(beam.DoFn):
      def start_bundle(self):
        count = Metrics.counter(self.__class__, 'bundles')
        count.inc()

      def finish_bundle(self):
        count = Metrics.counter(self.__class__, 'finished_bundles')
        count.inc()

      def process(self, element):
        count = Metrics.counter(self.__class__, 'elements')
        count.inc()
        distro = Metrics.distribution(self.__class__, 'element_dist')
        distro.update(element)
        return [element]

    runner = DirectRunner()
    p = Pipeline(runner,
                 options=PipelineOptions(self.default_properties))
    # pylint: disable=expression-not-assigned
    (p | ptransform.Create([1, 2, 3, 4, 5])
     | 'Do' >> beam.ParDo(MyDoFn()))
    result = p.run()
    result.wait_until_finish()
    metrics = result.metrics().query()
    namespace = '{}.{}'.format(MyDoFn.__module__,
                               MyDoFn.__name__)

    hc.assert_that(
        metrics['counters'],
        hc.contains_inanyorder(
            MetricResult(
                MetricKey('Do', MetricName(namespace, 'elements')),
                5, 5),
            MetricResult(
                MetricKey('Do', MetricName(namespace, 'bundles')),
                1, 1),
            MetricResult(
                MetricKey('Do', MetricName(namespace, 'finished_bundles')),
                1, 1)))
    hc.assert_that(
        metrics['distributions'],
        hc.contains_inanyorder(
            MetricResult(
                MetricKey('Do', MetricName(namespace, 'element_dist')),
                DistributionResult(DistributionData(15, 5, 1, 5)),
                DistributionResult(DistributionData(15, 5, 1, 5)))))
Example #11
0
    def test_write_record_auto(self):
        file_path_prefix = os.path.join(self._new_tempdir(), 'result')
        with beam.Pipeline(DirectRunner()) as p:
            input_data = ['foo', 'bar']
            _ = p | beam.Create(input_data) | WriteToTFRecord(
                file_path_prefix, file_name_suffix='.gz')

        actual = []
        file_name = glob.glob(file_path_prefix + '-*.gz')[0]
        for r in tf.python_io.tf_record_iterator(
                file_name,
                options=tf.python_io.TFRecordOptions(
                    tf.python_io.TFRecordCompressionType.GZIP)):
            actual.append(r)
        self.assertEqual(actual, input_data)
Example #12
0
    def _pipeline_runner():
        with beam.Pipeline(runner=DirectRunner()) as p:
            ts = TestStream().advance_watermark_to(0)
            all_elements = iter(range(size))
            watermark = 0
            while True:
                next_batch = list(itertools.islice(all_elements, 100))
                if not next_batch:
                    break
                ts = ts.add_elements([(i, random.randint(0, 1000))
                                      for i in next_batch])
                watermark = watermark + 100
                ts = ts.advance_watermark_to(watermark)
            ts = ts.advance_watermark_to_infinity()

            input_pc = p | ts | WindowInto(FixedWindows(100))
            for i in range(NUM_PARALLEL_STAGES):
                _build_serial_stages(input_pc, NUM_SERIAL_STAGES, i)
Example #13
0
def write_to_tfrecord(args):
    """
    This function is supposed to be called as a script.
    """
    # Decode arguments
    current_index, num_shards, train_split_fname_out, eval_split_fname_out, \
    exp_log_data_file_train_tfrecord, exp_log_data_file_eval_tfrecord, working_dir, data_formatter_module_path = args

    # num_shards = "32"
    current_index, num_shards = int(current_index), int(num_shards)

    split_train_file_pattern = '{}-{:05}-of-{:05}'.format(
        train_split_fname_out, current_index, num_shards) + '*'
    split_eval_file_pattern = '{}-{:05}-of-{:05}'.format(
        eval_split_fname_out, current_index, num_shards)

    log.info('exp_log_data_file_train_tfrecord {}'.format(
        exp_log_data_file_train_tfrecord))
    log.info('exp_log_data_file_eval_tfrecord {}'.format(
        exp_log_data_file_eval_tfrecord))
    log.info('split_train_file_pattern {}'.format(split_train_file_pattern))
    log.info('split_eval_file_pattern {}'.format(split_eval_file_pattern))

    data_formatter = import_from_uri(
        data_formatter_module_path).DataFormatter()

    # Set up the preprocessing pipeline.
    pipeline = beam.Pipeline(runner=DirectRunner())

    with beam_impl.Context(temp_dir=tempfile.mkdtemp()):
        # Read raw data files: CSV format ordered according to the `data_formatter`, that are then converted
        # into a cleaned up format.
        raw_train_data = (
            pipeline
            | 'ReadTrainDataFile' >> textio.ReadFromText(
                split_train_file_pattern, skip_header_lines=0)
            | 'DecodeTrainDataCSV' >> MapAndFilterErrors(
                tft_coders.CsvCoder(
                    data_formatter.get_features_and_targets(),
                    data_formatter.get_features_metadata().schema).decode))

        raw_eval_data = (
            pipeline
            | 'ReadEvalDataFile' >> textio.ReadFromText(
                split_eval_file_pattern, skip_header_lines=0)
            | 'DecodeEvalDataCSV' >> MapAndFilterErrors(
                tft_coders.CsvCoder(
                    data_formatter.get_features_and_targets(),
                    data_formatter.get_features_metadata().schema).decode))

        # Examples in tf-example format (for model analysis purposes).
        # raw_feature_spec = data_formatter.RAW_DATA_METADATA.schema.as_feature_spec()
        # raw_schema = dataset_schema.from_feature_spec(raw_feature_spec)
        # coder = example_proto_coder.ExampleProtoCoder(raw_schema)
        #
        # _ = (
        #         raw_eval_data
        #         | 'ToSerializedTFExample' >> beam.Map(coder.encode)
        #         | 'WriteAnalysisTFRecord' >> tfrecordio.WriteToTFRecord(
        #     '{}-{:05}-of-{:05}'.format(analysis_fname_out, i, num_shards),
        #     shard_name_template='', num_shards=1)
        # )

        # Write SavedModel and metadata to two subdirectories of working_dir, given by
        # `transform_fn_io.TRANSFORM_FN_DIR` and `transform_fn_io.TRANSFORMED_METADATA_DIR` respectively.
        transform_fn = (pipeline
                        | 'ReadTransformGraph' >>
                        transform_fn_io.ReadTransformFn(working_dir))

        # Applies the transformation `transform_fn` to the raw eval dataset
        (transformed_train_data, transformed_metadata) = (
            ((raw_train_data, data_formatter.get_features_metadata()),
             transform_fn)
            | 'TransformTrainData' >> beam_impl.TransformDataset())

        # Applies the transformation `transform_fn` to the raw eval dataset
        (transformed_eval_data, transformed_metadata) = (
            ((raw_eval_data, data_formatter.get_features_metadata()),
             transform_fn)
            | 'TransformEvalData' >> beam_impl.TransformDataset())

        # The data schema of the transformed data gets used to build a signature to create
        # a TFRecord (tf binary data format). This signature is a wrapper function used to
        # encode transformed data.
        transformed_data_coder = tft.coders.ExampleProtoCoder(
            transformed_metadata.schema)

        _ = (transformed_train_data
             | 'EncodeTrainDataTransform' >> MapAndFilterErrors(
                 transformed_data_coder.encode)
             | 'WriteTrainDataTFRecord' >>
             tfrecordio.WriteToTFRecord('{}-{:05}-of-{:05}'.format(
                 exp_log_data_file_train_tfrecord, current_index, num_shards),
                                        shard_name_template='',
                                        num_shards=1))

        _ = (transformed_eval_data
             | 'EncodeEvalDataTransform' >> MapAndFilterErrors(
                 transformed_data_coder.encode)
             | 'WriteEvalDataTFRecord' >>
             tfrecordio.WriteToTFRecord('{}-{:05}-of-{:05}'.format(
                 exp_log_data_file_eval_tfrecord, current_index, num_shards),
                                        shard_name_template='',
                                        num_shards=1))

    result = pipeline.run()
    result.wait_until_finish()
Example #14
0
def run(argv=None, saveMainSession=False):
    logging.info("____starting____")
    parser = argparse.ArgumentParser()
    parser.add_argument("--input-raw-str",
                        required=False,
                        default="THE QUICK BROWN FOX JUMPS OVER THE LAZY DOG")
    my_pipeline_options = MyOptions(input_file="./samples/kinglear.txt")
    known_args, pipeline_args = parser.parse_known_args(argv)
    pipeline_options = PipelineOptions(pipeline_args)

    try:
        shutil.rmtree(os.getcwd() + "/target", ignore_errors=True)
    except OSError as error:
        logging.error(error)
    else:
        with beam.Pipeline(runner=DirectRunner(),
                           options=pipeline_options) as p:
            """         
            logging.debug("All options:\n%s", p.options.get_all_options())
            logging.debug("Known Cmd Args : \n"
                            "\t -input-raw-str: %s",
                            known_args.input_raw_str)
            logging.debug("MyPipelineOptions : \n"
                            "\t -input-file: %s"
                            "\t -output-file: %s",
                            my_pipeline_options.get_all_options()["input_file"],
                            my_pipeline_options.get_all_options()["output_file"]
                            )
            
            """
            lines = p | "ReadInputFile" >> beam.io.ReadFromText(
                my_pipeline_options.input_file)
            """     
            #ParDoFn: with DoFn
            lines_len_v1 = lines | "Mapping with ParDo Fn" >> beam.ParDo(ComputeWordLengthFn()) | "Write lines_len_v1" >> beam.io.WriteToText(os.getcwd()+"/target/lines_len_v1.out", file_name_suffix=".txt")
            lines_len_v2 = lines | "Mapping with FlatMap Fn" >> beam.FlatMap(lambda word: [len(word)]) | "Write lines_len_v2" >> beam.io.WriteToText(os.getcwd()+"/target/lines_len_v2.out", file_name_suffix=".txt")
            lines_len_v3 = lines | "Mapping with Map Fn" >> beam.Map(len) | "Write lines_len_v3" >> beam.io.WriteToText(os.getcwd()+"/target/lines_len_v3.out", file_name_suffix=".txt")
        
            #Filter
            non_empty_lines = lines | "Filter empty lines" >> beam.Filter(lambda x: len(x) > 0) | "Write non empty lines" >> beam.io.WriteToText(os.getcwd()+"/target/non_empty_lines.out", file_name_suffix=".txt") 
            """

            #GroupByKey
            (lines
             | "Extract" >> beam.ParDo(ExtractWordsFn())
             | "Lower" >> beam.ParDo(lambda w: w.lower())
             | "PairWithOne" >> beam.Map(lambda w: (w, 1))
             | "GrouByKey" >> beam.GroupByKey()
             | "Count" >> beam.CombineValues(sum)
             | "WriteToFile" >> beam.io.WriteToText(
                 os.getcwd() + "/target/word_count.out",
                 file_name_suffix=".txt"))

            #CoGroupByKey
            emails_list = [
                ('amy', '*****@*****.**'),
                ('carl', '*****@*****.**'),
                ('julia', '*****@*****.**'),
                ('carl', '*****@*****.**'),
            ]
            phones_list = [
                ('amy', '111-222-3333'),
                ('james', '222-333-4444'),
                ('amy', '333-444-5555'),
                ('carl', '444-555-6666'),
            ]

            emails = p | 'CreateEmails' >> beam.Create(emails_list)
            phones = p | 'CreatePhones' >> beam.Create(phones_list)

            joined_result = ({
                "emails": emails,
                "phones": phones
            } | beam.CoGroupByKey())

            def join_person_info(person_infos):
                name, info = person_infos
                emails, phones = info["emails"], info["phones"]
                return f"{name} : {emails} - {phones}"

            (joined_result
             | "Show person info" >> beam.Map(join_person_info)
             | "Write infos to file" >> beam.io.WriteToText(
                 os.getcwd() + "/target/person_info", file_name_suffix=".txt"))

            #CombineGlobally
            student_subjects_marks = [("Joseph", "Maths", 83),
                                      ("Joseph", "Physics", 74),
                                      ("Joseph", "Chemistry", 91),
                                      ("Joseph", "Biology", 82),
                                      ("Jimmy", "Maths", 69),
                                      ("Jimmy", "Physics", 62),
                                      ("Jimmy", "Chemistry", 97),
                                      ("Jimmy", "Biology", 80),
                                      ("Tina", "Maths", 78),
                                      ("Tina", "Physics", 73),
                                      ("Tina", "Chemistry", 68),
                                      ("Tina", "Biology", 87),
                                      ("Thomas", "Maths", 87),
                                      ("Thomas", "Physics", 93),
                                      ("Thomas", "Chemistry", 91),
                                      ("Thomas", "Biology", 74),
                                      ("Cory", "Maths", 56),
                                      ("Cory", "Physics", 65),
                                      ("Cory", "Chemistry", 71),
                                      ("Cory", "Biology", 68),
                                      ("Jackeline", "Maths", 86),
                                      ("Jackeline", "Physics", 62),
                                      ("Jackeline", "Chemistry", 75),
                                      ("Jackeline", "Biology", 83),
                                      ("Juan", "Maths", 63),
                                      ("Juan", "Physics", 69),
                                      ("Juan", "Chemistry", 64),
                                      ("Juan", "Biology", 60)]

            def print_row(row, *args):
                print("=" * 100)
                for v in args:
                    print(v)
                print(row)
                print("=" * 100)

            students_results = p | "CreateStudentResult" >> beam.Create(
                student_subjects_marks)

            (students_results
             | beam.CombineGlobally(CombineAllMarks()).with_defaults(
             )  # return empty PCollection if input is empty
             | "Show Result" >> beam.Map(print_row, "GlobalAverage"))

            #CombinePerKey
            (students_results
             | "Group per name" >> beam.Map(lambda tuple:
                                            (tuple[0], (tuple[1], tuple[2])))
             | "Compute avg per student" >> beam.CombinePerKey(
                 CombineAllMarks(is_per_key=True))
             |
             "Show Result Per Key" >> beam.Map(print_row, "AveragePerStudent")
             # | "Write avg marks to file" >> beam.io.WriteToText(os.getcwd()+"/target/avg_mark_per_student", file_name_suffix=".txt")
             )

            #Flatten
            joseph_subjects_marks = p | "Create Joseph PCol" >> beam.Create(
                student_subjects_marks[:3])
            juan_subjects_marks = p | "Create Juan PCol" >> beam.Create(
                student_subjects_marks[-4:])
            ((joseph_subjects_marks, juan_subjects_marks)
             | beam.Flatten()
             | "Write Flattened to File" >> beam.io.WriteToText(
                 os.getcwd() + "/target/joseph_and_juan",
                 file_name_suffix=".txt"))

            #Partition
            def partition_fn(student, num_partitions):
                (_, subject, _) = student
                subjects = 'Maths', 'Physics', 'Chemistry', 'Biology',
                return subjects.index(subject)

            all_partitions = student_subjects_marks | beam.Partition(
                partition_fn, 4)
            (all_partitions['0']
             # | "Show Maths students" >> beam.Map(print_row, "Math Student") )
             | "Write Maths students to File" >> beam.io.WriteToText(
                 os.getcwd() + "/target/maths_students",
                 file_name_suffix=".txt"))

            #SideInput
            (lines
             | "SideInput : Extract words" >> beam.ParDo(ExtractWordsFn())
             | "Filter using length" >> beam.ParDo(
                 FilterWordsUsingLength(), lower_bound=2, upper_bound=5)
             | "Write small words" >> beam.io.WriteToText(
                 os.getcwd() + "/target/small_words", file_name_suffix=".txt"))

            #SideOutput
            prefix = 'O'
            outputs = (
                lines
                | "SideOutput : Extract words" >> beam.ParDo(ExtractWordsFn())
                | "SideOutput :  Filter using length" >> beam.ParDo(
                    ProcessWordsMultiOutputs(), upper_bound=5,
                    prefix=prefix).with_outputs(
                        'Short_Words', 'Long_Words', main='Start_With'))
            short_words = outputs.Short_Words
            long_words = outputs.Long_Words
            start_with = outputs.Start_With
            short_words | "SideOutput: Write short words" >> beam.io.WriteToText(
                os.getcwd() + "/target/side_output/short_words",
                file_name_suffix=".txt")
            long_words | "SideOutput : Write long words" >> beam.io.WriteToText(
                os.getcwd() + "/target/side_output/long_words",
                file_name_suffix=".txt")
            start_with | "SideOutput : Write words : start with" >> beam.io.WriteToText(
                os.getcwd() + f"/target/side_output/start_with_{prefix}",
                file_name_suffix=".txt")

            #PTransform
            (lines.apply(ComputeWordsTransform()
                         )  # <=> lines | ComputeWordsTransform() 
             | "PTransform : Write words" >> beam.io.WriteToText(
                 os.getcwd() + "/target/ptransform_words",
                 file_name_suffix=".txt"))