def run_pipeline(in_file): import csv import apache_beam as beam from apache_beam.io.textio import ReadFromText from apache_beam.io.textio import WriteToText # Simple process for apache beam pipeline with beam.Pipeline(runner='DirectRunner') as p: # # Pipeline(0): Data ingestion # # "lines" will include pcollections of each line # Options # file_pattern: File path to file # skip_header_lines: First line will be skipped. Set to "1". # https://beam.apache.org/releases/pydoc/2.11.0/apache_beam.io.textio.html#apache_beam.io.textio.ReadFromText collections = p | 'ReadAiportInfo' >> ReadFromText( file_pattern=in_file[0], skip_header_lines=1) # # Pipeline(1): Create side input # Final PCollection will be used as side input for the date time convertion in the next transformation # 1. Parse each line and return fields as a list. Use csv module to remove any double quotes inside field # 2. Filter out invalid fields # 3. Just get "AIRPORT_SEQ_ID"(0),"LATITUDE"(21),"LONGITUDE"(26). Also add timezone for correspondng coordinates # airports = (collections | 'airports:Extract' >> beam.Map(lambda x: next(csv.reader([x], delimiter=','))) | 'airports:Filter' >> beam.Filter(lambda x: x[21] and x[26]) | 'airports:Timezone' >> beam.Map(lambda x: (x[0], addtimezone(x[21], x[26])))) # # Pipeline(2): Correct timezone # 1. Read flight data # 2. Convert times into UTC flights = (p | 'flights:read' >> ReadFromText(file_pattern=in_file[1], skip_header_lines=1) | 'flights:tzcorr' >> beam.FlatMap( tz_correct, beam.pvalue.AsDict(airports))) # Write results to a file. Tuples are unpacked while function call. # https://beam.apache.org/releases/pydoc/2.11.0/apache_beam.io.textio.html#apache_beam.io.textio.WriteToText (flights | 'flights:out' >> WriteToText(file_path_prefix='flights')) # Pipeline(3): Generate departed and arrived events # 1. events = flights | '' >> beam.FlatMap(get_next_event) # # Pipeline(Final) # # Write results to a file. Tuples are unpacked while function call. # https://beam.apache.org/releases/pydoc/2.11.0/apache_beam.io.textio.html#apache_beam.io.textio.WriteToText (events | 'event:out' >> WriteToText(file_path_prefix='events'))
def test_read_gzip_empty_file(self): with TempDir() as tempdir: file_name = tempdir.create_temp_file() with TestPipeline() as pipeline: pcoll = pipeline | 'Read' >> ReadFromText( file_name, 0, CompressionTypes.GZIP, True, coders.StrUtf8Coder()) assert_that(pcoll, equal_to([]))
def run_pipeline(in_file, out_file): # Simple process for apache beam pipeline with beam.Pipeline(runner='DirectRunner') as p: # # Pipeline(0): Data ingestion # # "lines" will include pcollections of each line # Options # file_pattern: File path to file # skip_header_lines: First line will be skipped. Set to "1". # https://beam.apache.org/releases/pydoc/2.11.0/apache_beam.io.textio.html#apache_beam.io.textio.ReadFromText collections = p | ReadFromText(file_pattern=in_file, skip_header_lines=1) # # Pipeline(n): Detailed Transformation # 1. Parse each line and return fields as a list. Use csv module to remove any double quotes inside field # 2. Just get "AIRPORT_SEQ_ID"(0),"LATITUDE"(21),"LONGITUDE"(26) # airports = (collections | 'Extract_Into_Fields' >> beam.Map(lambda x: next(csv.reader([x], delimiter=','))) | 'Set_Fields' >> beam.Map(lambda x: (x[0], (x[21], x[26])))) # # Pipeline(Final) # # Write results to a file. Tuples are unpacked while function call. # https://beam.apache.org/releases/pydoc/2.11.0/apache_beam.io.textio.html#apache_beam.io.textio.WriteToText (airports | beam.Map(lambda (airport, data): "{0},{1}".format(airport, ','.join(data))) | WriteToText(file_path_prefix=out_file))
def test_read_from_text_single_file_with_coder(self): file_name, expected_data = write_data(5) assert len(expected_data) == 5 with TestPipeline() as pipeline: pcoll = pipeline | 'Read' >> ReadFromText(file_name, coder=DummyCoder()) assert_that(pcoll, equal_to([record * 2 for record in expected_data]))
def test_read_gzip_empty_file(self): file_name = self._create_temp_file() pipeline = TestPipeline() pcoll = pipeline | 'Read' >> ReadFromText( file_name, 0, CompressionTypes.GZIP, True, coders.StrUtf8Coder()) assert_that(pcoll, equal_to([])) pipeline.run()
def run(argv=None): parser = argparse.ArgumentParser() parser.add_argument("--file_location", dest="file_location", required=True) parser.add_argument("--schema_location", dest="schema_location", required=True) parser.add_argument("--output_table", dest="output_table", required=True) known_args, pipeline_args = parser.parse_known_args(argv) file_location = known_args.file_location schema_location = known_args.schema_location output_table = known_args.output_table cloud_storage_to_bq = CloudStorageToBigQuery() with open("./schemas/customers.json") as f: schema_dict = json.load(f) table_schema = json_to_schema(schema_dict) p = beam.Pipeline(options=_options) (p | "Read CSV file from cloud storage" >> ReadFromText( file_pattern=file_location, skip_header_lines=True) | 'CSV Row To BigQuery Row' >> beam.Map(lambda s: cloud_storage_to_bq.process_row(s, schema_dict)) | "Write to BigQuery" >> bigquery.WriteToBigQuery( table=f"precis-digital-case-interview:part_1.{output_table}", schema=table_schema, create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED, write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE)) p.run().wait_until_finish()
def get(self): """ Flask view that triggers the execution of the pipeline """ input_filename = 'data/input/titanic.txt' output_filename = 'data/output/titanic.txt' # project_id = os.environ['DATASTORE_PROJECT_ID'] # credentials_file = os.environ['GOOGLE_APPLICATION_CREDENTIALS'] # client = datastore.Client.from_service_account_json(credentials_file) options = PipelineOptions() gcloud_options = options.view_as(GoogleCloudOptions) # gcloud_options.project = project_id gcloud_options.job_name = 'test-job' # Dataflow runner runner = os.environ['DATAFLOW_RUNNER'] options.view_as(StandardOptions).runner = runner with apache_beam.Pipeline(options=options) as p: rows = (p | ReadFromText(input_filename) | apache_beam.ParDo(Split())) survived = (rows | apache_beam.ParDo(CollectSurvived()) | apache_beam.GroupByKey() | apache_beam.ParDo(WriteToCSV()) | WriteToText(output_filename)) return 'All Titanic survivors are writte to data/output/titanic.txt-00000-of-00001'
def test_read_from_text_file_pattern(self): pattern, expected_data = write_pattern([5, 3, 12, 8, 8, 4]) assert len(expected_data) == 40 pipeline = TestPipeline() pcoll = pipeline | 'Read' >> ReadFromText(pattern) assert_that(pcoll, equal_to(expected_data)) pipeline.run()
def main(): args, pipeline_args = get_args() # 먼저 PipelineOpions 을 통해서 pipeline에 대한 설정을 할 수 있습니다. # 예를 들어서 pipeline runner를 설정하여 무엇이 pipeline을 실행할지 설정할수 있습니다. pipeline_options = PipelineOptions(pipeline_args) pipeline_options.view_as(SetupOptions).save_main_session = True with beam.Pipeline(options=pipeline_options) as p: # Input data file -> TextIO.Read Transform -> PCollection(lines) lines = p | ReadFromText(args.input) counts = ( lines | 'Split' >> (beam.FlatMap(lambda x: re.findall(r'[A-Za-z\']+', x)) .with_output_types(unicode)) | 'PairWithOne' >> beam.Map(lambda x: (x, 1)) | 'GroupAndSum' >> beam.CombinePerKey(sum)) # Format the counts into a PCollection of strings. def format_result(word_count): (word, count) = word_count return '%s: %s' % (word, count) output = counts | 'Format' >> beam.Map(format_result) # Write the output using a "Write" transform that has side effects. # pylint: disable=expression-not-assigned output | WriteToText(args.output)
def test_read_from_text_single_file(self): file_name, expected_data = write_data(5) assert len(expected_data) == 5 pipeline = TestPipeline() pcoll = pipeline | 'Read' >> ReadFromText(file_name) assert_that(pcoll, equal_to(expected_data)) pipeline.run()
def run(argv=None): """Run the beam pipeline.""" args, pipeline_args = _parse_args(argv) pipeline_options = PipelineOptions(pipeline_args) pipeline_options.view_as(SetupOptions).save_main_session = True p = beam.Pipeline(options=pipeline_options) lines = p | "read qa files" >> ReadFromText(args.file_pattern) # The lines are not JSON, but the string representation of python # dictionary objects. Parse them with ast.literal_eval. json_objects = lines | "parsing dictionaries" >> beam.Map(ast.literal_eval) qa_tuples = json_objects | "create tuples" >> beam.FlatMap( partial( _create_tuples, min_words=args.min_words, max_words=args.max_words) ) # Remove duplicate examples. qa_tuples |= "key by QA" >> beam.Map(lambda v: (v[1:], v)) qa_tuples |= "group duplicates" >> beam.GroupByKey() qa_tuples |= "remove duplicates" >> beam.Map(lambda v: sorted(v[1])[0]) # Create the examples. examples = qa_tuples | "create examples" >> beam.Map( lambda args: _create_example(*args) ) examples = _shuffle_examples(examples) examples |= "split train and test" >> beam.ParDo( _TrainTestSplitFn(args.train_split) ).with_outputs(_TrainTestSplitFn.TEST_TAG, _TrainTestSplitFn.TRAIN_TAG) if args.dataset_format == _JSON_FORMAT: write_sink = WriteToText file_name_suffix = ".json" serialize_fn = json.dumps else: assert args.dataset_format == _TF_FORMAT write_sink = WriteToTFRecord file_name_suffix = ".tfrecord" serialize_fn = _features_to_serialized_tf_example for name, tag in [("train", _TrainTestSplitFn.TRAIN_TAG), ("test", _TrainTestSplitFn.TEST_TAG)]: serialized_examples = examples[tag] | ( "serialize {} examples".format(name) >> beam.Map(serialize_fn)) ( serialized_examples | ("write " + name) >> write_sink( os.path.join(args.output_dir, name), file_name_suffix=file_name_suffix, num_shards=args.num_shards_train, ) ) result = p.run() result.wait_until_finish()
def execute(): with beam.Pipeline('DirectRunner') as p: (p | 'ReadFile' >> ReadFromText(file_pattern='./data/SMSSpamCollection') | 'Deduplicate' >> beam.RemoveDuplicates() | 'Parse' >> beam.FlatMap(parse) | 'Write' >> beam.io.WriteToText('./data/Output.jsonl') )
def process(self, element): file = { 'name': element['name'], 'content': ReadFromText(element['file']) | apache_beam.combiners.ToList() } return [file]
def test_read_gzip_empty_file(self): filename = tempfile.NamedTemporaryFile(delete=False, prefix=tempfile.template).name pipeline = TestPipeline() pcoll = pipeline | 'Read' >> ReadFromText( filename, 0, CompressionTypes.GZIP, True, coders.StrUtf8Coder()) assert_that(pcoll, equal_to([])) pipeline.run()
def run(p, args, aggregator_dict, cloud_logger=None): """Run the pipeline with the args and dataflow pipeline option.""" # Create a PCollection for model directory. model_dir = p | "Create Model Directory" >> beam.Create([args.model_dir]) input_file_format = args.input_file_format.lower() # Create one pcollection per input file or file pattern. And then flatten # them into one pcollection. The duplicated names need to be removed as the # file name is used to create unique labels for the PTransform. readers = [] for pattern in list( set(args.input_file_patterns.split(FILE_LIST_SEPARATOR))): # Setup reader. # # TODO(user): Perhaps simplify the batch prediction code by using # CompressionTypes.AUTO. if input_file_format.startswith("tfrecord"): if input_file_format == "tfrecord_gzip": compression_type = CompressionTypes.GZIP else: assert input_file_format == "tfrecord" compression_type = CompressionTypes.UNCOMPRESSED reader = "READ_TFRECORD_FILES_%s" % pattern >> ReadFromTFRecord( pattern, compression_type=compression_type) else: assert input_file_format == "text" reader = "READ_TEXT_FILES_%s" % pattern >> ReadFromText(pattern) # Put the pcollections into a list and flatten later. readers.append(p | reader) # Setup the whole pipeline. results, errors = (readers | beam.Flatten() | "BATCH_PREDICTION" >> batch_prediction.BatchPredict( beam.pvalue.AsSingleton(model_dir), batch_size=args.batch_size, aggregator_dict=aggregator_dict, cloud_logger=cloud_logger)) # Convert predictions to JSON and then write to output files. _ = (results | "TO_JSON" >> beam.Map(json.dumps) | "WRITE_PREDICTION_RESULTS" >> WriteToText( os.path.join(args.output_location, OUTPUT_RESULTS_FILES_BASENAME_))) # Write prediction errors counts to output files. _ = ( errors | "GROUP_BY_ERROR_TYPE" >> beam.combiners.Count.PerKey() | "WRITE_ERRORS" >> WriteToText( os.path.join(args.output_location, OUTPUT_ERRORS_FILES_BASENAME_))) return p.run()
def get(self): """ Flask view that triggers the execution of the pipeline """ input_filename = 'input.txt' output_filename = 'output.txt' # project_id = os.environ['DATASTORE_PROJECT_ID'] # credentials_file = os.environ['GOOGLE_APPLICATION_CREDENTIALS'] # client = datastore.Client.from_service_account_json(credentials_file) options = PipelineOptions() gcloud_options = options.view_as(GoogleCloudOptions) # gcloud_options.project = project_id gcloud_options.job_name = 'test-job' # Dataflow runner runner = os.environ['DATAFLOW_RUNNER'] options.view_as(StandardOptions).runner = runner with apache_beam.Pipeline(options=options) as p: rows = ( p | ReadFromText(input_filename) | apache_beam.ParDo(Split()) ) timings = ( rows | apache_beam.ParDo(CollectTimings()) | "Grouping timings" >> apache_beam.GroupByKey() | "Calculating average" >> apache_beam.CombineValues( apache_beam.combiners.MeanCombineFn() ) ) users = ( rows | apache_beam.ParDo(CollectUsers()) | "Grouping users" >> apache_beam.GroupByKey() | "Counting users" >> apache_beam.CombineValues( apache_beam.combiners.CountCombineFn() ) ) to_be_joined = ( { 'timings': timings, 'users': users } | apache_beam.CoGroupByKey() | apache_beam.ParDo(WriteToCSV()) | WriteToText(output_filename) ) return 'ok'
def test_read_auto_bzip2(self): _, lines = write_data(15) with TempDir() as tempdir: file_name = tempdir.create_temp_file(suffix='.bz2') with bz2.BZ2File(file_name, 'wb') as f: f.write('\n'.join(lines).encode('utf-8')) with TestPipeline() as pipeline: pcoll = pipeline | 'Read' >> ReadFromText(file_name) assert_that(pcoll, equal_to(lines))
def test_read_auto_bzip2(self): _, lines = write_data(15) file_name = self._create_temp_file(suffix='.bz2') with bz2.BZ2File(file_name, 'wb') as f: f.write('\n'.join(lines)) pipeline = TestPipeline() pcoll = pipeline | 'Read' >> ReadFromText(file_name) assert_that(pcoll, equal_to(lines)) pipeline.run()
def test_read_auto_deflate(self): _, lines = write_data(15) with TempDir() as tempdir: file_name = tempdir.create_temp_file(suffix='.deflate') with open(file_name, 'wb') as f: f.write(zlib.compress('\n'.join(lines).encode('utf-8'))) with TestPipeline() as pipeline: pcoll = pipeline | 'Read' >> ReadFromText(file_name) assert_that(pcoll, equal_to(lines))
def run(argv=None): """Run the beam pipeline.""" args, pipeline_args = _parse_args(argv) pipeline_options = PipelineOptions(pipeline_args) pipeline_options.view_as(SetupOptions).save_main_session = True p = beam.Pipeline(options=pipeline_options) lines = p | "read qa files" >> ReadFromText(args.file_pattern) # The lines are not JSON, but the string representation of python # dictionary objects. Parse them with ast.literal_eval. json_objects = lines | "parsing dictionaries" >> beam.Map(ast.literal_eval) qa_tuples = json_objects | "create tuples" >> beam.FlatMap( partial( _create_tuples, min_words=args.min_words, max_words=args.max_words) ) # Remove duplicate examples. qa_tuples |= "key by QA" >> beam.Map(lambda v: (v[1:], v)) qa_tuples |= "group duplicates" >> beam.GroupByKey() qa_tuples |= "remove duplicates" >> beam.Map(lambda v: sorted(v[1])[0]) # Create the examples. serialized_examples = qa_tuples | "create examples" >> beam.Map( lambda args: create_example(*args).SerializeToString() ) serialized_examples = _shuffle_examples(serialized_examples) serialized_examples |= "split train and test" >> beam.ParDo( _TrainTestSplitFn(args.train_split) ).with_outputs(_TrainTestSplitFn.TEST_TAG, _TrainTestSplitFn.TRAIN_TAG) ( serialized_examples[_TrainTestSplitFn.TRAIN_TAG] | "write train" >> WriteToTFRecord( os.path.join(args.output_dir, "train"), file_name_suffix=".tfrecords", num_shards=args.num_shards_train, ) ) ( serialized_examples[_TrainTestSplitFn.TEST_TAG] | "write test" >> WriteToTFRecord( os.path.join(args.output_dir, "test"), file_name_suffix=".tfrecords", num_shards=args.num_shards_test, ) ) result = p.run() result.wait_until_finish()
def test_read_bzip2(self): _, lines = write_data(15) with TempDir() as tempdir: file_name = tempdir.create_temp_file() with bz2.BZ2File(file_name, 'wb') as f: f.write('\n'.join(lines).encode('utf-8')) with TestPipeline() as pipeline: pcoll = pipeline | 'Read' >> ReadFromText( file_name, compression_type=CompressionTypes.BZIP2) assert_that(pcoll, equal_to(lines))
def test_read_bzip2(self): _, lines = write_data(15) file_name = self._create_temp_file() with bz2.BZ2File(file_name, 'wb') as f: f.write('\n'.join(lines)) pipeline = TestPipeline() pcoll = pipeline | 'Read' >> ReadFromText( file_name, compression_type=CompressionTypes.BZIP2) assert_that(pcoll, equal_to(lines)) pipeline.run()
def test_read_gzip(self): _, lines = write_data(15) file_name = self._create_temp_file() with gzip.GzipFile(file_name, 'wb') as f: f.write('\n'.join(lines)) pipeline = TestPipeline() pcoll = pipeline | 'Read' >> ReadFromText( file_name, 0, CompressionTypes.GZIP, True, coders.StrUtf8Coder()) assert_that(pcoll, equal_to(lines)) pipeline.run()
def test_read_gzip(self): _, lines = write_data(15) with TempDir() as tempdir: file_name = tempdir.create_temp_file() with gzip.GzipFile(file_name, 'wb') as f: f.write('\n'.join(lines).encode('utf-8')) with TestPipeline() as pipeline: pcoll = pipeline | 'Read' >> ReadFromText( file_name, 0, CompressionTypes.GZIP, True, coders.StrUtf8Coder()) assert_that(pcoll, equal_to(lines))
def test_read_bzip2(self): _, lines = write_data(15) file_name = tempfile.NamedTemporaryFile(delete=False, prefix=tempfile.template).name with bz2.BZ2File(file_name, 'wb') as f: f.write('\n'.join(lines)) pipeline = TestPipeline() pcoll = pipeline | 'Read' >> ReadFromText( file_name, compression_type=CompressionTypes.BZIP2) assert_that(pcoll, equal_to(lines)) pipeline.run()
def test_read_gzip_large(self): _, lines = write_data(10000) file_name = tempfile.NamedTemporaryFile(delete=False, prefix=tempfile.template).name with gzip.GzipFile(file_name, 'wb') as f: f.write('\n'.join(lines)) pipeline = TestPipeline() pcoll = pipeline | 'Read' >> ReadFromText( file_name, 0, CompressionTypes.GZIP, True, coders.StrUtf8Coder()) assert_that(pcoll, equal_to(lines)) pipeline.run()
def test_read_auto_gzip(self): _, lines = write_data(15) with TempDir() as tempdir: file_name = tempdir.create_temp_file(suffix='.gz') with gzip.GzipFile(file_name, 'wb') as f: f.write('\n'.join(lines).encode('utf-8')) pipeline = TestPipeline() pcoll = pipeline | 'Read' >> ReadFromText(file_name) assert_that(pcoll, equal_to(lines)) pipeline.run()
def test_read_auto_gzip(self): _, lines = write_data(15) file_name = tempfile.NamedTemporaryFile(delete=False, prefix=tempfile.template, suffix='.gz').name with gzip.GzipFile(file_name, 'wb') as f: f.write('\n'.join(lines)) pipeline = TestPipeline() pcoll = pipeline | 'Read' >> ReadFromText(file_name) assert_that(pcoll, equal_to(lines)) pipeline.run()
def test_read_gzip_with_skip_lines(self): _, lines = write_data(15) with TempDir() as tempdir: file_name = tempdir.create_temp_file() with gzip.GzipFile(file_name, 'wb') as f: f.write('\n'.join(lines)) pipeline = TestPipeline() pcoll = pipeline | 'Read' >> ReadFromText( file_name, 0, CompressionTypes.GZIP, True, coders.StrUtf8Coder(), skip_header_lines=2) assert_that(pcoll, equal_to(lines[2:])) pipeline.run()
def test_read_from_text_single_file_with_coder(self): class DummyCoder(coders.Coder): def encode(self, x): raise ValueError def decode(self, x): return (x * 2).decode('utf-8') file_name, expected_data = write_data(5) assert len(expected_data) == 5 with TestPipeline() as pipeline: pcoll = pipeline | 'Read' >> ReadFromText(file_name, coder=DummyCoder()) assert_that(pcoll, equal_to([record * 2 for record in expected_data]))