def test_sink_transform_int96(self): with tempfile.NamedTemporaryFile() as dst: path = dst.name # pylint: disable=c-extension-no-member with self.assertRaises(pl.ArrowInvalid): with TestPipeline() as p: _ = p \ | Create(self.RECORDS) \ | WriteToParquet( path, self.SCHEMA96, num_shards=1, shard_name_template='')
def test_sink_transform_multiple_row_group(self): with tempfile.NamedTemporaryFile() as dst: path = dst.name with TestPipeline() as p: # writing 623200 bytes of data _ = p \ | Create(self.RECORDS * 4000) \ | WriteToParquet( path, self.SCHEMA, num_shards=1, codec='none', shard_name_template='', row_group_buffer_size=250000) self.assertEqual(pq.read_metadata(path).num_row_groups, 3)
def test_sink_transform_multiple_row_group(self): with TemporaryDirectory() as tmp_dirname: path = os.path.join(tmp_dirname + "tmp_filename") with TestPipeline() as p: # writing 623200 bytes of data _ = p \ | Create(self.RECORDS * 4000) \ | WriteToParquet( path, self.SCHEMA, num_shards=1, codec='none', shard_name_template='', row_group_buffer_size=250000) self.assertEqual(pq.read_metadata(path).num_row_groups, 3)
def test_sink_transform_int96(self): with tempfile.NamedTemporaryFile() as dst: path = dst.name # pylint: disable=c-extension-no-member with self.assertRaises(pl.ArrowInvalid): # Should throw an error "ArrowInvalid: Casting from timestamp[ns] to # timestamp[us] would lose data" with TestPipeline() as p: _ = p \ | Create(self.RECORDS) \ | WriteToParquet( path, self.SCHEMA96, num_shards=1, shard_name_template='')
def _generate_data(self, p, output_prefix, init_size, data_size): init_data = [x for x in range(init_size)] lines = (p | 'create' >> Create(init_data) | 'produce' >> ParDo(ProducerFn(data_size))) schema = pa.schema([('name', pa.binary()), ('number', pa.int64())]) files = lines | 'write' >> WriteToParquet( output_prefix, schema, codec='snappy', file_name_suffix='.parquet') return files
def test_batched_read(self): with tempfile.NamedTemporaryFile() as dst: path = dst.name with TestPipeline() as p: _ = p \ | Create(self.RECORDS, reshuffle=False) \ | WriteToParquet( path, self.SCHEMA, num_shards=1, shard_name_template='') with TestPipeline() as p: # json used for stable sortability readback = \ p \ | ReadFromParquetBatched(path) assert_that(readback, equal_to([self._records_as_arrow()]))
def test_write_display_data(self): file_name = 'some_parquet_sink' write = WriteToParquet(file_name, self.SCHEMA) dd = DisplayData.create_from(write) expected_items = [ DisplayDataItemMatcher('codec', 'none'), DisplayDataItemMatcher('schema', str(self.SCHEMA)), DisplayDataItemMatcher('row_group_buffer_size', str(64 * 1024 * 1024)), DisplayDataItemMatcher( 'file_pattern', 'some_parquet_sink-%(shard_num)05d-of-%(num_shards)05d'), DisplayDataItemMatcher('compression', 'uncompressed') ] hc.assert_that(dd.items, hc.contains_inanyorder(*expected_items))
def test_batched_read(self): with TemporaryDirectory() as tmp_dirname: path = os.path.join(tmp_dirname + "tmp_filename") with TestPipeline() as p: _ = p \ | Create(self.RECORDS, reshuffle=False) \ | WriteToParquet( path, self.SCHEMA, num_shards=1, shard_name_template='') with TestPipeline() as p: # json used for stable sortability readback = \ p \ | ReadFromParquetBatched(path) assert_that(readback, equal_to([self._records_as_arrow()]))
def test_sink_transform(self): with tempfile.NamedTemporaryFile() as dst: path = dst.name with TestPipeline() as p: _ = p \ | Create(self.RECORDS) \ | WriteToParquet( path, self.SCHEMA, num_shards=1, shard_name_template='') with TestPipeline() as p: # json used for stable sortability readback = \ p \ | ReadFromParquet(path) \ | Map(json.dumps) assert_that(readback, equal_to([json.dumps(r) for r in self.RECORDS]))
def test_sink_transform(self): with TemporaryDirectory() as tmp_dirname: path = os.path.join(tmp_dirname + "tmp_filename") with TestPipeline() as p: _ = p \ | Create(self.RECORDS) \ | WriteToParquet( path, self.SCHEMA, num_shards=1, shard_name_template='') with TestPipeline() as p: # json used for stable sortability readback = \ p \ | ReadFromParquet(path) \ | Map(json.dumps) assert_that(readback, equal_to([json.dumps(r) for r in self.RECORDS]))
def test_sink_transform_compressed(self, compression_type): if compression_type == 'lz4' and ARROW_MAJOR_VERSION == 1: return unittest.skip( "Writing with LZ4 compression is not supported in " "pyarrow 1.x") with TemporaryDirectory() as tmp_dirname: path = os.path.join(tmp_dirname + "tmp_filename") with TestPipeline() as p: _ = p \ | Create(self.RECORDS) \ | WriteToParquet( path, self.SCHEMA, codec=compression_type, num_shards=1, shard_name_template='') with TestPipeline() as p: # json used for stable sortability readback = \ p \ | ReadFromParquet(path + '*') \ | Map(json.dumps) assert_that(readback, equal_to([json.dumps(r) for r in self.RECORDS]))
def test_sink_transform_compliant_nested_type(self): if ARROW_MAJOR_VERSION < 4: return unittest.skip( 'Writing with compliant nested type is only ' 'supported in pyarrow 4.x and above') with TemporaryDirectory() as tmp_dirname: path = os.path.join(tmp_dirname + 'tmp_filename') with TestPipeline() as p: _ = p \ | Create(self.RECORDS_NESTED) \ | WriteToParquet( path, self.SCHEMA_NESTED, num_shards=1, shard_name_template='', use_compliant_nested_type=True) with TestPipeline() as p: # json used for stable sortability readback = \ p \ | ReadFromParquet(path) \ | Map(json.dumps) assert_that( readback, equal_to([json.dumps(r) for r in self.RECORDS_NESTED]))
# Esta funcion calcula el schema del parquet a escribir, aplicando el renombre de columnas al schema original def getSchema(): df_schema = pyarrow.Schema.from_pandas( pd.read_parquet(user_options.schema_source.get())) for (key, value) in ast.literal_eval( user_options.rename_columns.get()).items(): df_schema = df_schema.set( df_schema.get_field_index(key), pyarrow.field(value, df_schema.types[df_schema.get_field_index(key)])) return df_schema # Este lee los archivos parquet fuente y calcula el diccionario con el mapeo de las columnas a renombrar map_rename_cols = ( p | "Read for rename cols" >> ReadFromParquet(user_options.url_raw) | "Map rename cols" >> beam.Map(mapRenameCols) | "Rename cols to string" >> beam.Map(str) | "Deduplicate elements" >> beam.Distinct()) # Este lee los datos desde los archivos fuente data = (p | "Read parquet for data" >> ReadFromParquet(user_options.url_raw)) # Este aplica la funcion para renombarar las columnas y recibe el resultado del paso anterior como diccionario rename_data = (data | "Rename columns" >> beam.Map( reColumns, rename_cols=AsList(map_rename_cols))) # Este escribe los datos en la ruta destino, obteniendo el schema desde la funcion getSchema _ = (rename_data | "Write to storage TRN" >> WriteToParquet( user_options.url_trn, schema=getSchema(), file_name_suffix=".parquet")) print("End Pipeline")
def run(argv=None): """Main entry point; defines and runs the wordcount pipeline.""" parser = argparse.ArgumentParser() parser.add_argument( '--input', dest='input', default='gs://dataflow-samples/shakespeare/kinglear.txt', help='Input file to process.') parser.add_argument('--output', dest='output', required=True, help='Output file to write results to.') parser.add_argument('--format', dest='format', default='text', help='Supported output file formats: %s.' % FORMATS) known_args, pipeline_args = parser.parse_known_args(argv) if known_args.format not in FORMATS: raise ValueError('--format should be one of: %s' % FORMATS) # We use the save_main_session option because one or more DoFn's in this # workflow rely on global context (e.g., a module imported at module level). pipeline_options = PipelineOptions(pipeline_args) pipeline_options.view_as(SetupOptions).save_main_session = True p = beam.Pipeline(options=pipeline_options) # Read the text file[pattern] into a PCollection. lines = p | 'read' >> ReadFromText(known_args.input) # Count the occurrences of each word. def count_ones(word_ones): (word, ones) = word_ones return (word, sum(ones)) counts = (lines | 'split' >> (beam.ParDo(WordExtractingDoFn()).with_output_types(unicode)) | 'pair_with_one' >> beam.Map(lambda x: (x, 1)) | 'group' >> beam.GroupByKey() | 'count' >> beam.Map(count_ones)) # Format the counts into a PCollection of strings. def format_text(word_count): (word, count) = word_count return '%s: %d' % (word, count) # Format the counts into a PCollection of dictionary strings. def format_dict(word_count): (word, count) = word_count row = dict(zip(HEADER, [word, count])) return row if known_args.format == 'text': output = counts | 'format text' >> beam.Map(format_text) # Write the output using a "Write" transform that has side effects. # pylint: disable=expression-not-assigned output | 'write text' >> WriteToText(known_args.output) elif known_args.format == 'avro': output = counts | 'format avro' >> beam.Map(format_dict) schema = avro.schema.parse(json.dumps(AVRO_SCHEMA)) # Write the output using a "Write" transform that has side effects. # pylint: disable=expression-not-assigned output | 'write avro' >> WriteToAvro( file_path_prefix=known_args.output, schema=schema, codec=DEFAULT_CODEC) else: output = counts | 'format parquet' >> beam.Map(format_dict) # Write the output using a "Write" transform that has side effects. # pylint: disable=expression-not-assigned output | 'write parquet' >> WriteToParquet( file_path_prefix=known_args.output, schema=PARQUET_SCHEMA, codec=DEFAULT_CODEC) result = p.run() result.wait_until_finish() # Do not query metrics when creating a template which doesn't run if (not hasattr(result, 'has_job') # direct runner or result.has_job): # not just a template creation empty_lines_filter = MetricsFilter().with_name('empty_lines') query_result = result.metrics().query(empty_lines_filter) if query_result['counters']: empty_lines_counter = query_result['counters'][0] logging.info('number of empty lines: %d', empty_lines_counter.result) word_lengths_filter = MetricsFilter().with_name('word_len_dist') query_result = result.metrics().query(word_lengths_filter) if query_result['distributions']: word_lengths_dist = query_result['distributions'][0] logging.info('average word length: %d', word_lengths_dist.result.mean)
def run(known_args, pipeline_args): # We use the save_main_session option because one or more DoFn's in this # workflow rely on global context (e.g., a module imported at module level). logging.getLogger().setLevel(logging.INFO) pipeline_options = PipelineOptions(pipeline_args) pipeline_options.view_as(SetupOptions).save_main_session = True p = beam.Pipeline(options=pipeline_options) # Read the text file[pattern] into a PCollection. lines = p | 'read' >> ReadFromText(known_args.input) # Count the occurrences of each word. def count_ones(word_ones): (word, ones) = word_ones return (word, sum(ones)) counts = (lines | 'split' >> (beam.ParDo(WordExtractingDoFn()).with_output_types(unicode)) | 'pair_with_one' >> beam.Map(lambda x: (x, 1)) | 'group' >> beam.GroupByKey() | 'count' >> beam.Map(count_ones)) # Format the counts into a PCollection of strings. def format_text(word_count): (word, count) = word_count return '%s: %d' % (word, count) # Format the counts into a PCollection of dictionary strings. def format_dict(word_count): (word, count) = word_count row = dict(zip(HEADER, [word, count])) return row if known_args.format == 'text': output = counts | 'format text' >> beam.Map(format_text) # Write the output using a "Write" transform that has side effects. # pylint: disable=expression-not-assigned output | 'write text' >> WriteToText(known_args.output) elif known_args.format == 'avro': output = counts | 'format avro' >> beam.Map(format_dict) schema = avro.schema.parse(json.dumps(AVRO_SCHEMA)) # Write the output using a "Write" transform that has side effects. # pylint: disable=expression-not-assigned output | 'write avro' >> WriteToAvro( file_path_prefix=known_args.output, schema=schema, codec=DEFAULT_CODEC) else: output = counts | 'format parquet' >> beam.Map(format_dict) # Write the output using a "Write" transform that has side effects. # pylint: disable=expression-not-assigned output | 'write parquet' >> WriteToParquet( file_path_prefix=known_args.output, schema=PARQUET_SCHEMA, codec=DEFAULT_CODEC) result = p.run() result.wait_until_finish() # Do not query metrics when creating a template which doesn't run if (not hasattr(result, 'has_job') # direct runner or result.has_job): # not just a template creation empty_lines_filter = MetricsFilter().with_name('empty_lines') query_result = result.metrics().query(empty_lines_filter) if query_result['counters']: empty_lines_counter = query_result['counters'][0] logging.info('number of empty lines: %d', empty_lines_counter.result) word_lengths_filter = MetricsFilter().with_name('word_len_dist') query_result = result.metrics().query(word_lengths_filter) if query_result['distributions']: word_lengths_dist = query_result['distributions'][0] logging.info('average word length: %d', word_lengths_dist.result.mean)