Esempio n. 1
0
 def test_sink_transform_int96(self):
   with tempfile.NamedTemporaryFile() as dst:
     path = dst.name
     # pylint: disable=c-extension-no-member
     with self.assertRaises(pl.ArrowInvalid):
       with TestPipeline() as p:
         _ = p \
         | Create(self.RECORDS) \
         | WriteToParquet(
             path, self.SCHEMA96, num_shards=1, shard_name_template='')
Esempio n. 2
0
 def test_sink_transform_multiple_row_group(self):
     with tempfile.NamedTemporaryFile() as dst:
         path = dst.name
         with TestPipeline() as p:
             # writing 623200 bytes of data
             _ = p \
             | Create(self.RECORDS * 4000) \
             | WriteToParquet(
                 path, self.SCHEMA, num_shards=1, codec='none',
                 shard_name_template='', row_group_buffer_size=250000)
         self.assertEqual(pq.read_metadata(path).num_row_groups, 3)
Esempio n. 3
0
 def test_sink_transform_multiple_row_group(self):
     with TemporaryDirectory() as tmp_dirname:
         path = os.path.join(tmp_dirname + "tmp_filename")
         with TestPipeline() as p:
             # writing 623200 bytes of data
             _ = p \
             | Create(self.RECORDS * 4000) \
             | WriteToParquet(
                 path, self.SCHEMA, num_shards=1, codec='none',
                 shard_name_template='', row_group_buffer_size=250000)
         self.assertEqual(pq.read_metadata(path).num_row_groups, 3)
Esempio n. 4
0
 def test_sink_transform_int96(self):
     with tempfile.NamedTemporaryFile() as dst:
         path = dst.name
         # pylint: disable=c-extension-no-member
         with self.assertRaises(pl.ArrowInvalid):
             # Should throw an error "ArrowInvalid: Casting from timestamp[ns] to
             # timestamp[us] would lose data"
             with TestPipeline() as p:
                 _ = p \
                 | Create(self.RECORDS) \
                 | WriteToParquet(
                     path, self.SCHEMA96, num_shards=1, shard_name_template='')
Esempio n. 5
0
    def _generate_data(self, p, output_prefix, init_size, data_size):
        init_data = [x for x in range(init_size)]

        lines = (p
                 | 'create' >> Create(init_data)
                 | 'produce' >> ParDo(ProducerFn(data_size)))

        schema = pa.schema([('name', pa.binary()), ('number', pa.int64())])

        files = lines | 'write' >> WriteToParquet(
            output_prefix, schema, codec='snappy', file_name_suffix='.parquet')

        return files
Esempio n. 6
0
 def test_batched_read(self):
     with tempfile.NamedTemporaryFile() as dst:
         path = dst.name
         with TestPipeline() as p:
             _ = p \
             | Create(self.RECORDS, reshuffle=False) \
             | WriteToParquet(
                 path, self.SCHEMA, num_shards=1, shard_name_template='')
         with TestPipeline() as p:
             # json used for stable sortability
             readback = \
                 p \
                 | ReadFromParquetBatched(path)
             assert_that(readback, equal_to([self._records_as_arrow()]))
Esempio n. 7
0
 def test_write_display_data(self):
   file_name = 'some_parquet_sink'
   write = WriteToParquet(file_name, self.SCHEMA)
   dd = DisplayData.create_from(write)
   expected_items = [
       DisplayDataItemMatcher('codec', 'none'),
       DisplayDataItemMatcher('schema', str(self.SCHEMA)),
       DisplayDataItemMatcher('row_group_buffer_size', str(64 * 1024 * 1024)),
       DisplayDataItemMatcher(
           'file_pattern',
           'some_parquet_sink-%(shard_num)05d-of-%(num_shards)05d'),
       DisplayDataItemMatcher('compression', 'uncompressed')
   ]
   hc.assert_that(dd.items, hc.contains_inanyorder(*expected_items))
Esempio n. 8
0
 def test_batched_read(self):
     with TemporaryDirectory() as tmp_dirname:
         path = os.path.join(tmp_dirname + "tmp_filename")
         with TestPipeline() as p:
             _ = p \
             | Create(self.RECORDS, reshuffle=False) \
             | WriteToParquet(
                 path, self.SCHEMA, num_shards=1, shard_name_template='')
         with TestPipeline() as p:
             # json used for stable sortability
             readback = \
                 p \
                 | ReadFromParquetBatched(path)
             assert_that(readback, equal_to([self._records_as_arrow()]))
Esempio n. 9
0
 def test_sink_transform(self):
   with tempfile.NamedTemporaryFile() as dst:
     path = dst.name
     with TestPipeline() as p:
       _ = p \
       | Create(self.RECORDS) \
       | WriteToParquet(
           path, self.SCHEMA, num_shards=1, shard_name_template='')
     with TestPipeline() as p:
       # json used for stable sortability
       readback = \
           p \
           | ReadFromParquet(path) \
           | Map(json.dumps)
       assert_that(readback, equal_to([json.dumps(r) for r in self.RECORDS]))
Esempio n. 10
0
 def test_sink_transform(self):
   with TemporaryDirectory() as tmp_dirname:
     path = os.path.join(tmp_dirname + "tmp_filename")
     with TestPipeline() as p:
       _ = p \
       | Create(self.RECORDS) \
       | WriteToParquet(
           path, self.SCHEMA, num_shards=1, shard_name_template='')
     with TestPipeline() as p:
       # json used for stable sortability
       readback = \
           p \
           | ReadFromParquet(path) \
           | Map(json.dumps)
       assert_that(readback, equal_to([json.dumps(r) for r in self.RECORDS]))
Esempio n. 11
0
 def test_sink_transform_compressed(self, compression_type):
   if compression_type == 'lz4' and ARROW_MAJOR_VERSION == 1:
     return unittest.skip(
         "Writing with LZ4 compression is not supported in "
         "pyarrow 1.x")
   with TemporaryDirectory() as tmp_dirname:
     path = os.path.join(tmp_dirname + "tmp_filename")
     with TestPipeline() as p:
       _ = p \
       | Create(self.RECORDS) \
       | WriteToParquet(
           path, self.SCHEMA, codec=compression_type,
           num_shards=1, shard_name_template='')
     with TestPipeline() as p:
       # json used for stable sortability
       readback = \
           p \
           | ReadFromParquet(path + '*') \
           | Map(json.dumps)
       assert_that(readback, equal_to([json.dumps(r) for r in self.RECORDS]))
Esempio n. 12
0
 def test_sink_transform_compliant_nested_type(self):
   if ARROW_MAJOR_VERSION < 4:
     return unittest.skip(
         'Writing with compliant nested type is only '
         'supported in pyarrow 4.x and above')
   with TemporaryDirectory() as tmp_dirname:
     path = os.path.join(tmp_dirname + 'tmp_filename')
     with TestPipeline() as p:
       _ = p \
       | Create(self.RECORDS_NESTED) \
       | WriteToParquet(
           path, self.SCHEMA_NESTED, num_shards=1,
           shard_name_template='', use_compliant_nested_type=True)
     with TestPipeline() as p:
       # json used for stable sortability
       readback = \
           p \
           | ReadFromParquet(path) \
           | Map(json.dumps)
       assert_that(
           readback, equal_to([json.dumps(r) for r in self.RECORDS_NESTED]))
    # Esta funcion calcula el schema del parquet a escribir, aplicando el renombre de columnas al schema original
    def getSchema():
        df_schema = pyarrow.Schema.from_pandas(
            pd.read_parquet(user_options.schema_source.get()))
        for (key, value) in ast.literal_eval(
                user_options.rename_columns.get()).items():
            df_schema = df_schema.set(
                df_schema.get_field_index(key),
                pyarrow.field(value,
                              df_schema.types[df_schema.get_field_index(key)]))
        return df_schema

    # Este lee los archivos parquet fuente y calcula el diccionario con el mapeo de las columnas a renombrar
    map_rename_cols = (
        p | "Read for rename cols" >> ReadFromParquet(user_options.url_raw)
        | "Map rename cols" >> beam.Map(mapRenameCols)
        | "Rename cols to string" >> beam.Map(str)
        | "Deduplicate elements" >> beam.Distinct())
    # Este lee los datos desde los archivos fuente
    data = (p
            | "Read parquet for data" >> ReadFromParquet(user_options.url_raw))
    # Este aplica la funcion para renombarar las columnas y recibe el resultado del paso anterior como diccionario
    rename_data = (data | "Rename columns" >> beam.Map(
        reColumns, rename_cols=AsList(map_rename_cols)))
    # Este escribe los datos en la ruta destino, obteniendo el schema desde la funcion getSchema
    _ = (rename_data | "Write to storage TRN" >> WriteToParquet(
        user_options.url_trn, schema=getSchema(), file_name_suffix=".parquet"))

print("End Pipeline")
def run(argv=None):
    """Main entry point; defines and runs the wordcount pipeline."""
    parser = argparse.ArgumentParser()
    parser.add_argument(
        '--input',
        dest='input',
        default='gs://dataflow-samples/shakespeare/kinglear.txt',
        help='Input file to process.')
    parser.add_argument('--output',
                        dest='output',
                        required=True,
                        help='Output file to write results to.')
    parser.add_argument('--format',
                        dest='format',
                        default='text',
                        help='Supported output file formats: %s.' % FORMATS)
    known_args, pipeline_args = parser.parse_known_args(argv)

    if known_args.format not in FORMATS:
        raise ValueError('--format should be one of: %s' % FORMATS)

    # We use the save_main_session option because one or more DoFn's in this
    # workflow rely on global context (e.g., a module imported at module level).
    pipeline_options = PipelineOptions(pipeline_args)
    pipeline_options.view_as(SetupOptions).save_main_session = True
    p = beam.Pipeline(options=pipeline_options)

    # Read the text file[pattern] into a PCollection.
    lines = p | 'read' >> ReadFromText(known_args.input)

    # Count the occurrences of each word.
    def count_ones(word_ones):
        (word, ones) = word_ones
        return (word, sum(ones))

    counts = (lines
              | 'split' >>
              (beam.ParDo(WordExtractingDoFn()).with_output_types(unicode))
              | 'pair_with_one' >> beam.Map(lambda x: (x, 1))
              | 'group' >> beam.GroupByKey()
              | 'count' >> beam.Map(count_ones))

    # Format the counts into a PCollection of strings.
    def format_text(word_count):
        (word, count) = word_count
        return '%s: %d' % (word, count)

    # Format the counts into a PCollection of dictionary strings.

    def format_dict(word_count):
        (word, count) = word_count
        row = dict(zip(HEADER, [word, count]))
        return row

    if known_args.format == 'text':
        output = counts | 'format text' >> beam.Map(format_text)

        # Write the output using a "Write" transform that has side effects.
        # pylint: disable=expression-not-assigned
        output | 'write text' >> WriteToText(known_args.output)
    elif known_args.format == 'avro':
        output = counts | 'format avro' >> beam.Map(format_dict)

        schema = avro.schema.parse(json.dumps(AVRO_SCHEMA))

        # Write the output using a "Write" transform that has side effects.
        # pylint: disable=expression-not-assigned
        output | 'write avro' >> WriteToAvro(
            file_path_prefix=known_args.output,
            schema=schema,
            codec=DEFAULT_CODEC)
    else:
        output = counts | 'format parquet' >> beam.Map(format_dict)

        # Write the output using a "Write" transform that has side effects.
        # pylint: disable=expression-not-assigned
        output | 'write parquet' >> WriteToParquet(
            file_path_prefix=known_args.output,
            schema=PARQUET_SCHEMA,
            codec=DEFAULT_CODEC)

    result = p.run()
    result.wait_until_finish()

    # Do not query metrics when creating a template which doesn't run
    if (not hasattr(result, 'has_job')  # direct runner
            or result.has_job):  # not just a template creation
        empty_lines_filter = MetricsFilter().with_name('empty_lines')
        query_result = result.metrics().query(empty_lines_filter)
        if query_result['counters']:
            empty_lines_counter = query_result['counters'][0]
            logging.info('number of empty lines: %d',
                         empty_lines_counter.result)

        word_lengths_filter = MetricsFilter().with_name('word_len_dist')
        query_result = result.metrics().query(word_lengths_filter)
        if query_result['distributions']:
            word_lengths_dist = query_result['distributions'][0]
            logging.info('average word length: %d',
                         word_lengths_dist.result.mean)
Esempio n. 15
0
def run(known_args, pipeline_args):
    # We use the save_main_session option because one or more DoFn's in this
    # workflow rely on global context (e.g., a module imported at module level).
    logging.getLogger().setLevel(logging.INFO)

    pipeline_options = PipelineOptions(pipeline_args)
    pipeline_options.view_as(SetupOptions).save_main_session = True
    p = beam.Pipeline(options=pipeline_options)

    # Read the text file[pattern] into a PCollection.
    lines = p | 'read' >> ReadFromText(known_args.input)

    # Count the occurrences of each word.
    def count_ones(word_ones):
        (word, ones) = word_ones
        return (word, sum(ones))

    counts = (lines | 'split' >>
              (beam.ParDo(WordExtractingDoFn()).with_output_types(unicode))
              | 'pair_with_one' >> beam.Map(lambda x: (x, 1))
              | 'group' >> beam.GroupByKey() | 'count' >> beam.Map(count_ones))

    # Format the counts into a PCollection of strings.
    def format_text(word_count):
        (word, count) = word_count
        return '%s: %d' % (word, count)

    # Format the counts into a PCollection of dictionary strings.

    def format_dict(word_count):
        (word, count) = word_count
        row = dict(zip(HEADER, [word, count]))
        return row

    if known_args.format == 'text':
        output = counts | 'format text' >> beam.Map(format_text)

        # Write the output using a "Write" transform that has side effects.
        # pylint: disable=expression-not-assigned
        output | 'write text' >> WriteToText(known_args.output)
    elif known_args.format == 'avro':
        output = counts | 'format avro' >> beam.Map(format_dict)

        schema = avro.schema.parse(json.dumps(AVRO_SCHEMA))

        # Write the output using a "Write" transform that has side effects.
        # pylint: disable=expression-not-assigned
        output | 'write avro' >> WriteToAvro(
            file_path_prefix=known_args.output,
            schema=schema,
            codec=DEFAULT_CODEC)
    else:
        output = counts | 'format parquet' >> beam.Map(format_dict)

        # Write the output using a "Write" transform that has side effects.
        # pylint: disable=expression-not-assigned
        output | 'write parquet' >> WriteToParquet(
            file_path_prefix=known_args.output,
            schema=PARQUET_SCHEMA,
            codec=DEFAULT_CODEC)

    result = p.run()
    result.wait_until_finish()

    # Do not query metrics when creating a template which doesn't run
    if (not hasattr(result, 'has_job')  # direct runner
            or result.has_job):  # not just a template creation
        empty_lines_filter = MetricsFilter().with_name('empty_lines')
        query_result = result.metrics().query(empty_lines_filter)
        if query_result['counters']:
            empty_lines_counter = query_result['counters'][0]
            logging.info('number of empty lines: %d',
                         empty_lines_counter.result)

        word_lengths_filter = MetricsFilter().with_name('word_len_dist')
        query_result = result.metrics().query(word_lengths_filter)
        if query_result['distributions']:
            word_lengths_dist = query_result['distributions'][0]
            logging.info('average word length: %d',
                         word_lengths_dist.result.mean)