def test_read_display_data(self): file_name = 'some_parquet_source' read = \ ReadFromParquet( file_name, validate=False) dd = DisplayData.create_from(read) expected_items = [ DisplayDataItemMatcher('compression', 'auto'), DisplayDataItemMatcher('file_pattern', file_name)] hc.assert_that(dd.items, hc.contains_inanyorder(*expected_items))
def test_sink_transform(self): with tempfile.NamedTemporaryFile() as dst: path = dst.name with TestPipeline() as p: _ = p \ | Create(self.RECORDS) \ | WriteToParquet( path, self.SCHEMA, num_shards=1, shard_name_template='') with TestPipeline() as p: # json used for stable sortability readback = \ p \ | ReadFromParquet(path) \ | Map(json.dumps) assert_that(readback, equal_to([json.dumps(r) for r in self.RECORDS]))
def test_sink_transform(self): with TemporaryDirectory() as tmp_dirname: path = os.path.join(tmp_dirname + "tmp_filename") with TestPipeline() as p: _ = p \ | Create(self.RECORDS) \ | WriteToParquet( path, self.SCHEMA, num_shards=1, shard_name_template='') with TestPipeline() as p: # json used for stable sortability readback = \ p \ | ReadFromParquet(path) \ | Map(json.dumps) assert_that(readback, equal_to([json.dumps(r) for r in self.RECORDS]))
def test_sink_transform_compressed(self, compression_type): if compression_type == 'lz4' and ARROW_MAJOR_VERSION == 1: return unittest.skip( "Writing with LZ4 compression is not supported in " "pyarrow 1.x") with TemporaryDirectory() as tmp_dirname: path = os.path.join(tmp_dirname + "tmp_filename") with TestPipeline() as p: _ = p \ | Create(self.RECORDS) \ | WriteToParquet( path, self.SCHEMA, codec=compression_type, num_shards=1, shard_name_template='') with TestPipeline() as p: # json used for stable sortability readback = \ p \ | ReadFromParquet(path + '*') \ | Map(json.dumps) assert_that(readback, equal_to([json.dumps(r) for r in self.RECORDS]))
def test_sink_transform_compliant_nested_type(self): if ARROW_MAJOR_VERSION < 4: return unittest.skip( 'Writing with compliant nested type is only ' 'supported in pyarrow 4.x and above') with TemporaryDirectory() as tmp_dirname: path = os.path.join(tmp_dirname + 'tmp_filename') with TestPipeline() as p: _ = p \ | Create(self.RECORDS_NESTED) \ | WriteToParquet( path, self.SCHEMA_NESTED, num_shards=1, shard_name_template='', use_compliant_nested_type=True) with TestPipeline() as p: # json used for stable sortability readback = \ p \ | ReadFromParquet(path) \ | Map(json.dumps) assert_that( readback, equal_to([json.dumps(r) for r in self.RECORDS_NESTED]))
# Esta funcion calcula el schema del parquet a escribir, aplicando el renombre de columnas al schema original def getSchema(): df_schema = pyarrow.Schema.from_pandas( pd.read_parquet(user_options.schema_source.get())) for (key, value) in ast.literal_eval( user_options.rename_columns.get()).items(): df_schema = df_schema.set( df_schema.get_field_index(key), pyarrow.field(value, df_schema.types[df_schema.get_field_index(key)])) return df_schema # Este lee los archivos parquet fuente y calcula el diccionario con el mapeo de las columnas a renombrar map_rename_cols = ( p | "Read for rename cols" >> ReadFromParquet(user_options.url_raw) | "Map rename cols" >> beam.Map(mapRenameCols) | "Rename cols to string" >> beam.Map(str) | "Deduplicate elements" >> beam.Distinct()) # Este lee los datos desde los archivos fuente data = (p | "Read parquet for data" >> ReadFromParquet(user_options.url_raw)) # Este aplica la funcion para renombarar las columnas y recibe el resultado del paso anterior como diccionario rename_data = (data | "Rename columns" >> beam.Map( reColumns, rename_cols=AsList(map_rename_cols))) # Este escribe los datos en la ruta destino, obteniendo el schema desde la funcion getSchema _ = (rename_data | "Write to storage TRN" >> WriteToParquet( user_options.url_trn, schema=getSchema(), file_name_suffix=".parquet")) print("End Pipeline")
if "timePartitioning" in metadata_table_bq.keys( ) or "rangePartitioning" in metadata_table_bq.keys(): if "timePartitioning" in metadata_table_bq.keys(): print("time Partitioning") bq_param = metadata_table_bq["timePartitioning"] # bq_param = {"timePartitioning": {"type": metadata_table_bq["timePartitioning"]["type"],"field": metadata_table_bq["timePartitioning"]["field"]}} elif "rangePartitioning" in metadata_table_bq.keys(): print("range Partitioning") bq_param2 = metadata_table_bq["rangePartitioning"] bq_param = { "rangePartitioning": { "field": metadata_table_bq["rangePartitioning"]["field"], "range": metadata_table_bq["rangePartitioning"]["range"] } } data = p | "Read from storage HOM" >> ReadFromParquet( user_options.url_hom) print(bq_param2) print(bq_param) # data | beam.Map(print) data | "Write to BQ HOM" >> WriteToBigQuery( # table=user_options.table_id, table='yas-dev-sii-pid:sii_yas_de_hom.hom_cuentas_test', # project=metadata_table_bq['tableReference']['projectId'], additional_bq_parameters=bq_param, schema=metadata_table_bq['schema'], # method='FILE_LOADS', create_disposition='CREATE_IF_NEEDED', # create_disposition='CREATE_NEVER', write_disposition='WRITE_TRUNCATE', # write_disposition='WRITE_APPEND', # custom_gcs_temp_location=user_options.custom_bq_temp_loc.get()