def generate_transform(system_source_tuples): transforms = [] for system_source_tuple in system_source_tuples: system, source = system_source_tuple @incremental(snapshot_inputs=['input_df']) @transform_df( Output("{path}/l31_{system}_{source}_test".format(path=OUTPUT_DIR, system=system, source=source)), input_df=Input("{path}/l31_{system}_{source}_all".format( path=INPUT_DIR, system=system, source=source)), ) def checks_consolidated(input_df): return input_df \ .groupBy(input_df.SOURCE_SYSTEM) \ .agg(F.count(input_df.REF_INV_ID).alias('Count'), F.sum(input_df.DLVD_QTY_WT).alias('DLVD_QTY_WT'), F.sum(input_df.DLVD_QTY_VOL).alias('DLVD_QTY_VOL'), F.sum(input_df.INVG_VAL).alias('INVG_VAL')) \ .withColumn("Timestamp", F.current_timestamp()) \ .withColumn('Dataset_Name', lit('l31_isp_iop_line_all')) transforms.append(checks_consolidated) return transforms
def generate_transform(country, source): @transform_df( Output("{path}/{country}/l3_{source}_{country}".format(path=OUTPUT_DIR, country=country, source=source)), input_df=Input("{path}/{country}/l2_{source}_{country}".format(path=INPUT_DIR, country=country, source=source)), ) def l2_to_l3(input_df): df = input_df df = df.sort(df.LAST_UPDT_DATE_TIME.desc()) window = Window.partitionBy(df['INV_ID']).orderBy(df['LAST_UPDT_DATE_TIME'].desc()) df = df.select('*', row_number().over(window).alias('row_number')).filter(col('row_number') == 1) delete_is_null = F.isnull(F.col("LOG_DEL_IND")) df = df.where(delete_is_null) return df return l2_to_l3
from transforms.api import transform_df, Output from pyspark.sql.types import StructType, StructField, LongType @transform_df( Output("@output_path/@name") ) def function(ctx): id_list = @ids df = ctx.spark_session.createDataFrame(id_list, LongType()) df = df.withColumnRenamed("value", "id") return df
# Script to perform daily checks on ISP Datasets on Delta Files # # ver date author comment # === ======== ================ ========================================================= # 1.0 # 20/02/20 Rachit Saxena Initial version from transforms.api import transform_df, incremental, Input, Output from pyspark.sql import functions as F from pyspark.sql.functions import col @incremental(snapshot_inputs=['input_df']) @transform_df( Output( "/BP/Downstream-AirBP-DD-Insight_Hub/5 Testing/LCP Daily Checks/lcpcostelement" ), input_df=Input( "/BP/Downstream-AirBP-DD-Insight_Hub/3 Publish/Insight Hub/LCP/l6_all_lcpcostelement" ), ) def checks_consolidated(input_df): return input_df \ .withColumn("TIMESTAMP", F.current_timestamp()) \ .groupBy(input_df.source_system, (F.year(input_df.billing_date)).alias('YEAR'), (F.month(input_df.billing_date)).alias('MONTH'), input_df.country, input_df.plant, col("TIMESTAMP").alias("TIMESTAMP")) \ .agg((F.count(input_df.source_system).alias('COUNT')), (F.sum(input_df.wf_cond_value_gc)).alias('GP_USD'))
from transforms.api import transform, Input, Output from act.utils import get_newest_payload from pyspark.sql import functions as F import tempfile import shutil import zipfile @transform( processed=Output("/UNITE/Data Ingestion & OMOP Mapping/Source Data Model: ACT/Site 411/transform/00 - unzipped/unzipped_raw_data"), zip_file=Input("/UNITE/Data Ingestion & OMOP Mapping/raw_data/Zipped Datasets/site_411_act_raw_zips"), ) def unzip(ctx, processed, zip_file): fs = zip_file.filesystem() files_df = fs.files(regex="(?i).*incoming.*\.zip") newest_file = get_newest_payload(files_df) # Create a temp file to pass to zip library with tempfile.NamedTemporaryFile() as t: # Copy contents of file from Foundry into temp file with fs.open(newest_file, 'rb') as f: shutil.copyfileobj(f, t) t.flush() z = zipfile.ZipFile(t.name) # For each file in the zip, unzip and add it to output dataset for filename in z.namelist(): with processed.filesystem().open(filename, 'wb') as out: input_file = z.open(filename) CHUNK_SIZE = 100 * 1024 * 1024 # Read and write 100 MB chunks data = input_file.read(CHUNK_SIZE)
Input( '/UNITE/Data Ingestion & OMOP Mapping/Source Data Model: ACT/Site 411/transform/01 - parsed/patient_dimension' ), 'VISIT_DIMENSION': Input( '/UNITE/Data Ingestion & OMOP Mapping/Source Data Model: ACT/Site 411/transform/01 - parsed/visit_dimension' ), } @transform_df( Output( "/UNITE/Data Ingestion & OMOP Mapping/Source Data Model: ACT/Site 411/metadata/data_counts_check", checks=[ Check(E.count().gt(0), 'Valid data counts file provided', on_error='WARN'), Check(E.col('delta_row_count').equals(0), 'Parsed row count equals loaded row count', on_error='WARN'), ]), site_counts=Input( "/UNITE/Data Ingestion & OMOP Mapping/Source Data Model: ACT/Site 411/metadata/data_counts_parsed" ), **domains) def my_compute_function(ctx, site_counts, **domains): data = [] for domain_name, domain_df in domains.items(): row_count = domain_df.count() data.append((domain_name.lower(), row_count))
from transforms.api import transform, Input, Output from act.parsing import parse_input from act.utils import get_site_id import pyspark.sql.functions as F domain = "concept_dimension" regex = "(?i).*/{domain}\.csv".format(domain=domain.upper()) @transform( processed=Output('/UNITE/Data Ingestion & OMOP Mapping/Source Data Model: ACT/Site 411/transform/01 - parsed/concept_dimension'), my_input=Input('/UNITE/Data Ingestion & OMOP Mapping/Source Data Model: ACT/Site 411/transform/00 - unzipped/unzipped_raw_data'), payload=Input("/UNITE/Data Ingestion & OMOP Mapping/Source Data Model: ACT/Site 411/transform/00 - unzipped/payload_filename"), site_id_df=Input('/UNITE/Data Ingestion & OMOP Mapping/raw_data/data partner id tables/Data Partner IDs - Site 411'), errors=Output('/UNITE/Data Ingestion & OMOP Mapping/Source Data Model: ACT/Site 411/transform/01 - parsed/errors/concept_dimension'), ) def compute_function(ctx, my_input, payload, site_id_df, processed, errors): """ By default, the parse_input function will use a csv file processing function found in parsing.py (Default parameters: delimiter = "|") Any rows that are formatted incorrectly, e.g. do not have the appropriate number of fields for the given domain schema, will be read into the 'errors' dataset. """ site_id = get_site_id(site_id_df) processed_df = parse_input(ctx, my_input, errors, site_id, domain, regex) payload_filename = payload.dataframe().where(F.col("newest_payload") == True).take(1)[0].payload processed_df = processed_df.withColumn("payload", F.lit(payload_filename)) processed.write_dataframe(processed_df)
from transforms.api import transform, Input, Output, Check from transforms import expectations as E import csv import tempfile import shutil from pyspark.sql import Row from pyspark.sql import types as T @transform( processed=Output( "/UNITE/Data Ingestion & OMOP Mapping/Source Data Model: ACT/Site 411/metadata/control_map", checks=Check(E.count().gt(0), 'Valid CONTROL_MAP file provided by site', on_error='WARN') ), my_input=Input("/UNITE/Data Ingestion & OMOP Mapping/Source Data Model: ACT/Site 411/transform/00 - unzipped/unzipped_raw_data"), ) def my_compute_function(ctx, my_input, processed): def process_file(file_status): with tempfile.NamedTemporaryFile() as t: # Copy contents of file from Foundry into temp file with my_input.filesystem().open(file_status.path, 'rb') as f_bytes: shutil.copyfileobj(f_bytes, t) t.flush() # Read the csv, line by line, and use csv.Sniffer to infer the delimiter with open(t.name, newline="") as f: try: dialect = csv.Sniffer().sniff(f.read(1024)) f.seek(0) r = csv.reader(f, delimiter=dialect.delimiter, escapechar="\\")
from transforms.api import transform, Input, Output, Check from transforms import expectations as E import csv import tempfile import shutil from pyspark.sql import Row from pyspark.sql import types as T @transform( processed=Output( "/UNITE/Data Ingestion & OMOP Mapping/Source Data Model: ACT/Site 411/metadata/n3c_vocab_map" ), my_input=Input( "/UNITE/Data Ingestion & OMOP Mapping/Source Data Model: ACT/Site 411/transform/00 - unzipped/unzipped_raw_data" ), ) def my_compute_function(ctx, my_input, processed): def process_file(file_status): with tempfile.NamedTemporaryFile() as t: # Copy contents of file from Foundry into temp file with my_input.filesystem().open(file_status.path, 'rb') as f_bytes: shutil.copyfileobj(f_bytes, t) t.flush() # Read the csv, line by line, and use csv.Sniffer to infer the delimiter with open(t.name, newline="") as f: try: dialect = csv.Sniffer().sniff(f.read(1024)) f.seek(0) r = csv.reader(f,
from transforms.api import transform, Input, Output, Check from transforms import expectations as E from pyspark.sql import Row import csv import tempfile import shutil from datetime import datetime from pyspark.sql import functions as F from act import omop_schemas @transform( processed=Output( "/UNITE/Data Ingestion & OMOP Mapping/Source Data Model: ACT/Site 411/metadata/manifest", checks= Check(E.negate(E.col("SITE_NAME").equals("[No manifest provided]")), 'Valid manifest provided by site', on_error='WARN')), my_input=Input( "/UNITE/Data Ingestion & OMOP Mapping/Source Data Model: ACT/Site 411/transform/00 - unzipped/unzipped_raw_data" ), site_id_df=Input( '/UNITE/Data Ingestion & OMOP Mapping/raw_data/data partner id tables/Data Partner IDs - Site 411' ), data_partner_ids=Input( "/UNITE/Data Ingestion & OMOP Mapping/raw_data/data partner id tables/Data Partner IDs - ALL" ), omop_vocab=Input("/UNITE/OMOP Vocabularies/vocabulary"), control_map=Input( "/UNITE/Data Ingestion & OMOP Mapping/Source Data Model: ACT/Site 411/metadata/control_map" ))
from transforms.api import transform, Input, Output from act.utils import get_newest_payload from pyspark.sql import functions as F import pyspark.sql.types as T import os @transform( payload_filename=Output("/UNITE/Data Ingestion & OMOP Mapping/Source Data Model: ACT/Site 411/transform/00 - unzipped/payload_filename"), zip_file=Input("/UNITE/Data Ingestion & OMOP Mapping/raw_data/Zipped Datasets/site_411_act_raw_zips"), ) def unzip(ctx, payload_filename, zip_file): fs = zip_file.filesystem() files_df = fs.files(regex="(?i).*incoming.*\.zip") newest_file = get_newest_payload(files_df) files_df = files_df.withColumn("newest_payload", F.when(F.col("path") == newest_file, F.lit(True)).otherwise(F.lit(False))) get_basename = F.udf(lambda x: os.path.basename(x), T.StringType()) ctx.spark_session.udf.register("get_basename", get_basename) files_df = files_df.withColumn("payload", get_basename(F.col("path"))) payload_filename.write_dataframe(files_df.select("payload", "newest_payload"))
from transforms.api import transform_df, Input, Output from pyspark.sql import types as T from pyspark.sql import functions as F @transform_df( Output( "/UNITE/Data Ingestion & OMOP Mapping/Source Data Model: ACT/Site 411/metadata/payload_status" ), parsed_df=Input( "/UNITE/Data Ingestion & OMOP Mapping/Source Data Model: ACT/Site 411/transform/00 - unzipped/payload_filename" ), passed_pipeline_df=Input( "/UNITE/Data Ingestion & OMOP Mapping/Source Data Model: ACT/Site 411/union_staging/person" ), site_id=Input( "/UNITE/Data Ingestion & OMOP Mapping/raw_data/data partner id tables/Data Partner IDs - Site 411" )) def my_compute_function(ctx, parsed_df, passed_pipeline_df, site_id): # parsed_df tells us the payload currently being processed in the transformation pipeline parsed_payload = parsed_df.where( F.col("newest_payload") == True).take(1)[0].payload # passed_pipeline_df tells us the payload that has successfully made it through the pipeline passed_pipeline_payload = passed_pipeline_df.select( "payload").distinct().take(1)[0].payload data_partner_id = int(site_id.take(1)[0].data_partner_id) schema = T.StructType([ T.StructField('parsed_payload', T.StringType(), True), T.StructField('unreleased_payload', T.StringType(), True), T.StructField('data_partner_id', T.IntegerType(), True)