コード例 #1
0
def generate_transform(system_source_tuples):

    transforms = []

    for system_source_tuple in system_source_tuples:
        system, source = system_source_tuple

        @incremental(snapshot_inputs=['input_df'])
        @transform_df(
            Output("{path}/l31_{system}_{source}_test".format(path=OUTPUT_DIR,
                                                              system=system,
                                                              source=source)),
            input_df=Input("{path}/l31_{system}_{source}_all".format(
                path=INPUT_DIR, system=system, source=source)),
        )
        def checks_consolidated(input_df):
            return input_df \
                .groupBy(input_df.SOURCE_SYSTEM) \
                .agg(F.count(input_df.REF_INV_ID).alias('Count'), F.sum(input_df.DLVD_QTY_WT).alias('DLVD_QTY_WT'), F.sum(input_df.DLVD_QTY_VOL).alias('DLVD_QTY_VOL'), F.sum(input_df.INVG_VAL).alias('INVG_VAL')) \
                .withColumn("Timestamp", F.current_timestamp()) \
                .withColumn('Dataset_Name', lit('l31_isp_iop_line_all'))

        transforms.append(checks_consolidated)

    return transforms
コード例 #2
0
ファイル: ISP_IOP_HEADER.py プロジェクト: rachitcode/Palantir
def generate_transform(country, source):
    @transform_df(
        Output("{path}/{country}/l3_{source}_{country}".format(path=OUTPUT_DIR, country=country, source=source)),
        input_df=Input("{path}/{country}/l2_{source}_{country}".format(path=INPUT_DIR, country=country, source=source)),
    )
    def l2_to_l3(input_df):
        df = input_df
        df = df.sort(df.LAST_UPDT_DATE_TIME.desc())
        window = Window.partitionBy(df['INV_ID']).orderBy(df['LAST_UPDT_DATE_TIME'].desc())
        df = df.select('*', row_number().over(window).alias('row_number')).filter(col('row_number') == 1)
        delete_is_null = F.isnull(F.col("LOG_DEL_IND"))
        df = df.where(delete_is_null)
        return df

    return l2_to_l3
コード例 #3
0
from transforms.api import transform_df, Output
from pyspark.sql.types import StructType, StructField, LongType


@transform_df(
    Output("@output_path/@name")
)
def function(ctx):
    id_list = @ids
    df = ctx.spark_session.createDataFrame(id_list, LongType())
    df = df.withColumnRenamed("value", "id")
    return df
コード例 #4
0
#       Script to perform daily checks on ISP Datasets on Delta Files
#
#       ver   date     author           comment
#       ===   ======== ================ =========================================================
#       1.0 # 20/02/20 Rachit Saxena   Initial version
from transforms.api import transform_df, incremental, Input, Output
from pyspark.sql import functions as F
from pyspark.sql.functions import col


@incremental(snapshot_inputs=['input_df'])
@transform_df(
    Output(
        "/BP/Downstream-AirBP-DD-Insight_Hub/5 Testing/LCP Daily Checks/lcpcostelement"
    ),
    input_df=Input(
        "/BP/Downstream-AirBP-DD-Insight_Hub/3 Publish/Insight Hub/LCP/l6_all_lcpcostelement"
    ),
)
def checks_consolidated(input_df):
    return input_df \
                .withColumn("TIMESTAMP", F.current_timestamp()) \
                .groupBy(input_df.source_system, (F.year(input_df.billing_date)).alias('YEAR'), (F.month(input_df.billing_date)).alias('MONTH'), input_df.country, input_df.plant, col("TIMESTAMP").alias("TIMESTAMP")) \
                .agg((F.count(input_df.source_system).alias('COUNT')), (F.sum(input_df.wf_cond_value_gc)).alias('GP_USD'))
from transforms.api import transform, Input, Output
from act.utils import get_newest_payload
from pyspark.sql import functions as F
import tempfile
import shutil
import zipfile


@transform(
    processed=Output("/UNITE/Data Ingestion & OMOP Mapping/Source Data Model: ACT/Site 411/transform/00 - unzipped/unzipped_raw_data"),
    zip_file=Input("/UNITE/Data Ingestion & OMOP Mapping/raw_data/Zipped Datasets/site_411_act_raw_zips"),
)
def unzip(ctx, processed, zip_file):
    fs = zip_file.filesystem()
    files_df = fs.files(regex="(?i).*incoming.*\.zip")
    newest_file = get_newest_payload(files_df)

    # Create a temp file to pass to zip library
    with tempfile.NamedTemporaryFile() as t:
        # Copy contents of file from Foundry into temp file
        with fs.open(newest_file, 'rb') as f:
            shutil.copyfileobj(f, t)
            t.flush()

        z = zipfile.ZipFile(t.name)
        # For each file in the zip, unzip and add it to output dataset
        for filename in z.namelist():
            with processed.filesystem().open(filename, 'wb') as out:
                input_file = z.open(filename)
                CHUNK_SIZE = 100 * 1024 * 1024  # Read and write 100 MB chunks
                data = input_file.read(CHUNK_SIZE)
コード例 #6
0
    Input(
        '/UNITE/Data Ingestion & OMOP Mapping/Source Data Model: ACT/Site 411/transform/01 - parsed/patient_dimension'
    ),
    'VISIT_DIMENSION':
    Input(
        '/UNITE/Data Ingestion & OMOP Mapping/Source Data Model: ACT/Site 411/transform/01 - parsed/visit_dimension'
    ),
}


@transform_df(
    Output(
        "/UNITE/Data Ingestion & OMOP Mapping/Source Data Model: ACT/Site 411/metadata/data_counts_check",
        checks=[
            Check(E.count().gt(0),
                  'Valid data counts file provided',
                  on_error='WARN'),
            Check(E.col('delta_row_count').equals(0),
                  'Parsed row count equals loaded row count',
                  on_error='WARN'),
        ]),
    site_counts=Input(
        "/UNITE/Data Ingestion & OMOP Mapping/Source Data Model: ACT/Site 411/metadata/data_counts_parsed"
    ),
    **domains)
def my_compute_function(ctx, site_counts, **domains):

    data = []
    for domain_name, domain_df in domains.items():
        row_count = domain_df.count()
        data.append((domain_name.lower(), row_count))
from transforms.api import transform, Input, Output
from act.parsing import parse_input
from act.utils import get_site_id
import pyspark.sql.functions as F

domain = "concept_dimension"
regex = "(?i).*/{domain}\.csv".format(domain=domain.upper())


@transform(
    processed=Output('/UNITE/Data Ingestion & OMOP Mapping/Source Data Model: ACT/Site 411/transform/01 - parsed/concept_dimension'),
    my_input=Input('/UNITE/Data Ingestion & OMOP Mapping/Source Data Model: ACT/Site 411/transform/00 - unzipped/unzipped_raw_data'),
    payload=Input("/UNITE/Data Ingestion & OMOP Mapping/Source Data Model: ACT/Site 411/transform/00 - unzipped/payload_filename"),
    site_id_df=Input('/UNITE/Data Ingestion & OMOP Mapping/raw_data/data partner id tables/Data Partner IDs - Site 411'),
    errors=Output('/UNITE/Data Ingestion & OMOP Mapping/Source Data Model: ACT/Site 411/transform/01 - parsed/errors/concept_dimension'),
)
def compute_function(ctx, my_input, payload, site_id_df, processed, errors):
    """
    By default, the parse_input function will use a csv file processing function found in parsing.py
    (Default parameters: delimiter = "|")

    Any rows that are formatted incorrectly, e.g. do not have the appropriate number of fields for the
    given domain schema, will be read into the 'errors' dataset.
    """

    site_id = get_site_id(site_id_df)
    processed_df = parse_input(ctx, my_input, errors, site_id, domain, regex)
    payload_filename = payload.dataframe().where(F.col("newest_payload") == True).take(1)[0].payload
    processed_df = processed_df.withColumn("payload", F.lit(payload_filename))
    processed.write_dataframe(processed_df)
コード例 #8
0
from transforms.api import transform, Input, Output, Check
from transforms import expectations as E
import csv
import tempfile
import shutil
from pyspark.sql import Row
from pyspark.sql import types as T


@transform(
    processed=Output(
        "/UNITE/Data Ingestion & OMOP Mapping/Source Data Model: ACT/Site 411/metadata/control_map",
        checks=Check(E.count().gt(0), 'Valid CONTROL_MAP file provided by site', on_error='WARN')
    ),
    my_input=Input("/UNITE/Data Ingestion & OMOP Mapping/Source Data Model: ACT/Site 411/transform/00 - unzipped/unzipped_raw_data"),
)
def my_compute_function(ctx, my_input, processed):

    def process_file(file_status):
        with tempfile.NamedTemporaryFile() as t:
            # Copy contents of file from Foundry into temp file
            with my_input.filesystem().open(file_status.path, 'rb') as f_bytes:
                shutil.copyfileobj(f_bytes, t)
                t.flush()

            # Read the csv, line by line, and use csv.Sniffer to infer the delimiter
            with open(t.name, newline="") as f:
                try:
                    dialect = csv.Sniffer().sniff(f.read(1024))
                    f.seek(0)
                    r = csv.reader(f, delimiter=dialect.delimiter, escapechar="\\")
from transforms.api import transform, Input, Output, Check
from transforms import expectations as E
import csv
import tempfile
import shutil
from pyspark.sql import Row
from pyspark.sql import types as T


@transform(
    processed=Output(
        "/UNITE/Data Ingestion & OMOP Mapping/Source Data Model: ACT/Site 411/metadata/n3c_vocab_map"
    ),
    my_input=Input(
        "/UNITE/Data Ingestion & OMOP Mapping/Source Data Model: ACT/Site 411/transform/00 - unzipped/unzipped_raw_data"
    ),
)
def my_compute_function(ctx, my_input, processed):
    def process_file(file_status):
        with tempfile.NamedTemporaryFile() as t:
            # Copy contents of file from Foundry into temp file
            with my_input.filesystem().open(file_status.path, 'rb') as f_bytes:
                shutil.copyfileobj(f_bytes, t)
                t.flush()

            # Read the csv, line by line, and use csv.Sniffer to infer the delimiter
            with open(t.name, newline="") as f:
                try:
                    dialect = csv.Sniffer().sniff(f.read(1024))
                    f.seek(0)
                    r = csv.reader(f,
from transforms.api import transform, Input, Output, Check
from transforms import expectations as E
from pyspark.sql import Row
import csv
import tempfile
import shutil
from datetime import datetime
from pyspark.sql import functions as F
from act import omop_schemas


@transform(
    processed=Output(
        "/UNITE/Data Ingestion & OMOP Mapping/Source Data Model: ACT/Site 411/metadata/manifest",
        checks=
        Check(E.negate(E.col("SITE_NAME").equals("[No manifest provided]")),
              'Valid manifest provided by site',
              on_error='WARN')),
    my_input=Input(
        "/UNITE/Data Ingestion & OMOP Mapping/Source Data Model: ACT/Site 411/transform/00 - unzipped/unzipped_raw_data"
    ),
    site_id_df=Input(
        '/UNITE/Data Ingestion & OMOP Mapping/raw_data/data partner id tables/Data Partner IDs - Site 411'
    ),
    data_partner_ids=Input(
        "/UNITE/Data Ingestion & OMOP Mapping/raw_data/data partner id tables/Data Partner IDs - ALL"
    ),
    omop_vocab=Input("/UNITE/OMOP Vocabularies/vocabulary"),
    control_map=Input(
        "/UNITE/Data Ingestion & OMOP Mapping/Source Data Model: ACT/Site 411/metadata/control_map"
    ))
from transforms.api import transform, Input, Output
from act.utils import get_newest_payload
from pyspark.sql import functions as F
import pyspark.sql.types as T
import os


@transform(
    payload_filename=Output("/UNITE/Data Ingestion & OMOP Mapping/Source Data Model: ACT/Site 411/transform/00 - unzipped/payload_filename"),
    zip_file=Input("/UNITE/Data Ingestion & OMOP Mapping/raw_data/Zipped Datasets/site_411_act_raw_zips"),
)
def unzip(ctx, payload_filename, zip_file):
    fs = zip_file.filesystem()
    files_df = fs.files(regex="(?i).*incoming.*\.zip")
    newest_file = get_newest_payload(files_df)
    files_df = files_df.withColumn("newest_payload", F.when(F.col("path") == newest_file, F.lit(True)).otherwise(F.lit(False)))

    get_basename = F.udf(lambda x: os.path.basename(x), T.StringType())
    ctx.spark_session.udf.register("get_basename", get_basename)
    files_df = files_df.withColumn("payload", get_basename(F.col("path")))

    payload_filename.write_dataframe(files_df.select("payload", "newest_payload"))
from transforms.api import transform_df, Input, Output
from pyspark.sql import types as T
from pyspark.sql import functions as F


@transform_df(
    Output(
        "/UNITE/Data Ingestion & OMOP Mapping/Source Data Model: ACT/Site 411/metadata/payload_status"
    ),
    parsed_df=Input(
        "/UNITE/Data Ingestion & OMOP Mapping/Source Data Model: ACT/Site 411/transform/00 - unzipped/payload_filename"
    ),
    passed_pipeline_df=Input(
        "/UNITE/Data Ingestion & OMOP Mapping/Source Data Model: ACT/Site 411/union_staging/person"
    ),
    site_id=Input(
        "/UNITE/Data Ingestion & OMOP Mapping/raw_data/data partner id tables/Data Partner IDs - Site 411"
    ))
def my_compute_function(ctx, parsed_df, passed_pipeline_df, site_id):
    # parsed_df tells us the payload currently being processed in the transformation pipeline
    parsed_payload = parsed_df.where(
        F.col("newest_payload") == True).take(1)[0].payload
    # passed_pipeline_df tells us the payload that has successfully made it through the pipeline
    passed_pipeline_payload = passed_pipeline_df.select(
        "payload").distinct().take(1)[0].payload
    data_partner_id = int(site_id.take(1)[0].data_partner_id)

    schema = T.StructType([
        T.StructField('parsed_payload', T.StringType(), True),
        T.StructField('unreleased_payload', T.StringType(), True),
        T.StructField('data_partner_id', T.IntegerType(), True)