Esempio n. 1
0
def join_q2_data(
    context,
    april_data: DataFrame,
    may_data: DataFrame,
    june_data: DataFrame,
    master_cord_data: DataFrame,
) -> DataFrame:

    dfs = {'april': april_data, 'may': may_data, 'june': june_data}

    missing_things = []

    for required_column in ['DestAirportSeqID', 'OriginAirportSeqID']:
        for month, df in dfs.items():
            if required_column not in df.columns:
                missing_things.append({'month': month, 'missing_column': required_column})

    yield ExpectationResult(
        success=not bool(missing_things),
        label='airport_ids_present',
        description='Sequence IDs present in incoming monthly flight data.',
        metadata_entries=[
            EventMetadataEntry.json(label='metadata', data={'missing_columns': missing_things})
        ],
    )

    yield ExpectationResult(
        success=set(april_data.columns) == set(may_data.columns) == set(june_data.columns),
        label='flight_data_same_shape',
        metadata_entries=[
            EventMetadataEntry.json(label='metadata', data={'columns': april_data.columns})
        ],
    )

    q2_data = april_data.union(may_data).union(june_data)
    sampled_q2_data = q2_data.sample(
        withReplacement=False, fraction=context.solid_config['subsample_pct'] / 100.0
    )
    sampled_q2_data.createOrReplaceTempView('q2_data')

    dest_prefixed_master_cord_data = do_prefix_column_names(master_cord_data, 'DEST_')
    dest_prefixed_master_cord_data.createOrReplaceTempView('dest_cord_data')

    origin_prefixed_master_cord_data = do_prefix_column_names(master_cord_data, 'ORIGIN_')
    origin_prefixed_master_cord_data.createOrReplaceTempView('origin_cord_data')

    full_data = context.resources.spark.sql(
        '''
        SELECT * FROM origin_cord_data
        LEFT JOIN (
            SELECT * FROM q2_data
            LEFT JOIN dest_cord_data ON
            q2_data.DestAirportSeqID = dest_cord_data.DEST_AIRPORT_SEQ_ID
        ) q2_dest_data
        ON origin_cord_data.ORIGIN_AIRPORT_SEQ_ID = q2_dest_data.OriginAirportSeqID
        '''
    )

    yield Output(rename_spark_dataframe_columns(full_data, lambda c: c.lower()))
Esempio n. 2
0
def subsample_spark_dataset(context, data_frame: DataFrame) -> DataFrame:
    return data_frame.sample(withReplacement=False,
                             fraction=context.solid_config['subsample_pct'] /
                             100.0)