def join_q2_data( context, april_data: DataFrame, may_data: DataFrame, june_data: DataFrame, master_cord_data: DataFrame, ) -> DataFrame: dfs = {'april': april_data, 'may': may_data, 'june': june_data} missing_things = [] for required_column in ['DestAirportSeqID', 'OriginAirportSeqID']: for month, df in dfs.items(): if required_column not in df.columns: missing_things.append({'month': month, 'missing_column': required_column}) yield ExpectationResult( success=not bool(missing_things), label='airport_ids_present', description='Sequence IDs present in incoming monthly flight data.', metadata_entries=[ EventMetadataEntry.json(label='metadata', data={'missing_columns': missing_things}) ], ) yield ExpectationResult( success=set(april_data.columns) == set(may_data.columns) == set(june_data.columns), label='flight_data_same_shape', metadata_entries=[ EventMetadataEntry.json(label='metadata', data={'columns': april_data.columns}) ], ) q2_data = april_data.union(may_data).union(june_data) sampled_q2_data = q2_data.sample( withReplacement=False, fraction=context.solid_config['subsample_pct'] / 100.0 ) sampled_q2_data.createOrReplaceTempView('q2_data') dest_prefixed_master_cord_data = do_prefix_column_names(master_cord_data, 'DEST_') dest_prefixed_master_cord_data.createOrReplaceTempView('dest_cord_data') origin_prefixed_master_cord_data = do_prefix_column_names(master_cord_data, 'ORIGIN_') origin_prefixed_master_cord_data.createOrReplaceTempView('origin_cord_data') full_data = context.resources.spark.sql( ''' SELECT * FROM origin_cord_data LEFT JOIN ( SELECT * FROM q2_data LEFT JOIN dest_cord_data ON q2_data.DestAirportSeqID = dest_cord_data.DEST_AIRPORT_SEQ_ID ) q2_dest_data ON origin_cord_data.ORIGIN_AIRPORT_SEQ_ID = q2_dest_data.OriginAirportSeqID ''' ) yield Output(rename_spark_dataframe_columns(full_data, lambda c: c.lower()))
def subsample_spark_dataset(context, data_frame: DataFrame) -> DataFrame: return data_frame.sample(withReplacement=False, fraction=context.solid_config['subsample_pct'] / 100.0)