def add_numeric_stats_task( dag: models.DAG, feature_vars: dag_utils.FeatureConfigListMapping, storage_vars: dag_utils.AirflowVarsConfig ) -> bigquery_operator.BigQueryExecuteQueryOperator: """Builds an Operator that generates numeric fact stats within a DAG. Args: dag: The dag that the task needs to be added to. feature_vars: The parsed config values from airflow feature object variable. storage_vars: The parsed config values from airflow storage variable. Returns: Operator used to build numeric stats within a DAG. """ num_feats = dag_utils.get_features(feature_vars, 'type', 'Numeric') stats_dataset = (f'{storage_vars["bq_working_project"]}.' f'{storage_vars["bq_working_dataset"]}') numeric_stats_sql = pipeline_utils.render_sql_from_template( 'numeric_stats', fact_table=f'{stats_dataset}.facts', feature_columns=[ f'\'{dag_utils.get_feature_name(x)}\'' for x in num_feats ]) return bigquery_operator.BigQueryExecuteQueryOperator( task_id=_GENERATE_NUMERIC_STATS_TASK, sql=numeric_stats_sql, use_legacy_sql=False, destination_dataset_table=f'{stats_dataset}.num_facts_stats_table', create_disposition='CREATE_IF_NEEDED', write_disposition='WRITE_TRUNCATE', allow_large_results=True, dag=dag)
def _generate_batch_prediction_sql_template(template: str, bq_location: str) -> str: """Build batch_predict sql as per Measurement Protocol. Args: template: sql jinja template to load. bq_location: Location where the batch_predictions are stored by auto_ml Returns: SQL. """ storage_vars = airflow_utils.retrieve_airflow_variable_as_dict( blockbuster_constants.BLOCKBUSTER_STORAGE_CONFIG) activation_vars = airflow_utils.retrieve_airflow_variable_as_dict( blockbuster_constants.BLOCKBUSTER_PREDICTION_ACTIVATION_CONFIG) prediction_source_table = '{project}.{bigquery_location}.predictions'.format( project=storage_vars['bq_working_project'], bigquery_location=bq_location) sql_template = pipeline_utils.render_sql_from_template( template, source_table=prediction_source_table, num_segments=20, event_label=activation_vars['event_label'], event_action="'" + activation_vars['event_action'] + "'", event_category="'" + activation_vars['event_category'] + "'") return sql_template
def add_user_session_task( dag: models.DAG, task_id: str, output_type: blockbuster_constants.PreprocessingType, feature_vars: dag_utils.FeatureConfigListMapping, prediction_vars: dag_utils.AirflowVarsConfig, preprocess_vars: dag_utils.AirflowVarsConfig, storage_vars: dag_utils.AirflowVarsConfig, training_vars: dag_utils.AirflowVarsConfig) -> models.BaseOperator: """Builds the UserSessionPipeline Operator. Args: dag: The dag that the task needs to be added to. task_id: Id string for this specific task within the DAG. output_type: Indicate whether this pipeline is to be used for training or prediction. feature_vars: The parsed config values from airflow feature object variable. prediction_vars: The parsed config values from airflow prediction variable. preprocess_vars: The parsed config values from airflow preprocess variable. storage_vars: The parsed config values from airflow storage variable. training_vars: The parsed config values from airflow training variable. Returns: Operator to use within a DAG to run the User Session Pipeline on Dataflow. """ # Load start/end date from the appropriate Airflow Variable if output_type == blockbuster_constants.PreprocessingType.TRAINING: output_path = f'{storage_vars["gcs_temp_path"]}/training' elif output_type == blockbuster_constants.PreprocessingType.PREDICTION: output_path = f'{storage_vars["gcs_temp_path"]}/prediction' template_file_directory = storage_vars['gcs_dataflow_path'] sql_vars = get_user_session_sql_params( output_type, feature_vars, prediction_vars, preprocess_vars, storage_vars, ) sql = pipeline_utils.render_sql_from_template('usersession_source', **sql_vars) return dataflow_operator.DataflowTemplatedJobStartOperator( task_id=task_id, template=f'{template_file_directory}/UserSessionPipeline', parameters={ 'inputBigQuerySQL': sql, 'outputSessionsAvroPrefix': f'{output_path}/usersession-output/', 'predictionFactName': training_vars['predictionFactName'], 'predictionFactValues': training_vars['predictionFactValues'] }, dag=dag)
def _add_prepare_automl_data_in_bq_task( dag: models.DAG, output_type: blockbuster_constants.PreprocessingType, prediction_vars: dag_utils.AirflowVarsConfig, storage_vars: dag_utils.AirflowVarsConfig ) -> bigquery_operator.BigQueryExecuteQueryOperator: """Adds the task to write the output to Big Query to dag. Args: dag: The dag that the task needs to be added to. output_type: Indicate whether this pipeline is to be used for training or prediction. prediction_vars: The parsed config values from airflow prediction variable. storage_vars: The parsed config values from airflow storage variable. Returns: The configured BigQueryOperator task to write input data for automl that was added to the dag. """ exclude_from_output = ['userId', 'RECENT_BB_id', 'RECENT_most_recent_lead'] if output_type == blockbuster_constants.PreprocessingType.TRAINING: output_table = 'training' exclude_from_output.append('BB_id') elif output_type == blockbuster_constants.PreprocessingType.PREDICTION: output_table = 'prediction' exclude_from_output.append('MLDataSplit') features_table = dag_utils.construct_bq_table_path( storage_vars['bq_working_project'], storage_vars['bq_working_dataset'], f'ga_{output_table}_input') prepare_data_sql = pipeline_utils.render_sql_from_template( 'prepare_data', features_table=features_table, exclude_from_output=exclude_from_output, inclusion_recency_days=prediction_vars['leads_submission_window']) output_dataset = dag_utils.construct_bq_table_path( storage_vars['bq_working_project'], storage_vars['bq_working_dataset'], f'automl_{output_table}_input') prepare_data_for_automl = bigquery_operator.BigQueryExecuteQueryOperator( task_id='prepare_data_for_automl', sql=prepare_data_sql, use_legacy_sql=False, destination_dataset_table=output_dataset, create_disposition='CREATE_IF_NEEDED', write_disposition='WRITE_TRUNCATE', allow_large_results=True, dag=dag, ) return prepare_data_for_automl
def create_dag( args: Dict[str, Union[Dict[str, Any], dt.datetime]], output_type: blockbuster_constants.PreprocessingType, parent_dag_name: Optional[str] = None, ) -> models.DAG: """Generates a DAG that create source table from GA tables. Args: args: Arguments to provide to the operators as defaults. output_type: Which set of variables to load for preprocessing and prediction. parent_dag_name: If this is provided, this is a SubDAG. Returns: The DAG object. """ # Load params from Airflow Variables. bb_vars = airflow_utils.retrieve_airflow_variable_as_dict( blockbuster_constants.BLOCKBUSTER_GLOBAL_CONFIG) preprocess_vars = airflow_utils.retrieve_airflow_variable_as_dict( blockbuster_constants.BLOCKBUSTER_PREPROCESS_CONFIG) prediction_vars = airflow_utils.retrieve_airflow_variable_as_dict( blockbuster_constants.BLOCKBUSTER_PREDICTION_ACTIVATION_CONFIG) storage_vars = airflow_utils.retrieve_airflow_variable_as_dict( blockbuster_constants.BLOCKBUSTER_STORAGE_CONFIG) dag_id = dag_utils.get_dag_id(DAG_NAME, parent_dag_name) dag = airflow_utils.initialize_airflow_dag( dag_id, None, blockbuster_constants.DEFAULT_DAG_RETRY, blockbuster_constants.DEFAULT_DAG_RETRY_DELAY_MINS, blockbuster_constants.DEFAULT_START_DAYS_AGO, **args) if output_type == blockbuster_constants.PreprocessingType.TRAINING: table_suffix = 'training' sql_vars = get_sql_params_for_training_stage(preprocess_vars) elif output_type == blockbuster_constants.PreprocessingType.PREDICTION: table_suffix = 'prediction' sql_vars = get_sql_params_for_prediction_stage(preprocess_vars, prediction_vars) sql = pipeline_utils.render_sql_from_template('source_leads', **sql_vars) bq_working_project = storage_vars['bq_working_project'] bq_working_dataset = storage_vars['bq_working_dataset'] leads_table = get_leads_table(bq_working_project, bq_working_dataset, table_suffix) gcp_region = bb_vars['gcp_region'] add_prepare_source_data_task_to_dag(dag, sql, leads_table, gcp_region) return dag