def _generate_batch_prediction_sql_template(template: str, bq_location: str) -> str: """Build batch_predict sql as per Measurement Protocol. Args: template: sql jinja template to load. bq_location: Location where the batch_predictions are stored by auto_ml Returns: SQL. """ storage_vars = airflow_utils.retrieve_airflow_variable_as_dict( blockbuster_constants.BLOCKBUSTER_STORAGE_CONFIG) activation_vars = airflow_utils.retrieve_airflow_variable_as_dict( blockbuster_constants.BLOCKBUSTER_PREDICTION_ACTIVATION_CONFIG) prediction_source_table = '{project}.{bigquery_location}.predictions'.format( project=storage_vars['bq_working_project'], bigquery_location=bq_location) sql_template = pipeline_utils.render_sql_from_template( template, source_table=prediction_source_table, num_segments=20, event_label=activation_vars['event_label'], event_action="'" + activation_vars['event_action'] + "'", event_category="'" + activation_vars['event_category'] + "'") return sql_template
def create_dag( args: Mapping[str, Any], parent_dag_name: Optional[str] = None, ) -> models.DAG: """Generates a DAG that trains a new model using the training dataset. Args: args: Arguments to provide to the Airflow DAG object as defaults. parent_dag_name: If this is provided, this is a SubDAG. Returns: The DAG object. """ dag = airflow_utils.initialize_airflow_dag( dag_id=dag_utils.get_dag_id(_DAG_NAME, parent_dag_name), schedule=None, retries=blockbuster_constants.DEFAULT_DAG_RETRY, retry_delay=blockbuster_constants.DEFAULT_DAG_RETRY_DELAY_MINS, start_days_ago=blockbuster_constants.DEFAULT_START_DAYS_AGO, **args) bb_vars = airflow_utils.retrieve_airflow_variable_as_dict( blockbuster_constants.BLOCKBUSTER_GLOBAL_CONFIG) training_vars = airflow_utils.retrieve_airflow_variable_as_dict( blockbuster_constants.BLOCKBUSTER_TRAINING_CONFIG) create_model_task = _add_create_model_task(dag, bb_vars, training_vars) update_airflow_variable_task = _add_update_airflow_variable_task(dag) helpers.chain(create_model_task, update_airflow_variable_task) return dag
def create_training_preprocess_dag(): """Creates the main dag for preprocess main dag. Returns: Parent training DAG for preprocessing. """ bb_storage_vars = airflow_utils.retrieve_airflow_variable_as_dict( blockbuster_constants.BLOCKBUSTER_STORAGE_CONFIG) bb_project_vars = airflow_utils.retrieve_airflow_variable_as_dict( blockbuster_constants.BLOCKBUSTER_GLOBAL_CONFIG) args = { 'start_date': airflow.utils.dates.days_ago(1), 'dataflow_default_options': { 'project': bb_project_vars['gcp_project_id'], 'region': bb_project_vars['gcp_region'], 'zone': bb_project_vars['gcp_zone'], 'tempLocation': bb_storage_vars['gcs_temp_path'] }, } main_dag = airflow_utils.initialize_airflow_dag( dag_id=_DAG_ID, schedule=None, retries=blockbuster_constants.DEFAULT_DAG_RETRY, retry_delay=blockbuster_constants.DEFAULT_DAG_RETRY_DELAY_MINS, start_days_ago=blockbuster_constants.DEFAULT_START_DAYS_AGO, **args) create_preprocess_subdag(main_dag, args) return main_dag
def create_dag( args: Mapping[str, Any], parent_dag_name: Optional[str] = None, ) -> models.DAG: """Generates a DAG that pushes data from Google Cloud Storage to GA. Args: args: Arguments to provide to the Airflow DAG object as defaults. parent_dag_name: If this is provided, this is a SubDAG. Returns: The DAG object. """ dag = airflow_utils.initialize_airflow_dag( dag_id=dag_utils.get_dag_id(_DAG_NAME, parent_dag_name), schedule=None, retries=blockbuster_constants.DEFAULT_DAG_RETRY, retry_delay=blockbuster_constants.DEFAULT_DAG_RETRY_DELAY_MINS, start_days_ago=blockbuster_constants.DEFAULT_START_DAYS_AGO, **args) storage_vars = airflow_utils.retrieve_airflow_variable_as_dict( blockbuster_constants.BLOCKBUSTER_STORAGE_CONFIG) activation_vars = airflow_utils.retrieve_airflow_variable_as_dict( blockbuster_constants.BLOCKBUSTER_ACTIVATION_CONFIG) bucket_uri = storage_vars['gcs_output_path'] bq_dataset = storage_vars['bq_working_dataset'] bq_table = 'monitoring' ga_tracking_id = activation_vars['ga_tracking_id'] _add_storage_to_ga_task(dag, bucket_uri, ga_tracking_id, bq_dataset, bq_table) return dag
def _store_final_results_to_bq(dag: models.DAG, task_id: str, batch_predict_sql: str) -> models.BaseOperator: """Store MP complaint results in Bigquery before GA transfer. Args: dag: the DAG to add this operator to task_id: ID for this specific task within the DAG. batch_predict_sql: Custom Query to pick records and add some additional colums as MP protocol. Returns: Operator to use within a DAG to store Prediction results to Bigquery. """ bb_vars = airflow_utils.retrieve_airflow_variable_as_dict( blockbuster_constants.BLOCKBUSTER_GLOBAL_CONFIG) storage_vars = airflow_utils.retrieve_airflow_variable_as_dict( blockbuster_constants.BLOCKBUSTER_STORAGE_CONFIG) final_output_table = '{project}.{dataset}.final_output'.format( project=storage_vars['bq_working_project'], dataset=storage_vars['bq_working_dataset']) return bigquery_operator.BigQueryOperator( task_id=task_id, sql=batch_predict_sql, use_legacy_sql=False, destination_dataset_table=final_output_table, create_disposition='CREATE_IF_NEEDED', write_disposition='WRITE_TRUNCATE', allow_large_results=True, location=bb_vars['gcp_region'], dag=dag, )
def create_dag( args: Mapping[Text, Any], parent_dag_name: Optional[Text] = None, ) -> models.DAG: """Generates a DAG that loads data into an AutoML Dataset. Args: args: Arguments to provide to the AutoML operators as defaults. parent_dag_name: If this value is provided, the newly created dag object is made a subdag of the parent dag. Returns: The DAG object. """ # Load params from Variables. bb_vars = airflow_utils.retrieve_airflow_variable_as_dict( blockbuster_constants.BLOCKBUSTER_GLOBAL_CONFIG) storage_vars = airflow_utils.retrieve_airflow_variable_as_dict( blockbuster_constants.BLOCKBUSTER_STORAGE_CONFIG) # Create dag. dag = airflow_utils.initialize_airflow_dag( dag_utils.get_dag_id(_DAG_NAME, parent_dag_name), None, # schedule blockbuster_constants.DEFAULT_DAG_RETRY, blockbuster_constants.DEFAULT_DAG_RETRY_DELAY_MINS, blockbuster_constants.DEFAULT_START_DAYS_AGO, local_macros={ 'get_column_spec': _get_column_spec, 'target': 'predictionLabel', 'extract_object_id': automl_hook.AutoMLTablesHook.extract_object_id, }, **args) dataset_creation_task = _add_dataset_creation_task(dag, bb_vars) dataset_id = ( "{{ task_instance.xcom_pull('create_dataset_task', key='dataset_id') }}" ) import_data_task = _add_import_data_task(dag, dataset_id, bb_vars, storage_vars) list_table_specs_task = _add_list_table_specs_task(dag, dataset_id, bb_vars) list_column_specs_task = _add_list_column_specs_task( dag, dataset_id, bb_vars) update_dataset_task = _add_update_dataset_task(dag, bb_vars) update_airflow_variable_task = _add_update_airflow_variable_task(dag) helpers.chain(dataset_creation_task, import_data_task, list_table_specs_task, list_column_specs_task, update_dataset_task, update_airflow_variable_task) return dag
def _get_batch_predictions(dag: models.DAG, task_id: str) -> models.BaseOperator: """Batch Predict the GA leads using pre-trained AutoML model. Args: dag: the DAG to add this operator to task_id: ID for this specific task within the DAG. Returns: Operator to use within a DAG to run the Batch Prediction Pipeline on automl. """ storage_vars = airflow_utils.retrieve_airflow_variable_as_dict( blockbuster_constants.BLOCKBUSTER_STORAGE_CONFIG) model_id = airflow_utils.get_airflow_variable( blockbuster_constants.BLOCKBUSTER_PREDICTION_RECENT_MODEL) bq_input_path = 'bq://{project}.{dataset}.automl_prediction_input'.format( project=storage_vars['bq_working_project'], dataset=storage_vars['bq_working_dataset']) output_path = f'bq://{storage_vars["bq_working_project"]}' output_key = 'bq_output_dataset' task_batch_predict = (automl_tables_batch_prediction_operator. AutoMLTablesBatchPredictionOperator( task_id=task_id, model_id=model_id, input_path=bq_input_path, output_path=output_path, output_key=output_key, conn_id='google_cloud_default', dag=dag)) return task_batch_predict
def _transfer_bigquery_to_gcs(dag, task_id) -> models.BaseOperator: """Pipeline to transfer finally transferable output to GCS. Args: dag: the DAG to add this operator to task_id: ID for this specific task within the DAG. Returns: Operator to use within a DAG to run the Pipeline for moving records to GCS. """ storage_vars = airflow_utils.retrieve_airflow_variable_as_dict( blockbuster_constants.BLOCKBUSTER_STORAGE_CONFIG) final_output_uri = '{path}/result-{timestamp}-*.json'.format( path=storage_vars['gcs_output_path'], timestamp=int(time.time())) final_output_table = '{project}.{dataset}.final_output'.format( project=storage_vars['bq_working_project'], dataset=storage_vars['bq_working_dataset']) return bigquery_to_gcs.BigQueryToCloudStorageOperator( task_id=task_id, source_project_dataset_table=final_output_table, destination_cloud_storage_uris=[final_output_uri], export_format='NEWLINE_DELIMITED_JSON', dag=dag)
def create_dag( args: Dict[str, Union[Dict[str, Any], dt.datetime]], output_type: blockbuster_constants.PreprocessingType, parent_dag_name: Optional[str] = None, ) -> models.DAG: """Generates a DAG that create source table from GA tables. Args: args: Arguments to provide to the operators as defaults. output_type: Which set of variables to load for preprocessing and prediction. parent_dag_name: If this is provided, this is a SubDAG. Returns: The DAG object. """ # Load params from Airflow Variables. bb_vars = airflow_utils.retrieve_airflow_variable_as_dict( blockbuster_constants.BLOCKBUSTER_GLOBAL_CONFIG) preprocess_vars = airflow_utils.retrieve_airflow_variable_as_dict( blockbuster_constants.BLOCKBUSTER_PREPROCESS_CONFIG) prediction_vars = airflow_utils.retrieve_airflow_variable_as_dict( blockbuster_constants.BLOCKBUSTER_PREDICTION_ACTIVATION_CONFIG) storage_vars = airflow_utils.retrieve_airflow_variable_as_dict( blockbuster_constants.BLOCKBUSTER_STORAGE_CONFIG) dag_id = dag_utils.get_dag_id(DAG_NAME, parent_dag_name) dag = airflow_utils.initialize_airflow_dag( dag_id, None, blockbuster_constants.DEFAULT_DAG_RETRY, blockbuster_constants.DEFAULT_DAG_RETRY_DELAY_MINS, blockbuster_constants.DEFAULT_START_DAYS_AGO, **args) if output_type == blockbuster_constants.PreprocessingType.TRAINING: table_suffix = 'training' sql_vars = get_sql_params_for_training_stage(preprocess_vars) elif output_type == blockbuster_constants.PreprocessingType.PREDICTION: table_suffix = 'prediction' sql_vars = get_sql_params_for_prediction_stage(preprocess_vars, prediction_vars) sql = pipeline_utils.render_sql_from_template('source_leads', **sql_vars) bq_working_project = storage_vars['bq_working_project'] bq_working_dataset = storage_vars['bq_working_dataset'] leads_table = get_leads_table(bq_working_project, bq_working_dataset, table_suffix) gcp_region = bb_vars['gcp_region'] add_prepare_source_data_task_to_dag(dag, sql, leads_table, gcp_region) return dag
def create_dag( args: Mapping[str, Any], output_type: blockbuster_constants.PreprocessingType, parent_dag_name: Optional[str] = None, ) -> models.DAG: """Generates a DAG that preprocesses data. Args: args: Arguments to provide to the Operators as defaults. output_type: Which set of Variables to load for preprocessing parent_dag_name: If this is provided, this is a SubDAG. Returns: The DAG object. """ preprocess_vars = airflow_utils.retrieve_airflow_variable_as_dict( blockbuster_constants.BLOCKBUSTER_PREPROCESS_CONFIG) prediction_vars = airflow_utils.retrieve_airflow_variable_as_dict( blockbuster_constants.BLOCKBUSTER_PREDICTION_ACTIVATION_CONFIG) storage_vars = airflow_utils.retrieve_airflow_variable_as_dict( blockbuster_constants.BLOCKBUSTER_STORAGE_CONFIG) training_vars = airflow_utils.retrieve_airflow_variable_as_dict( blockbuster_constants.BLOCKBUSTER_TRAINING_CONFIG) feature_options = dag_utils.get_feature_config_val( blockbuster_constants.BLOCKBUSTER_FEATURE_CONFIG) dag = airflow_utils.initialize_airflow_dag( dag_utils.get_dag_id(_DAG_NAME, parent_dag_name), None, blockbuster_constants.DEFAULT_DAG_RETRY, blockbuster_constants.DEFAULT_DAG_RETRY_DELAY_MINS, blockbuster_constants.DEFAULT_START_DAYS_AGO, **args) mlwp_sliding_window_pipeline_task = _add_mlwp_sliding_window_pipeline_task( dag, output_type, prediction_vars, preprocess_vars, storage_vars, training_vars) mlwp_generate_features_pipeline_task = _add_mlwp_generate_features_pipeline_task( dag, output_type, feature_options, storage_vars) prepare_automl_data_in_bq_task = _add_prepare_automl_data_in_bq_task( dag, output_type, prediction_vars, storage_vars) helpers.chain(mlwp_sliding_window_pipeline_task, mlwp_generate_features_pipeline_task, prepare_automl_data_in_bq_task) return dag
def create_prediction_activate_dag(): """Creates the main dag for analyze main dag. Returns: Parent training DAG for analyzing. """ bb_storage_vars = airflow_utils.retrieve_airflow_variable_as_dict( blockbuster_constants.BLOCKBUSTER_STORAGE_CONFIG) bb_project_vars = airflow_utils.retrieve_airflow_variable_as_dict( blockbuster_constants.BLOCKBUSTER_GLOBAL_CONFIG) args = { 'start_date': airflow.utils.dates.days_ago(1), 'dataflow_default_options': { 'project': bb_project_vars['gcp_project_id'], 'region': bb_project_vars['gcp_region'], 'zone': bb_project_vars['gcp_zone'], 'tempLocation': bb_storage_vars['gcs_temp_path'] }, } main_dag = airflow_utils.initialize_airflow_dag( dag_id=_DAG_ID, schedule=None, retries=blockbuster_constants.DEFAULT_DAG_RETRY, retry_delay=blockbuster_constants.DEFAULT_DAG_RETRY_DELAY_MINS, start_days_ago=blockbuster_constants.DEFAULT_START_DAYS_AGO, **args) prepare_source_task = create_prepare_source_task( main_dag, args, prepare_source_dag.DAG_NAME) analyze_task = create_analyze_task(main_dag, args, 'analyze') preprocess_task = create_preprocess_task(main_dag, args, 'preprocess') predict_task = create_predict_task(main_dag, args, 'batch_predict') activate_task = create_activate_task(main_dag, args, 'activate_ga') clean_up_task = create_cleanup_task(main_dag, args, 'cleanup_gcs') # Create task dependency pipeline. prepare_source_task.set_downstream(analyze_task) analyze_task.set_downstream(preprocess_task) preprocess_task.set_downstream(predict_task) predict_task.set_downstream(activate_task) activate_task.set_downstream(clean_up_task) return main_dag
def create_train_model_dag() -> models.DAG: """Creates the main dag for train model main dag. Returns: Parent training DAG. """ bb_storage_vars = airflow_utils.retrieve_airflow_variable_as_dict( blockbuster_constants.BLOCKBUSTER_STORAGE_CONFIG) bb_project_vars = airflow_utils.retrieve_airflow_variable_as_dict( blockbuster_constants.BLOCKBUSTER_GLOBAL_CONFIG) args = { 'start_date': airflow.utils.dates.days_ago(1), 'dataflow_default_options': { 'project': bb_project_vars['gcp_project_id'], 'region': bb_project_vars['gcp_region'], 'zone': bb_project_vars['gcp_zone'], 'tempLocation': bb_storage_vars['gcs_temp_path'] }, } main_dag = airflow_utils.initialize_airflow_dag( dag_id=_DAG_ID, schedule=None, retries=blockbuster_constants.DEFAULT_DAG_RETRY, retry_delay=blockbuster_constants.DEFAULT_DAG_RETRY_DELAY_MINS, start_days_ago=blockbuster_constants.DEFAULT_START_DAYS_AGO, **args) load_data_subdag = subdag_operator.SubDagOperator( task_id=_LOAD_DATA_TASK_NAME, subdag=load_data_dag.create_dag(args, _DAG_ID), dag=main_dag) train_model_subdag = subdag_operator.SubDagOperator( task_id=_TRAIN_MODEL_TASK_NAME, subdag=train_model_dag.create_dag(args, _DAG_ID), dag=main_dag) helpers.chain(load_data_subdag, train_model_subdag) return main_dag
def create_dag( args: Mapping[str, Any], output_type: blockbuster_constants.PreprocessingType, parent_dag_name: Optional[str] = None, ) -> models.DAG: """Generates a DAG that analyzes data before preprocessing. Args: args: Arguments to provide to the Operators as defaults. output_type: Which set of Variables to load for preprocessing. parent_dag_name: If this is provided, this is a SubDAG. Returns: The DAG object. """ # Load params from Airflow Variables preprocess_vars = airflow_utils.retrieve_airflow_variable_as_dict( blockbuster_constants.BLOCKBUSTER_PREPROCESS_CONFIG) prediction_vars = airflow_utils.retrieve_airflow_variable_as_dict( blockbuster_constants.BLOCKBUSTER_PREDICTION_ACTIVATION_CONFIG) storage_vars = airflow_utils.retrieve_airflow_variable_as_dict( blockbuster_constants.BLOCKBUSTER_STORAGE_CONFIG) training_vars = airflow_utils.retrieve_airflow_variable_as_dict( blockbuster_constants.BLOCKBUSTER_TRAINING_CONFIG) feature_vars = dag_utils.get_feature_config_val( blockbuster_constants.BLOCKBUSTER_FEATURE_CONFIG) dag = airflow_utils.initialize_airflow_dag( dag_utils.get_dag_id(_DAG_NAME, parent_dag_name), None, blockbuster_constants.DEFAULT_DAG_RETRY, blockbuster_constants.DEFAULT_DAG_RETRY_DELAY_MINS, blockbuster_constants.DEFAULT_START_DAYS_AGO, **args) if output_type == blockbuster_constants.PreprocessingType.TRAINING: bucket_name, bucket_path = dag_utils.extract_bucket_parts( f'{storage_vars["gcs_temp_path"]}/training') else: bucket_name, bucket_path = dag_utils.extract_bucket_parts( f'{storage_vars["gcs_temp_path"]}/prediction') clean_temp_dir_task = gcs_delete_operator.GoogleCloudStorageDeleteOperator( task_id=_CLEAN_TEMP_DIR_TASK, bucket=bucket_name, directory=bucket_path, dag=dag) user_session_pipeline_task = add_user_session_task( dag, _USER_SESSION_TASK_ID, output_type, feature_vars, prediction_vars, preprocess_vars, storage_vars, training_vars) if output_type == blockbuster_constants.PreprocessingType.TRAINING: data_visualization_pipeline_task = add_data_visualization_task( dag, _DATA_VISUALISATION_TASK_ID, preprocess_vars, storage_vars) generate_categorical_stats_task = add_categorical_stats_task( dag, feature_vars, storage_vars) generate_numeric_stats_task = add_numeric_stats_task( dag, feature_vars, storage_vars) helpers.chain( clean_temp_dir_task, user_session_pipeline_task, data_visualization_pipeline_task, [generate_categorical_stats_task, generate_numeric_stats_task]) else: helpers.chain(clean_temp_dir_task, user_session_pipeline_task) return dag