def create_training_prepare_source_dag() -> models.DAG:
  """Creates the pipeline for preparing source for training the model.

  Returns:
    Parent training DAG.
  """
  bb_project_vars = json.loads(models.Variable.get('bb_project'))
  bb_storage_vars = json.loads(models.Variable.get('bb_storage'))
  args = {
      'start_date': airflow.utils.dates.days_ago(1),
      'dataflow_default_options': {
          'project': bb_project_vars['gcp_project_id'],
          'region': bb_project_vars['gcp_region'],
          'zone': bb_project_vars['gcp_zone'],
          'tempLocation': bb_storage_vars['gcs_temp_path']
      },
  }

  main_dag = airflow_utils.initialize_airflow_dag(
      _DAG_ID, None, blockbuster_constants.DEFAULT_DAG_RETRY,
      blockbuster_constants.DEFAULT_DAG_RETRY_DELAY_MINS,
      blockbuster_constants.DEFAULT_START_DAYS_AGO, **args)

  create_prepare_source_subdag(main_dag, args)
  return main_dag
Ejemplo n.º 2
0
def create_dag(
    args: Mapping[str, Any],
    parent_dag_name: Optional[str] = None,
) -> models.DAG:
    """Generates a DAG that trains a new model using the training dataset.

  Args:
    args: Arguments to provide to the Airflow DAG object as defaults.
    parent_dag_name: If this is provided, this is a SubDAG.

  Returns:
    The DAG object.
  """
    dag = airflow_utils.initialize_airflow_dag(
        dag_id=dag_utils.get_dag_id(_DAG_NAME, parent_dag_name),
        schedule=None,
        retries=blockbuster_constants.DEFAULT_DAG_RETRY,
        retry_delay=blockbuster_constants.DEFAULT_DAG_RETRY_DELAY_MINS,
        start_days_ago=blockbuster_constants.DEFAULT_START_DAYS_AGO,
        **args)

    bb_vars = airflow_utils.retrieve_airflow_variable_as_dict(
        blockbuster_constants.BLOCKBUSTER_GLOBAL_CONFIG)
    training_vars = airflow_utils.retrieve_airflow_variable_as_dict(
        blockbuster_constants.BLOCKBUSTER_TRAINING_CONFIG)

    create_model_task = _add_create_model_task(dag, bb_vars, training_vars)
    update_airflow_variable_task = _add_update_airflow_variable_task(dag)

    helpers.chain(create_model_task, update_airflow_variable_task)
    return dag
Ejemplo n.º 3
0
def create_training_preprocess_dag():
  """Creates the main dag for preprocess main dag.

  Returns:
    Parent training DAG for preprocessing.
  """
  bb_storage_vars = airflow_utils.retrieve_airflow_variable_as_dict(
      blockbuster_constants.BLOCKBUSTER_STORAGE_CONFIG)
  bb_project_vars = airflow_utils.retrieve_airflow_variable_as_dict(
      blockbuster_constants.BLOCKBUSTER_GLOBAL_CONFIG)
  args = {
      'start_date': airflow.utils.dates.days_ago(1),
      'dataflow_default_options': {
          'project': bb_project_vars['gcp_project_id'],
          'region': bb_project_vars['gcp_region'],
          'zone': bb_project_vars['gcp_zone'],
          'tempLocation': bb_storage_vars['gcs_temp_path']
      },
  }

  main_dag = airflow_utils.initialize_airflow_dag(
      dag_id=_DAG_ID,
      schedule=None,
      retries=blockbuster_constants.DEFAULT_DAG_RETRY,
      retry_delay=blockbuster_constants.DEFAULT_DAG_RETRY_DELAY_MINS,
      start_days_ago=blockbuster_constants.DEFAULT_START_DAYS_AGO,
      **args)

  create_preprocess_subdag(main_dag, args)

  return main_dag
def create_dag(
    args: Mapping[str, Any],
    parent_dag_name: Optional[str] = None,
) -> models.DAG:
  """Generates a DAG that pushes data from Google Cloud Storage to GA.

  Args:
    args: Arguments to provide to the Airflow DAG object as defaults.
    parent_dag_name: If this is provided, this is a SubDAG.

  Returns:
    The DAG object.
  """
  dag = airflow_utils.initialize_airflow_dag(
      dag_id=dag_utils.get_dag_id(_DAG_NAME, parent_dag_name),
      schedule=None,
      retries=blockbuster_constants.DEFAULT_DAG_RETRY,
      retry_delay=blockbuster_constants.DEFAULT_DAG_RETRY_DELAY_MINS,
      start_days_ago=blockbuster_constants.DEFAULT_START_DAYS_AGO, **args)

  storage_vars = airflow_utils.retrieve_airflow_variable_as_dict(
      blockbuster_constants.BLOCKBUSTER_STORAGE_CONFIG)

  bucket_name, bucket_path = dag_utils.extract_bucket_parts(
      storage_vars['gcs_output_path'])
  _cleanup_storage_task(dag, bucket_name, bucket_path)

  return dag
Ejemplo n.º 5
0
def create_dag(
    args: Mapping[Text, Any],
    parent_dag_name: Optional[Text] = None,
) -> models.DAG:
    """Generates a DAG that loads data into an AutoML Dataset.

  Args:
    args: Arguments to provide to the AutoML operators as defaults.
    parent_dag_name: If this value is provided, the newly created dag object is
      made a subdag of the parent dag.

  Returns:
    The DAG object.
  """
    # Load params from Variables.
    bb_vars = airflow_utils.retrieve_airflow_variable_as_dict(
        blockbuster_constants.BLOCKBUSTER_GLOBAL_CONFIG)
    storage_vars = airflow_utils.retrieve_airflow_variable_as_dict(
        blockbuster_constants.BLOCKBUSTER_STORAGE_CONFIG)
    # Create dag.
    dag = airflow_utils.initialize_airflow_dag(
        dag_utils.get_dag_id(_DAG_NAME, parent_dag_name),
        None,  # schedule
        blockbuster_constants.DEFAULT_DAG_RETRY,
        blockbuster_constants.DEFAULT_DAG_RETRY_DELAY_MINS,
        blockbuster_constants.DEFAULT_START_DAYS_AGO,
        local_macros={
            'get_column_spec': _get_column_spec,
            'target': 'predictionLabel',
            'extract_object_id':
            automl_hook.AutoMLTablesHook.extract_object_id,
        },
        **args)
    dataset_creation_task = _add_dataset_creation_task(dag, bb_vars)
    dataset_id = (
        "{{ task_instance.xcom_pull('create_dataset_task', key='dataset_id') }}"
    )
    import_data_task = _add_import_data_task(dag, dataset_id, bb_vars,
                                             storage_vars)
    list_table_specs_task = _add_list_table_specs_task(dag, dataset_id,
                                                       bb_vars)
    list_column_specs_task = _add_list_column_specs_task(
        dag, dataset_id, bb_vars)
    update_dataset_task = _add_update_dataset_task(dag, bb_vars)
    update_airflow_variable_task = _add_update_airflow_variable_task(dag)
    helpers.chain(dataset_creation_task, import_data_task,
                  list_table_specs_task, list_column_specs_task,
                  update_dataset_task, update_airflow_variable_task)
    return dag
def create_dag(
    args: Dict[str, Union[Dict[str, Any], dt.datetime]],
    output_type: blockbuster_constants.PreprocessingType,
    parent_dag_name: Optional[str] = None,
) -> models.DAG:
    """Generates a DAG that create source table from GA tables.


  Args:
    args: Arguments to provide to the operators as defaults.
    output_type: Which set of variables to load for preprocessing and
      prediction.
    parent_dag_name: If this is provided, this is a SubDAG.

  Returns:
    The DAG object.
  """
    # Load params from Airflow Variables.
    bb_vars = airflow_utils.retrieve_airflow_variable_as_dict(
        blockbuster_constants.BLOCKBUSTER_GLOBAL_CONFIG)
    preprocess_vars = airflow_utils.retrieve_airflow_variable_as_dict(
        blockbuster_constants.BLOCKBUSTER_PREPROCESS_CONFIG)
    prediction_vars = airflow_utils.retrieve_airflow_variable_as_dict(
        blockbuster_constants.BLOCKBUSTER_PREDICTION_ACTIVATION_CONFIG)
    storage_vars = airflow_utils.retrieve_airflow_variable_as_dict(
        blockbuster_constants.BLOCKBUSTER_STORAGE_CONFIG)
    dag_id = dag_utils.get_dag_id(DAG_NAME, parent_dag_name)
    dag = airflow_utils.initialize_airflow_dag(
        dag_id, None, blockbuster_constants.DEFAULT_DAG_RETRY,
        blockbuster_constants.DEFAULT_DAG_RETRY_DELAY_MINS,
        blockbuster_constants.DEFAULT_START_DAYS_AGO, **args)
    if output_type == blockbuster_constants.PreprocessingType.TRAINING:
        table_suffix = 'training'
        sql_vars = get_sql_params_for_training_stage(preprocess_vars)
    elif output_type == blockbuster_constants.PreprocessingType.PREDICTION:
        table_suffix = 'prediction'
        sql_vars = get_sql_params_for_prediction_stage(preprocess_vars,
                                                       prediction_vars)
    sql = pipeline_utils.render_sql_from_template('source_leads', **sql_vars)
    bq_working_project = storage_vars['bq_working_project']
    bq_working_dataset = storage_vars['bq_working_dataset']
    leads_table = get_leads_table(bq_working_project, bq_working_dataset,
                                  table_suffix)
    gcp_region = bb_vars['gcp_region']
    add_prepare_source_data_task_to_dag(dag, sql, leads_table, gcp_region)
    return dag
Ejemplo n.º 7
0
def create_dag(
    args: Mapping[str, Any],
    output_type: blockbuster_constants.PreprocessingType,
    parent_dag_name: Optional[str] = None,
) -> models.DAG:
    """Generates a DAG that preprocesses data.

  Args:
    args: Arguments to provide to the Operators as defaults.
    output_type: Which set of Variables to load for preprocessing
    parent_dag_name: If this is provided, this is a SubDAG.

  Returns:
    The DAG object.
  """
    preprocess_vars = airflow_utils.retrieve_airflow_variable_as_dict(
        blockbuster_constants.BLOCKBUSTER_PREPROCESS_CONFIG)
    prediction_vars = airflow_utils.retrieve_airflow_variable_as_dict(
        blockbuster_constants.BLOCKBUSTER_PREDICTION_ACTIVATION_CONFIG)
    storage_vars = airflow_utils.retrieve_airflow_variable_as_dict(
        blockbuster_constants.BLOCKBUSTER_STORAGE_CONFIG)
    training_vars = airflow_utils.retrieve_airflow_variable_as_dict(
        blockbuster_constants.BLOCKBUSTER_TRAINING_CONFIG)
    feature_options = dag_utils.get_feature_config_val(
        blockbuster_constants.BLOCKBUSTER_FEATURE_CONFIG)

    dag = airflow_utils.initialize_airflow_dag(
        dag_utils.get_dag_id(_DAG_NAME, parent_dag_name), None,
        blockbuster_constants.DEFAULT_DAG_RETRY,
        blockbuster_constants.DEFAULT_DAG_RETRY_DELAY_MINS,
        blockbuster_constants.DEFAULT_START_DAYS_AGO, **args)

    mlwp_sliding_window_pipeline_task = _add_mlwp_sliding_window_pipeline_task(
        dag, output_type, prediction_vars, preprocess_vars, storage_vars,
        training_vars)
    mlwp_generate_features_pipeline_task = _add_mlwp_generate_features_pipeline_task(
        dag, output_type, feature_options, storage_vars)
    prepare_automl_data_in_bq_task = _add_prepare_automl_data_in_bq_task(
        dag, output_type, prediction_vars, storage_vars)
    helpers.chain(mlwp_sliding_window_pipeline_task,
                  mlwp_generate_features_pipeline_task,
                  prepare_automl_data_in_bq_task)

    return dag
Ejemplo n.º 8
0
def create_batch_predict_dag(
    args: Dict[str, Union[Dict[str, Any], dt.datetime]],
    parent_dag_name: Optional[str] = None,
) -> models.DAG:
  """Generates a DAG/SubDAG that predicts using an existing model.

  Args:
    args: Arguments to provide to the operators as defaults.
    parent_dag_name: If this is provided, this is a SubDAG.

  Returns:
    DAG

  Raises:
    KeyError: If recent_model_id Airflow Variable hasn't been set.
  """
  dag_id = dag_utils.get_dag_id(DAG_NAME, parent_dag_name)

  dag = airflow_utils.initialize_airflow_dag(
      dag_id=dag_id,
      schedule=None,
      retries=blockbuster_constants.DEFAULT_DAG_RETRY,
      retry_delay=blockbuster_constants.DEFAULT_DAG_RETRY_DELAY_MINS,
      start_days_ago=blockbuster_constants.DEFAULT_START_DAYS_AGO,
      local_macros={'extract_dataset_id': _extract_dataset_id}, **args)

  batch_predict_task = _get_batch_predictions(dag, 'batch_predict_task')
  automl_bq_location = (
      '{{ extract_dataset_id('  # macro to be expanded in task Jinja template
      'task_instance.xcom_pull('
      '"batch_predict_task", key="bq_output_dataset")) }}')
  batch_predict_sql = _generate_batch_prediction_sql_template(
      'batch_predict', automl_bq_location)

  get_output_data_task = _store_final_results_to_bq(dag, 'get_output_data',
                                                    batch_predict_sql)

  bq_to_gcs_task = _transfer_bigquery_to_gcs(dag, 'bq_to_gcs')


  dag >> batch_predict_task >> get_output_data_task >> bq_to_gcs_task


  return dag
Ejemplo n.º 9
0
def create_prediction_activate_dag():
    """Creates the main dag for analyze main dag.

  Returns:
    Parent training DAG for analyzing.
  """
    bb_storage_vars = airflow_utils.retrieve_airflow_variable_as_dict(
        blockbuster_constants.BLOCKBUSTER_STORAGE_CONFIG)
    bb_project_vars = airflow_utils.retrieve_airflow_variable_as_dict(
        blockbuster_constants.BLOCKBUSTER_GLOBAL_CONFIG)
    args = {
        'start_date': airflow.utils.dates.days_ago(1),
        'dataflow_default_options': {
            'project': bb_project_vars['gcp_project_id'],
            'region': bb_project_vars['gcp_region'],
            'zone': bb_project_vars['gcp_zone'],
            'tempLocation': bb_storage_vars['gcs_temp_path']
        },
    }

    main_dag = airflow_utils.initialize_airflow_dag(
        dag_id=_DAG_ID,
        schedule=None,
        retries=blockbuster_constants.DEFAULT_DAG_RETRY,
        retry_delay=blockbuster_constants.DEFAULT_DAG_RETRY_DELAY_MINS,
        start_days_ago=blockbuster_constants.DEFAULT_START_DAYS_AGO,
        **args)

    prepare_source_task = create_prepare_source_task(
        main_dag, args, prepare_source_dag.DAG_NAME)
    analyze_task = create_analyze_task(main_dag, args, 'analyze')
    preprocess_task = create_preprocess_task(main_dag, args, 'preprocess')
    predict_task = create_predict_task(main_dag, args, 'batch_predict')
    activate_task = create_activate_task(main_dag, args, 'activate_ga')
    clean_up_task = create_cleanup_task(main_dag, args, 'cleanup_gcs')

    # Create task dependency pipeline.
    prepare_source_task.set_downstream(analyze_task)
    analyze_task.set_downstream(preprocess_task)
    preprocess_task.set_downstream(predict_task)
    predict_task.set_downstream(activate_task)
    activate_task.set_downstream(clean_up_task)
    return main_dag
Ejemplo n.º 10
0
def create_train_model_dag() -> models.DAG:
    """Creates the main dag for train model main dag.

  Returns:
    Parent training DAG.
  """
    bb_storage_vars = airflow_utils.retrieve_airflow_variable_as_dict(
        blockbuster_constants.BLOCKBUSTER_STORAGE_CONFIG)
    bb_project_vars = airflow_utils.retrieve_airflow_variable_as_dict(
        blockbuster_constants.BLOCKBUSTER_GLOBAL_CONFIG)
    args = {
        'start_date': airflow.utils.dates.days_ago(1),
        'dataflow_default_options': {
            'project': bb_project_vars['gcp_project_id'],
            'region': bb_project_vars['gcp_region'],
            'zone': bb_project_vars['gcp_zone'],
            'tempLocation': bb_storage_vars['gcs_temp_path']
        },
    }

    main_dag = airflow_utils.initialize_airflow_dag(
        dag_id=_DAG_ID,
        schedule=None,
        retries=blockbuster_constants.DEFAULT_DAG_RETRY,
        retry_delay=blockbuster_constants.DEFAULT_DAG_RETRY_DELAY_MINS,
        start_days_ago=blockbuster_constants.DEFAULT_START_DAYS_AGO,
        **args)
    load_data_subdag = subdag_operator.SubDagOperator(
        task_id=_LOAD_DATA_TASK_NAME,
        subdag=load_data_dag.create_dag(args, _DAG_ID),
        dag=main_dag)
    train_model_subdag = subdag_operator.SubDagOperator(
        task_id=_TRAIN_MODEL_TASK_NAME,
        subdag=train_model_dag.create_dag(args, _DAG_ID),
        dag=main_dag)

    helpers.chain(load_data_subdag, train_model_subdag)
    return main_dag
Ejemplo n.º 11
0
def create_dag(
    args: Mapping[str, Any],
    output_type: blockbuster_constants.PreprocessingType,
    parent_dag_name: Optional[str] = None,
) -> models.DAG:
    """Generates a DAG that analyzes data before preprocessing.

  Args:
    args: Arguments to provide to the Operators as defaults.
    output_type: Which set of Variables to load for preprocessing.
    parent_dag_name: If this is provided, this is a SubDAG.

  Returns:
    The DAG object.
  """
    # Load params from Airflow Variables
    preprocess_vars = airflow_utils.retrieve_airflow_variable_as_dict(
        blockbuster_constants.BLOCKBUSTER_PREPROCESS_CONFIG)
    prediction_vars = airflow_utils.retrieve_airflow_variable_as_dict(
        blockbuster_constants.BLOCKBUSTER_PREDICTION_ACTIVATION_CONFIG)
    storage_vars = airflow_utils.retrieve_airflow_variable_as_dict(
        blockbuster_constants.BLOCKBUSTER_STORAGE_CONFIG)
    training_vars = airflow_utils.retrieve_airflow_variable_as_dict(
        blockbuster_constants.BLOCKBUSTER_TRAINING_CONFIG)
    feature_vars = dag_utils.get_feature_config_val(
        blockbuster_constants.BLOCKBUSTER_FEATURE_CONFIG)
    dag = airflow_utils.initialize_airflow_dag(
        dag_utils.get_dag_id(_DAG_NAME, parent_dag_name), None,
        blockbuster_constants.DEFAULT_DAG_RETRY,
        blockbuster_constants.DEFAULT_DAG_RETRY_DELAY_MINS,
        blockbuster_constants.DEFAULT_START_DAYS_AGO, **args)
    if output_type == blockbuster_constants.PreprocessingType.TRAINING:
        bucket_name, bucket_path = dag_utils.extract_bucket_parts(
            f'{storage_vars["gcs_temp_path"]}/training')
    else:
        bucket_name, bucket_path = dag_utils.extract_bucket_parts(
            f'{storage_vars["gcs_temp_path"]}/prediction')

    clean_temp_dir_task = gcs_delete_operator.GoogleCloudStorageDeleteOperator(
        task_id=_CLEAN_TEMP_DIR_TASK,
        bucket=bucket_name,
        directory=bucket_path,
        dag=dag)

    user_session_pipeline_task = add_user_session_task(
        dag, _USER_SESSION_TASK_ID, output_type, feature_vars, prediction_vars,
        preprocess_vars, storage_vars, training_vars)
    if output_type == blockbuster_constants.PreprocessingType.TRAINING:
        data_visualization_pipeline_task = add_data_visualization_task(
            dag, _DATA_VISUALISATION_TASK_ID, preprocess_vars, storage_vars)
        generate_categorical_stats_task = add_categorical_stats_task(
            dag, feature_vars, storage_vars)
        generate_numeric_stats_task = add_numeric_stats_task(
            dag, feature_vars, storage_vars)
        helpers.chain(
            clean_temp_dir_task, user_session_pipeline_task,
            data_visualization_pipeline_task,
            [generate_categorical_stats_task, generate_numeric_stats_task])
    else:
        helpers.chain(clean_temp_dir_task, user_session_pipeline_task)
    return dag