def create_dag(
    args: Mapping[str, Any],
    parent_dag_name: Optional[str] = None,
) -> models.DAG:
  """Generates a DAG that pushes data from Google Cloud Storage to GA.

  Args:
    args: Arguments to provide to the Airflow DAG object as defaults.
    parent_dag_name: If this is provided, this is a SubDAG.

  Returns:
    The DAG object.
  """
  dag = airflow_utils.initialize_airflow_dag(
      dag_id=dag_utils.get_dag_id(_DAG_NAME, parent_dag_name),
      schedule=None,
      retries=blockbuster_constants.DEFAULT_DAG_RETRY,
      retry_delay=blockbuster_constants.DEFAULT_DAG_RETRY_DELAY_MINS,
      start_days_ago=blockbuster_constants.DEFAULT_START_DAYS_AGO, **args)

  storage_vars = airflow_utils.retrieve_airflow_variable_as_dict(
      blockbuster_constants.BLOCKBUSTER_STORAGE_CONFIG)

  bucket_name, bucket_path = dag_utils.extract_bucket_parts(
      storage_vars['gcs_output_path'])
  _cleanup_storage_task(dag, bucket_name, bucket_path)

  return dag
Esempio n. 2
0
def create_dag(
    args: Mapping[str, Any],
    parent_dag_name: Optional[str] = None,
) -> models.DAG:
    """Generates a DAG that trains a new model using the training dataset.

  Args:
    args: Arguments to provide to the Airflow DAG object as defaults.
    parent_dag_name: If this is provided, this is a SubDAG.

  Returns:
    The DAG object.
  """
    dag = airflow_utils.initialize_airflow_dag(
        dag_id=dag_utils.get_dag_id(_DAG_NAME, parent_dag_name),
        schedule=None,
        retries=blockbuster_constants.DEFAULT_DAG_RETRY,
        retry_delay=blockbuster_constants.DEFAULT_DAG_RETRY_DELAY_MINS,
        start_days_ago=blockbuster_constants.DEFAULT_START_DAYS_AGO,
        **args)

    bb_vars = airflow_utils.retrieve_airflow_variable_as_dict(
        blockbuster_constants.BLOCKBUSTER_GLOBAL_CONFIG)
    training_vars = airflow_utils.retrieve_airflow_variable_as_dict(
        blockbuster_constants.BLOCKBUSTER_TRAINING_CONFIG)

    create_model_task = _add_create_model_task(dag, bb_vars, training_vars)
    update_airflow_variable_task = _add_update_airflow_variable_task(dag)

    helpers.chain(create_model_task, update_airflow_variable_task)
    return dag
Esempio n. 3
0
def create_dag(
    args: Mapping[Text, Any],
    parent_dag_name: Optional[Text] = None,
) -> models.DAG:
    """Generates a DAG that loads data into an AutoML Dataset.

  Args:
    args: Arguments to provide to the AutoML operators as defaults.
    parent_dag_name: If this value is provided, the newly created dag object is
      made a subdag of the parent dag.

  Returns:
    The DAG object.
  """
    # Load params from Variables.
    bb_vars = airflow_utils.retrieve_airflow_variable_as_dict(
        blockbuster_constants.BLOCKBUSTER_GLOBAL_CONFIG)
    storage_vars = airflow_utils.retrieve_airflow_variable_as_dict(
        blockbuster_constants.BLOCKBUSTER_STORAGE_CONFIG)
    # Create dag.
    dag = airflow_utils.initialize_airflow_dag(
        dag_utils.get_dag_id(_DAG_NAME, parent_dag_name),
        None,  # schedule
        blockbuster_constants.DEFAULT_DAG_RETRY,
        blockbuster_constants.DEFAULT_DAG_RETRY_DELAY_MINS,
        blockbuster_constants.DEFAULT_START_DAYS_AGO,
        local_macros={
            'get_column_spec': _get_column_spec,
            'target': 'predictionLabel',
            'extract_object_id':
            automl_hook.AutoMLTablesHook.extract_object_id,
        },
        **args)
    dataset_creation_task = _add_dataset_creation_task(dag, bb_vars)
    dataset_id = (
        "{{ task_instance.xcom_pull('create_dataset_task', key='dataset_id') }}"
    )
    import_data_task = _add_import_data_task(dag, dataset_id, bb_vars,
                                             storage_vars)
    list_table_specs_task = _add_list_table_specs_task(dag, dataset_id,
                                                       bb_vars)
    list_column_specs_task = _add_list_column_specs_task(
        dag, dataset_id, bb_vars)
    update_dataset_task = _add_update_dataset_task(dag, bb_vars)
    update_airflow_variable_task = _add_update_airflow_variable_task(dag)
    helpers.chain(dataset_creation_task, import_data_task,
                  list_table_specs_task, list_column_specs_task,
                  update_dataset_task, update_airflow_variable_task)
    return dag
def create_dag(
    args: Dict[str, Union[Dict[str, Any], dt.datetime]],
    output_type: blockbuster_constants.PreprocessingType,
    parent_dag_name: Optional[str] = None,
) -> models.DAG:
    """Generates a DAG that create source table from GA tables.


  Args:
    args: Arguments to provide to the operators as defaults.
    output_type: Which set of variables to load for preprocessing and
      prediction.
    parent_dag_name: If this is provided, this is a SubDAG.

  Returns:
    The DAG object.
  """
    # Load params from Airflow Variables.
    bb_vars = airflow_utils.retrieve_airflow_variable_as_dict(
        blockbuster_constants.BLOCKBUSTER_GLOBAL_CONFIG)
    preprocess_vars = airflow_utils.retrieve_airflow_variable_as_dict(
        blockbuster_constants.BLOCKBUSTER_PREPROCESS_CONFIG)
    prediction_vars = airflow_utils.retrieve_airflow_variable_as_dict(
        blockbuster_constants.BLOCKBUSTER_PREDICTION_ACTIVATION_CONFIG)
    storage_vars = airflow_utils.retrieve_airflow_variable_as_dict(
        blockbuster_constants.BLOCKBUSTER_STORAGE_CONFIG)
    dag_id = dag_utils.get_dag_id(DAG_NAME, parent_dag_name)
    dag = airflow_utils.initialize_airflow_dag(
        dag_id, None, blockbuster_constants.DEFAULT_DAG_RETRY,
        blockbuster_constants.DEFAULT_DAG_RETRY_DELAY_MINS,
        blockbuster_constants.DEFAULT_START_DAYS_AGO, **args)
    if output_type == blockbuster_constants.PreprocessingType.TRAINING:
        table_suffix = 'training'
        sql_vars = get_sql_params_for_training_stage(preprocess_vars)
    elif output_type == blockbuster_constants.PreprocessingType.PREDICTION:
        table_suffix = 'prediction'
        sql_vars = get_sql_params_for_prediction_stage(preprocess_vars,
                                                       prediction_vars)
    sql = pipeline_utils.render_sql_from_template('source_leads', **sql_vars)
    bq_working_project = storage_vars['bq_working_project']
    bq_working_dataset = storage_vars['bq_working_dataset']
    leads_table = get_leads_table(bq_working_project, bq_working_dataset,
                                  table_suffix)
    gcp_region = bb_vars['gcp_region']
    add_prepare_source_data_task_to_dag(dag, sql, leads_table, gcp_region)
    return dag
Esempio n. 5
0
def create_dag(
    args: Mapping[str, Any],
    output_type: blockbuster_constants.PreprocessingType,
    parent_dag_name: Optional[str] = None,
) -> models.DAG:
    """Generates a DAG that preprocesses data.

  Args:
    args: Arguments to provide to the Operators as defaults.
    output_type: Which set of Variables to load for preprocessing
    parent_dag_name: If this is provided, this is a SubDAG.

  Returns:
    The DAG object.
  """
    preprocess_vars = airflow_utils.retrieve_airflow_variable_as_dict(
        blockbuster_constants.BLOCKBUSTER_PREPROCESS_CONFIG)
    prediction_vars = airflow_utils.retrieve_airflow_variable_as_dict(
        blockbuster_constants.BLOCKBUSTER_PREDICTION_ACTIVATION_CONFIG)
    storage_vars = airflow_utils.retrieve_airflow_variable_as_dict(
        blockbuster_constants.BLOCKBUSTER_STORAGE_CONFIG)
    training_vars = airflow_utils.retrieve_airflow_variable_as_dict(
        blockbuster_constants.BLOCKBUSTER_TRAINING_CONFIG)
    feature_options = dag_utils.get_feature_config_val(
        blockbuster_constants.BLOCKBUSTER_FEATURE_CONFIG)

    dag = airflow_utils.initialize_airflow_dag(
        dag_utils.get_dag_id(_DAG_NAME, parent_dag_name), None,
        blockbuster_constants.DEFAULT_DAG_RETRY,
        blockbuster_constants.DEFAULT_DAG_RETRY_DELAY_MINS,
        blockbuster_constants.DEFAULT_START_DAYS_AGO, **args)

    mlwp_sliding_window_pipeline_task = _add_mlwp_sliding_window_pipeline_task(
        dag, output_type, prediction_vars, preprocess_vars, storage_vars,
        training_vars)
    mlwp_generate_features_pipeline_task = _add_mlwp_generate_features_pipeline_task(
        dag, output_type, feature_options, storage_vars)
    prepare_automl_data_in_bq_task = _add_prepare_automl_data_in_bq_task(
        dag, output_type, prediction_vars, storage_vars)
    helpers.chain(mlwp_sliding_window_pipeline_task,
                  mlwp_generate_features_pipeline_task,
                  prepare_automl_data_in_bq_task)

    return dag
Esempio n. 6
0
def create_batch_predict_dag(
    args: Dict[str, Union[Dict[str, Any], dt.datetime]],
    parent_dag_name: Optional[str] = None,
) -> models.DAG:
  """Generates a DAG/SubDAG that predicts using an existing model.

  Args:
    args: Arguments to provide to the operators as defaults.
    parent_dag_name: If this is provided, this is a SubDAG.

  Returns:
    DAG

  Raises:
    KeyError: If recent_model_id Airflow Variable hasn't been set.
  """
  dag_id = dag_utils.get_dag_id(DAG_NAME, parent_dag_name)

  dag = airflow_utils.initialize_airflow_dag(
      dag_id=dag_id,
      schedule=None,
      retries=blockbuster_constants.DEFAULT_DAG_RETRY,
      retry_delay=blockbuster_constants.DEFAULT_DAG_RETRY_DELAY_MINS,
      start_days_ago=blockbuster_constants.DEFAULT_START_DAYS_AGO,
      local_macros={'extract_dataset_id': _extract_dataset_id}, **args)

  batch_predict_task = _get_batch_predictions(dag, 'batch_predict_task')
  automl_bq_location = (
      '{{ extract_dataset_id('  # macro to be expanded in task Jinja template
      'task_instance.xcom_pull('
      '"batch_predict_task", key="bq_output_dataset")) }}')
  batch_predict_sql = _generate_batch_prediction_sql_template(
      'batch_predict', automl_bq_location)

  get_output_data_task = _store_final_results_to_bq(dag, 'get_output_data',
                                                    batch_predict_sql)

  bq_to_gcs_task = _transfer_bigquery_to_gcs(dag, 'bq_to_gcs')


  dag >> batch_predict_task >> get_output_data_task >> bq_to_gcs_task


  return dag
Esempio n. 7
0
def create_dag(
    args: Mapping[str, Any],
    output_type: blockbuster_constants.PreprocessingType,
    parent_dag_name: Optional[str] = None,
) -> models.DAG:
    """Generates a DAG that analyzes data before preprocessing.

  Args:
    args: Arguments to provide to the Operators as defaults.
    output_type: Which set of Variables to load for preprocessing.
    parent_dag_name: If this is provided, this is a SubDAG.

  Returns:
    The DAG object.
  """
    # Load params from Airflow Variables
    preprocess_vars = airflow_utils.retrieve_airflow_variable_as_dict(
        blockbuster_constants.BLOCKBUSTER_PREPROCESS_CONFIG)
    prediction_vars = airflow_utils.retrieve_airflow_variable_as_dict(
        blockbuster_constants.BLOCKBUSTER_PREDICTION_ACTIVATION_CONFIG)
    storage_vars = airflow_utils.retrieve_airflow_variable_as_dict(
        blockbuster_constants.BLOCKBUSTER_STORAGE_CONFIG)
    training_vars = airflow_utils.retrieve_airflow_variable_as_dict(
        blockbuster_constants.BLOCKBUSTER_TRAINING_CONFIG)
    feature_vars = dag_utils.get_feature_config_val(
        blockbuster_constants.BLOCKBUSTER_FEATURE_CONFIG)
    dag = airflow_utils.initialize_airflow_dag(
        dag_utils.get_dag_id(_DAG_NAME, parent_dag_name), None,
        blockbuster_constants.DEFAULT_DAG_RETRY,
        blockbuster_constants.DEFAULT_DAG_RETRY_DELAY_MINS,
        blockbuster_constants.DEFAULT_START_DAYS_AGO, **args)
    if output_type == blockbuster_constants.PreprocessingType.TRAINING:
        bucket_name, bucket_path = dag_utils.extract_bucket_parts(
            f'{storage_vars["gcs_temp_path"]}/training')
    else:
        bucket_name, bucket_path = dag_utils.extract_bucket_parts(
            f'{storage_vars["gcs_temp_path"]}/prediction')

    clean_temp_dir_task = gcs_delete_operator.GoogleCloudStorageDeleteOperator(
        task_id=_CLEAN_TEMP_DIR_TASK,
        bucket=bucket_name,
        directory=bucket_path,
        dag=dag)

    user_session_pipeline_task = add_user_session_task(
        dag, _USER_SESSION_TASK_ID, output_type, feature_vars, prediction_vars,
        preprocess_vars, storage_vars, training_vars)
    if output_type == blockbuster_constants.PreprocessingType.TRAINING:
        data_visualization_pipeline_task = add_data_visualization_task(
            dag, _DATA_VISUALISATION_TASK_ID, preprocess_vars, storage_vars)
        generate_categorical_stats_task = add_categorical_stats_task(
            dag, feature_vars, storage_vars)
        generate_numeric_stats_task = add_numeric_stats_task(
            dag, feature_vars, storage_vars)
        helpers.chain(
            clean_temp_dir_task, user_session_pipeline_task,
            data_visualization_pipeline_task,
            [generate_categorical_stats_task, generate_numeric_stats_task])
    else:
        helpers.chain(clean_temp_dir_task, user_session_pipeline_task)
    return dag