Beispiel #1
0
def do_load_to_datastore(**kwargs):
    """ Saves the predictions results into Datastore. Because there is no way to
    directly load a CSV to Datastore, we use Apache Beam on Dataflow with
    templates gs://dataflow-templates/latest/GCS_Text_to_Datastore.
    https://cloud.google.com/dataflow/docs/templates/provided-templates#gcstexttodatastore
    """
    gcs_prediction_output = 'gs://{}/predictions/output'.format(
        COMPOSER_BUCKET_NAME)
    template = 'gs://dataflow-templates/latest/GCS_Text_to_Datastore'

    df_template_params = {
        'textReadPattern':
        '{}/prediction.results*'.format(gcs_prediction_output),
        'javascriptTextTransformGcsPath':
        'gs://{}/gcs_datastore_transform.js'.format(COMPOSER_BUCKET_NAME),
        'javascriptTextTransformFunctionName':
        'from_prediction_output_to_datastore_object',
        'datastoreWriteProjectId':
        PROJECT,
        'errorWritePath':
        'gs://{}/errors/serving_load'.format(COMPOSER_BUCKET_NAME)
    }

    dataflow_operator.DataflowTemplateOperator(
        task_id='gcs_predictions_df_transform',
        project_id=PROJECT,
        template=template,
        parameters=df_template_params,
        dag=dag).execute(kwargs)
def _add_mlwp_generate_features_pipeline_task(
    dag: models.DAG,
    output_type: blockbuster_constants.PreprocessingType,
    feature_options: dag_utils.FeatureConfigListMapping,
    storage_vars: dag_utils.AirflowVarsConfig,
) -> dataflow_operator.DataflowTemplateOperator:
    """Adds the Generate Features Task of ML Windowing Pipeline to dag.

  Args:
    dag: The dag that the task needs to be added to.
    output_type: Indicate whether this pipeline is to be used for training or
      prediction.
    feature_options: The parsed config values from airflow feature object
      variable.
    storage_vars: The parsed config values from airflow storage variable.

  Returns:
    The configured Generate Features task that was added to the input dag.
  """
    if output_type == blockbuster_constants.PreprocessingType.TRAINING:
        output_table = 'training'
        training_mode = 'true'
        output_path = f'{storage_vars["gcs_temp_path"]}/training'
    elif output_type == blockbuster_constants.PreprocessingType.PREDICTION:
        output_table = 'prediction'
        training_mode = 'false'
        output_path = f'{storage_vars["gcs_temp_path"]}/prediction'
    template_file_directory = storage_vars['gcs_dataflow_path']
    step_4_output = (f'{storage_vars["bq_working_project"]}:'
                     f'{storage_vars["bq_working_dataset"]}.'
                     f'ga_{output_table}_input')

    # Always add the BB_id as a RECENT feature
    mod_features = list(feature_options['features'])
    mod_features.append({
        'fact': 'BB_id',
        'type': 'Categorical',
        'accumulators': 'recent'
    })
    feature_options_copy = dict(feature_options)
    feature_options_copy['features'] = mod_features

    return dataflow_operator.DataflowTemplateOperator(
        task_id='mlwp_step4',
        template=f'{template_file_directory}/GenerateFeaturesPipeline',
        parameters={
            **dag_utils.generate_feature_pipeline_parameters(feature_options_copy), 'windowedAvroLocation':
            f'{output_path}/windowing-output/*.avro',
            'featureDestinationTable':
            step_4_output,
            'trainMode':
            training_mode,
            'showEffectiveDateWeekOfYear':
            'false',
            'showEffectiveDateMonthOfYear':
            'false'
        },
        dag=dag)
Beispiel #3
0
def add_data_visualization_task(
    dag: models.DAG, task_id: str,
    preprocess_vars: dag_utils.AirflowVarsConfig,
    storage_vars: dag_utils.AirflowVarsConfig
) -> dataflow_operator.DataflowTemplateOperator:
    """Builds the DataVisualizationPipeline Operator.

  Args:
    dag: The dag that the task needs to be added to.
    task_id: ID for this specific task within the DAG.
    preprocess_vars: The parsed config values from airflow preprocess variable.
    storage_vars: The parsed config values from airflow storage variable.

  Returns:
    Operator used to run the Data Visualization Pipeline on Dataflow.
  """
    template_file_directory = storage_vars['gcs_dataflow_path']

    p2_output_dataset = (f'{storage_vars["bq_working_project"]}:'
                         f'{storage_vars["bq_working_dataset"]}')

    proc_st_dt = datetime.datetime.strptime(str(preprocess_vars['start_date']),
                                            '%Y%m%d')
    proc_ed_dt = datetime.datetime.strptime(str(preprocess_vars['end_date']),
                                            '%Y%m%d')
    output_path = f'{storage_vars["gcs_temp_path"]}/training'
    lookback_days = int(preprocess_vars['lookback_days'])
    prediction_days = int(preprocess_vars['prediction_days'])

    return dataflow_operator.DataflowTemplateOperator(
        task_id=task_id,
        template=f'{template_file_directory}/DataVisualizationPipeline',
        parameters={
            'snapshotStartDate':
            get_date_str_from_date(proc_st_dt +
                                   datetime.timedelta(days=lookback_days),
                                   date_format='%d/%m/%Y'),
            'snapshotEndDate':
            get_date_str_from_date(proc_ed_dt -
                                   datetime.timedelta(days=prediction_days),
                                   date_format='%d/%m/%Y'),
            'inputAvroSessionsLocation':
            f'{output_path}/usersession-output/*.avro',
            'stopOnFirstPositiveLabel':
            str(preprocess_vars['stopOnFirstPositiveLabel']),
            'slideTimeInSeconds':
            str(preprocess_vars['slideTimeInSeconds']),
            'minimumLookaheadTimeInSeconds':
            str(preprocess_vars['minimumLookaheadTimeInSeconds']),
            'maximumLookaheadTimeInSeconds':
            str(preprocess_vars['maximumLookaheadTimeInSeconds']),
            'outputBigQueryUserActivityTable':
            f'{p2_output_dataset}.instance',
            'outputBigQueryFactsTable':
            f'{p2_output_dataset}.facts',
        },
        dag=dag)
def _add_mlwp_sliding_window_pipeline_task(
    dag: models.DAG, output_type: blockbuster_constants.PreprocessingType,
    prediction_vars: dag_utils.AirflowVarsConfig,
    preprocess_vars: dag_utils.AirflowVarsConfig,
    storage_vars: dag_utils.AirflowVarsConfig,
    training_vars: dag_utils.AirflowVarsConfig
) -> dataflow_operator.DataflowTemplateOperator:
    """Adds the Sliding Window Task of ML Windowing Pipeline to dag.

  Args:
    dag: The dag that the task needs to be added to.
    output_type: Indicates whether this pipeline is to be used for training or
      prediction.
    prediction_vars: The parsed config values from airflow prediction variable.
    preprocess_vars: The parsed config values from airflow preprocess variable.
    storage_vars: The parsed config values from airflow storage variable.
    training_vars: The parsed config values from airflow training variable.

  Returns:
    The configured Sliding Window task that was added to the input dag.
  """
    if output_type == blockbuster_constants.PreprocessingType.TRAINING:
        snapshot_start_dt, snapshot_end_dt, output_path = _get_sliding_window_pipeline_params_for_training(
            preprocess_vars, storage_vars, training_vars)
    elif output_type == blockbuster_constants.PreprocessingType.PREDICTION:
        snapshot_start_dt, snapshot_end_dt, output_path = _get_sliding_window_pipeline_params_for_prediction(
            prediction_vars, preprocess_vars, storage_vars)
    template_file_directory = storage_vars['gcs_dataflow_path']
    return dataflow_operator.DataflowTemplateOperator(
        task_id='mlwp_step3',
        template=f'{template_file_directory}/SlidingWindowPipeline',
        parameters={
            'snapshotStartDate':
            snapshot_start_dt,
            'snapshotEndDate':
            snapshot_end_dt,
            'inputAvroSessionsLocation':
            f'{output_path}/usersession-output/*.avro',
            'stopOnFirstPositiveLabel':
            str(preprocess_vars['stopOnFirstPositiveLabel']),
            'slideTimeInSeconds':
            str(preprocess_vars['slideTimeInSeconds']),
            'minimumLookaheadTimeInSeconds':
            str(preprocess_vars['minimumLookaheadTimeInSeconds']),
            'maximumLookaheadTimeInSeconds':
            str(preprocess_vars['maximumLookaheadTimeInSeconds']),
            'lookbackGapInSeconds':
            str(int(preprocess_vars['lookbackGapInDays']) * 86400),
            'windowTimeInSeconds':
            str(preprocess_vars['windowTimeInSeconds']),
            'outputSlidingWindowAvroPrefix':
            f'{output_path}/windowing-output/',
        },
        dag=dag)
Beispiel #5
0
def add_user_session_task(
        dag: models.DAG, task_id: str,
        output_type: blockbuster_constants.PreprocessingType,
        feature_vars: dag_utils.FeatureConfigListMapping,
        prediction_vars: dag_utils.AirflowVarsConfig,
        preprocess_vars: dag_utils.AirflowVarsConfig,
        storage_vars: dag_utils.AirflowVarsConfig,
        training_vars: dag_utils.AirflowVarsConfig) -> models.BaseOperator:
    """Builds the UserSessionPipeline Operator.

  Args:
    dag: The dag that the task needs to be added to.
    task_id: Id string for this specific task within the DAG.
    output_type: Indicate whether this pipeline is to be used for training or
      prediction.
    feature_vars: The parsed config values from airflow feature object variable.
    prediction_vars: The parsed config values from airflow prediction variable.
    preprocess_vars: The parsed config values from airflow preprocess variable.
    storage_vars: The parsed config values from airflow storage variable.
    training_vars: The parsed config values from airflow training variable.

  Returns:
    Operator to use within a DAG to run the User Session Pipeline on Dataflow.
  """
    # Load start/end date from the appropriate Airflow Variable
    if output_type == blockbuster_constants.PreprocessingType.TRAINING:
        output_path = f'{storage_vars["gcs_temp_path"]}/training'

    elif output_type == blockbuster_constants.PreprocessingType.PREDICTION:
        output_path = f'{storage_vars["gcs_temp_path"]}/prediction'

    template_file_directory = storage_vars['gcs_dataflow_path']
    sql_vars = get_user_session_sql_params(
        output_type,
        feature_vars,
        prediction_vars,
        preprocess_vars,
        storage_vars,
    )
    sql = pipeline_utils.render_sql_from_template('usersession_source',
                                                  **sql_vars)

    return dataflow_operator.DataflowTemplateOperator(
        task_id=task_id,
        template=f'{template_file_directory}/UserSessionPipeline',
        parameters={
            'inputBigQuerySQL': sql,
            'outputSessionsAvroPrefix': f'{output_path}/usersession-output/',
            'predictionFactName': training_vars['predictionFactName'],
            'predictionFactValues': training_vars['predictionFactValues']
        },
        dag=dag)
Beispiel #6
0
                                       username=sftp_username,
                                       password=sftp_password,
                                       port=sftp_port)

    output_filename = 'output/report-%s-{{ run_id }}.csv' % advertiser_id

    process_elements = dataflow_operator.DataflowTemplateOperator(
        task_id='process_elements-%s' % advertiser_id,
        dataflow_default_options={
            'project': gcp_project,
            'zone': gcp_zone,
            'tempLocation': dataflow_staging,
        },
        parameters={
            'inputKeywordsFile':
            GCS_PATH_FORMAT % (gcs_bucket, REPORT_FILENAME),
            'outputKeywordsFile':
            GCS_PATH_FORMAT % (gcs_bucket, output_filename),
            'keywordColumnNames': output_file_header,
            'inputCustomDataFile': input_custom_data_file,
            'customDataColumnNames': custom_data_column_names,
            'advertiserId': advertiser_id
        },
        template=dataflow_template,
        gcp_conn_id=sa360_conn_id,
        dag=dag)
    download_file.set_downstream(process_elements)

    upload_to_sftp = gcs_to_sftp_operator.GCSToSFTPOperator(
        task_id='upload_to_sftp-%s' % advertiser_id,
        gcs_hook=gcs_hook,
        ssh_hook=connection_hook,
Beispiel #7
0
                schedule_interval=None, default_args=DEFAULT_DAG_ARGS) as dag:
    # Args required for the Dataflow job.

    downloadminutes = python_operator.PythonOperator(task_id='downloadminutes',
                                                     python_callable=download_minutes,
                                                     op_args=[GCP_BUCKET, TARGET_EVENT],
                                                     provide_context=True)

    # use template for the xcom
    job_args = {
        'input': "{{ task_instance.xcom_pull(task_ids='downloadminutes') }}",
        'output': OUTPUT_FILE_PATH
    }

    dataflow_task = dataflow_operator.DataflowTemplateOperator(
        template=DATAFLOW_MINUTES_TEMPLATE,
        task_id="processminutes",
        parameters=job_args)

    # Here we create two conditional tasks, one of which will be executed
    # based on whether the dataflow_task was a success or a failure.
    success_move_task = python_operator.PythonOperator(task_id='success-completion',
                                                       python_callable=task_completion,
                                                       op_args=[SUCCESS_TAG],
                                                       provide_context=True,
                                                       trigger_rule=TriggerRule.ALL_SUCCESS)

    failure_move_task = python_operator.PythonOperator(task_id='failure-completion',
                                                       python_callable=task_completion,
                                                       op_args=[FAILURE_TAG],
                                                       provide_context=True,
                                                       trigger_rule=TriggerRule.ALL_FAILED)
default_dag_args = {
    'start_date': yesterday,
    # To email on failure or retry set 'email' arg to your email and enable
    # emailing here.
    'email_on_failure': False,
    'email_on_retry': False,
    # If a task fails, retry it once after waiting at least 5 minutes
    'retries': 1,
    'retry_delay': datetime.timedelta(minutes=5),
    'dataflow_default_options': {
        'project': 'my-test-project-218908',
        'zone': 'europe-west1-d',
        'tempLocation': 'gs://staging-bucket-tes/staging'
    }
}

# [START bigquery_extracton_test]
with models.DAG(
        'dataflow_test',
        # Continue to run DAG once per day
        schedule_interval=datetime.timedelta(days=1),
        default_args=default_dag_args) as dag:
    # [END bigquery_extracton_test]

    first_dataflow = dataflow_operator.DataflowTemplateOperator(
        task_id='dataflow_test',
        template='gs://staging-bucket-tes/templates/PublisherDemo',
        gcp_conn_id='google_cloud_default',
        dag=dag)

first_dataflow
Beispiel #9
0
]

# 한국시 기준 데이터로 보여주기 위해 UTC 기준2일치 데이터를 처리
for i in range(2):
    output_directory = '{}/data/log/rescuetime'.format(datalake_gs)
    # 한국시(+9:00) 기준 레스큐 타임 데이터를 UTC 기준으로 저장하기 위해 한번의 2일치 데이터를 조회한다.
    load_rescuetime = bash_operator.BashOperator(
        task_id=('load_rescuetime-%s' % i),
        bash_command=
        'java -jar ${{AIRFLOW_HOME}}/dags/dd-importers-load-rescuetime.jar -user_id={} -api_key={} -input_begin_date={} -input_end_date={} -input_timezone=Asia/Seoul -output_date={} -output_timezone=UTC -output_directory={}  -output_filenameprefix={} -shard_size=3'
        .format(user_id, api_key, input_begin_dates[i], input_end_dates[i],
                input_begin_dates[i], output_directory,
                output_filename_prefixes[i]),
        dag=dag)

    create_rescuetime_bd = dataflow_operator.DataflowTemplateOperator(
        task_id=('create_rescuetime_bd-%s' % i),
        template='{}/templates/dd-etls-create-rescuetime'.format(dataflow_gs),
        parameters={
            'runner':
            'DataflowRunner',
            'inputFilePattern':
            '{}/data/log/rescuetime/{}Z-*'.format(datalake_gs, bd_dates[i]),
            'outputTable':
            '{}:dw_datadriver.rescuetime_tbl_bd_data${}'.format(
                project_id, bd_dates[i])
        },
        dag=dag,
        gcp_conn_id='gcp-airflow-service-account')
    create_rescuetime_bd.set_upstream(load_rescuetime)
Beispiel #10
0
                description='A DAG triggered by an external Cloud Function',
                schedule_interval=None,
                default_args=DEFAULT_DAG_ARGS) as dag:

    # Build arguments for dataflow task. The dag_run.conf is a way of accessing
    # input variables passed by calling GCF function.
    job_args = {
        'bigtableInstanceId': config['bt_instance'],
        'bigtableTableId': '{{ dag_run.conf["bigtable_id"] }}',
        'inputFile': '{{ dag_run.conf["input_file"] }}',
        'bigtableProjectId': config['gcp_project'],
    }

    # Main Dataflow task that will process and load the input csv file.
    dataflow_task = dataflow_operator.DataflowTemplateOperator(
        task_id='csv_to_bt',
        template=config['dataflow_template_location'],
        parameters=job_args)

    success_task = python_operator.PythonOperator(
        task_id='success-move-to-completion',
        python_callable=update_on_completion,
        op_args=[SUCCESS_TAG, FAILURE_TAG],
        provide_context=True,
        trigger_rule=TriggerRule.ALL_SUCCESS)

    failure_task = python_operator.PythonOperator(
        task_id='failure-move-to-completion',
        python_callable=update_on_completion,
        op_args=[FAILURE_TAG, SUCCESS_TAG],
        provide_context=True,
        trigger_rule=TriggerRule.ALL_FAILED)
                filename_prefixes[i]),
        dag=dag)

    create_googlefitness_bd = dataflow_operator.DataflowTemplateOperator(
        task_id=('create_googlefitness_bd-%s' % i),
        template='{}/templates/dd-etls-create-googlefitness'.format(
            dataflow_gs),
        parameters={
            'runner':
            'DataflowRunner',
            'beginTime':
            begin_times[i],
            'endTime':
            end_times[i],
            'inputAggregatedDatasetsFilePattern':
            '{}/data/log/googlefitness/{}Z-*-aggregated-datasets-*'.format(
                datalake_gs, bd_dates[i]),
            'inputSessionsFilePattern':
            '{}/data/log/googlefitness/{}Z-*-sessions-*'.format(
                datalake_gs, bd_dates[i]),
            'outputAggregatedDatasetsTable':
            '{}:dw_datadriver.googlefitness_tbl_bd_aggregated_datasets${}'.
            format(project_id, bd_dates[i]),
            'outputSessionsTable':
            '{}:dw_datadriver.googlefitness_tbl_bd_sessions${}'.format(
                project_id, bd_dates[i])
        },
        dag=dag,
        gcp_conn_id='gcp-airflow-service-account')
    create_googlefitness_bd.set_upstream(load_googlefitness)