예제 #1
0
    def test_execute(self, mock_hook):
        ignore_if_missing = True
        deletion_dataset_table = '{}.{}'.format(TEST_DATASET, TEST_TABLE_ID)

        operator = BigQueryTableDeleteOperator(
            task_id=TASK_ID,
            deletion_dataset_table=deletion_dataset_table,
            ignore_if_missing=ignore_if_missing)

        operator.execute(None)
        mock_hook.return_value \
            .get_conn.return_value \
            .cursor.return_value \
            .run_table_delete \
            .assert_called_once_with(
                deletion_dataset_table=deletion_dataset_table,
                ignore_if_missing=ignore_if_missing
            )
    def test_execute(self, mock_hook):
        ignore_if_missing = True
        deletion_dataset_table = '{}.{}'.format(TEST_DATASET, TEST_TABLE_ID)

        operator = BigQueryTableDeleteOperator(
            task_id=TASK_ID,
            deletion_dataset_table=deletion_dataset_table,
            ignore_if_missing=ignore_if_missing
        )

        operator.execute(None)
        mock_hook.return_value \
            .get_conn.return_value \
            .cursor.return_value \
            .run_table_delete \
            .assert_called_once_with(
                deletion_dataset_table=deletion_dataset_table,
                ignore_if_missing=ignore_if_missing
            )
예제 #3
0
    "--concurrency",
    "10",
    "--bucket",
    gcs_data_bucket,
    "--no-resume",
    "--prefix",
    objects_prefix,
    "--cluster-by",
    "crash_date",
]

# We remove the current date partition for idempotency.
remove_bq_table_partition = BigQueryTableDeleteOperator(
    task_id="remove_bq_table_partition",
    bigquery_conn_id=bq_gcp_conn_id,
    deletion_dataset_table="{}.{}${{{{ds_nodash}}}}".format(
        bq_dataset, bq_table_name),
    ignore_if_missing=True,
    dag=dag,
)

bq_load = GKEPodOperator(
    task_id="bigquery_load",
    gcp_conn_id=bq_gcp_conn_id,
    project_id=bq_connection.project_id,
    name="load-socorro-crash-parquet-to-bq",
    image=docker_image,
    arguments=gke_args,
    env_vars={
        "GOOGLE_CLOUD_PROJECT": "{{ var.value.gcp_shared_prod_project }}"
    },
    dag=dag,
예제 #4
0
        schema_fields=[
            {
                "name": "emp_name",
                "type": "STRING",
                "mode": "REQUIRED"
            },
            {
                "name": "salary",
                "type": "INTEGER",
                "mode": "NULLABLE"
            },
        ],
    )

    delete_table = BigQueryTableDeleteOperator(
        task_id="delete-table",
        deletion_dataset_table="{}.test_table".format(DATASET_NAME))

    get_dataset = BigQueryGetDatasetOperator(task_id="get-dataset",
                                             dataset_id=DATASET_NAME)

    get_dataset_result = BashOperator(
        task_id="get-dataset-result",
        bash_command=
        "echo \"{{ task_instance.xcom_pull('get-dataset')['id'] }}\"",
    )

    patch_dataset = BigQueryPatchDatasetOperator(
        task_id="patch-dataset",
        dataset_id=DATASET_NAME,
        dataset_resource={
예제 #5
0
def load_to_bigquery(parent_dag_name=None,
                     default_args=None,
                     dataset_s3_bucket=None,
                     aws_conn_id=None,
                     dataset=None,
                     dataset_version=None,
                     gke_cluster_name=None,
                     date_submission_col='submission_date_s3',
                     ds_type='ds_nodash',
                     dag_name='load_to_bigquery',
                     gke_location='us-central1-a',
                     gke_namespace='default',
                     docker_image='docker.io/mozilla/parquet2bigquery:20191017', # noqa
                     reprocess=False,
                     p2b_concurrency='10',
                     p2b_resume=False,
                     p2b_table_alias=None,
                     objects_prefix=None,
                     spark_gs_dataset_location=None,
                     bigquery_dataset='telemetry',
                     dataset_gcs_bucket='moz-fx-data-derived-datasets-parquet',
                     gcp_conn_id='google_cloud_derived_datasets',
                     cluster_by=(),
                     drop=(),
                     rename={},
                     replace=()):

    """ Load Parquet data into BigQuery. Used with SubDagOperator.

    We use S3ToGoogleCloudStorageTransferOperator to create a GCS Transfer
    Service job to transfer the AWS S3 parquet data into a GCS Bucket.
    Once that is completed we launch a Kubernates pod on a existing GKE
    cluster using the GKEPodOperator.

    :param str parent_dag_name:            parent dag name
    :param dict default_args:              dag configuration
    :param str dataset_s3_bucket:          source S3 Bucket
    :param str dataset_gcs_bucket:         destination GCS Bucket
    :param str aws_conn_id:                airflow connection id for S3 access
    :param str gcp_conn_id:                airflow connection id for GCP access
    :param str dataset:                    dataset name
    :param str dataset_version:            dataset version
    :param str date_submission_col:        dataset date submission column
    :param str ds_type:                    dataset format (ds or ds_nodash)
    :param str gke_location:               GKE cluster zone
    :param str gke_namespace:              GKE cluster namespace
    :param str docker_image:               docker image to use for GKE pod operations # noqa
    :param str bigquery_dataset:           bigquery load destination dataset
    :param str p2b_concurrency:            number of processes for parquet2bigquery load
    :param str p2b_table_alias:            override p2b table name with alias
    :param str p2b_resume                  allow resume support. defaults to False
    :param bool reprocess:                 enable dataset reprocessing defaults to False
    :param str objects_prefix:             custom objects_prefix to override defaults
    :param str spark_gs_dataset_location:  custom spark dataset load location to override defaults
    :param List[str] cluster_by:           top level fields to cluster by when creating destination table
    :param List[str] drop:                 top level fields to exclude from destination table
    :param Dict[str, str] rename:          top level fields to rename in destination table
    :param List[str] replace:              top level field replacement expressions

    :return airflow.models.DAG
    """

    connection = GoogleCloudBaseHook(gcp_conn_id=gcp_conn_id)

    _dag_name = '{}.{}'.format(parent_dag_name, dag_name)

    if objects_prefix:
        _objects_prefix = objects_prefix
    else:
        _objects_prefix = '{}/{}/{}={{{{{}}}}}'.format(dataset,
                                                       dataset_version,
                                                       date_submission_col,
                                                       ds_type)
    gcs_buckets = {
        'transfer': dataset_gcs_bucket,
        'load': dataset_gcs_bucket,
    }

    gcstj_object_conditions = {
        'includePrefixes': _objects_prefix
    }

    gcstj_transfer_options = {
        'deleteObjectsUniqueInSink': True
    }

    gke_args = [
        '-d', bigquery_dataset,
        '-c', p2b_concurrency,
        '-b', gcs_buckets['load'],
        ]

    if not p2b_resume:
        gke_args += ['-R']

    if p2b_table_alias:
        gke_args += ['-a', p2b_table_alias]

    if reprocess:
        reprocess_objects_prefix = _objects_prefix.replace('_nodash', '')
        gcs_buckets['transfer'] += '-tmp'
        gke_args += ['-p', reprocess_objects_prefix]

    else:
        gke_args += ['-p', _objects_prefix]

    if cluster_by:
        gke_args += ['--cluster-by'] + cluster_by

    if drop:
        gke_args += ['--drop'] + drop

    if rename:
        gke_args += ['--rename'] + [k + "=" + v for k, v in rename.items()]

    if replace:
        gke_args += ['--replace'] + replace

    bq_table_name = p2b_table_alias or normalize_table_id('_'.join([dataset,
                                                                   dataset_version]))

    with models.DAG(_dag_name, default_args=default_args) as dag:
        if dataset_s3_bucket is not None:
            s3_to_gcs = S3ToGoogleCloudStorageTransferOperator(
                task_id='s3_to_gcs',
                s3_bucket=dataset_s3_bucket,
                gcs_bucket=gcs_buckets['transfer'],
                description=_objects_prefix,
                aws_conn_id=aws_conn_id,
                gcp_conn_id=gcp_conn_id,
                project_id=connection.project_id,
                object_conditions=gcstj_object_conditions,
                transfer_options=gcstj_transfer_options,
            )
        else:
            s3_to_gcs = DummyOperator(task_id='no_s3_to_gcs')

        reprocess = SubDagOperator(
            subdag=reprocess_parquet(
                _dag_name,
                default_args,
                reprocess,
                gcp_conn_id,
                gcs_buckets,
                _objects_prefix,
                date_submission_col,
                dataset,
                dataset_version,
                gs_dataset_location=spark_gs_dataset_location),
            task_id='reprocess_parquet')

        remove_bq_table = BigQueryTableDeleteOperator(
            task_id='remove_bq_table',
            bigquery_conn_id=gcp_conn_id,
            deletion_dataset_table='{}.{}${{{{ds_nodash}}}}'.format(bigquery_dataset, bq_table_name), # noqa
            ignore_if_missing=True,
        )

        bulk_load = GKEPodOperator(
            task_id='bigquery_load',
            gcp_conn_id=gcp_conn_id,
            project_id=connection.project_id,
            location=gke_location,
            cluster_name=gke_cluster_name,
            name=_dag_name.replace('_', '-'),
            namespace=gke_namespace,
            image=docker_image,
            arguments=gke_args,
            )

        s3_to_gcs >> reprocess >> remove_bq_table >> bulk_load

        return dag
예제 #6
0
def build_dag():
    """Build DAG."""
    dag = DAG('btc_to_neo4j',
              schedule_interval='@daily',
              default_args=DEFAULT_ARGS,
              catchup=True)

    # NOTE: It is import to keep elements of this list in this order since it is required later when loading data
    blockchain_elements = [
        'blocks', 'txns', 'outputs', 'output_addresses', 'inputs'
    ]
    load_dependency = None

    for element in blockchain_elements:
        table = 'crypto_bitcoin.{element}'.format(
            element=element) + '_{{ds_nodash}}'
        bigquery_to_daily_table_task = BigQueryOperator(
            task_id='{element}_to_daily_table'.format(element=element),
            sql='bigquery/{element}.sql'.format(element=element),
            destination_dataset_table=table,
            write_disposition='WRITE_TRUNCATE',
            use_legacy_sql=False,
            dag=dag)

        filename = '{element}/{element}-*.csv'.format(element=element)
        destination_pattern = 'gs://{bucket}'.format(bucket=BUCKET) + \
                              '/neo4j_import/{{macros.ds_format(ds, "%Y-%m-%d", "%Y/%m/%d")}}/' + filename

        table_to_bucket_task = BigQueryToCloudStorageOperator(
            task_id='{element}_table_to_bucket'.format(element=element),
            source_project_dataset_table=table,
            destination_cloud_storage_uris=[destination_pattern],
            export_format='csv',
            field_delimiter=',',
            print_header=True,
            dag=dag)

        load_into_neo4j_task = PythonOperator(
            task_id="load_{element}_into_neo4j".format(element=element),
            python_callable=load_into_neo4j,
            provide_context=True,
            op_kwargs={'element': element},
            pool='neo4j_slot',
            dag=dag)

        # NOTE: timestamps in blocks are not strictly incremental and since we query by dates it could happen
        # that we need to backfill some relations.
        # See: https://bitcoin.stackexchange.com/questions/67618/difference-between-time-and-mediantime-in-getblock
        if element == 'blocks':
            backfill_blocks_in_neo4j_task = PythonOperator(
                task_id="backfill_blocks_in_neo4j",
                python_callable=backfill_blocks_in_neo4j,
                provide_context=True,
                pool='neo4j_slot',
                dag=dag)
            load_into_neo4j_task >> backfill_blocks_in_neo4j_task

        delete_aux_table = BigQueryTableDeleteOperator(
            task_id='delete_{element}_table'.format(element=element),
            deletion_dataset_table=table,
            dag=dag)

        bigquery_to_daily_table_task >> table_to_bucket_task >> load_into_neo4j_task
        table_to_bucket_task >> delete_aux_table

        # Make sure that we load data in Neo4J in right order
        if load_dependency is not None:
            load_dependency >> load_into_neo4j_task

        load_dependency = load_into_neo4j_task

    return dag
예제 #7
0
        project_id=SQL_PROJECT,
        instance='servicedat-cal-mysql',
        body={
            "importContext": {
                "kind": "sql#importContext",
                "fileType": 'CSV',
                "uri": '{}/cloudSQLexport_temp.csv'.format(DIR_TMP),
                "database": DATABASE,
                "csvImportOptions": {
                    "table": TABLE
                }
            }
        },
        api_version='v1beta4',
        gcp_conn_id='cloudsql_pipeline')

    delete_tmp_table = BigQueryTableDeleteOperator(
        task_id='delete_tmp_table',
        deletion_dataset_table='{}.Temporal.cloudSQLexport_tmp'.format(
            BQ_PROJECT),
        bigquery_conn_id=cfg.bigquery_conn_id)

    delete_tmp_csv = BashOperator(
        task_id='delete_tmp_csv',
        bash_command='gsutil rm {}/cloudSQLexport_temp.csv'.format(DIR_TMP))

    # Dependencies between tasks
    create_tmp_table >> create_tmp_csv >> import_to_csql
    create_tmp_csv >> delete_tmp_table
    import_to_csql >> delete_tmp_csv
        'connectionProperties':
        '{{ (var.json|attr("config-{}".format(run_id)))["dataflow"]["parameters"]["connectionProperties"] }}',
        'username':
        '******',
        'password':
        '******',
    },
    dag=dag)

bq_merge = BigQueryOperator(
    task_id='bq_merge',
    sql=
    '{{(var.json|attr("config-{}".format(run_id)))["bigquery"]["merge_query"]}}',
    use_legacy_sql=False,
    write_disposition='WRITE_APPEND',
    create_disposition='CREATE_IF_NEEDED',
    dag=dag)

bq_delete_staging = BigQueryTableDeleteOperator(
    task_id='bq_delete_staging',
    deletion_dataset_table=
    '{{(var.json|attr("config-{}".format(run_id)))["bigquery"]["staging_table"]}}',
    dag=dag)

delete_config = PythonOperator(task_id='delete_config',
                               provide_context=True,
                               python_callable=cleanup_config,
                               dag=dag)

load_config >> bq_create_staging >> stage_data >> bq_merge >> bq_delete_staging >> delete_config
ORDER BY avg_rating DESC
""",
    write_disposition="WRITE_TRUNCATE",
    create_disposition="CREATE_IF_NEEDED",
    bigquery_conn_id="gcp",
    dag=dag,
)

extract_top_ratings = BigQueryToCloudStorageOperator(
    task_id="extract_top_ratings",
    source_project_dataset_table=(os.environ["GCP_PROJECT"] + ":" +
                                  os.environ["BIGQUERY_DATASET"] + "." +
                                  "rating_results_{{ ds_nodash }}"),
    destination_cloud_storage_uris=("gs://" + os.environ["RESULT_BUCKET"] +
                                    "/{{ ds_nodash }}.csv"),
    export_format="CSV",
    bigquery_conn_id="gcp",
    dag=dag,
)

delete_result_table = BigQueryTableDeleteOperator(
    task_id="delete_result_table",
    deletion_dataset_table=(os.environ["GCP_PROJECT"] + ":" +
                            os.environ["BIGQUERY_DATASET"] + "." +
                            "rating_results_{{ ds_nodash }}"),
    bigquery_conn_id="gcp",
    dag=dag,
)

upload_ratings_to_gcs >> import_in_bigquery >> query_top_ratings >> extract_top_ratings >> delete_result_table
예제 #10
0
    create_disposition='CREATE_IF_NEEDED',
    use_legacy_sql=False,
    task_id='analytics_award_golden_globe',
    dag=dag)

analytics_award_saga = BigQueryOperator(
    sql=SqlQueries.analytics_award_saga_insert,
    destination_dataset_table=award_table,
    write_disposition='WRITE_APPEND',
    create_disposition='CREATE_IF_NEEDED',
    use_legacy_sql=False,
    task_id='analytics_award_saga',
    dag=dag)

drop_awards = BigQueryTableDeleteOperator(deletion_dataset_table=award_table,
                                          ignore_if_missing=True,
                                          task_id='deletion_dataset_table',
                                          dag=dag)

###########################
# Validation tasks
###########################
validate_non_empty_movie = BigQueryCheckOperator(
    dag=dag,
    task_id='validate_non_empty_movie',
    sql=SqlQueries.validate_non_empty_movie,
    use_legacy_sql=False)

validate_non_empty_person = BigQueryCheckOperator(
    dag=dag,
    task_id='validate_non_empty_person',
    sql=SqlQueries.validate_non_empty_person,
예제 #11
0
export_gcs_to_s3 = GoogleCloudStorageToS3Operator(
    dag=dag,
    task_id="cp_gcs_to_s3",
    dest_verify=True,
    google_cloud_storage_conn_id=google_conn_id,
    bucket=gcs_bucket,
    dest_aws_conn_id='local_s3',
    dest_s3_key=redshift_s3_bucket)

load_redshift = S3ToRedshiftTransfer(dag=dag,
                                     task_id="redshift_load",
                                     redshift_conn_id=redshift_conn_id,
                                     s3_file=s3_output_file,
                                     schema='public',
                                     table='ga360_sessions',
                                     iam_role=redshift_iam_role,
                                     copy_options=[
                                         'CSV', 'IGNOREHEADER 1', 'GZIP',
                                         """DATEFORMAT AS 'YYYYMMDD'"""
                                     ])

delete_tmp_table = BigQueryTableDeleteOperator(
    dag=dag,
    task_id="delete_tmp_table",
    bigquery_conn_id=google_conn_id,
    deletion_dataset_table=target_table)

prepare_ga360 >> extract_ga360_to_gcs
extract_ga360_to_gcs >> [export_gcs_to_s3, delete_tmp_table]
export_gcs_to_s3 >> load_redshift