def test_execute(self, mock_hook): ignore_if_missing = True deletion_dataset_table = '{}.{}'.format(TEST_DATASET, TEST_TABLE_ID) operator = BigQueryTableDeleteOperator( task_id=TASK_ID, deletion_dataset_table=deletion_dataset_table, ignore_if_missing=ignore_if_missing) operator.execute(None) mock_hook.return_value \ .get_conn.return_value \ .cursor.return_value \ .run_table_delete \ .assert_called_once_with( deletion_dataset_table=deletion_dataset_table, ignore_if_missing=ignore_if_missing )
def test_execute(self, mock_hook): ignore_if_missing = True deletion_dataset_table = '{}.{}'.format(TEST_DATASET, TEST_TABLE_ID) operator = BigQueryTableDeleteOperator( task_id=TASK_ID, deletion_dataset_table=deletion_dataset_table, ignore_if_missing=ignore_if_missing ) operator.execute(None) mock_hook.return_value \ .get_conn.return_value \ .cursor.return_value \ .run_table_delete \ .assert_called_once_with( deletion_dataset_table=deletion_dataset_table, ignore_if_missing=ignore_if_missing )
"--concurrency", "10", "--bucket", gcs_data_bucket, "--no-resume", "--prefix", objects_prefix, "--cluster-by", "crash_date", ] # We remove the current date partition for idempotency. remove_bq_table_partition = BigQueryTableDeleteOperator( task_id="remove_bq_table_partition", bigquery_conn_id=bq_gcp_conn_id, deletion_dataset_table="{}.{}${{{{ds_nodash}}}}".format( bq_dataset, bq_table_name), ignore_if_missing=True, dag=dag, ) bq_load = GKEPodOperator( task_id="bigquery_load", gcp_conn_id=bq_gcp_conn_id, project_id=bq_connection.project_id, name="load-socorro-crash-parquet-to-bq", image=docker_image, arguments=gke_args, env_vars={ "GOOGLE_CLOUD_PROJECT": "{{ var.value.gcp_shared_prod_project }}" }, dag=dag,
schema_fields=[ { "name": "emp_name", "type": "STRING", "mode": "REQUIRED" }, { "name": "salary", "type": "INTEGER", "mode": "NULLABLE" }, ], ) delete_table = BigQueryTableDeleteOperator( task_id="delete-table", deletion_dataset_table="{}.test_table".format(DATASET_NAME)) get_dataset = BigQueryGetDatasetOperator(task_id="get-dataset", dataset_id=DATASET_NAME) get_dataset_result = BashOperator( task_id="get-dataset-result", bash_command= "echo \"{{ task_instance.xcom_pull('get-dataset')['id'] }}\"", ) patch_dataset = BigQueryPatchDatasetOperator( task_id="patch-dataset", dataset_id=DATASET_NAME, dataset_resource={
def load_to_bigquery(parent_dag_name=None, default_args=None, dataset_s3_bucket=None, aws_conn_id=None, dataset=None, dataset_version=None, gke_cluster_name=None, date_submission_col='submission_date_s3', ds_type='ds_nodash', dag_name='load_to_bigquery', gke_location='us-central1-a', gke_namespace='default', docker_image='docker.io/mozilla/parquet2bigquery:20191017', # noqa reprocess=False, p2b_concurrency='10', p2b_resume=False, p2b_table_alias=None, objects_prefix=None, spark_gs_dataset_location=None, bigquery_dataset='telemetry', dataset_gcs_bucket='moz-fx-data-derived-datasets-parquet', gcp_conn_id='google_cloud_derived_datasets', cluster_by=(), drop=(), rename={}, replace=()): """ Load Parquet data into BigQuery. Used with SubDagOperator. We use S3ToGoogleCloudStorageTransferOperator to create a GCS Transfer Service job to transfer the AWS S3 parquet data into a GCS Bucket. Once that is completed we launch a Kubernates pod on a existing GKE cluster using the GKEPodOperator. :param str parent_dag_name: parent dag name :param dict default_args: dag configuration :param str dataset_s3_bucket: source S3 Bucket :param str dataset_gcs_bucket: destination GCS Bucket :param str aws_conn_id: airflow connection id for S3 access :param str gcp_conn_id: airflow connection id for GCP access :param str dataset: dataset name :param str dataset_version: dataset version :param str date_submission_col: dataset date submission column :param str ds_type: dataset format (ds or ds_nodash) :param str gke_location: GKE cluster zone :param str gke_namespace: GKE cluster namespace :param str docker_image: docker image to use for GKE pod operations # noqa :param str bigquery_dataset: bigquery load destination dataset :param str p2b_concurrency: number of processes for parquet2bigquery load :param str p2b_table_alias: override p2b table name with alias :param str p2b_resume allow resume support. defaults to False :param bool reprocess: enable dataset reprocessing defaults to False :param str objects_prefix: custom objects_prefix to override defaults :param str spark_gs_dataset_location: custom spark dataset load location to override defaults :param List[str] cluster_by: top level fields to cluster by when creating destination table :param List[str] drop: top level fields to exclude from destination table :param Dict[str, str] rename: top level fields to rename in destination table :param List[str] replace: top level field replacement expressions :return airflow.models.DAG """ connection = GoogleCloudBaseHook(gcp_conn_id=gcp_conn_id) _dag_name = '{}.{}'.format(parent_dag_name, dag_name) if objects_prefix: _objects_prefix = objects_prefix else: _objects_prefix = '{}/{}/{}={{{{{}}}}}'.format(dataset, dataset_version, date_submission_col, ds_type) gcs_buckets = { 'transfer': dataset_gcs_bucket, 'load': dataset_gcs_bucket, } gcstj_object_conditions = { 'includePrefixes': _objects_prefix } gcstj_transfer_options = { 'deleteObjectsUniqueInSink': True } gke_args = [ '-d', bigquery_dataset, '-c', p2b_concurrency, '-b', gcs_buckets['load'], ] if not p2b_resume: gke_args += ['-R'] if p2b_table_alias: gke_args += ['-a', p2b_table_alias] if reprocess: reprocess_objects_prefix = _objects_prefix.replace('_nodash', '') gcs_buckets['transfer'] += '-tmp' gke_args += ['-p', reprocess_objects_prefix] else: gke_args += ['-p', _objects_prefix] if cluster_by: gke_args += ['--cluster-by'] + cluster_by if drop: gke_args += ['--drop'] + drop if rename: gke_args += ['--rename'] + [k + "=" + v for k, v in rename.items()] if replace: gke_args += ['--replace'] + replace bq_table_name = p2b_table_alias or normalize_table_id('_'.join([dataset, dataset_version])) with models.DAG(_dag_name, default_args=default_args) as dag: if dataset_s3_bucket is not None: s3_to_gcs = S3ToGoogleCloudStorageTransferOperator( task_id='s3_to_gcs', s3_bucket=dataset_s3_bucket, gcs_bucket=gcs_buckets['transfer'], description=_objects_prefix, aws_conn_id=aws_conn_id, gcp_conn_id=gcp_conn_id, project_id=connection.project_id, object_conditions=gcstj_object_conditions, transfer_options=gcstj_transfer_options, ) else: s3_to_gcs = DummyOperator(task_id='no_s3_to_gcs') reprocess = SubDagOperator( subdag=reprocess_parquet( _dag_name, default_args, reprocess, gcp_conn_id, gcs_buckets, _objects_prefix, date_submission_col, dataset, dataset_version, gs_dataset_location=spark_gs_dataset_location), task_id='reprocess_parquet') remove_bq_table = BigQueryTableDeleteOperator( task_id='remove_bq_table', bigquery_conn_id=gcp_conn_id, deletion_dataset_table='{}.{}${{{{ds_nodash}}}}'.format(bigquery_dataset, bq_table_name), # noqa ignore_if_missing=True, ) bulk_load = GKEPodOperator( task_id='bigquery_load', gcp_conn_id=gcp_conn_id, project_id=connection.project_id, location=gke_location, cluster_name=gke_cluster_name, name=_dag_name.replace('_', '-'), namespace=gke_namespace, image=docker_image, arguments=gke_args, ) s3_to_gcs >> reprocess >> remove_bq_table >> bulk_load return dag
def build_dag(): """Build DAG.""" dag = DAG('btc_to_neo4j', schedule_interval='@daily', default_args=DEFAULT_ARGS, catchup=True) # NOTE: It is import to keep elements of this list in this order since it is required later when loading data blockchain_elements = [ 'blocks', 'txns', 'outputs', 'output_addresses', 'inputs' ] load_dependency = None for element in blockchain_elements: table = 'crypto_bitcoin.{element}'.format( element=element) + '_{{ds_nodash}}' bigquery_to_daily_table_task = BigQueryOperator( task_id='{element}_to_daily_table'.format(element=element), sql='bigquery/{element}.sql'.format(element=element), destination_dataset_table=table, write_disposition='WRITE_TRUNCATE', use_legacy_sql=False, dag=dag) filename = '{element}/{element}-*.csv'.format(element=element) destination_pattern = 'gs://{bucket}'.format(bucket=BUCKET) + \ '/neo4j_import/{{macros.ds_format(ds, "%Y-%m-%d", "%Y/%m/%d")}}/' + filename table_to_bucket_task = BigQueryToCloudStorageOperator( task_id='{element}_table_to_bucket'.format(element=element), source_project_dataset_table=table, destination_cloud_storage_uris=[destination_pattern], export_format='csv', field_delimiter=',', print_header=True, dag=dag) load_into_neo4j_task = PythonOperator( task_id="load_{element}_into_neo4j".format(element=element), python_callable=load_into_neo4j, provide_context=True, op_kwargs={'element': element}, pool='neo4j_slot', dag=dag) # NOTE: timestamps in blocks are not strictly incremental and since we query by dates it could happen # that we need to backfill some relations. # See: https://bitcoin.stackexchange.com/questions/67618/difference-between-time-and-mediantime-in-getblock if element == 'blocks': backfill_blocks_in_neo4j_task = PythonOperator( task_id="backfill_blocks_in_neo4j", python_callable=backfill_blocks_in_neo4j, provide_context=True, pool='neo4j_slot', dag=dag) load_into_neo4j_task >> backfill_blocks_in_neo4j_task delete_aux_table = BigQueryTableDeleteOperator( task_id='delete_{element}_table'.format(element=element), deletion_dataset_table=table, dag=dag) bigquery_to_daily_table_task >> table_to_bucket_task >> load_into_neo4j_task table_to_bucket_task >> delete_aux_table # Make sure that we load data in Neo4J in right order if load_dependency is not None: load_dependency >> load_into_neo4j_task load_dependency = load_into_neo4j_task return dag
project_id=SQL_PROJECT, instance='servicedat-cal-mysql', body={ "importContext": { "kind": "sql#importContext", "fileType": 'CSV', "uri": '{}/cloudSQLexport_temp.csv'.format(DIR_TMP), "database": DATABASE, "csvImportOptions": { "table": TABLE } } }, api_version='v1beta4', gcp_conn_id='cloudsql_pipeline') delete_tmp_table = BigQueryTableDeleteOperator( task_id='delete_tmp_table', deletion_dataset_table='{}.Temporal.cloudSQLexport_tmp'.format( BQ_PROJECT), bigquery_conn_id=cfg.bigquery_conn_id) delete_tmp_csv = BashOperator( task_id='delete_tmp_csv', bash_command='gsutil rm {}/cloudSQLexport_temp.csv'.format(DIR_TMP)) # Dependencies between tasks create_tmp_table >> create_tmp_csv >> import_to_csql create_tmp_csv >> delete_tmp_table import_to_csql >> delete_tmp_csv
'connectionProperties': '{{ (var.json|attr("config-{}".format(run_id)))["dataflow"]["parameters"]["connectionProperties"] }}', 'username': '******', 'password': '******', }, dag=dag) bq_merge = BigQueryOperator( task_id='bq_merge', sql= '{{(var.json|attr("config-{}".format(run_id)))["bigquery"]["merge_query"]}}', use_legacy_sql=False, write_disposition='WRITE_APPEND', create_disposition='CREATE_IF_NEEDED', dag=dag) bq_delete_staging = BigQueryTableDeleteOperator( task_id='bq_delete_staging', deletion_dataset_table= '{{(var.json|attr("config-{}".format(run_id)))["bigquery"]["staging_table"]}}', dag=dag) delete_config = PythonOperator(task_id='delete_config', provide_context=True, python_callable=cleanup_config, dag=dag) load_config >> bq_create_staging >> stage_data >> bq_merge >> bq_delete_staging >> delete_config
ORDER BY avg_rating DESC """, write_disposition="WRITE_TRUNCATE", create_disposition="CREATE_IF_NEEDED", bigquery_conn_id="gcp", dag=dag, ) extract_top_ratings = BigQueryToCloudStorageOperator( task_id="extract_top_ratings", source_project_dataset_table=(os.environ["GCP_PROJECT"] + ":" + os.environ["BIGQUERY_DATASET"] + "." + "rating_results_{{ ds_nodash }}"), destination_cloud_storage_uris=("gs://" + os.environ["RESULT_BUCKET"] + "/{{ ds_nodash }}.csv"), export_format="CSV", bigquery_conn_id="gcp", dag=dag, ) delete_result_table = BigQueryTableDeleteOperator( task_id="delete_result_table", deletion_dataset_table=(os.environ["GCP_PROJECT"] + ":" + os.environ["BIGQUERY_DATASET"] + "." + "rating_results_{{ ds_nodash }}"), bigquery_conn_id="gcp", dag=dag, ) upload_ratings_to_gcs >> import_in_bigquery >> query_top_ratings >> extract_top_ratings >> delete_result_table
create_disposition='CREATE_IF_NEEDED', use_legacy_sql=False, task_id='analytics_award_golden_globe', dag=dag) analytics_award_saga = BigQueryOperator( sql=SqlQueries.analytics_award_saga_insert, destination_dataset_table=award_table, write_disposition='WRITE_APPEND', create_disposition='CREATE_IF_NEEDED', use_legacy_sql=False, task_id='analytics_award_saga', dag=dag) drop_awards = BigQueryTableDeleteOperator(deletion_dataset_table=award_table, ignore_if_missing=True, task_id='deletion_dataset_table', dag=dag) ########################### # Validation tasks ########################### validate_non_empty_movie = BigQueryCheckOperator( dag=dag, task_id='validate_non_empty_movie', sql=SqlQueries.validate_non_empty_movie, use_legacy_sql=False) validate_non_empty_person = BigQueryCheckOperator( dag=dag, task_id='validate_non_empty_person', sql=SqlQueries.validate_non_empty_person,
export_gcs_to_s3 = GoogleCloudStorageToS3Operator( dag=dag, task_id="cp_gcs_to_s3", dest_verify=True, google_cloud_storage_conn_id=google_conn_id, bucket=gcs_bucket, dest_aws_conn_id='local_s3', dest_s3_key=redshift_s3_bucket) load_redshift = S3ToRedshiftTransfer(dag=dag, task_id="redshift_load", redshift_conn_id=redshift_conn_id, s3_file=s3_output_file, schema='public', table='ga360_sessions', iam_role=redshift_iam_role, copy_options=[ 'CSV', 'IGNOREHEADER 1', 'GZIP', """DATEFORMAT AS 'YYYYMMDD'""" ]) delete_tmp_table = BigQueryTableDeleteOperator( dag=dag, task_id="delete_tmp_table", bigquery_conn_id=google_conn_id, deletion_dataset_table=target_table) prepare_ga360 >> extract_ga360_to_gcs extract_ga360_to_gcs >> [export_gcs_to_s3, delete_tmp_table] export_gcs_to_s3 >> load_redshift