예제 #1
0
def dag_preprocess_tables(
    dag_id,
    schedule_interval,
    start_date,
    target_project_id,
    target_dataset_id,
    table_config,
    table_partition,
):
    dag = DAG(dag_id=dag_id,
              schedule_interval=schedule_interval,
              start_date=start_date)

    for table in table_config:

        start_check_tables_task = DummyOperator(
            task_id='%s-%s' % ("start_check_tables_task", table["name"]),
            dag=dag)

        check_if_table_exist = BranchPythonOperator(
            task_id='%s-%s' % (table["name"], "check_if_table_exist"),
            python_callable=if_tbl_exists,
            op_kwargs={
                'dataset': target_dataset_id,
                'project': target_project_id,
                'table_name': table["name"]
            },
            dag=dag)

        table_exists = DummyOperator(task_id='%s-%s' %
                                     (table["name"], "table_exists"),
                                     dag=dag)

        table_does_not_exist = DummyOperator(
            task_id='%s-%s' % (table["name"], "table_does_not_exist"), dag=dag)

        # [start create equipped_item_reference if not exists]
        create_if_not_exists = BigQueryCreateEmptyTableOperator(
            task_id='%s-%s' % (table["name"], "create_if_not_exists"),
            project_id=target_project_id,
            dataset_id=target_dataset_id,
            table_id=table["name"],
            gcs_schema_object=table["schema_gcs_location"],
            time_partitioning=table_partition,
            trigger_rule=TriggerRule.ALL_SUCCESS,
            dag=dag)

        end_check_tables_task = DummyOperator(
            task_id='%s-%s' % ("end_check_tables_task", table["name"]),
            trigger_rule='none_failed_or_skipped',
            dag=dag)

        start_check_tables_task >> check_if_table_exist >> [
            table_does_not_exist, table_exists
        ]
        table_does_not_exist >> create_if_not_exists >> end_check_tables_task
        table_exists >> end_check_tables_task

    return dag
예제 #2
0
    def test_execute(self, mock_hook):
        operator = BigQueryCreateEmptyTableOperator(task_id=TASK_ID,
                                                    dataset_id=TEST_DATASET,
                                                    project_id=TEST_PROJECT_ID,
                                                    table_id=TEST_TABLE_ID)

        operator.execute(None)
        mock_hook.return_value \
            .get_conn() \
            .cursor() \
            .create_empty_table \
            .assert_called_once_with(
                dataset_id=TEST_DATASET,
                project_id=TEST_PROJECT_ID,
                table_id=TEST_TABLE_ID,
                schema_fields=None,
                time_partitioning={}
            )
    def test_execute(self, mock_hook):
        operator = BigQueryCreateEmptyTableOperator(task_id=TASK_ID,
                                                    dataset_id=TEST_DATASET,
                                                    project_id=TEST_PROJECT_ID,
                                                    table_id=TEST_TABLE_ID)

        operator.execute(None)
        mock_hook.return_value \
            .get_conn() \
            .cursor() \
            .create_empty_table \
            .assert_called_once_with(
                dataset_id=TEST_DATASET,
                project_id=TEST_PROJECT_ID,
                table_id=TEST_TABLE_ID,
                schema_fields=None,
                time_partitioning={}
            )
예제 #4
0
    def __init__(self, project_id, dataset_id, table_id, schema_fields, *args,
                 **kwargs):
        BigQueryDropTableOperator.__init__(self,
                                           project_id=project_id,
                                           dataset_id=dataset_id,
                                           table_id=table_id,
                                           *args,
                                           **kwargs)

        BigQueryCreateEmptyTableOperator.__init__(
            self,
            task_id='drop_create_{}'.format(table_id),
            project_id=project_id,
            dataset_id=dataset_id,
            table_id=table_id,
            schema_fields=schema_fields,
            *args,
            **kwargs)
예제 #5
0
}

dag = DAG(
    dag_id='create_and_insert_table',
    default_args=default_args,
    description='DAG to create empty table, dataset provided',
    schedule_interval=timedelta(days=1),
    catchup=False,
)

create_table_task = BigQueryCreateEmptyTableOperator(

    project_id="dark-furnace-298806",
    dataset_id="shtest",
    task_id="create_empty_table",
    table_id="new_service_table",
    bigquery_conn_id="bigquery_default",
    google_cloud_storage_conn_id="google_cloud_default",
    dag=dag

)

insert_task = BigQueryOperator(
    task_id="insert_into_table",
    bql=''' 
    select * from 
    bigquery-public-data.austin_311.311_service_requests
    limit 5 
     ''',
    use_legacy_sql=False,
    destination_dataset_table="dark-furnace-298806:shtest.new_service_table",
예제 #6
0
        task_id="create-dataset", dataset_id=DATASET_NAME)

    create_dataset_with_location = BigQueryCreateEmptyDatasetOperator(
        task_id="create_dataset_with_location",
        dataset_id=LOCATION_DATASET_NAME,
        location=BQ_LOCATION)

    create_table = BigQueryCreateEmptyTableOperator(
        task_id="create-table",
        dataset_id=DATASET_NAME,
        table_id="test_table",
        schema_fields=[
            {
                "name": "emp_name",
                "type": "STRING",
                "mode": "REQUIRED"
            },
            {
                "name": "salary",
                "type": "INTEGER",
                "mode": "NULLABLE"
            },
        ],
    )

    create_table_with_location = BigQueryCreateEmptyTableOperator(
        task_id="create_table_with_location",
        dataset_id=LOCATION_DATASET_NAME,
        table_id="test_table",
        schema_fields=[
            {
예제 #7
0
# Create table
# CreateTable = BigQueryCreateEmptyTableOperator(
#     task_id='BigQueryCreateEmptyTableOperator_task',
#     dataset_id='ODS',
#     table_id='Employees',
#     project_id='internal-gcp-project',
#     gcs_schema_object='gs://schema-bucket/employee_schema.json',
#     bigquery_conn_id='airflow-service-account',
#     google_cloud_storage_conn_id='airflow-service-account'
# )

CreateTable = BigQueryCreateEmptyTableOperator(
    task_id='BigQueryCreateEmptyTableOperator_task',
    dataset_id='Covid',
    table_id='GoogleTrend_test',
    project_id='covidproject-278521',
    gcs_schema_object='gs://testcovidlinh/googletrend_schema.json',
    # schema_fields=googletrend_schema,
    bigquery_conn_id=gcp_conn_id,
    google_cloud_storage_conn_id=gcp_conn_id)

# Loading data from GCS to BigQuery
gcs_to_bigquery = GoogleCloudStorageToBigQueryOperator(
    task_id='GCS_to_BigQuery',
    dag=dag,
    bucket='testcovidlinh',
    source_objects=source_objects,
    # schema_object   = "/tmp/covidStatSchema.json",
    schema_object='googletrend_schema.json',
    # schema_fields   = googletrend_schema,
    source_format='CSV',
# task2 create a new covid_19 partition table.
task2 = BigQueryCreateEmptyTableOperator(task_id='bq_create_new_talble',
                                         dag=dag,
                                         dataset_id=BQ_DATASET,
                                         table_id=BQ_TABLE,
                                         project_id=BQ_PROJECT,
                                         bigquery_conn_id=BQ_CONN_ID,
                                         schema_fields=[{
                                             "name": "DateStr",
                                             "type": "DATE",
                                             "mode": "REQUIRED"
                                         }, {
                                             "name": "State",
                                             "type": "STRING",
                                             "mode": "REQUIRED"
                                         }, {
                                             "name": "Count",
                                             "type": "INTEGER",
                                             "mode": "REQUIRED"
                                         }, {
                                             "name": "Status",
                                             "type": "STRING",
                                             "mode": "REQUIRED"
                                         }],
                                         time_partitioning={
                                             "type": "DAY",
                                             "field": "DateStr",
                                             "expiration_ms": "5184000000"
                                         }
                                         # partition expiration 60 days
                                         )
dag = DAG('incremental_ingestion',
          catchup=False,
          default_args=default_args,
          schedule_interval=dt.timedelta(days=1))

load_config = PythonOperator(task_id='load_config',
                             provide_context=True,
                             python_callable=fetch_config,
                             dag=dag)

bq_create_staging = BigQueryCreateEmptyTableOperator(
    task_id='bq_create_staging',
    project_id=
    '{{(var.json|attr("config-{}".format(run_id)))["bigquery"]["project_id"]}}',
    dataset_id=
    '{{(var.json|attr("config-{}".format(run_id)))["bigquery"]["dataset_id"]}}',
    table_id=
    '{{(var.json|attr("config-{}".format(run_id)))["bigquery"]["staging_table_id"]}}',
    gcs_schema_object=
    '{{(var.json|attr("config-{}".format(run_id)))["bigquery"]["schema_file"]}}',
    dag=dag)

stage_data = DataflowTemplateOperator(
    task_id='stage_data',
    template=
    '{{ (var.json|attr("config-{}".format(run_id)))["dataflow"]["template"] }}',
    dataflow_default_options={
        'project':
        '{{ (var.json|attr("config-{}".format(run_id)))["dataflow"]["options"]["project"] }}',
        'region':
        '{{ (var.json|attr("config-{}".format(run_id)))["dataflow"]["options"]["region"] }}',
    task_id='create_football_matches_dataset',
    project_id=project_id,
    dataset_id=dataset_id,
    bigquery_conn_id=gcp_conn,
    dag=dag
)

create_games_table = BigQueryCreateEmptyTableOperator(
    task_id="create_games_table",
    project_id=project_id,
    dataset_id=dataset_id,
    bigquery_conn_id=gcp_conn,
    table_id="games",
    schema_fields=[{"name": "date", "type": "TIMESTAMP", "mode": "REQUIRED"},
                   {"name": "team_1", "type": "STRING", "mode": "REQUIRED"},
                   {"name": "team_2", "type": "STRING", "mode": "REQUIRED"},
                   {"name": "team_1_score", "type": "INTEGER", "mode": "REQUIRED"},
                   {"name": "team_2_score", "type": "INTEGER", "mode": "REQUIRED"},
                   {"name": "home_team", "type": "STRING", "mode": "NULLABLE"},
                   {"name": "tournament", "type": "STRING", "mode": "NULLABLE"},
                   {"name": "country", "type": "STRING", "mode": "NULLABLE"},
                   {"name": "city", "type": "STRING", "mode": "NULLABLE"},
                   ],
    dag=dag
)

etl_games_job_code_path = 'gs://' + gcs_football_bucket + '/spark_jobs/etl_games_to_bigquery.py'
submit_etl_games_spark_job = DataProcPySparkOperator(
    task_id='submit_etl_games_spark_job',
    main=etl_games_job_code_path,
    cluster_name=cluster_name,
    job_name='etl_games_to_bigquery',
예제 #11
0
    def execute(self, context):
        BigQueryDropTableOperator.execute(self, context)

        BigQueryCreateEmptyTableOperator.execute(self, context)