Esempio n. 1
0
def bqimport(ts, dag, tablename, schedule, **kwargs):
    pg_conn = config[schedule][tablename][1]
    table_schema = tablename.split('.')[1]
    table_name = tablename.split('.')[2]
    export_datetime = ts
    sync_interval = schedule
    bqload = GoogleCloudStorageToBigQueryOperator(
        task_id='func_bqload_{}'.format(table_name),
        bucket='prod-data-sync-bucket',
        destination_project_dataset_table=table_schema + '.tbl_' + table_name,
        create_disposition='CREATE_IF_NEEDED',
        source_format='csv',
        field_delimiter='|',
        autodetect=False,
        schema_object='json_schema/' + table_schema + '_' + table_name +
        '.json',
        source_objects=[
            'mergeload/data/' + sync_interval + '/' + ts + '/' + table_schema +
            '_' + table_name + '_' + pg_conn + '/*.csv'
        ],
        quote_character='"',
        allow_quoted_newlines=True,
        allow_jagged_rows=True,
        write_disposition='WRITE_APPEND',
        bigquery_conn_id='bigquery_default',
        google_cloud_storage_conn_id='google_cloud_storage_default',
        skip_leading_rows=1,
        dag=dag)
    bqload.execute(None)
Esempio n. 2
0
def build_gcs_to_bq_task(dag, data_type):
    '''
    Creates a task to load a file from Google Cloud Storage into BigQuery. 
    The name of the file being loaded is retrieved through Airflow's Xcom.
    Data types should be: 'accounts', 'ledgers', 'offers', 'operations', 'trades', 'transactions', or 'trustlines'.
    Parameters:
        dag - parent dag that the task will be attached to 
        data_type - type of the data being uploaded; should be string
    Returns:
        the newly created task
    '''

    bucket_name = Variable.get('gcs_exported_data_bucket_name')
    project_name = Variable.get('bq_project')
    dataset_name = Variable.get('bq_dataset')
    table_ids = Variable.get('table_ids', deserialize_json=True)
    prev_task_id = f'load_{data_type}_to_gcs'
    schema_fields = read_local_schema(data_type)
    return GoogleCloudStorageToBigQueryOperator(
        task_id=f'send_{data_type}_to_bq',
        google_cloud_storage_conn_id='google_cloud_platform_connection',
        bigquery_conn_id='google_cloud_platform_connection',
        bucket=bucket_name,
        schema_fields=schema_fields,
        autodetect=False,
        source_format='NEWLINE_DELIMITED_JSON',
        source_objects=[
            "{{ task_instance.xcom_pull(task_ids='" + prev_task_id + "') }}"
        ],
        destination_project_dataset_table=
        f'{project_name}.{dataset_name}.{table_ids[data_type]}',
        write_disposition='WRITE_APPEND',
        create_disposition='CREATE_IF_NEEDED',
        dag=dag,
    )
Esempio n. 3
0
def gc_tasks(name, schema, next_task=DummyOperator(task_id="Done")):
    bq_staging = f"{{{{ var.value.gc_project_id }}}}.{{{{ var.value.bq_dataset_source }}}}.{name}"
    bq_warehouse = f"{{{{ var.value.gc_project_id }}}}.{{{{ var.value.bq_dataset_target }}}}.{name}"

    t1 = GoogleCloudStorageToBigQueryOperator(
        task_id=f"staging_{name}",
        bucket="{{var.value.gcs_bucket}}",
        source_objects=[f"{name}*"],
        destination_project_dataset_table=bq_staging,
        write_disposition="WRITE_TRUNCATE",
        schema_fields=schema,
        skip_leading_rows=1,
    )

    t2 = BigQueryOperator(
        task_id=f"merge_{name}_into_warehouse",
        sql=_create_merge_sql(bq_staging, bq_warehouse, schema),
        use_legacy_sql=False,
    )

    t3 = GoogleCloudStorageToGoogleCloudStorageOperator(
        task_id=f"move_{name}_to_processed",
        source_bucket="{{var.value.gcs_bucket}}",
        source_object=f"{name}*",
        destination_bucket="{{var.value.gcs_bucket}}",
        destination_object=f"processed/{name}",
        move_object=True,
    )

    t1 >> t2 >> t3 >> next_task

    return t1
Esempio n. 4
0
def move_from_gcs_to_bq(**context):
    gcs_hook = GoogleCloudStorageHook()
    file_list = get_bucket_file_names(gcs_hook)
    destination_table = '{}.raw.raw'.format(env.BQ_PROJECT)

    for file in file_list:
        gcs_to_bq_operator = GoogleCloudStorageToBigQueryOperator(
            task_id='temp',
            bigquery_conn_id='google_cloud_default',
            bucket=env.GCS_BUCKET,
            source_objects=[file],
            destination_project_dataset_table=destination_table,
            write_disposition='WRITE_APPEND',
            schema_fields=raw_schema,
            skip_leading_rows=1)
        gcs_to_bq_operator.execute(context)
        gcs_hook.delete(bucket=env.GCS_BUCKET, object=file)
Esempio n. 5
0
    def test_execute_explicit_project(self, bq_hook):
        operator = GoogleCloudStorageToBigQueryOperator(
            task_id=TASK_ID,
            bucket=TEST_BUCKET,
            source_objects=TEST_SOURCE_OBJECTS,
            destination_project_dataset_table=TEST_EXPLICIT_DEST,
            max_id_key=MAX_ID_KEY)

        # using non-legacy SQL
        bq_hook.return_value.get_conn.return_value.cursor.return_value.use_legacy_sql = False

        operator.execute(None)

        bq_hook.return_value \
            .get_conn.return_value \
            .cursor.return_value \
            .execute \
            .assert_called_once_with("SELECT MAX(id) FROM `test-project.dataset.table`")
Esempio n. 6
0
def gen_import_table_task(table_config):
    import_task = GoogleCloudStorageToBigQueryOperator(
        task_id='{}_to_bigquery'.format(table_config.params['export_table']),
        bucket=table_config.params['export_bucket'],
        source_objects=["cloudsql_to_bigquery/{}/{}*".format(table_config.params['export_database'],
                                                             table_config.params['export_table'])],
        destination_project_dataset_table="{}.{}.{}".format(table_config.params['gcp_project'],
                                                            table_config.params['stage_dataset'],
                                                            table_config.params['stage_table']),
        schema_object="cloudsql_to_bigquery/schema/{}/schema_raw".format(table_config.params['export_table']),
        write_disposition='WRITE_TRUNCATE',
        source_format="NEWLINE_DELIMITED_JSON",
        dag=dag)

    import_task.doc_md = """\
        #### Import table from storage to bigquery
        task documentation    
        """
    return import_task
 def createTaskHelper(table):
     
         return GoogleCloudStorageToBigQueryOperator(
             task_id = 'create_staging_{0}'.format(table),
             field_delimiter = '|',
             schema_object = 'schema/{0}.json'.format(table),
             source_objects = ['{0}/{1}.dat'.format('dimension',table.lower())],
             bucket = 'da304-staging',
             destination_project_dataset_table = "{0}.{1}".format(BQ_STAGING_DATASET_NAME,table),
             external_table = True,
             dag=dag_daily)
 def createTaskHelper(table):
        return GoogleCloudStorageToBigQueryOperator(
             task_id = 'create_staging_{0}'.format(table),
             skip_leading_rows=1,
             field_delimiter = '|',
             schema_object = 'schema/{0}.json'.format(table),
             source_objects = ['{0}/{1}/{2}.csv'.format('facts',job_run_date,table)],
             bucket = 'da304-staging',
             destination_project_dataset_table = "{0}.{1}".format(BQ_STAGING_DATASET_NAME,table),
             external_table = True,
             dag=dag_daily)
Esempio n. 9
0
def Import_From_File(proj, dset, tab, bucket, path, infile):

    return GoogleCloudStorageToBigQueryOperator(
        task_id='import_to_' + tab,
        bucket=bucket,
        source_objects=[path + '/' + infile],
        field_delimiter=',',
        skip_leading_rows=1,
        allow_quoted_newlines=True,
        destination_project_dataset_table=proj + '.' + dset + '.' + tab,
        create_disposition='CREATE_NEVER',
        write_disposition='WRITE_TRUNCATE',
        schema_object='schema/' + dset + '/' + tab + '.json',
        aurodetect=False,
        trigger_rule='none_failed')
Esempio n. 10
0
def construct_gcs_to_bq_operator(
    task_id: str, source_objects: List[str], schema_fields: List[Dict],
    destination_project_dataset_table: str
) -> GoogleCloudStorageToBigQueryOperator:
    return GoogleCloudStorageToBigQueryOperator(
        task_id=task_id,
        bucket=GCS_BUCKET,
        source_objects=source_objects,
        schema_fields=schema_fields,
        field_delimiter=get_delimiter(source_objects),
        destination_project_dataset_table=destination_project_dataset_table,
        write_disposition='WRITE_TRUNCATE',
        autodetect=False,
        bigquery_conn_id=BIG_QUERY_CONN_ID,
        google_cloud_storage_conn_id=GOOGLE_CLOUD_DEFAULT,
        ignore_unknown_values=False)
Esempio n. 11
0
    "type": "STRING",
    "name": "malepopulation",
    "mode": "NULLABLE",
}, {
    "type": "STRING",
    "name": "totalpopulation",
    "mode": "NULLABLE",
}, {
    "type": "String",
    "name": "year",
    "mode": "NULLABLE",
}]

from_gcs_to_bigQuery = GoogleCloudStorageToBigQueryOperator(
    task_id='import_data',
    bucket='us-central1-airflows-0e76080f-bucket/data',
    source_objects=['*.csv'],
    destination_project_dataset_table='yueairflow.Jason_Yue.population',
    source_format='CSV',
    field_delimiter=',',
    max_bad_records=10,
    skip_leading_rows=1,
    schema_fields=schema,
    create_disposition='CREATE_IF_NEEDED',
    write_disposition='WRITE_TRUNCATE',
    dag=dag,
    bigquery_conn_id='google_cloud_default',
    trigger_rule='all_success')

read_json_2013 >> write_csv_gcs >> from_gcs_to_bigQuery
        bash_command=
        "bash /home/airflow/gcs/plugins/sqoop-incremental-imports.sh ephemeral-spark-cluster-{{ds_nodash}}",
        dag=dag)

    hdfs_to_gcs = BashOperator(
        task_id="hdfs_to_gcs",
        bash_command=
        "gcloud compute ssh ephemeral-spark-cluster-{{ds_nodash}}-m --zone='asia-southeast2-a' -- -T 'hadoop distcp /incremental_buckets/*.avro gs://bigdata-etl-2_flights/sqoop_output/'",
        dag=dag)

    bq_load_flight_delays = GoogleCloudStorageToBigQueryOperator(
        task_id="bq_load_flight_delays",
        bucket="bigdata-etl-2_flights",
        source_objects=["sqoop_output/part.20190515_*.avro"],
        destination_project_dataset_table=PROJECT_ID +
        ".data_flights.flights_delays",
        autodetect=True,
        source_format="AVRO",
        create_disposition="CREATE_IF_NEEDED",
        skip_leading_rows=0,
        write_disposition="WRITE_APPEND",
        max_bad_records=0)

    # delete_cluster = DataprocClusterDeleteOperator(
    #     task_id='delete_dataproc_cluster',
    #     cluster_name="ephemeral-spark-cluster-{{ds_nodash}}",
    #     region='asia-east1',
    #     trigger_rule=TriggerRule.ALL_DONE
    # )

    create_cluster.dag = dag
Esempio n. 13
0
tasks = Variable.get('moat_tasks', deserialize_json=True)
logging.info("Loaded Tasks")

for tile, campaigns in tasks.items():
    tile = int(tile)  ##TODO: tile type on variable load
    start_task = DummyOperator(task_id='Start_{}'.format(tile),
                               retries=0,
                               dag=dag)

    store_task = GoogleCloudStorageToBigQueryOperator(
        task_id="Store_{}".format(tile),
        bucket='rtf_staging',
        source_objects=['{}/*'.format(tile)],
        destination_project_dataset_table=
        'essence-analytics-dwh:RTF_DWH_Moat.{}_'.format(tile) +
        '{{ ds_nodash }}',
        schema_fields=moat_schemas.get(tile),
        source_format='NEWLINE_DELIMITED_JSON',
        create_disposition='CREATE_IF_NEEDED',
        write_disposition='WRITE_TRUNCATE',
        autodetect=False,
        dag=dag)

    for campaign in campaigns:
        level_filter, dimensions = campaign  #unpack tuple

        filter_value = [*level_filter.values()][0]

        request_task = Moat_To_GCS(
            task_id="Req_{}_{}".format(tile, filter_value),
            brand_id=tile,
Esempio n. 14
0
            'additional-cloud-sql-instances': INSTANCE_NAME
        },
        image_version='1.5')

    submit_sqoop = BashOperator(
        task_id="sqoop_full_table_import",
        bash_command=
        'bash /home/airflow/gcs/plugins/sqoop_simple_table_imports_for_airflow.sh ephemeral-spark-cluster-{{ds_nodash}}'
    )

    bq_load_flight_delays = GoogleCloudStorageToBigQueryOperator(
        task_id="bq_load_flight_delays",
        bucket="spark-etl-1",
        source_objects=["sqoop-output/flights/*.avro"],
        destination_project_dataset_table=
        "bigdata-etl-20201027.data_analysis.flight_delays",
        autodetect=True,
        source_format="AVRO",
        create_disposition="CREATE_IF_NEEDED",
        skip_leading_rows=0,
        write_disposition="WRITE_APPEND",
        max_bad_records=0)

    delete_cluster = DataprocClusterDeleteOperator(
        task_id="delete_dataproc_cluster",
        cluster_name="ephemeral-spark-cluster-{{ds_nodash}}",
        region="us-central1",
        trigger_rule=TriggerRule.ALL_DONE)

    create_cluster.dag = dag
    create_cluster.set_downstream(submit_sqoop)
    submit_sqoop.set_downstream(bq_load_flight_delays)
Esempio n. 15
0
    def __new__(
        cls,
        parent_id,
        gcs_dirs_xcom,
        dst_dir,
        filename,
        schema_fields,
        table_name,
        task_id,
        dag,
    ):
        from airflow.utils.dates import days_ago

        args = {
            "start_date": days_ago(2),
        }

        bucket = get_bucket().replace("gs://", "", 1)
        full_table_name = format_table_name(table_name, is_staging=True)

        subdag = DAG(dag_id=f"{parent_id}.{task_id}", default_args=args)

        column_names = [schema["name"] for schema in schema_fields]

        # by convention, preface task names with dag_id
        op_col_select = PythonTaskflowOperator(
            task_id="select_cols",
            python_callable=_keep_columns,
            # note that this input should have form schedule/{execution_date}/...
            taskflow={
                "gcs_dirs": {
                    "dag_id": parent_id,
                    "task_ids": gcs_dirs_xcom
                }
            },
            op_kwargs={
                "dst_dir": dst_dir,
                "filename": filename,
                "required_cols": [],
                "optional_cols": column_names,
            },
            dag=subdag,
        )

        op_stage_bq = GoogleCloudStorageToBigQueryOperator(
            task_id="stage_bigquery",
            bucket=bucket,
            # note that we can't really pull a list out of xcom without subclassing
            # operators, so we really on knowing that the task passing in
            # gcs_dirs_xcom data is using schedule/{execution_date}
            source_objects=[
                "schedule/{{execution_date}}/*/%s/%s" % (dst_dir, filename)
            ],
            schema_fields=schema_fields,
            destination_project_dataset_table=full_table_name,
            create_disposition="CREATE_IF_NEEDED",
            write_disposition="WRITE_TRUNCATE",
            # _keep_columns function includes headers in output
            skip_leading_rows=1,
            dag=subdag,
        )

        op_col_select >> op_stage_bq

        return SubDagOperator(subdag=subdag, dag=dag, task_id=task_id)
        arguments=[
            "--gcs_path_raw={{ dag_run.conf['raw_path'] }}",
            "--gcs_path_transformed=gs://{{ var.value.gcs_bucket}}" +
            "/{{ dag_run.conf['transformed_path'] }}"
        ])

    # Load the transformed files to a BigQuery table.
    bq_load = GoogleCloudStorageToBigQueryOperator(
        task_id='GCS_to_BigQuery',
        bucket='{{ var.value.gcs_bucket }}',
        # Wildcard for objects created by spark job to be written to BigQuery
        # Reads the relative path to the objects transformed by the spark job
        # from the POST message.
        source_objects=["{{ dag_run.conf['transformed_path'] }}/part-*"],
        destination_project_dataset_table='{{ var.value.bq_output_table }}',
        schema_fields=None,
        # Relative gcs path to schema file.
        schema_object='schemas/nyc-tlc-yellow.json',
        # Note that our spark job does json -> csv conversion.
        source_format='CSV',
        create_disposition='CREATE_IF_NEEDED',
        skip_leading_rows=0,
        write_disposition='WRITE_TRUNCATE',  # If the table exists, overwrite it
        max_bad_records=0)

    # Delete the Cloud Dataproc cluster.
    delete_cluster = DataprocClusterDeleteOperator(
        task_id='delete_dataproc_cluster',
        # Obviously needs to match the name of cluster created in the prior two
        # Operators.
        cluster_name='ephemeral-spark-cluster-{{ ds_nodash }}',
Esempio n. 17
0
         catchup=False,
         default_args=default_arguments,
         user_defined_macros={'project': PROJECT_ID}) as dag:

    list_files = PythonOperator(
        task_id='list_files',
        python_callable=list_objects,
        op_kwargs={'bucket': LANDING_BUCKET},
    )

    load_data = GoogleCloudStorageToBigQueryOperator(
        task_id='load_data',
        bucket=LANDING_BUCKET,
        source_objects=['*'],
        source_format='CSV',
        skip_leading_rows=1,
        destination_project_dataset_table='{{ project }}.analytics.history',
        create_disposition='CREATE_IF_NEEDED',
        write_disposition='WRITE_APPEND',
        bigquery_conn_id='google_cloud_default',
        google_cloud_storage_conn_id='google_cloud_default')
    query = '''
    SELECT * except(rank)
    FROM (
        SELECT
            *,
            ROW_NUMBER() OVER (
                PARTITION BY vehicle_id ORDER BY DATETIME(date, TIME(hour, minute, 0))
            ) as rank
        FROM {{ project }}.analytics.history) as latest
    WHERE rank = 1;
Esempio n. 18
0
[t4_US,t4_CA,...] >> t5_gamma >> t6_dummy


'''

try : 
	t5_prime_tableCheck=BigQueryCheckOperator(task_id='checkForTable',
		sql="SELECT COUNT(*) FROM `{}.{}.{}`".format(project_id,dataset_id,table_id),
		bigquery_conn_id=bigquery_conn_id,
		use_legacy_sql=False,
		dag=dag)
	t5_prime_tableCheck.set_upstream(writeToGCS_task)
	storageToBQ_task = GoogleCloudStorageToBigQueryOperator(task_id='uploadtoBQ_{}'.format(datetime.now().strptime('%Y%m%d_%H%M')),
	bucket=bucket_name,
	source_objects=tempGCS_dir_paths,
	destination_project_dataset="`{}.{}.{}`".format(project_id,dataset_id,table_id),
	schema_fields=schema,
	create_disposition='WRITE_TRUNCATE',
	dag=dag)
	storageToBQ_task.set_upstream(t5_prime_tableCheck)
	dummy_task.set_upstream(storageToBQ_task)
except Exception as e : 
	print("BigQueryCheck error = {}".format(e))
	t5_gamme_tableCreate = BigQueryCreateExternalTableOperator(task_id='CreateBQtable',
		bucket=bucket_name,
		source_objects=tempGCS_dir_paths,
		destination_project_dataset_table="{}.{}.{}"format(project_id,dataset_id,dataset_id),
		schema=schema,
		dag=dag)
	dummy_task.set_upstream(t5_gamme_tableCreate)
Esempio n. 19
0
)

comp_aggregate = DataProcPySparkOperator(
    task_id='compute_aggregates',
    main=
    'gs://europe-west1-training-airfl-159310f1-bucket/other/build_statistics_simple.py',
    cluster_name='analyse-pricing-{{ ds }}',
    arguments=["{{ ds }}"],
    dag=dag,
)

del_cluster = DataprocClusterDeleteOperator(
    task_id="delete_dataproc",
    cluster_name="analyse-pricing-{{ ds }}",
    project_id='airflowbolcom-20165e4959a78c1d',
    trigger_rule=TriggerRule.ALL_DONE,
    dag=dag,
)

TABLE_THANG = 'airflowbolcom-20165e4959a78c1d:yolo.piet${{ ds_nodash }}'
into_BQ = GoogleCloudStorageToBigQueryOperator(
    task_id="stuff_to_BQ",
    bucket='airflow_training_bucket',
    source_objects=['average_prices/transfer_date={{ ds }}/*.parquet'],
    destination_project_dataset_table=TABLE_THANG,
    source_format='PARQUET',
    write_disposition='WRITE_TRUNCATE',
    dag=dag)

get_data >> create_cluster >> comp_aggregate >> del_cluster >> into_BQ
Esempio n. 20
0
# =======================
# == load to bigquery ===
# =======================

bq_load_user = GoogleCloudStorageToBigQueryOperator(
    task_id='bq_load_user',
    bucket='exampleBucket',
    source_objects=["obj_folder/*"],
    source_format='NEWLINE_DELIMITED_JSON',
    destination_project_dataset_table='exampleSchema.exampleTable0',
    schema_fields=[
        {
            'name': 'user_id',
            'type': 'STRING',
            'mode': 'NULLABLE'
        },
        {
            'name': 'first_timestamp',
            'type': 'STRING',
            'mode': 'NULLABLE'
        },
    ],
    create_disposition='CREATE_IF_NEEDED',
    write_disposition='WRITE_TRUNCATE',
    dag=dag)

bq_load_agg = GoogleCloudStorageToBigQueryOperator(
    task_id='bq_load_agg',
    bucket='exampleBucket',
    source_objects=["obj_folder/*"],
    )

    submit_pyspark = DataProcPySparkOperator(
        task_id = "run_pyspark_etl",
        main = PYSPARK_JOB,
        cluster_name="ephemeral-spark-cluster-{{ds_nodash}}",
        region="asia-southeast2"
    )

    bq_load_series_1 = GoogleCloudStorageToBigQueryOperator(

        task_id = "bq_load_series_1",
        bucket='bigdata-etl-2_flights',
        source_objects=["series_data_output/"+current_date+"_datamart_1/*.json"],
        destination_project_dataset_table= f'{PROJECT_ID}:qoala_test.series_datamart_1',
        autodetect = True,
        source_format="NEWLINE_DELIMITED_JSON",
        create_disposition="CREATE_IF_NEEDED",
        skip_leading_rows=0,
        write_disposition="WRITE_APPEND",
        max_bad_records=0,
    )

    bq_load_series_2 = GoogleCloudStorageToBigQueryOperator(

        task_id = "bq_load_series_2",
        bucket='bigdata-etl-2_flights',
        source_objects=["series_data_output/"+current_date+"_datamart_2/*.json"],
        destination_project_dataset_table= f'{PROJECT_ID}:qoala_test.series_datamart_2',
        autodetect = True,
        source_format="NEWLINE_DELIMITED_JSON",
Esempio n. 22
0
synthGCStoBQSync = GoogleCloudStorageToBigQueryOperator(
    task_id='gcs_to_bq',
    bucket='synth-info',
    source_objects=['synthinfo.csv'],
    schema_fields=[{
        "mode": "NULLABLE",
        "name": "address",
        "type": "STRING"
    }, {
        "mode": "NULLABLE",
        "name": "collateralToken",
        "type": "STRING"
    }, {
        "mode": "NULLABLE",
        "name": "syntheticToken",
        "type": "STRING"
    }, {
        "mode": "NULLABLE",
        "name": "totalPositionCollateral",
        "type": "FLOAT"
    }, {
        "mode": "NULLABLE",
        "name": "totalTokensOutstanding",
        "type": "FLOAT"
    }, {
        "mode": "NULLABLE",
        "name": "timestamp",
        "type": "TIMESTAMP"
    }, {
        "mode": "NULLABLE",
        "name": "collateralTokenName",
        "type": "STRING"
    }, {
        "mode": "NULLABLE",
        "name": "syntheticTokenName",
        "type": "STRING"
    }],
    destination_project_dataset_table='uma.SynthInfo',
    skip_leading_rows=1,
    write_disposition="WRITE_APPEND",
    dag=dag)
Esempio n. 23
0
        '/firearms_dataflow.py'),
    dag=dag)

beam_cleanup = BashOperator(task_id='firearms_beam_cleanup',
                            bash_command=airflow_utils.beam_cleanup_statement(
                                '{}_firearm_seizures'.format(
                                    os.environ['GCS_PREFIX'])),
                            dag=dag)

bq_insert_temp = GoogleCloudStorageToBigQueryOperator(
    task_id='firearms_bq_insert',
    destination_project_dataset_table='{}:firearm_seizures.seizures_temp'.
    format(os.environ['GCP_PROJECT']),
    bucket='{}_firearm_seizures'.format(os.environ['GCS_PREFIX']),
    source_objects=[
        "avro_output/{}/{}/{}/*.avro".format(dt.strftime('%Y'),
                                             dt.strftime('%m').lower(),
                                             dt.strftime("%Y-%m-%d"))
    ],
    write_disposition='WRITE_APPEND',
    source_format='AVRO',
    time_partitioning={'type': 'DAY'},
    dag=dag)

bq_geo_join = BigQueryOperator(
    task_id='firearms_bq_geojoin',
    sql=build_revgeo_query('firearm_seizures', 'seizures_temp'),
    use_legacy_sql=False,
    destination_dataset_table='{}:firearm_seizures.seizures'.format(
        os.environ['GCP_PROJECT']),
    write_disposition='WRITE_APPEND',
    time_partitioning={'type': 'DAY'},
Esempio n. 24
0
task_1 = GoogleCloudStorageToBigQueryOperator(
    task_id='load_price_quote_to_bigquery',
    bucket='bv-data',
    source_objects=['data/price_quote.csv'],
    destination_project_dataset_table='{}.{}.price_quotes'.format(project,dsIngestion),
    write_disposition='WRITE_TRUNCATE',
    skip_leading_rows=1,
    autodetect=False,
    schema_fields=[
    {
        "name": "tube_assembly_id",
        "type": "STRING",
        "mode": "NULLABLE"
    },
    {
        "name": "supplier",
        "type": "STRING",
        "mode": "NULLABLE"
    },
    {
        "name": "quote_date",
        "type": "DATE",
        "mode": "NULLABLE"
    },
    {
        "name": "annual_usage",
        "type": "INTEGER",
        "mode": "NULLABLE"
    },
    {
        "name": "min_order_quantity",
        "type": "INTEGER",
        "mode": "NULLABLE"
    },
    {
        "name": "bracket_pricing",
        "type": "BOOLEAN",
        "mode": "NULLABLE"
    },
    {
        "name": "quantity",
        "type": "INTEGER",
        "mode": "NULLABLE"
    },
    {
        "name": "cost",
        "type": "FLOAT",
        "mode": "NULLABLE"
    }],
    dag=dag)
Esempio n. 25
0

dag = DAG(dag_nombre, default_args=default_dag_args, schedule_interval=schedule )

start = DummyOperator(task_id='start', dag=dag)



# Se leen los nombres de los archivos ( en este caso adhoc son 3), se iteran y se almacenan en BigQuery
for i, table_name in enumerate(params_bq_schema):
    gcs_to_bq = GoogleCloudStorageToBigQueryOperator( 
                        task_id = 'gcs_to_bq-{}'.format(table_name),
                        bucket  = params['BUCKET_ID'],
                        allow_quoted_newlines  = True, #Permite leer double quota ""
                        skip_leading_rows = 1, # Para que no lea el header
                        source_objects = ['data/{}.csv'.format(table_name)],
                        destination_project_dataset_table = '{}:{}.{}'.format(params['GCP_PROJECT_ID'],params['BQ_DATASET_ID'],table_name),
                        schema_fields =  params_bq_schema[table_name],
                        write_disposition = 'WRITE_TRUNCATE',
                        dag = dag
                )
    list_gcs_to_bq.append(gcs_to_bq)



# Se lee la querie de las variables de airflow para obtener el dataset final agrupado y almacenar en BigQuery
execute_bq_sql = BigQueryOperator(
                        task_id='execute_bq_sql',
                        sql= query_sql,
                        use_legacy_sql=False,
                        destination_dataset_table=bq_recent_questions_table_id,
        "input":
        "gs://{bucket}/airflow-training-data/land_registry_price_paid_uk/{{{{ ds }}}}/"
        .format(bucket=bucket_name),
        "table":
        "first_result_table",
        "dataset":
        "airflow_dataset",
    },
    dag=dag,
)

results_to_bigquery = GoogleCloudStorageToBigQueryOperator(
    task_id='results_to_bigquery',
    bucket=bucket_name,
    source_objects=['airflow-training-data/results/{{ ds }}/part-*'],
    destination_project_dataset_table='airflow_dataset.results_{{ ds_nodash }}',
    source_format='PARQUET',
    # schema_fields=schemas.gsob(),
    create_disposition='CREATE_IF_NEEDED',
    write_disposition='WRITE_TRUNCATE',
    google_cloud_storage_conn_id='google_cloud_storage_default',
    bigquery_conn_id='bigquery_default',
    autodetect=True,
    dag=dag,
)

[pgsl_to_gcs, http_to_gcs
 ] >> dataproc_create_cluster >> compute_aggregates >> [
     dataproc_delete_cluster, load_into_bigquery, results_to_bigquery
 ]
t2 = GoogleCloudStorageToBigQueryOperator(
    task_id='gcs_to_bq',
    bucket='example-datasets',
    source_objects=['game_data_condensed.csv'],
    destination_project_dataset_table='{0}.gcp_example_table'.format(
        BQ_DATASET_NAME),
    schema_fields=[
        {
            'name': 'name',
            'type': 'string',
            'mode': 'nullable'
        },
        {
            'name': 'team',
            'type': 'string',
            'mode': 'nullable'
        },
        {
            'name': 'total_score',
            'type': 'integer',
            'mode': 'nullable'
        },
        {
            'name': 'timestamp',
            'type': 'integer',
            'mode': 'nullable'
        },
        {
            'name': 'window_start',
            'type': 'string',
            'mode': 'nullable'
        },
    ],
    write_disposition='WRITE_TRUNCATE')
Esempio n. 28
0
        region="asia-east1",
        zone="asia-east1-a")

    submit_pyspark = DataProcPySparkOperator(
        task_id="run_pyspark_etl",
        main=PYSPARK_JOB,
        cluster_name="ephemeral-spark-cluster-{{ds_nodash}}",
        region="asia-east1")

    bq_load_delays_by_distance = GoogleCloudStorageToBigQueryOperator(
        task_id="bq_load_avg_delays_by_distance",
        bucket='enr1qu319-data-engineer-1',
        source_objects=[
            "flights_data_output/" + current_date + "_distance_category/part-*"
        ],
        destination_project_dataset_table=PROJECT_ID +
        ".data_analysis.avg_delays_by_distance",
        autodetect=True,
        source_format="NEWLINE_DELIMITED_JSON",
        create_disposition="CREATE_IF_NEEDED",
        skip_leading_rows=0,
        write_disposition="WRITE_APPEND",
        max_bad_records=0)

    bq_load_delays_by_flight_nums = GoogleCloudStorageToBigQueryOperator(
        task_id="bq_load_delays_by_flight_nums",
        bucket='enr1qu319-data-engineer-1',
        source_objects=[
            "flights_data_output/" + current_date + "_flight_nums/part-*"
        ],
        destination_project_dataset_table=PROJECT_ID +
        ".data_analysis.avg_delays_by_flight_nums",
Esempio n. 29
0
        trigger_rule=trigger_rule.TriggerRule.ALL_DONE)

    load_parquet_bqt = GoogleCloudStorageToBigQueryOperator(
        task_id='load_parquet_bqt',
        bucket='dphivedb',
        source_objects=['HQL/PARQUET/*'],
        schema_fields=None,
        schema_object=None,
        source_format='parquet',
        destination_project_dataset_table='bqdataset.test3',
        bigquery_conn_id='bigquery_default',
        google_cloud_storage_conn_id='google_cloud_default',
        write_disposition='WRITE_TRUNCATE',
        autodetect=True,
        delegate_to=None,
        udf_config=None,
        use_legacy_sql=True,
        maximum_billing_tier=None,
        maximum_bytes_billed=None,
        create_disposition='CREATE_IF_NEEDED',
        schema_update_options=(),
        query_params=None,
        labels=None,
        priority='INTERACTIVE',
        time_partitioning=None,
        api_resource_configs=None,
        cluster_fields=None,
        location=None,
        encryption_configuration=None,
        dag=dag)

    rm_par_file = BashOperator(
Esempio n. 30
0
        ],
    )

    delete_dataproc_cluster = dataproc_operator.DataprocClusterDeleteOperator(
        project_id=PROJECT_ID,
        task_id="delete_dataproc_cluster",
        cluster_name="gcp-data-platform",
        trigger_rule=trigger_rule.TriggerRule.ALL_DONE,
    )

    gcs_to_bigquery = GoogleCloudStorageToBigQueryOperator(
        task_id="gcs_to_bigquery",
        bucket=OUTPUT_BUCKET,
        source_objects=['{{ ds_format(ds, "%Y/%m/%d") }}/*.parquet'],
        destination_project_dataset_table=
        "{project_id}.events.events{{ ds_nodash }}".format(
            project_id=PROJECT_ID),
        source_format="PARQUET",
        create_disposition="CREATE_IF_NEEDED",
        write_disposition="WRITE_TRUNCATE",
    )

    explore_visits_by_hour = BigQueryOperator(
        task_id="explore_visits_by_hour",
        sql="""
   SELECT FORMAT_DATETIME("%F %H:00:00", DATETIME(TIMESTAMP_SECONDS(CAST(timestamp AS INT64)))) AS ts,
          COUNT(1) AS num_visits
     FROM events.events
    WHERE url = '/explore'
 GROUP BY ts
 ORDER BY ts ASC