Beispiel #1
0
def _transfer_bigquery_to_gcs(dag, task_id) -> models.BaseOperator:
    """Pipeline to transfer finally transferable output to GCS.

  Args:
    dag: the DAG to add this operator to
    task_id: ID for this specific task within the DAG.

  Returns:
    Operator to use within a DAG to run the Pipeline for moving records to GCS.
  """
    storage_vars = airflow_utils.retrieve_airflow_variable_as_dict(
        blockbuster_constants.BLOCKBUSTER_STORAGE_CONFIG)

    final_output_uri = '{path}/result-{timestamp}-*.json'.format(
        path=storage_vars['gcs_output_path'], timestamp=int(time.time()))

    final_output_table = '{project}.{dataset}.final_output'.format(
        project=storage_vars['bq_working_project'],
        dataset=storage_vars['bq_working_dataset'])

    return bigquery_to_gcs.BigQueryToCloudStorageOperator(
        task_id=task_id,
        source_project_dataset_table=final_output_table,
        destination_cloud_storage_uris=[final_output_uri],
        export_format='NEWLINE_DELIMITED_JSON',
        dag=dag)
from airflow.contrib.operators import bigquery_to_gcs
from airflow.utils import trigger_rule

default_dag_args = {
    'start_date': airflow.utils.dates.days_ago(1),
    'email_on_failure': False,
    'email_on_retry': False,
    'retries': 1,
    'retry_delay': datetime.timedelta(minutes=5),
}

output_file = 'gs://southamerica-east1-cassio-a-77e1beeb-bucket/data/address.csv'
#Replace <Your bucket> with your path details
with DAG(dag_id='demo_bq_dag',
         schedule_interval=datetime.timedelta(days=1),
         default_args=default_dag_args) as dag:

    bq_airflow_commits_query = bigquery_operator.BigQueryOperator(
        task_id='bq_airflow_commits_query',
        bql="""    SELECT Address
         FROM [airflow-studies:Address.Add]
          """)

    export_commits_to_gcs = bigquery_to_gcs.BigQueryToCloudStorageOperator(
        task_id='export_airflow_commits_to_gcs',
        source_project_dataset_table='airflow-studies:Address.Add',
        destination_cloud_storage_uris=[output_file],
        export_format='CSV')

    bq_airflow_commits_query >> export_commits_to_gcs
Beispiel #3
0
    # Get the table list from master file
    all_records = read_table_list(table_list_file_path)

    # Loop over each record in the 'all_records' python list to build up
    # Airflow tasks
    for record in all_records:
        logger.info('Generating tasks to transfer table: {}'.format(record))

        table_source = record['table_source']
        table_dest = record['table_dest']

        BQ_to_GCS = bigquery_to_gcs.BigQueryToCloudStorageOperator(
            # Replace ":" with valid character for Airflow task
            task_id='{}_BQ_to_GCS'.format(table_source.replace(":", "_")),
            source_project_dataset_table=table_source,
            destination_cloud_storage_uris=[
                '{}-*.avro'.format('gs://' + source_bucket + '/' +
                                   table_source)
            ],
            export_format='AVRO')

        GCS_to_GCS = gcs_to_gcs.GoogleCloudStorageToGoogleCloudStorageOperator(
            # Replace ":" with valid character for Airflow task
            task_id='{}_GCS_to_GCS'.format(table_source.replace(":", "_")),
            source_bucket=source_bucket,
            source_object='{}-*.avro'.format(table_source),
            destination_bucket=dest_bucket,
            # destination_object='{}-*.avro'.format(table_dest)
        )

        GCS_to_BQ = gcs_to_bq.GoogleCloudStorageToBigQueryOperator(
Beispiel #4
0
    # query_device_opsbrowser_snapshot = bigquery_operator.BigQueryOperator(
    #     task_id='query_device_opsbrowser_snapshot',
    #     bql=pathlib.Path(galileo.DAGS_DIR+"/bq_scripts_doi/dta_sql_device_opsbrowser_daily_snapshot_full_doi").read_text(), use_legacy_sql=False)

    # query_device_opsbrowser_delta_snapshot = bigquery_operator.BigQueryOperator(
    #     task_id='query_device_opsbrowser_delta_snapshot',
    #     bql=pathlib.Path(galileo.DAGS_DIR + "/bq_scripts_doi/dta_sql_device_opsbrowser_daily_snapshot_delta_doi").read_text(), use_legacy_sql=False)
    # ============================================================================================================
    # Export datasets
    # pageviews snapshot
    export_bq_to_gcs_json = bigquery_to_gcs.BigQueryToCloudStorageOperator(
        task_id='export_bq_to_gcs_json',
        source_project_dataset_table=
        "{{params.project_id}}.dta_customers.pageviews_daily_snapshot_delta_doi",
        params={'project_id': project_id},
        destination_cloud_storage_uris=[
            "gs://%s/data/analytics/json/%s.json" %
            (models.Variable.get('AIRFLOW_BUCKET',
                                 'us-east1-dta-airflow-b3415db4-bucket'),
             'pgviews_daily_snapshot_doi')
        ],
        export_format='NEWLINE_DELIMITED_JSON')

    export_bq_to_gcs_csv = bigquery_to_gcs.BigQueryToCloudStorageOperator(
        task_id='export_bq_to_gcs_csv',
        source_project_dataset_table=
        "{{params.project_id}}.dta_customers.pageviews_daily_snapshot_delta_doi",
        params={'project_id': project_id},
        destination_cloud_storage_uris=[
            "gs://%s/data/analytics/csv/%s.csv" %
            (models.Variable.get('AIRFLOW_BUCKET',
                                 'us-east1-dta-airflow-b3415db4-bucket'),
Beispiel #5
0
        sql="""
        SELECT owner_display_name, title, view_count
        FROM `bigquery-public-data.stackoverflow.posts_questions`
        WHERE creation_date < CAST('{max_date}' AS TIMESTAMP)
            AND creation_date >= CAST('{min_date}' AS TIMESTAMP)
        ORDER BY view_count DESC
        LIMIT 100
        """.format(max_date=max_query_date, min_date=min_query_date),
        use_legacy_sql=False,
        destination_dataset_table=bq_recent_questions_table_id)
    # [END composer_bigquery]

    # Export query result to Cloud Storage.
    export_questions_to_gcs = bigquery_to_gcs.BigQueryToCloudStorageOperator(
        task_id='export_recent_questions_to_gcs',
        source_project_dataset_table=bq_recent_questions_table_id,
        destination_cloud_storage_uris=[output_file],
        export_format='CSV')

    # Perform most popular question query.
    bq_most_popular_query = bigquery_operator.BigQueryOperator(
        task_id='bq_most_popular_question_query',
        sql="""
        SELECT title, view_count
        FROM `{table}`
        ORDER BY view_count DESC
        LIMIT 1
        """.format(table=bq_recent_questions_table_id),
        use_legacy_sql=False,
        destination_dataset_table=bq_most_popular_table_id)
Beispiel #6
0
                           """
    query_webcrawl_ga_index = bigquery_operator.BigQueryOperator(
        task_id='query_webcrawl_ga_index',
        bql=query,
        use_legacy_sql=False,
        params={
            'project_id': project_id,
            'temp_table': temp_table,
        })
    export_webcrawl_ga_index_to_gcs = bigquery_to_gcs.BigQueryToCloudStorageOperator(
        task_id='export_webcrawl_ga_index_to_gcs',
        source_project_dataset_table=
        "{{params.project_id}}.tmp.{{params.temp_table}}_{{ ts_nodash }}",
        params={
            'project_id': project_id,
            'temp_table': temp_table
        },
        destination_cloud_storage_uris=[
            "gs://%s/data/%s.csv" % (models.Variable.get(
                'AIRFLOW_BUCKET',
                'us-east1-dta-airflow-b3415db4-bucket'), temp_table)
        ],
        export_format='CSV')
    query_webcrawl_ga_index >> export_webcrawl_ga_index_to_gcs

    compare_webcrawl_ga_accounts = python_operator.PythonOperator(
        task_id='compare_webcrawl_ga_with_accounts',
        python_callable=compare_webcrawl_ga_with_accounts)
    generate_account_index >> compare_webcrawl_ga_accounts
    export_webcrawl_ga_index_to_gcs >> compare_webcrawl_ga_accounts

if __name__ == '__main__':
Beispiel #7
0
    query_device_opsbrowser_snapshot_intraday = bigquery_operator.BigQueryOperator(
        task_id='query_device_opsbrowser_snapshot_intraday',
        bql=pathlib.Path(
            galileo.DAGS_DIR +
            "/bq_scripts_doi/bq_scripts_delta/dta_sql_device_opsbrowser_snapshot_delta_intraday_doi"
        ).read_text(),
        use_legacy_sql=False)
    # ===================================================================================================
    # Export datasets
    # pageviews snapshot
    export_bq_to_gcs_json_pgviews = bigquery_to_gcs.BigQueryToCloudStorageOperator(
        task_id='export_bq_to_gcs_json_pgviews',
        source_project_dataset_table=
        "{{params.project_id}}.dta_customers.pageviews_daily_snapshot_intraday_doi",
        params={'project_id': project_id},
        destination_cloud_storage_uris=[
            "gs://%s/data/analytics/delta_snapshot/%s.json" %
            (models.Variable.get('AIRFLOW_BUCKET',
                                 'us-east1-dta-airflow-b3415db4-bucket'),
             'pgviews_daily_snapshot_doi_' + str(date.today()) + '_intraday')
        ],
        export_format='NEWLINE_DELIMITED_JSON')

    export_bq_to_gcs_csv_pgviews = bigquery_to_gcs.BigQueryToCloudStorageOperator(
        task_id='export_bq_to_gcs_csv_pgviews',
        source_project_dataset_table=
        "{{params.project_id}}.dta_customers.pageviews_daily_snapshot_intraday_doi",
        params={'project_id': project_id},
        destination_cloud_storage_uris=[
            "gs://%s/data/analytics/delta_snapshot/%s.csv" %
            (models.Variable.get('AIRFLOW_BUCKET',
                                 'us-east1-dta-airflow-b3415db4-bucket'),
Beispiel #8
0
        schedule_interval=datetime.timedelta(weeks=4),
        default_args=default_dag_args) as dag: 

    # Query recent today's data
    bq_fetch_todays_data = bigquery_operator.BigQueryOperator(
        task_id='bq_fetch_todays_data',
        bql="""
          SELECT * FROM `camel-154800.chicago_historical_congestion_data.Aug2_Larger_dataset_view`  WHERE FULL_DATE=CURRENT_DATE("America/Chicago") ORDER BY HH_hour DESC  
        """.format(max_date=max_query_date, min_date=min_query_date),
        use_legacy_sql=False,
        destination_dataset_table=bq_temp_composer_dataset)
    # [END composer_bigquery]

    # [START composer_bigquery]
  

    # Export query result to Cloud Storage.
    export_output_to_gcs = bigquery_to_gcs.BigQueryToCloudStorageOperator(
        task_id='export_output_to_gcs',
        source_project_dataset_table=bq_temp_composer_dataset,
        destination_cloud_storage_uris=[output_file],
        export_format='CSV')



    # Delete BigQuery dataset
    # Delete the bq table

        #make_bq_dataset
    bq_fetch_todays_data >> export_output_to_gcs
    create_disposition="CREATE_IF_NEEDED",
    write_disposition="WRITE_TRUNCATE",
    dag=dag
)

# TODO: Currently all data steps are done whether BTYD or DNN are used. It would
# be better to have a condition to call only one task or the other using 'model_type'
data_btyd_location = ['gs://{}/{}'.format(LOCATION_TRAINING_DATA, 'btyd.csv')]
data_train_locations = ['gs://{}/{}'.format(LOCATION_TRAINING_DATA, 'train.csv')]
data_eval_locations = ['gs://{}/{}'.format(LOCATION_TRAINING_DATA, 'eval.csv')]
data_test_locations = ['gs://{}/{}'.format(LOCATION_TRAINING_DATA, 'test.csv')]

t5a = bigquery_to_gcs.BigQueryToCloudStorageOperator(
    task_id='bq_dnn_train_to_gcs',
    source_project_dataset_table="{}.{}.{}".format(PROJECT, DATASET, 'dnn_train'),
    destination_cloud_storage_uris=data_train_locations,
    print_header=False,
    dag=dag
)

t5b = bigquery_to_gcs.BigQueryToCloudStorageOperator(
    task_id='bq_dnn_eval_to_gcs',
    source_project_dataset_table="{}.{}.{}".format(PROJECT, DATASET, 'dnn_eval'),
    destination_cloud_storage_uris=data_eval_locations,
    print_header=False,
    dag=dag
)

t5c = bigquery_to_gcs.BigQueryToCloudStorageOperator(
    task_id='bq_dnn_test_to_gcs',
    source_project_dataset_table="{}.{}.{}".format(PROJECT, DATASET, 'dnn_test'),
}

output_file = 'gs://us-east1-example-environmen-3bb7fd7a-bucket/data/product.csv'

dag = DAG(
    'bq_export_dag',
    'catchup=False',
    default_args=default_args,
    schedule_interval="@once",
)

start_task = dummy_operator.DummyOperator(
    task_id='start',
    default_args=default_args,
    dag=dag,
)

bq_query_task = bigquery_operator.BigQueryOperator(
    task_id='bq_query',
    bql='''SELECT * FROM [hd-personalization-dev.temp.Prod] ''',
    dag=dag)

export_to_gcs_task = bigquery_to_gcs.BigQueryToCloudStorageOperator(
    task_id='export_to_gcs',
    source_project_dataset_table='hd-personalization-dev.temp.Prod',
    destination_cloud_storage_uris=[output_file],
    export_format='CSV',
    dag=dag)

start_task >> bq_query_task >> export_to_gcs_task
            WHERE
              _TABLE_SUFFIX BETWEEN '{{params.start}}' AND '{{params.end}}'
            AND       REGEXP_CONTAINS(page.pagePath, r"{{ params.search_param }}=(.*?)(?:&|$)")
            group by query
            order by count(*) desc
                       """

        query_internalsearch = bigquery_operator.BigQueryOperator(
            task_id='query_internalsearch_' + galileo.domain_slug(d['domain']),
            bql=query, use_legacy_sql=False, params={
                'project_id': project_id,
                'view_id': d['view_id'],
                'start': start,
                'end': end,
                'temp_table': temp_table,
                'domain': d['domain'],
                'search_param': d['search_param']
            })
        export_internalsearch_to_gcs = bigquery_to_gcs.BigQueryToCloudStorageOperator(
            task_id='export_internalsearch_to_gcs_' + galileo.domain_slug(d['domain']),
            source_project_dataset_table="{{params.project_id}}.tmp.{{params.temp_table}}_{{ ts_nodash }}",
            params= {
                'project_id': project_id,
                    'temp_table': temp_table
            },
            destination_cloud_storage_uris=[
                "gs://%s/data/searchqueries/%s.csv" % (
                models.Variable.get('AIRFLOW_BUCKET', 'us-east1-dta-airflow-b3415db4-bucket'), temp_table)],
            export_format='CSV')
        query_internalsearch >> export_internalsearch_to_gcs
Beispiel #12
0
        task_id='query_device_brand_12months_daily_snapshot',
        bql=pathlib.Path(
            galileo.DAGS_DIR +
            "/bq_scripts_ursa_major/dta_sql_devicebrand_snapshot_12months_daily_doi"
        ).read_text(),
        use_legacy_sql=False)
    # ============================================================================================================
    # ============================================================================================================
    # Export datasets
    # total unique visitors 90 days snapshot
    export_bq_to_gcs_json_uniquevisitors = bigquery_to_gcs.BigQueryToCloudStorageOperator(
        task_id='export_bq_to_gcs_json_uniquevisitors',
        source_project_dataset_table=
        "{{params.project_id}}.dta_project_ursa_major.unique_visitors_90days_daily_delta_doi",
        params={'project_id': project_id},
        destination_cloud_storage_uris=[
            "gs://%s/data/analytics/project_ursa_major/%s.json" %
            (models.Variable.get('AIRFLOW_BUCKET',
                                 'us-east1-dta-airflow-b3415db4-bucket'),
             'uniquevisitors_90days_daily_snapshot_doi')
        ],
        export_format='NEWLINE_DELIMITED_JSON')

    export_bq_to_gcs_csv_uniquevisitors = bigquery_to_gcs.BigQueryToCloudStorageOperator(
        task_id='export_bq_to_gcs_csv_uniquevisitors',
        source_project_dataset_table=
        "{{params.project_id}}.dta_project_ursa_major.unique_visitors_90days_daily_delta_doi",
        params={'project_id': project_id},
        destination_cloud_storage_uris=[
            "gs://%s/data/analytics/project_ursa_major/%s.csv" %
            (models.Variable.get('AIRFLOW_BUCKET',
                                 'us-east1-dta-airflow-b3415db4-bucket'),
        task_id='query_digital_services_counter',
        bql=pathlib.Path(
            galileo.DAGS_DIR +
            "/bq_scripts_ursa_major/dta_sql_websites_count_doi").read_text(),
        use_legacy_sql=False)

    # ============================================================================================================
    # ============================================================================================================
    # Export datasets
    # total unique visitors 90 days snapshot
    export_bq_to_gcs_json_counter = bigquery_to_gcs.BigQueryToCloudStorageOperator(
        task_id='export_bq_to_gcs_json_counter',
        source_project_dataset_table=
        "{{params.project_id}}.dta_project_ursa_major.ga_monthly_websites_counter",
        params={'project_id': project_id},
        destination_cloud_storage_uris=[
            "gs://%s/data/analytics/project_ursa_major/%s.json" %
            (models.Variable.get('AIRFLOW_BUCKET',
                                 'us-east1-dta-airflow-b3415db4-bucket'),
             'websites_agencies_monthly_counter')
        ],
        export_format='NEWLINE_DELIMITED_JSON')

    export_bq_to_gcs_csv_counter = bigquery_to_gcs.BigQueryToCloudStorageOperator(
        task_id='export_bq_to_gcs_csv_counter',
        source_project_dataset_table=
        "{{params.project_id}}.dta_project_ursa_major.ga_monthly_websites_counter",
        params={'project_id': project_id},
        destination_cloud_storage_uris=[
            "gs://%s/data/analytics/project_ursa_major/%s.csv" %
            (models.Variable.get('AIRFLOW_BUCKET',
                                 'us-east1-dta-airflow-b3415db4-bucket'),
Beispiel #14
0
                                                      UNNEST(hits) as hits
        WHERE
        hits.type="PAGE"
        ORDER BY
        fullVisitorId,
        visitId,
        visitNumber,
        hitNumber
    """

    query_benchmark = bigquery_operator.BigQueryOperator(
        task_id='query_benchmark',
        bql=query,
        use_legacy_sql=False,
        params={
            'project_id': project_id,
            'view_id': view_id,
            'timestamp': timestamp,
            'temp_table': temp_table
        })
    export_benchmark_to_gcs = bigquery_to_gcs.BigQueryToCloudStorageOperator(
        task_id='export_benchmark_to_gcs',
        source_project_dataset_table="%s.tmp.%s" % (project_id, temp_table),
        destination_cloud_storage_uris=[
            "gs://%s/data/%s.csv" % (models.Variable.get(
                'AIRFLOW_BUCKET',
                'us-east1-dta-airflow-b3415db4-bucket'), temp_table)
        ],
        export_format='CSV')
    query_benchmark >> export_benchmark_to_gcs
            ,Security_Type_Child
            ,Security_Type_Parent
            ,Sector
            ,Trade_Date
            -- Account Information
            ,Account_Type
            ,Account_Name
            -- Financial Advisor Information
            ,FA_Code
            ,FA_Name
            -- Department
            ,Dept_Name
            ,Dept_City
            ,Dept_State""",
    use_legacy_sql=True,
    destination_dataset_table='gcp_flex_etl.trade_agg',
    write_disposition='WRITE_TRUNCATE',
    create_disposition='CREATE_IF_NEEDED',
    allow_large_results=True,
    dag=dag)

## Export trade_agg table to GCS
export_trade_agg_table = bigquery_to_gcs.BigQueryToCloudStorageOperator(
    task_id='export_trade_agg_table',
    source_project_dataset_table='gcp_flex_etl.trade_agg',
    destination_cloud_storage_uris=[current_bucket],
    export_format='CSV',
    dag=dag)

start >> archive_old_data >> load_account_current >> load_trade_agg >> export_trade_agg_table >> end
start >> archive_old_data >> load_financial_advisor_current >> load_trade_agg >> export_trade_agg_table >> end
        use_legacy_sql=False)

    query_pageviews_snapshot_delta = bigquery_operator.BigQueryOperator(
        task_id='query_pageviews_snapshot_delta',
        bql=pathlib.Path(
            galileo.DAGS_DIR +
            "/bq_scripts/dta_sql_pgvw_daily_snapshot_incremental").read_text(),
        use_legacy_sql=False)

    # Export data from BQ to GCS
    export_bq_to_gcs_json = bigquery_to_gcs.BigQueryToCloudStorageOperator(
        task_id='export_bq_to_gcs_json',
        source_project_dataset_table=
        "{{params.project_id}}.dta_customers.pageviews_daily_snapshot_increment",
        params={'project_id': project_id},
        destination_cloud_storage_uris=[
            "gs://%s/data/analytics/%s.json" % (models.Variable.get(
                'AIRFLOW_BUCKET', 'us-east1-dta-airflow-b3415db4-bucket'),
                                                'pgviews_daily_snapshot')
        ],
        export_format='NEWLINE_DELIMITED_JSON')

    export_bq_to_gcs_csv = bigquery_to_gcs.BigQueryToCloudStorageOperator(
        task_id='export_bq_to_gcs_csv',
        source_project_dataset_table=
        "{{params.project_id}}.dta_customers.pageviews_daily_snapshot_increment",
        params={'project_id': project_id},
        destination_cloud_storage_uris=[
            "gs://%s/data/analytics/%s.csv" % (models.Variable.get(
                'AIRFLOW_BUCKET', 'us-east1-dta-airflow-b3415db4-bucket'),
                                               'pgviews_daily_snapshot')