def test_execute(self, mock_hook):
        source_project_dataset_table = '{}.{}'.format(
            TEST_DATASET, TEST_TABLE_ID)
        destination_cloud_storage_uris = ['gs://some-bucket/some-file.txt']
        compression = 'NONE'
        export_format = 'CSV'
        field_delimiter = ','
        print_header = True
        labels = {'k1': 'v1'}

        operator = BigQueryToCloudStorageOperator(
            task_id=TASK_ID,
            source_project_dataset_table=source_project_dataset_table,
            destination_cloud_storage_uris=destination_cloud_storage_uris,
            compression=compression,
            export_format=export_format,
            field_delimiter=field_delimiter,
            print_header=print_header,
            labels=labels
        )

        operator.execute(None)
        mock_hook.return_value \
            .get_conn.return_value \
            .cursor.return_value \
            .run_extract \
            .assert_called_once_with(
                source_project_dataset_table=source_project_dataset_table,
                destination_cloud_storage_uris=destination_cloud_storage_uris,
                compression=compression,
                export_format=export_format,
                field_delimiter=field_delimiter,
                print_header=print_header,
                labels=labels
            )
예제 #2
0
    def test_execute(self, mock_hook):
        source_project_dataset_table = '{}.{}'.format(TEST_DATASET,
                                                      TEST_TABLE_ID)
        destination_cloud_storage_uris = ['gs://some-bucket/some-file.txt']
        compression = 'NONE'
        export_format = 'CSV'
        field_delimiter = ','
        print_header = True
        labels = {'k1': 'v1'}

        operator = BigQueryToCloudStorageOperator(
            task_id=TASK_ID,
            source_project_dataset_table=source_project_dataset_table,
            destination_cloud_storage_uris=destination_cloud_storage_uris,
            compression=compression,
            export_format=export_format,
            field_delimiter=field_delimiter,
            print_header=print_header,
            labels=labels)

        operator.execute(None)
        mock_hook.return_value \
            .get_conn.return_value \
            .cursor.return_value \
            .run_extract \
            .assert_called_once_with(
                source_project_dataset_table=source_project_dataset_table,
                destination_cloud_storage_uris=destination_cloud_storage_uris,
                compression=compression,
                export_format=export_format,
                field_delimiter=field_delimiter,
                print_header=print_header,
                labels=labels
            )
예제 #3
0
def extract_channel_subdag(
    parent_dag_name,
    child_dag_name,
    default_args,
    schedule_interval,
    dataset_id,
    channel,
):
    dag = DAG(
        dag_id="{}.{}".format(parent_dag_name, child_dag_name),
        default_args=default_args,
        schedule_interval=schedule_interval,
    )

    bq_extract_table = "glam_extract_firefox_{}_v1".format(channel)
    etl_query = bigquery_etl_query(
        task_id="glam_client_probe_counts_{}_extract".format(channel),
        destination_table=bq_extract_table,
        dataset_id=dataset_id,
        project_id=project_id,
        owner="*****@*****.**",
        email=[
            "*****@*****.**",
            "*****@*****.**",
            "*****@*****.**",
        ],
        date_partition_parameter=None,
        arguments=("--replace", ),
        sql_file_path=
        "sql/moz-fx-data-shared-prod/{}/glam_client_probe_counts_extract_v1/query.sql"
        .format(dataset_id),
        parameters=("channel:STRING:{}".format(channel), ),
        dag=dag,
    )

    gcs_delete = GoogleCloudStorageDeleteOperator(
        task_id="glam_gcs_delete_old_{}_extracts".format(channel),
        bucket_name=glam_bucket,
        prefix="aggs-desktop-{}".format(channel),
        google_cloud_storage_conn_id=gcp_conn.gcp_conn_id,
        dag=dag,
    )

    gcs_destination = "gs://{bucket}/aggs-desktop-{channel}-*.csv".format(
        bucket=glam_bucket, channel=channel)
    bq2gcs = BigQueryToCloudStorageOperator(
        task_id="glam_extract_{}_to_csv".format(channel),
        source_project_dataset_table="{}.{}.{}".format(project_id, dataset_id,
                                                       bq_extract_table),
        destination_cloud_storage_uris=gcs_destination,
        bigquery_conn_id=gcp_conn.gcp_conn_id,
        export_format="CSV",
        print_header=False,
        dag=dag,
    )

    etl_query >> gcs_delete >> bq2gcs

    return dag
예제 #4
0
def extract_channel_subdag(
    parent_dag_name,
    child_dag_name,
    default_args,
    schedule_interval,
    dataset_id,
    channel,
):
    dag = DAG(
        dag_id="{}.{}".format(parent_dag_name, child_dag_name),
        default_args=default_args,
        schedule_interval=schedule_interval,
    )

    bq_extract_table = "glam_client_probe_counts_{}_extract_v1".format(channel)
    glam_client_probe_counts_extract = bigquery_etl_query(
        task_id="glam_client_probe_counts_{}_extract".format(channel),
        destination_table=bq_extract_table,
        dataset_id=dataset_id,
        project_id=project_id,
        owner="*****@*****.**",
        email=[
            "*****@*****.**",
            "*****@*****.**",
            "*****@*****.**",
        ],
        date_partition_parameter=None,
        arguments=("--replace", ),
        dag=dag,
    )

    glam_gcs_delete_old_extracts = GoogleCloudStorageDeleteOperator(
        task_id="glam_gcs_delete_old_{}_extracts".format(channel),
        bucket_name=glam_bucket,
        prefix="extract-desktop-{}".format(channel),
        google_cloud_storage_conn_id=gcp_conn.gcp_conn_id,
        dag=dag,
    )

    gcs_destination = "gs://{}/extract-desktop-{}-*.csv".format(
        glam_bucket, channel)
    glam_extract_to_csv = BigQueryToCloudStorageOperator(
        task_id="glam_extract_{}_to_csv".format(channel),
        source_project_dataset_table="{}.{}.{}".format(project_id, dataset_id,
                                                       bq_extract_table),
        destination_cloud_storage_uris=gcs_destination,
        bigquery_conn_id=gcp_conn.gcp_conn_id,
        export_format="CSV",
        print_header=False,
        dag=dag,
    )

    glam_client_probe_counts_extract >> glam_gcs_delete_old_extracts >> glam_extract_to_csv

    return dag
예제 #5
0
def extract_user_counts(parent_dag_name, child_dag_name, default_args,
                        schedule_interval, dataset_id):

    dag = DAG(
        dag_id="{}.{}".format(parent_dag_name, child_dag_name),
        default_args=default_args,
        schedule_interval=schedule_interval,
    )

    bq_extract_table = "glam_user_counts_extract_v1"
    etl_query = bigquery_etl_query(
        task_id="glam_user_counts_extract",
        destination_table=bq_extract_table,
        dataset_id=dataset_id,
        project_id=project_id,
        owner="*****@*****.**",
        email=[
            "*****@*****.**",
            "*****@*****.**",
            "*****@*****.**",
        ],
        date_partition_parameter=None,
        arguments=("--replace", ),
        dag=dag,
    )

    gcs_delete = GoogleCloudStorageDeleteOperator(
        task_id="glam_gcs_delete_count_extracts",
        bucket_name=glam_bucket,
        prefix="glam-extract-firefox-counts",
        google_cloud_storage_conn_id=gcp_conn.gcp_conn_id,
        dag=dag,
    )

    gcs_destination = "gs://{}/glam-extract-firefox-counts.csv".format(
        glam_bucket)
    bq2gcs = BigQueryToCloudStorageOperator(
        task_id="glam_extract_user_counts_to_csv",
        source_project_dataset_table="{}.{}.{}".format(project_id, dataset_id,
                                                       bq_extract_table),
        destination_cloud_storage_uris=gcs_destination,
        bigquery_conn_id=gcp_conn.gcp_conn_id,
        export_format="CSV",
        print_header=False,
        dag=dag,
    )

    etl_query >> gcs_delete >> bq2gcs

    return dag
예제 #6
0
def get_export_table_operator(table_name, dag=None):
    """Get templated BigQueryToCloudStorageOperator.

    Args:
        table_name (string): Name of the table to export.
        dag (airflow.models.DAG): DAG used by context_manager. e.g. `with get_dag() as dag: get_export_table_operator(..., dag=dag)`. Defaults to None.

    Returns:
        airflow.contrib.operators.bigquery_operator.BigQueryOperator

    """
    if dag is None:
        logger.warning(
            'No DAG context was found. The operator may not be associated to any DAG nor appeared in Web UI'
        )

    date_descriptor = '{{ ds_nodash }}'
    table_name_with_date_descriptor = \
        '{table_name}{date_descriptor}'.format(
            table_name=table_name,
            date_descriptor=date_descriptor)

    return BigQueryToCloudStorageOperator(
        dag=dag or models._CONTEXT_MANAGER_DAG,
        task_id='{experiment_name}.{table_name}.export'.format(
            experiment_name=get_config('experiment_name'),
            table_name=table_name),
        source_project_dataset_table=
        '{gcp_project_name}.{database_name}.{table_name}'.format(
            gcp_project_name=get_config('gcp_project_name'),
            database_name='%s_database' % get_config('experiment_name'),
            table_name=table_name_with_date_descriptor),
        # TODO: 1GB以上のデータに対応
        # https://cloud.google.com/bigquery/exporting-data-from-bigquery#exportingmultiple
        destination_cloud_storage_uris=[
            'gs://{bucket_name}/{experiment_name}/exported_tables/'
            '{table_name}/{date_descriptor}/'
            'out.csv.gzip'.format(
                bucket_name=get_config('bucket_name'),
                experiment_name=get_config('experiment_name'),
                date_descriptor=date_descriptor,
                table_name=table_name)
        ],
        compression="GZIP")
예제 #7
0
                          sql=queries.write_weight_data_to_bq,
                          destination_dataset_table=cfg.BQ_TABLE_WEIGHT,
                          create_disposition='CREATE_IF_NEEDED',
                          write_disposition='WRITE_TRUNCATE',
                          use_legacy_sql=False)

    t4 = GoogleCloudStorageToBigQueryOperator(
        task_id='write_route_data_to_bq',
        bucket=cfg.BUCKET,
        source_objects=[cfg.SOURCE_OBJECT],
        field_delimiter=';',
        destination_project_dataset_table=cfg.BQ_TABLE_ROUTE,
        create_disposition='CREATE_IF_NEEDED',
        write_disposition='WRITE_TRUNCATE',
        skip_leading_rows=1)

    t5 = BigQueryOperator(task_id='prepare_and_merge_data',
                          sql=queries.prepare_and_merge_data,
                          use_legacy_sql=False,
                          destination_dataset_table=cfg.BQ_TABLE_MERGE,
                          create_disposition='CREATE_IF_NEEDED',
                          write_disposition='WRITE_TRUNCATE')

    t6 = BigQueryToCloudStorageOperator(
        task_id='export_results_to_gcs',
        source_project_dataset_table=cfg.BQ_TABLE_MERGE,
        destination_cloud_storage_uris=cfg.DESTINATION_URI,
        export_format='CSV')

    t1 >> t2 >> [t3, t4] >> t5 >> t6
예제 #8
0
def export_to_amplitude(
        parent_dag_name,
        dag_name,
        default_args,
        project,
        dataset,
        table_or_view,
        s3_prefix,
        gcs_bucket='moz-fx-data-derived-datasets-amplitude-export',
        gcp_conn_id='google_cloud_derived_datasets',
        amplitude_s3_conn='amplitude_s3_conn',
        amplitude_s3_bucket='com-amplitude-vacuum-mozilla-vacuum-wup'):

    """Export a bigquery table or view to Amplitude.

    This uses the BigQueryToCloudStorage operator to export the
    partition to GCS, then pushes that data to S3. It operates
    on a temporary table that is dropped after the job is finished.

    :param str parent_dag_name: Parent dag name
    :param str dag_name: This dag's name (appended to parent_dag_name)
    :param str default_args: DAG configuration
    :param str dataset: BigQuery project containing the table to be exported
    :param str dataset: BigQuery dataset
    :param str table_or_view: Table or view name
    :param str gcs_bucket: The bucket the data will be exported to
    :param str gcp_conn_id: GCP connection ID
    :param str amplitude_s3_conn: S3 connection ID
    :param str amplitude_s3_bucket: The bucket to export data to
    :param str s3_prefix: The prefix for the s3 objects
    """

    environment = environ['DEPLOY_ENVIRONMENT']
    _dag_name = '{}.{}'.format(parent_dag_name, dag_name)

    with models.DAG(_dag_name, default_args=default_args) as dag:
        # For now, we assume the view is already updated
        # See https://github.com/mozilla/bigquery-etl/issues/218

        exec_date = '{{ ds }}'

        # Check that we have data for this date
        check_sql = (
            'SELECT COUNT(*) '
            'FROM `{}.{}.{}` '
            'WHERE DATE(submission_timestamp) = "{}"'
        ).format(project, dataset, table_or_view, exec_date)

        wait_for_data = BigQuerySQLSensorOperator(
                task_id='wait_for_data',
                sql=check_sql,
                bigquery_conn_id=gcp_conn_id,
                use_legacy_sql=False
        )

        # Create the table with yesterday's data
        project_id = GoogleCloudBaseHook(gcp_conn_id=gcp_conn_id).project_id
        temp_table_name = table_or_view + '_{{ ds_nodash }}'
        fully_qualified_table_name = '{}.{}.{}'.format(project_id, dataset, temp_table_name)

        sql = (
            'SELECT * EXCEPT (submission_timestamp) '
            'FROM `{}.{}.{}` '
            'WHERE DATE(submission_timestamp) = "{}"'
        ).format(project, dataset, table_or_view, exec_date)

        create_table = BigQueryOperator(
            task_id='create_temporary_table',
            sql=sql,
            destination_dataset_table=fully_qualified_table_name,
            bigquery_conn_id=gcp_conn_id,
            use_legacy_sql=False
        )

        directory = '/'.join((environment, s3_prefix, '{{ ds_nodash }}'))
        extension = '.tsv.gz'

        # Export from bq to gcs
        # Docs: https://github.com/apache/airflow/blob/master/airflow/contrib/operators/bigquery_to_gcs.py#L28 # noqa: E501
        gcs_uri = 'gs://{}/{}/*{}'.format(gcs_bucket, directory, extension)
        table_extract = BigQueryToCloudStorageOperator(
            task_id='bq_to_gcs',
            source_project_dataset_table=fully_qualified_table_name,
            destination_cloud_storage_uris=[gcs_uri],
            bigquery_conn_id=gcp_conn_id,
            compression='GZIP',
            export_format='CSV',
            field_delimiter='\t',
            print_header=True
        )

        # Push the data to S3
        # Docs: https://github.com/apache/airflow/blob/master/airflow/contrib/operators/gcs_to_s3.py#L29 # noqa: E501
        s3_push = GoogleCloudStorageToS3Operator(
            task_id='gcs_to_s3',
            bucket=gcs_bucket,
            prefix=directory,
            delimiter=extension,
            google_cloud_storage_conn_id=gcp_conn_id,
            dest_aws_conn_id=amplitude_s3_conn,
            dest_s3_key='s3://{}/'.format(amplitude_s3_bucket),
            replace=True
        )

        # Drop the temporary table
        table_drop = BigQueryOperator(
            task_id='drop_temp_table',
            sql='DROP TABLE `{}`'.format(fully_qualified_table_name),
            bigquery_conn_id=gcp_conn_id,
            use_legacy_sql=False
        )

        # Delete the GCS data
        data_delete = GoogleCloudStorageDeleteOperator(
            task_id='delete_gcs_data',
            bucket_name=gcs_bucket,
            prefix=directory,
            gcp_conn_id=gcp_conn_id
        )

        wait_for_data >> create_table >> table_extract >> s3_push
        s3_push >> table_drop
        s3_push >> data_delete

        return dag
예제 #9
0
    project_id = GoogleCloudBaseHook(gcp_conn_id=gcp_conn_id).project_id

    gcs_bucket = 'moz-fx-data-derived-datasets-amplitude-export'
    directory = environment + '/fenix/{{ ds_nodash }}/'
    extension = '.tsv.gz'

    # Export from bq to gcs
    # Docs: https://github.com/apache/airflow/blob/master/airflow/contrib/operators/bigquery_to_gcs.py#L28
    table_extract = BigQueryToCloudStorageOperator(
        task_id='bq_to_gcs',
        source_project_dataset_table='{project}.telemetry.{table}'.format(
            project=project_id, table=table_name),
        destination_cloud_storage_uris=[
            'gs://{bucket}/{dir}*{ext}'.format(bucket=gcs_bucket,
                                               dir=directory,
                                               ext=extension)
        ],
        bigquery_conn_id=gcp_conn_id,
        compression='GZIP',
        export_format='CSV',
        field_delimiter='\t',
        print_header=True)

    # Push the data to S3
    # Docs: https://github.com/apache/airflow/blob/master/airflow/contrib/operators/gcs_to_s3.py#L29
    s3_push = GoogleCloudStorageToS3Operator(
        task_id='gcs_to_s3',
        bucket=gcs_bucket,
        prefix=directory,
        delimiter=extension,
        google_cloud_storage_conn_id=gcp_conn_id,
        write_disposition="WRITE_TRUNCATE",  # specify to truncate on writes
        #ADD YOUR CODE HERE
    )

    train_files = BUCKET + "/chicago_taxi/data/train/"
    valid_files = BUCKET + "/chicago_taxi/data/valid/"

    # Tasks to export the results of the previous BigQueryOperators to
    # Cloud Storage to stage for later AI Platform Training job.

    bq_export_train_csv_op = BigQueryToCloudStorageOperator(
        task_id="bq_export_gcs_train_csv_task",
        source_project_dataset_table="{}.{}_train_data"
                .format(DESTINATION_DATASET, model.replace(".", "_")),
        destination_cloud_storage_uris=[train_files +
                                        "{}/train-*.csv"
                                        .format(model.replace(".", "_"))],
        export_format="CSV",
        print_header=False,
        dag=dag
    )

    bq_export_valid_csv_op = BigQueryToCloudStorageOperator(
        task_id="bq_export_gcs_valid_csv_task",
        source_project_dataset_table="{}.{}_valid_data"
                .format(DESTINATION_DATASET, model.replace(".", "_")),
        destination_cloud_storage_uris=[valid_files +
                                        "{}/valid-*.csv"
                                        .format(model.replace(".", "_"))],
        export_format="CSV",
        print_header=False,
예제 #11
0
# Reference: https://airflow.apache.org/integration.html#bigqueryoperator
t1 = BigQueryOperator(
    task_id='bq_rec_training_data',
    bql=bql,
    destination_dataset_table='%s.recommendation_events' % DATASET,
    write_disposition='WRITE_TRUNCATE',  # specify to truncate on writes
    dag=dag)

# BigQuery training data export to GCS

# TODO: Fill in the missing operator name for task #2 which
# takes a BigQuery dataset and table as input and exports it to GCS as a CSV
training_file = BUCKET + '/data/recommendation_events.csv'
t2 = BigQueryToCloudStorageOperator(
    task_id='bq_export_op',
    source_project_dataset_table='%s.recommendation_events' % DATASET,
    destination_cloud_storage_uris=[training_file],
    export_format='CSV',
    dag=dag)

# ML Engine training job

job_id = 'recserve_{0}'.format(datetime.datetime.now().strftime('%Y%m%d%H%M'))
job_dir = BUCKET + '/jobs/' + job_id
output_dir = BUCKET
training_args = [
    '--job-dir', job_dir, '--train-files', training_file, '--output-dir',
    output_dir, '--data-type', 'web_views', '--use-optimized'
]

# TODO: Fill in the missing operator name for task #3 which will
# start a new training job to Cloud ML Engine
    with TemporaryDirectory(prefix='airflowtmp') as tmp_dir:
        avro_dir = tmp_dir + "/avro"
        data_path = tmp_dir + "/data.dat"

        convert_to_array = BigQueryOperator(
            task_id="convert_to_array",
            sql=bq_sql,
            destination_dataset_table=bq_destination_table,
            write_disposition="WRITE_TRUNCATE",
            use_legacy_sql=False,
            dag=dag)
        cleanup_gcs = BashOperator(task_id="cleanup_gcs",
                                   bash_command=f"gsutil -m rm {output_uri}")
        export_questions_to_gcs = BigQueryToCloudStorageOperator(
            task_id="export_array_to_gcs",
            source_project_dataset_table=bq_destination_table,
            destination_cloud_storage_uris=[output_uri],
            export_format="Avro")
        download_avro = BashOperator(
            task_id="download_avro",
            bash_command=f"gsutil -m cp {output_uri} avro_dir")
        count = BigQueryCountOperator(task_id="get_count",
                                      dataset="consumption_test",
                                      table="tmp_out_table",
                                      dag=dag)
        cmd = (venv_python + " " + dag_dir + "/src/convert_to_memmap.py" +
               ' --rows {{ ti.xcom_pull(task_ids="get_count") }}' +
               " --target-path " + avro_dir + " --target-attr values" +
               " --output-path " + data_path)
        convert = BashOperator(task_id="convert_avro", bash_command=cmd)
WHERE DATE(timestamp) <= DATE({{ ds }})
GROUP BY movieid
ORDER BY avg_rating DESC
""",
    write_disposition="WRITE_TRUNCATE",
    create_disposition="CREATE_IF_NEEDED",
    bigquery_conn_id="gcp",
    dag=dag,
)

extract_top_ratings = BigQueryToCloudStorageOperator(
    task_id="extract_top_ratings",
    source_project_dataset_table=(os.environ["GCP_PROJECT"] + ":" +
                                  os.environ["BIGQUERY_DATASET"] + "." +
                                  "rating_results_{{ ds_nodash }}"),
    destination_cloud_storage_uris=("gs://" + os.environ["RESULT_BUCKET"] +
                                    "/{{ ds_nodash }}.csv"),
    export_format="CSV",
    bigquery_conn_id="gcp",
    dag=dag,
)

delete_result_table = BigQueryTableDeleteOperator(
    task_id="delete_result_table",
    deletion_dataset_table=(os.environ["GCP_PROJECT"] + ":" +
                            os.environ["BIGQUERY_DATASET"] + "." +
                            "rating_results_{{ ds_nodash }}"),
    bigquery_conn_id="gcp",
    dag=dag,
)
예제 #14
0
         schedule_interval=timedelta(days=1),
         default_args=default_args) as dag:
    bq_extract_one_day = BigQueryOperator(
        task_id='bq_extract_one_day',
        bql='gcp_smoke/gsob_extract_day.sql',
        destination_dataset_table=
        '{{var.value.gcq_dataset}}.gsod_partition{{ ds_nodash }}',
        write_disposition='WRITE_TRUNCATE',
        bigquery_conn_id='gcp_smoke',
        use_legacy_sql=False)

    bq2gcp_avro = BigQueryToCloudStorageOperator(
        task_id='bq2gcp_avro',
        source_project_dataset_table=
        '{{var.value.gcq_dataset}}.gsod_partition{{ ds_nodash }}',
        destination_cloud_storage_uris=[
            'gs://{{var.value.gcs_bucket}}/{{var.value.gcs_root}}/gcp_smoke_bq/bq_to_gcp_avro/{{ ds_nodash }}/part-*.avro'
        ],
        export_format='AVRO',
        bigquery_conn_id='gcp_smoke',
    )

    bq2gcp_override = BigQueryToCloudStorageOperator(
        task_id='bq2gcp_override',
        source_project_dataset_table=
        '{{var.value.gcq_dataset}}.gsod_partition{{ ds_nodash }}',
        destination_cloud_storage_uris=[
            'gs://{{var.value.gcs_bucket}}/{{var.value.gcs_root}}/gcp_smoke_bq/bq_to_gcp_avro/99999999/part-*.avro'
        ],
        export_format='AVRO',
        bigquery_conn_id='gcp_smoke',
    )
예제 #15
0
        use_legacy_sql=False,
        write_disposition="WRITE_TRUNCATE",
    )

    directory = "/".join((GCS_PREFIX, "{{ ds_nodash }}"))
    extension = ".csv"

    # Export from bq to gcs
    # Docs: https://github.com/apache/airflow/blob/master/airflow/contrib/operators/bigquery_to_gcs.py#L28 # noqa: E501
    gcs_uri = "gs://{}/{}/*{}".format(GCS_BUCKET, directory, extension)

    table_extract = BigQueryToCloudStorageOperator(
        task_id="bq_to_gcs",
        source_project_dataset_table=fully_qualified_table_name,
        destination_cloud_storage_uris=[gcs_uri],
        bigquery_conn_id=gcp_conn_id,
        compression="GZIP",
        export_format="CSV",
        field_delimiter=",",
        print_header=False,
    )

    # Drop the temporary table
    table_drop = BigQueryOperator(
        task_id="drop_temp_table",
        sql="DROP TABLE `{}`".format(fully_qualified_table_name),
        bigquery_conn_id=gcp_conn_id,
        use_legacy_sql=False,
    )

    create_table >> table_extract >> table_drop
예제 #16
0
    date_partition_parameter=None,
    arguments=('--replace', ),
    dag=dag)

glam_gcs_delete_old_extracts = GoogleCloudStorageDeleteOperator(
    task_id="glam_gcs_delete_old_extracts",
    bucket_name=glam_bucket,
    prefix="extract-",
    google_cloud_storage_conn_id=gcp_conn.gcp_conn_id,
    dag=dag)

gcs_destination = "{}/extract-*.csv".format(glam_bucket)
glam_extract_to_csv = BigQueryToCloudStorageOperator(
    task_id="glam_extract_to_csv",
    source_project_dataset_table="glam_client_probe_counts_extract_v1",
    destination_cloud_storage_uris=gcs_destination,
    export_format="CSV",
    print_header=False,
    dag=dag)

wait_for_main_ping >> latest_versions

latest_versions >> clients_daily_scalar_aggregates
clients_daily_scalar_aggregates >> clients_daily_keyed_scalar_aggregates
clients_daily_scalar_aggregates >> clients_daily_keyed_boolean_aggregates
clients_daily_keyed_boolean_aggregates >> clients_scalar_aggregates
clients_daily_keyed_scalar_aggregates >> clients_scalar_aggregates
clients_scalar_aggregates >> clients_scalar_bucket_counts
clients_scalar_aggregates >> scalar_percentiles

latest_versions >> clients_daily_histogram_aggregates
예제 #17
0
def build_dag():
    """Build DAG."""
    dag = DAG('btc_to_neo4j',
              schedule_interval='@daily',
              default_args=DEFAULT_ARGS,
              catchup=True)

    # NOTE: It is import to keep elements of this list in this order since it is required later when loading data
    blockchain_elements = [
        'blocks', 'txns', 'outputs', 'output_addresses', 'inputs'
    ]
    load_dependency = None

    for element in blockchain_elements:
        table = 'crypto_bitcoin.{element}'.format(
            element=element) + '_{{ds_nodash}}'
        bigquery_to_daily_table_task = BigQueryOperator(
            task_id='{element}_to_daily_table'.format(element=element),
            sql='bigquery/{element}.sql'.format(element=element),
            destination_dataset_table=table,
            write_disposition='WRITE_TRUNCATE',
            use_legacy_sql=False,
            dag=dag)

        filename = '{element}/{element}-*.csv'.format(element=element)
        destination_pattern = 'gs://{bucket}'.format(bucket=BUCKET) + \
                              '/neo4j_import/{{macros.ds_format(ds, "%Y-%m-%d", "%Y/%m/%d")}}/' + filename

        table_to_bucket_task = BigQueryToCloudStorageOperator(
            task_id='{element}_table_to_bucket'.format(element=element),
            source_project_dataset_table=table,
            destination_cloud_storage_uris=[destination_pattern],
            export_format='csv',
            field_delimiter=',',
            print_header=True,
            dag=dag)

        load_into_neo4j_task = PythonOperator(
            task_id="load_{element}_into_neo4j".format(element=element),
            python_callable=load_into_neo4j,
            provide_context=True,
            op_kwargs={'element': element},
            pool='neo4j_slot',
            dag=dag)

        # NOTE: timestamps in blocks are not strictly incremental and since we query by dates it could happen
        # that we need to backfill some relations.
        # See: https://bitcoin.stackexchange.com/questions/67618/difference-between-time-and-mediantime-in-getblock
        if element == 'blocks':
            backfill_blocks_in_neo4j_task = PythonOperator(
                task_id="backfill_blocks_in_neo4j",
                python_callable=backfill_blocks_in_neo4j,
                provide_context=True,
                pool='neo4j_slot',
                dag=dag)
            load_into_neo4j_task >> backfill_blocks_in_neo4j_task

        delete_aux_table = BigQueryTableDeleteOperator(
            task_id='delete_{element}_table'.format(element=element),
            deletion_dataset_table=table,
            dag=dag)

        bigquery_to_daily_table_task >> table_to_bucket_task >> load_into_neo4j_task
        table_to_bucket_task >> delete_aux_table

        # Make sure that we load data in Neo4J in right order
        if load_dependency is not None:
            load_dependency >> load_into_neo4j_task

        load_dependency = load_into_neo4j_task

    return dag
예제 #18
0
    create_tmp_table = BigQueryOperator(
        task_id='create_tmp_table',
        sql=QUERY,
        use_legacy_sql=False,
        destination_dataset_table='{}.Temporal.cloudSQLexport_tmp'.format(
            cfg.PROJECT_ID),
        write_disposition='WRITE_TRUNCATE',
        create_disposition='CREATE_IF_NEEDED',
        bigquery_conn_id=cfg.bigquery_conn_id)

    create_tmp_csv = BigQueryToCloudStorageOperator(
        task_id='create_tmp_csv',
        source_project_dataset_table='{}.Temporal.cloudSQLexport_tmp'.format(
            BQ_PROJECT),
        destination_cloud_storage_uris='{}/cloudSQLexport_temp.csv'.format(
            DIR_TMP),
        export_format='CSV',
        print_header=False,
        bigquery_conn_id=cfg.bigquery_conn_id)

    import_to_csql = CloudSqlInstanceImportOperator(
        task_id='import_to_csql',
        project_id=SQL_PROJECT,
        instance='servicedat-cal-mysql',
        body={
            "importContext": {
                "kind": "sql#importContext",
                "fileType": 'CSV',
                "uri": '{}/cloudSQLexport_temp.csv'.format(DIR_TMP),
                "database": DATABASE,
yesterday = datetime.combine(datetime.today() - timedelta(1),
                             datetime.min.time())

default_args = {
    'start_date': yesterday,
    'email_on_failure': False,
    'email_on_retry': False,
    'retries': 1,
    'retry_delay': timedelta(minutes=5)
}


def get_composer_gcs_bucket():
    return os.environ['GCS_BUCKET']


with DAG(dag_id='21_bigquery_operators',
         catchup=False,
         schedule_interval=timedelta(days=1),
         default_args=default_args,
         user_defined_macros={
             'get_composer_gcs_bucket': get_composer_gcs_bucket,
         }) as dag:

    bq_to_gcs = BigQueryToCloudStorageOperator(
        task_id='bq_to_gcs',
        source_project_dataset_table='bigquery-public-data:samples.gsod',
        destination_cloud_storage_uris=[
            'gs://{{ get_composer_gcs_bucket() }}/data/21_bigquery_operators/{{ execution_date }}/gsod-*.csv'
        ])
예제 #20
0
	SELECT user
	, store_id
	, CASE WHEN store_id IS NOT NULL THEN 1 ELSE NULL END AS rating
	FROM transactions
	JOIN stores
	ON transactions.store_id=stores.id
	WHERE user IN (SELECT user FROM users)
	AND store_id IN (SELECT id FROM stores)
	GROUP BY user, store_id
	''')

t2 = BigQueryToCloudStorageOperator(
	task_id='bq_to_gcs_store_recs_train',
	source_project_dataset_table='data_science.store_recs_train',
	destination_cloud_storage_uris=['gs://store_recs/development/training/{{ ds }}/store_recs_train_*.json'],
	export_format='NEWLINE_DELIMITED_JSON',
	delegate_to='*****@*****.**',
	dag=dag
	)

t3 = DataprocClusterCreateOperator(
	task_id='create_cluster',
	project_id='my_project',
	cluster_name='cluster-1',
	num_workers=25,
	storage_bucket='data-science',
	init_actions_uris=['gs://initialization-actions/create-my-cluster.sh'],
	properties= {"spark:spark.executor.instances": "74", 
				 "spark:spark.yarn.executor.memoryOverhead": "2048",
				 "spark:spark.yarn.driver.memoryOverhead": "3072",
				 "spark:spark.executor.cores": "5",
    # detected in the Cloud Storage bucket.
    'start_date': yesterday,
    # To email on failure or retry set 'email' arg to your email and enable
    # emailing here.
    'email_on_failure': False,
    'email_on_retry': False,
    # If a task fails, retry it once after waiting at least 5 minutes
    'retries': 0,
    'retry_delay': datetime.timedelta(minutes=5),
    'project_id': models.Variable.get('gcp_project')
}

with DAG(
        dag_id='table_snapshot',
        # Continue to run DAG once per day
        schedule_interval=datetime.timedelta(days=1),
        default_args=default_dag_args) as dag:

    start = DummyOperator(task_id='start')

    end = DummyOperator(task_id='end')

    bq_table_snapshot = BigQueryToCloudStorageOperator(
        task_id='bq_table_snapshot',
        source_project_dataset_table='MyProjectID.DATA_LAKE.table_name',
        destination_cloud_storage_uris=[destination_cloud_storage_uris1],
        compression='Snappy',
        export_format='Avro',
        bigquery_conn_id='bigquery_default')

start >> bq_table_snapshot >> end
예제 #22
0
               'project': project_id,
               'region': 'europe-west1',
               'staging_location': 'gs://' + project_id + '-dataflow/staging/',
               'temp_location': 'gs://' + project_id + '-dataflow/temp/'
               }
}

# Define a DAG (directed acyclic graph) of tasks.
# Any task you create within the context manager is automatically added to the
# DAG object.
with models.DAG(
        'hourly_data_transfer',
        schedule_interval='0 * * * *',
        default_args=default_dag_args) as dag:

    run_dataflow = DataFlowPythonOperator(
        py_file='gs://' + project_id + '-dataflow/pipelines/batch_pipeline.py',
        task_id='run_Dataflow_from_BQ_to_Datastore',
        dataflow_default_options=default_dag_args['dataflow_default_options']
    )

    bq_to_gcs = BigQueryToCloudStorageOperator(
        task_id='export_stream_data_from_BQ',
        source_project_dataset_table='my_dataset.stream_data',
        destination_cloud_storage_uris=['gs://' + project_id + '-data-export/stream_data.csv'],
        export_format='CSV')

    # Define DAG dependencies.
    run_dataflow
    bq_to_gcs
예제 #23
0
def export_to_parquet(
    table,
    destination_table=None,
    static_partitions=[],
    arguments=[],
    use_storage_api=False,
    dag_name="export_to_parquet",
    parent_dag_name=None,
    default_args=None,
    aws_conn_id="aws_dev_iam_s3",
    gcp_conn_id="google_cloud_derived_datasets",
    dataproc_zone="us-central1-a",
    dataproc_storage_bucket="moz-fx-data-derived-datasets-parquet",
    num_workers=2,
    num_preemptible_workers=0,
    gcs_output_bucket="moz-fx-data-derived-datasets-parquet",
    s3_output_bucket="telemetry-parquet",
):

    """ Export a BigQuery table to Parquet.

    https://github.com/mozilla/bigquery-etl/blob/master/script/pyspark/export_to_parquet.py

    :param str table:                             [Required] BigQuery table name
    :param Optional[str] destination_table:       Output table name, defaults to table,
                                                  will have r'_v[0-9]+$' replaced with
                                                  r'/v[0-9]+'
    :param List[str] arguments:                   Additional pyspark arguments
    :param bool use_storage_api:                  Whether to read from the BigQuery
                                                  Storage API or an AVRO export
    :param str dag_name:                          Name of DAG
    :param Optional[str] parent_dag_name:         Parent DAG name
    :param Optional[Dict[str, Any]] default_args: DAG configuration
    :param str gcp_conn_id:                       Airflow connection id for GCP access
    :param str dataproc_storage_bucket:           Dataproc staging GCS bucket
    :param str dataproc_zone:                     GCP zone to launch dataproc clusters
    :param int num_preemptible_workers:           Number of Dataproc preemptible workers

    :return: airflow.models.DAG
    """

    # remove the dataset prefix and partition suffix from table
    table_id = table.rsplit(".", 1)[-1]
    unqualified_table, _, partition_id = table_id.partition("$")
    # limit cluster name to 35 characters plus suffix of -export-YYYYMMDD (51 total)
    cluster_name = unqualified_table.replace("_", "-")
    if len(cluster_name) > 35:
        # preserve version when truncating cluster name to 42 characters
        prefix, version = re.match(r"(.*?)(-v[0-9]+)?$", cluster_name).groups("")
        cluster_name = prefix[:35 - len(version)] + version
    cluster_name += "-export-{{ ds_nodash }}"

    dag_prefix = parent_dag_name + "." if parent_dag_name else ""
    connection = GoogleCloudBaseHook(gcp_conn_id=gcp_conn_id)

    if destination_table is None:
        destination_table = unqualified_table
    # separate version using "/" instead of "_"
    export_prefix = re.sub(r"_(v[0-9]+)$", r"/\1", destination_table) + "/"
    if static_partitions:
        export_prefix += "/".join(static_partitions) + "/"
    avro_prefix = "avro/" + export_prefix
    if not static_partitions and partition_id:
        avro_prefix += "partition_id=" + partition_id + "/"
    avro_path = "gs://" + gcs_output_bucket + "/" + avro_prefix + "*.avro"

    with models.DAG(dag_id=dag_prefix + dag_name, default_args=default_args) as dag:

        create_dataproc_cluster = DataprocClusterCreateOperator(
            task_id="create_dataproc_cluster",
            cluster_name=cluster_name,
            gcp_conn_id=gcp_conn_id,
            project_id=connection.project_id,
            num_workers=num_workers,
            image_version="1.4",
            storage_bucket=dataproc_storage_bucket,
            zone=dataproc_zone,
            master_machine_type="n1-standard-8",
            worker_machine_type="n1-standard-8",
            num_preemptible_workers=num_preemptible_workers,
            init_actions_uris=[
                "gs://dataproc-initialization-actions/python/pip-install.sh",
            ],
            metadata={"PIP_PACKAGES": "google-cloud-bigquery==1.20.0"},
        )

        run_dataproc_pyspark = DataProcPySparkOperator(
            task_id="run_dataproc_pyspark",
            cluster_name=cluster_name,
            dataproc_pyspark_jars=[
                "gs://spark-lib/bigquery/spark-bigquery-latest.jar"
            ],
            dataproc_pyspark_properties={
                "spark.jars.packages": "org.apache.spark:spark-avro_2.11:2.4.4",
            },
            main="https://raw.githubusercontent.com/mozilla/bigquery-etl/master"
            "/script/pyspark/export_to_parquet.py",
            arguments=[table]
            + [
                "--" + key + "=" + value
                for key, value in {
                    "avro-path": (not use_storage_api) and avro_path,
                    "destination": "gs://" + gcs_output_bucket,
                    "destination-table": destination_table,
                }.items()
                if value
            ]
            + (["--static-partitions"] if static_partitions else [])
            + [static_partitions]
            + arguments,
            gcp_conn_id=gcp_conn_id,
        )

        gcs_to_s3 = DataProcHadoopOperatorWithAws(
            task_id="gcs_to_s3",
            main_jar="file:///usr/lib/hadoop-mapreduce/hadoop-distcp.jar",
            arguments=[
                "-update",
                "-delete",
                "gs://{}/{}".format(gcs_output_bucket, export_prefix),
                "s3a://{}/{}".format(s3_output_bucket, export_prefix),
            ],
            cluster_name=cluster_name,
            gcp_conn_id=gcp_conn_id,
            project_id=connection.project_id,
            aws_conn_id=aws_conn_id,
        )

        delete_dataproc_cluster = DataprocClusterDeleteOperator(
            task_id="delete_dataproc_cluster",
            cluster_name=cluster_name,
            gcp_conn_id=gcp_conn_id,
            project_id=connection.project_id,
            trigger_rule=trigger_rule.TriggerRule.ALL_DONE,
        )

        if not use_storage_api:
            avro_export = BigQueryToCloudStorageOperator(
                task_id="avro_export",
                source_project_dataset_table=table,
                destination_cloud_storage_uris=avro_path,
                compression=None,
                export_format="AVRO",
                bigquery_conn_id=gcp_conn_id,
            )
            avro_delete = GoogleCloudStorageDeleteOperator(
                task_id="avro_delete",
                bucket_name=gcs_output_bucket,
                prefix=avro_prefix,
                gcp_conn_id=gcp_conn_id,
                trigger_rule=trigger_rule.TriggerRule.ALL_DONE,
            )
            avro_export >> run_dataproc_pyspark >> avro_delete

        create_dataproc_cluster >> run_dataproc_pyspark >> gcs_to_s3
        gcs_to_s3 >> delete_dataproc_cluster

        return dag
예제 #24
0
def preprocess_tasks(model, dag, PROJECT_ID, BUCKET, DATA_DIR):
  # Constants
  # Specify your source BigQuery project, dataset, and table names
  SOURCE_BQ_PROJECT = "nyc-tlc"
  SOURCE_DATASET_TABLE_NAMES = "yellow.trips,green.trips_2014,green.trips_2015".split(",")

  # Specify your destination BigQuery dataset
  DESTINATION_DATASET = "taxifare"
  
  # BigQuery data query
  bql="""
  SELECT
    (tolls_amount + fare_amount) AS fare_amount,
    EXTRACT(DAYOFWEEK FROM pickup_datetime) * 1.0 AS dayofweek,
    EXTRACT(HOUR FROM pickup_datetime) * 1.0 AS hourofday,
    pickup_longitude AS pickuplon,
    pickup_latitude AS pickuplat,
    dropoff_longitude AS dropofflon,
    dropoff_latitude AS dropofflat,
    passenger_count*1.0 AS passengers,
    CONCAT(CAST(pickup_datetime AS STRING), CAST(pickup_longitude AS STRING), CAST(pickup_latitude AS STRING), CAST(dropoff_latitude AS STRING), CAST(dropoff_longitude AS STRING)) AS key
  FROM
    `{0}.{1}`
  WHERE
    trip_distance > 0
    AND fare_amount >= 2.5
    AND pickup_longitude > -78
    AND pickup_longitude < -70
    AND dropoff_longitude > -78
    AND dropoff_longitude < -70
    AND pickup_latitude > 37
    AND pickup_latitude < 45
    AND dropoff_latitude > 37
    AND dropoff_latitude < 45
    AND passenger_count > 0
    AND rand() < 0.00001
  """

  bql = bql.format(SOURCE_BQ_PROJECT, model)

  bql_train = "SELECT * EXCEPT (key) FROM({0}) WHERE MOD(ABS(FARM_FINGERPRINT(key)), 5) < 4".format(bql)
  bql_eval = "SELECT * EXCEPT (key) FROM({0}) WHERE MOD(ABS(FARM_FINGERPRINT(key)), 5) = 4".format(bql)

  # Complete the BigQueryOperator task to truncate the table if it already exists before writing
  # Reference: https://airflow.apache.org/integration.html#bigqueryoperator
  bq_train_data_op = BigQueryOperator(
      task_id="bq_train_data_{}_task".format(model.replace(".","_")),
      bql=bql_train,
      destination_dataset_table="{}.{}_train_data".format(DESTINATION_DATASET, model.replace(".","_")),
      write_disposition="WRITE_TRUNCATE", # specify to truncate on writes
      use_legacy_sql=False,
      dag=dag
  )

  bq_eval_data_op = BigQueryOperator(
      task_id="bq_eval_data_{}_task".format(model.replace(".","_")),
      bql=bql_eval,
      destination_dataset_table="{}.{}_eval_data".format(DESTINATION_DATASET, model.replace(".","_")),
      write_disposition="WRITE_TRUNCATE", # specify to truncate on writes
      use_legacy_sql=False,
      dag=dag
  )

  sql = """
  SELECT
    COUNT(*)
  FROM
    [{0}:{1}.{2}]
  """

  # Check to make sure that the data tables won"t be empty
  bq_check_train_data_op = BigQueryCheckOperator(
      task_id="bq_check_train_data_{}_task".format(model.replace(".","_")),
      sql=sql.format(PROJECT_ID, DESTINATION_DATASET, model.replace(".","_") + "_train_data"),
      dag=dag
  )

  bq_check_eval_data_op = BigQueryCheckOperator(
      task_id="bq_check_eval_data_{}_task".format(model.replace(".","_")),
      sql=sql.format(PROJECT_ID, DESTINATION_DATASET, model.replace(".","_") + "_eval_data"),
      dag=dag
  )

  # BigQuery training data export to GCS
  bash_remove_old_data_op = BashOperator(
      task_id="bash_remove_old_data_{}_task".format(model.replace(".","_")),
      bash_command="if gsutil ls {0}/taxifare/data/{1} 2> /dev/null; then gsutil -m rm -rf {0}/taxifare/data/{1}/*; else true; fi".format(BUCKET, model.replace(".","_")),
      dag=dag
  )

  # Takes a BigQuery dataset and table as input and exports it to GCS as a CSV
  bq_export_gcs_train_csv_op = BigQueryToCloudStorageOperator(
      task_id="bq_export_gcs_train_csv_{}_task".format(model.replace(".","_")),
      source_project_dataset_table="{}.{}_train_data".format(DESTINATION_DATASET, model.replace(".","_")),
      destination_cloud_storage_uris=[DATA_DIR + "{}/train-*.csv".format(model.replace(".","_"))],
      export_format="CSV",
      print_header=False,
      dag=dag
  )

  bq_export_gcs_eval_csv_op = BigQueryToCloudStorageOperator(
      task_id="bq_export_gcs_eval_csv_{}_task".format(model.replace(".","_")),
      source_project_dataset_table="{}.{}_eval_data".format(DESTINATION_DATASET, model.replace(".","_")),
      destination_cloud_storage_uris=[DATA_DIR + "{}/eval-*.csv".format(model.replace(".","_"))],
      export_format="CSV",
      print_header=False,
      dag=dag
  )
  
  return (bq_train_data_op,
          bq_eval_data_op,
          bq_check_train_data_op,
          bq_check_eval_data_op,
          bash_remove_old_data_op,
          bq_export_gcs_train_csv_op,
          bq_export_gcs_eval_csv_op)
import pendulum
from function import consts, utils

# DAG設定
DAG_NAME = 'dag_bq_to_gcs'
default_args = {
    'owner': 'terunrun',
    'depends_on_past': False,
    'start_date': datetime(2021, 2, 22, 0, 0, tzinfo=pendulum.timezone('Asia/Tokyo')),
    'email_on_failure': False,
    'email_on_retry': False,
    'retries': 0,
}
dag = DAG(DAG_NAME, schedule_interval=None, default_args=default_args, catchup=False)

# TASK設定
bq_to_gcs = BigQueryToCloudStorageOperator(
    task_id='bq_to_gcs',
    # export対象データセット.テーブル
    source_project_dataset_table=consts.WARM_DATASET + '.' + consts.DATA_NAME + '_' + '{{ (execution_date + macros.timedelta(hours=+9)).strftime("%Y%m%d") }}',
    # export先オブジェクト名(uriなのでgs://形式で記述する)
    destination_cloud_storage_uris='gs://' + consts.TABLE_EXPORT_BUCKET + '/' + consts.FOLDER_NAME + '/' + consts.DATA_NAME + '_' + '{{ (execution_date + macros.timedelta(hours=+9)).strftime("%Y%m%d") }}'
    # 圧縮形式
    compression='GZIP',
    # export形式
    export_format='CSV',
    # exportファイルにヘッダー行を含めるかどうか
    print_header=False,
    dag=dag,
)
예제 #26
0
                                        and sessions.clientId = userId.clientId)
                      select userId as distinct_id, clientId, visitId, visitStartTime, date, fullVisitorId, hit.hitNumber, hit.page.pagePath, hit.appInfo.screenName , hit.page.hostname, hit.page.pageTitle, hit.type as hit_type
                      from data cross join unnest(hits) as hit"""

prepare_ga360 = BigQueryOperator(
    dag=dag,
    task_id='bq_unnest_table',
    bigquery_conn_id=google_conn_id,
    use_legacy_sql=False,
    sql=bq_extract_query.format(source_table=source_table),
    destination_dataset_table=target_table)

extract_ga360_to_gcs = BigQueryToCloudStorageOperator(
    dag=dag,
    task_id='export_recent_yesterday_to_gcs',
    bigquery_conn_id=google_conn_id,
    source_project_dataset_table=target_table,
    destination_cloud_storage_uris=[gcs_output_file],
    compression='GZIP',
    export_format='CSV')

export_gcs_to_s3 = GoogleCloudStorageToS3Operator(
    dag=dag,
    task_id="cp_gcs_to_s3",
    dest_verify=True,
    google_cloud_storage_conn_id=google_conn_id,
    bucket=gcs_bucket,
    dest_aws_conn_id='local_s3',
    dest_s3_key=redshift_s3_bucket)

load_redshift = S3ToRedshiftTransfer(dag=dag,
                                     task_id="redshift_load",
    bash_remove_old_data_op = BashOperator(
        task_id="bash_remove_old_data_{}_task".format(model.replace(".", "_")),
        bash_command=
        "if gsutil ls {0}/taxifare/data/{1} 2> /dev/null; then gsutil -m rm -rf {0}/taxifare/data/{1}/*; else true; fi"
        .format(BUCKET, model.replace(".", "_")),
        dag=dag)

    # Takes a BigQuery dataset and table as input and exports it to GCS as a CSV
    train_files = BUCKET + "/taxifare/data/"

    bq_export_gcs_train_csv_op = BigQueryToCloudStorageOperator(
        task_id="bq_export_gcs_train_csv_{}_task".format(
            model.replace(".", "_")),
        source_project_dataset_table="{}.{}_train_data".format(
            DESTINATION_DATASET, model.replace(".", "_")),
        destination_cloud_storage_uris=[
            train_files + "{}/train-*.csv".format(model.replace(".", "_"))
        ],
        export_format="CSV",
        print_header=False,
        dag=dag)

    eval_files = BUCKET + "/taxifare/data/"

    bq_export_gcs_eval_csv_op = BigQueryToCloudStorageOperator(
        task_id="bq_export_gcs_eval_csv_{}_task".format(model.replace(
            ".", "_")),
        source_project_dataset_table="{}.{}_eval_data".format(
            DESTINATION_DATASET, model.replace(".", "_")),
        destination_cloud_storage_uris=[
            eval_files + "{}/eval-*.csv".format(model.replace(".", "_"))
예제 #28
0
        task_id='create_temporary_table',
        sql=sql,
        destination_dataset_table=fully_qualified_tmp_table,
        bigquery_conn_id=gcp_conn_id,
        use_legacy_sql=False
    )

    fully_qualified_table_name = '{}.{}.{}'.format(project, dataset, table)
    gcs_bucket = 'moz-fx-data-prod-analysis'
    incline_prefix = 'incline/executive_dash/{{ds}}/data.ndjson'
    gcs_uri = 'gs://{bucket}/{prefix}'.format(bucket=gcs_bucket, prefix=incline_prefix)

    table_extract = BigQueryToCloudStorageOperator(
        task_id='extract_as_latest',
        source_project_dataset_table=fully_qualified_table_name,
        destination_cloud_storage_uris=[gcs_uri],
        bigquery_conn_id=gcp_conn_id,
        export_format='JSON',
        compression='GZIP'
    )

    # Drop the temporary table
    table_drop = BigQueryOperator(
        task_id='drop_temp_table',
        sql='DROP TABLE `{}`'.format(fully_qualified_tmp_table),
        bigquery_conn_id=gcp_conn_id,
        use_legacy_sql=False
    )

    wait_for_copy_deduplicate >> \
        migrated_clients >> \
        exec_dash >> \
예제 #29
0
            DATASET_NAME),
        use_legacy_sql=False,
    )

    copy_from_selected_data = BigQueryToBigQueryOperator(
        task_id="copy-from-selected-data",
        source_project_dataset_tables="{}.selected_data_from_external_table".
        format(DATASET_NAME),
        destination_project_dataset_table=
        "{}.copy_of_selected_data_from_external_table".format(DATASET_NAME),
    )

    bigquery_to_gcs = BigQueryToCloudStorageOperator(
        task_id="bigquery-to-gcs",
        source_project_dataset_table="{}.selected_data_from_external_table".
        format(DATASET_NAME),
        destination_cloud_storage_uris=[
            "gs://{}/export-bigquery.csv".format(DATA_EXPORT_BUCKET_NAME)
        ],
    )

    create_dataset = BigQueryCreateEmptyDatasetOperator(
        task_id="create-dataset", dataset_id=DATASET_NAME)

    create_dataset_with_location = BigQueryCreateEmptyDatasetOperator(
        task_id="create_dataset_with_location",
        dataset_id=LOCATION_DATASET_NAME,
        location=BQ_LOCATION)

    create_table = BigQueryCreateEmptyTableOperator(
        task_id="create-table",
        dataset_id=DATASET_NAME,
예제 #30
0
    'end_date': datetime(2020, 9, 30),
    'depends_on_past': False,
    'retries': 0,
    'retry_delay': timedelta(minutes=10)
}

dag = DAG(dag_id="my-bq-task",
          default_args=default_args,
          schedule_interval='@daily')
filepath = "gs://backup-data-2020/" + 'out_' + datetime.now().strftime(
    "%d_%b_%Y:%H:%M:%S.%f") + '.csv'

t1 = BigQueryCheckOperator(
    bigquery_conn_id="my-bq-conn",
    task_id="check_for_table",
    dag=dag,
    use_legacy_sql=False,
    sql="""select count(*) from `tech-289406.sample.covid`""")

t2 = BigQueryToCloudStorageOperator(
    task_id="dataset_to_gcs",
    source_project_dataset_table="sample.covid",
    destination_cloud_storage_uris=[filepath],
    export_format="CSV",
    field_delimiter=',',
    bigquery_conn_id="my-bq-conn",
    dag=dag,
)

t1 >> t2
예제 #31
0
execute_bq_sql = BigQueryOperator(
                        task_id='execute_bq_sql',
                        sql= query_sql,
                        use_legacy_sql=False,
                        destination_dataset_table=bq_recent_questions_table_id,
                        create_disposition='CREATE_IF_NEEDED',
                        write_disposition='WRITE_TRUNCATE',
                        dag = dag
                )

# Se exporta el resultado de la tabla temporal a GCS

export_data_groupby = BigQueryToCloudStorageOperator(
                        task_id='export_table_temp_to_gcs',
                        source_project_dataset_table= bq_recent_questions_table_id,
                        destination_cloud_storage_uris='gs://{}/data/archivo_final_agrupado.csv'.format(params['BUCKET_ID']),
                        export_format='CSV',
                        dag = dag
                )

# La tabla creada con el dataset anterior se elimina ya que es temporal
delete_bq_dataset = BashOperator(
                    task_id='delete_table_temp',
                    bash_command='bq rm -f %s' % bq_recent_questions_table_id,
                    trigger_rule=trigger_rule.TriggerRule.ALL_DONE,
                    dag = dag

                )

# Se suben los archivos de la agrupacion final, en conjunto con los archivos originales
t1 = BigQueryOperator(
    task_id='bq_rec_training_data',
    bql=bql,
    destination_dataset_table='%s.recommendation_events' % DATASET,
    write_disposition='WRITE_TRUNCATE', # specify to truncate on writes
    dag=dag)

# BigQuery training data export to GCS

# TODO: Fill in the missing operator name for task #2 which
# takes a BigQuery dataset and table as input and exports it to GCS as a CSV
training_file = BUCKET + '/data/recommendation_events.csv'
t2 = BigQueryToCloudStorageOperator(
    task_id='bq_export_op',
    source_project_dataset_table='%s.recommendation_events' % DATASET,
    destination_cloud_storage_uris=[training_file],
    export_format='CSV',
    dag=dag
)


# ML Engine training job

job_id = 'recserve_{0}'.format(datetime.datetime.now().strftime('%Y%m%d%H%M'))
job_dir = BUCKET + '/jobs/' + job_id
output_dir = BUCKET
training_args = ['--job-dir', job_dir,
                 '--train-files', training_file,
                 '--output-dir', output_dir,
                 '--data-type', 'web_views',
                 '--use-optimized']