task2 = CsvToStorage(
        task_id = 'save_imoveis'
    )

    task3 = gcs_to_bq.GoogleCloudStorageToBigQueryOperator(
        task_id='cloud_to_bigquery',
        bucket='aluguel-data-scraper',
        source_format='CSV',
        source_objects=['alugueis.csv'],
        destination_project_dataset_table='alugueis.alugueis',
        schema_fields=[
            {'name': 'Valor', 'type': 'INTEGER', 'mode': 'NULLABLE'},
            {'name': 'TipoAluguel', 'type': 'STRING', 'mode': 'NULLABLE'},
            {'name': 'Iptu', 'type': 'INTEGER', 'mode': 'NULLABLE'},
            {'name': 'Area', 'type': 'INTEGER', 'mode': 'NULLABLE'},
            {'name': 'Quartos', 'type': 'INTEGER', 'mode': 'NULLABLE'},
            {'name': 'Banheiros', 'type': 'INTEGER', 'mode': 'NULLABLE'},
            {'name': 'Vagas', 'type': 'INTEGER', 'mode': 'NULLABLE'},
            {'name': 'Cidade', 'type': 'STRING', 'mode': 'NULLABLE'},
            {'name': 'Endereco', 'type': 'STRING', 'mode': 'NULLABLE'},
            {'name': 'TipoNegociacao', 'type': 'STRING', 'mode': 'NULLABLE'},
        ],
        write_disposition='WRITE_TRUNCATE',
        skip_leading_rows=1,
        autodetect=True,
        dag=dag
    )

    #fluxo de execução
    task1
    task2.set_upstream([task1])
        ##notice trigger_rule="all_done"
        bash_run_report_remotly_cmd = 'gcloud beta compute --project myProjectName ssh scheduler2 --internal-ip --zone us-central1-a --command "sudo -u omid python3 /home/omid/gam_data_transfer/report_example_using_service_account_with_date_range.py --start ' + temp_date + " --end " + temp_date + '"'
        run_report_remotly = BashOperator(
            task_id='run_report_remotly_' + temp_date,
            retries=0,
            bash_command=bash_run_report_remotly_cmd,
            trigger_rule="all_done")

        start.set_downstream(bash_cleanup)
        bash_cleanup.set_downstream(run_report_remotly)
        run_report_remotly.set_downstream(wait)

    ##notice trigger_rule="all_done"
    run_gsutil_mv = BashOperator(task_id='bash_gsutil_mv_cmd',
                                 retries=0,
                                 bash_command=bash_gsutil_mv_cmd,
                                 trigger_rule="all_done")

    load_to_bq_from_gcs = gcs_to_bq.GoogleCloudStorageToBigQueryOperator(
        task_id='load_to_bq_from_gcs',
        source_objects='*',
        skip_leading_rows=1,
        create_disposition='CREATE_NEVER',
        write_disposition='WRITE_TRUNCATE',  #overwrite?
        bucket='myBucket/google/gam/example_report',
        destination_project_dataset_table=
        'DATA_LAKE_GOOGLE_US.example_report_partitioned')

wait >> run_gsutil_mv >> load_to_bq_from_gcs >> end
Ejemplo n.º 3
0
project_dm = 'dmgcp-ingestion-poc'
location = 'US'
bq_connection_id = 'bigquery_default'

default_dag_args = {
    'owner': 'airflow',
    'depends_on_past': False,
    'start_date': airflow.utils.dates.days_ago(0),
    'retries': 1,
    'retry_delay': timedelta(minutes=1),
    'project_id': project_dm,
    'schedule_interval': None
}

dag = DAG(dag_id='test_wrt_3', default_args=default_dag_args)

load_csv = gcs_to_bq.GoogleCloudStorageToBigQueryOperator(
    task_id='gcs_to_bq_example',
    bucket='ebcidc-to-bq-testing-us-central',
    source_objects=['csv/wrt/wrt_output.csv'],
    destination_project_dataset_table='airflow_test.wrt_3',
    schema_objects=
    'gs://kubernetes-staging-85897c950b/Dags/json/wrt-00000-of-00001.json',
    write_disposition='WRITE_TRUNCATE',
    dag=dag,
    bigquery_conn_id=bq_connection_id,
    google_cloud_storage_conn_id=bq_connection_id)

load_csv
        table_dest = record['table_dest']

        BQ_to_GCS = bigquery_to_gcs.BigQueryToCloudStorageOperator(
            # Replace ":" with valid character for Airflow task
            task_id='{}_BQ_to_GCS'.format(table_source.replace(":", "_")),
            source_project_dataset_table=table_source,
            destination_cloud_storage_uris=[
                '{}-*.avro'.format('gs://' + gcs_bucket + '/' + table_source)
            ],
            export_format='AVRO')

        # GCS_to_GCS = gcs_to_gcs.GoogleCloudStorageToGoogleCloudStorageOperator(
        #    # Replace ":" with valid character for Airflow task
        #    task_id='{}_GCS_to_GCS'.format(table_source.replace(":", "_")),
        #    source_bucket=source_bucket,
        #    source_object='{}-*.avro'.format(table_source),
        #    destination_bucket=dest_bucket,
        #    # destination_object='{}-*.avro'.format(table_dest)
        # )

        GCS_to_BQ = gcs_to_bq.GoogleCloudStorageToBigQueryOperator(
            # Replace ":" with valid character for Airflow task
            task_id='{}_GCS_to_BQ'.format(table_dest.replace(":", "_")),
            bucket=gcs_bucket,
            source_objects=['{}-*.avro'.format(table_source)],
            destination_project_dataset_table=table_dest,
            source_format='AVRO',
            write_disposition='WRITE_TRUNCATE')

        start >> BQ_to_GCS >> GCS_to_BQ >> end
}

dag = models.DAG('build_train_deploy', default_args=default_dag_args)
#[END dag_build_train_deploy]

# instantiate Google Cloud base hook to get credentials and create automl clients
gcp_hook = GoogleCloudBaseHook(conn_id='google_cloud_default')
automl_client = AutoMlClient(credentials=gcp_hook._get_credentials())

# Loads the database dump from Cloud Storage to BigQuery
t1 = gcs_to_bq.GoogleCloudStorageToBigQueryOperator(
    task_id="db_dump_to_bigquery",
    bucket=COMPOSER_BUCKET_NAME,
    source_objects=[DB_DUMP_FILENAME],
    schema_object="schema_source.json",
    source_format="CSV",
    skip_leading_rows=1,
    destination_project_dataset_table="{}.{}.{}".format(
        PROJECT, DATASET, 'data_source'),
    create_disposition="CREATE_IF_NEEDED",
    write_disposition="WRITE_TRUNCATE",
    dag=dag)

# Clean the data from BigQuery to BigQuery
t2 = bigquery_operator.BigQueryOperator(
    task_id='bq_from_source_to_clean',
    bql='{}/common/clean.sql'.format(GCS_SQL),
    use_legacy_sql=False,
    allow_large_results=True,
    destination_dataset_table="{}.{}.{}".format(PROJECT, DATASET,
                                                'data_cleaned'),
    create_disposition="CREATE_IF_NEEDED",
Ejemplo n.º 6
0
default_dag_args = {
    'start_date': datetime.datetime(2018, 1, 1),
}

with models.DAG('Airflow', default_args=default_dag_args) as dag:

    def cleaning():
        df = pd.read_csv("gs://heena_dawani/P9-ConsoleGames.csv")
        df["Platform"].fillna("N/A", inplace=True)
        df["NA_Sales"].fillna(0, inplace=True)
        df["EU_Sales"].fillna(0, inplace=True)
        df["JP_Sales"].fillna(0, inplace=True)
        df["Other_Sales"].fillna(0, inplace=True)
        new_data = df.dropna(axis=0, how='any')
        new_data.to_csv("gs://heena_dawani/games.csv")

    Cleaning = python_operator.PythonOperator(task_id='Cleaning',
                                              python_callable=cleaning)

    GCS_to_BQ = gcs_to_bq.GoogleCloudStorageToBigQueryOperator(
        task_id='GCS_to_BQ',
        bucket='heena_dawani',
        skip_leading_rows=1,
        autodetect=True,
        source_objects=["games.csv"],
        destination_project_dataset_table='heena_dawani.heena',
        source_format='CSV',
        write_disposition='WRITE_TRUNCATE')
    Cleaning >> GCS_to_BQ
Ejemplo n.º 7
0
    'retry_delay': datetime.timedelta(minutes=5),
}

# --------------------------------------------------------------------------------
# Main DAG
# --------------------------------------------------------------------------------

# Define a DAG (directed acyclic graph) of tasks.
# Any task you create within the context manager is automatically added to the
# DAG object.
with models.DAG('comoser_sample_bq_copy_across_locations',
                default_args=default_args,
                schedule_interval=None) as dag:

    start = dummy_operator.DummyOperator(task_id='start',
                                         trigger_rule='all_success')

    end = dummy_operator.DummyOperator(task_id='end',
                                       trigger_rule='all_success')

    GCS_to_BQ = gcs_to_bq.GoogleCloudStorageToBigQueryOperator(
        # Replace ":" with valid character for Airflow task
        task_id='copy_csv_from_storage_to_bq',
        bucket='baketto1',
        source_objects='avocado.csv',
        destination_project_dataset_table=
        'micro-store-218714.avocadoDataset.avocado5',
        source_format='CSV',
        write_disposition='WRITE_TRUNCATE')

    start >> GCS_to_BQ >> end
Ejemplo n.º 8
0
    dst=
    'data/spinnup/fraud_detection_v01/year={{ macros.ds_format(ds, \'%Y-%m-%d\', \'%Y\') }}/month={{ macros.ds_format(ds, \'%Y-%m-%d\', \'%Y%m\') }}/day={{ macros.ds_format(ds, \'%Y-%m-%d\', \'%Y%m%d\') }}/v01_results{{ds_nodash}}.csv',
    bucket='{{ var.value.project_bucket }}',
    conn_id='google_cloud_default',
    src='%s/v01_results.csv' % (DIR),
    dag=dag)

# Load results from GCS to BQ
load_to_bq = gcs_to_bq.GoogleCloudStorageToBigQueryOperator(
    task_id='load_to_bq',
    bucket='{{ var.value.project_bucket }}',
    source_objects=[
        'data/spinnup/fraud_detection_v01/year={{ macros.ds_format(ds, \'%Y-%m-%d\', \'%Y\') }}/month={{ macros.ds_format(ds, \'%Y-%m-%d\', \'%Y%m\') }}/day={{ macros.ds_format(ds, \'%Y-%m-%d\', \'%Y%m%d\') }}/v01_results{{ds_nodash}}.csv'
    ],
    destination_project_dataset_table=
    'umg-data-science.detect_fraud_spinnup.v01_results${{ ds_nodash }}',
    schema_object=
    'builds/flow/schemas/spinnup/spinnup_fraud_detection_v01_schema.json',
    source_format='CSV',
    field_delimiter=',',
    bigquery_conn_id='bigquery_default',
    skip_leading_rows=0,
    write_disposition='WRITE_TRUNCATE',
    dag=dag)

# Delete directory from airflow instance
delete_directory = BashOperator(task_id='delete_directory_from_instance',
                                bash_command='rm -rf {}'.format(DIR),
                                dag=dag)

# dag structure
spinnup_partition_sensor >> generate_dataset >> mkdir >> download_script >> generate_results >> upload_to_gcs >> load_to_bq >> delete_directory
Ejemplo n.º 9
0
    MOVE_SFTP_FILE_TO_GCS = sftp_operator.SFTPOperator(
        task_id="move_sftp_file_to_gcs",
        ssh_conn_id=SSH_CONN_ID,
        google_cloud_storage_conn_id=GOOGLE_CLOUD_STORAGE_CONN_ID,
        local_filepath=LOCAL_FILEPATH,
        remote_filepath=REMOTE_FILEPATH,
        operation=OPERATION,
        confirm=True)

    MOVE_GCS_FILE_TO_BQ = gcs_to_bq.GoogleCloudStorageToBigQueryOperator(
        task_id="move_gcs_file_to_bq",
        bucket=BUCKET,
        source_objects=[SOURCE_OBJECTS],
        destination_project_dataset_table="{{0}_temp}".format(
            DESTINATION_PROJECT_DATASET_TABLE_MAIN),
        skip_leading_rows=1,
        autodetect=True,
        source_format=SOURCE_FORMAT,
        create_disposition=CREATE_DISPOSITION,
        write_disposition='WRITE_APPEND',
        google_cloud_storage_conn_id=GOOGLE_CLOUD_STORAGE_CONN_ID,
        bigquery_conn_id=BIGQUERY_CONN_ID)

    BQ_TO_BQ_MERGING = bigquery_operator.BigQueryOperator(
        task_id="gbq_merging_tables",
        sql=SQL,
        use_legacy_sql=False,
        destination_project_dataset_table=
        DESTINATION_PROJECT_DATASET_TABLE_MAIN,
        write_disposition='WRITE_EMPTY',
        create_disposition=CREATE_DISPOSITION,
Ejemplo n.º 10
0
        jsonl_file_names_format = src_nodes_ways_relations_gcs_dir + '{}_{}.jsonl'

        for element_and_schema in elements_and_schemas:
            element, schema = element_and_schema
            task_id = element + '_json_to_bq_{}_{}'.format(
                index + 1, addt_mn_gke_pool_num_nodes)
            source_object = jsonl_file_names_format.format(element, index)
            destination_dataset_table = '{}.history_{}'.format(
                bq_dataset_to_export, element)

            json_to_bq_task = gcs_to_bq.GoogleCloudStorageToBigQueryOperator(
                task_id=task_id,
                bucket=src_nodes_ways_relations_gcs_bucket,
                source_objects=[source_object],
                source_format='NEWLINE_DELIMITED_JSON',
                destination_project_dataset_table=destination_dataset_table,
                schema_fields=schema,
                write_disposition='WRITE_APPEND',
                max_bad_records=max_bad_records_for_bq_export,
                retries=5,
                dag=dag)
            json_to_bq_tasks.append(json_to_bq_task)
            remove_json_task = gcs_delete_operator.GoogleCloudStorageDeleteOperator(
                task_id='remove_json-{}-{}-{}'.format(
                    element, index + 1, addt_mn_gke_pool_num_nodes),
                bucket_name=src_nodes_ways_relations_gcs_bucket,
                objects=[source_object])
            json_to_bq_task.set_downstream(remove_json_task)

            update_result_shard_timestamp_task = python_operator.PythonOperator(
                task_id='update-result-shard-timestamp-{}-{}-{}'.format(
Ejemplo n.º 11
0
        bucket=source_bucket,
        ts_func=context['dag'].context['execution_date'],
        object=
        "{{ task_instance.xcom_pull(task_ids='get_file', key='file_name') }}",
        provide_context=True)

    fork = BranchPythonOperator(task_id='fork',
                                python_callable=branch,
                                provide_context=True)

    sink_bq = gcs_to_bq.GoogleCloudStorageToBigQueryOperator(
        task_id='sink_bq',
        bucket=source_bucket,
        source_objects=[
            "{{ task_instance.xcom_pull(task_ids='get_file', key='file_name') }}"
        ],  # pulls file_name
        destination_project_dataset_table='omniture.omniture_raw_2017_2027',
        schema_fields=bq_schema,
        compression=compression_type,
        field_delimiter=field_delimiter_type,
        write_disposition=write_disposition_type,
        provide_context=True)

    move_sink_file = gcs_to_gcs.GoogleCloudStorageToGoogleCloudStorageOperator(
        task_id='move_sink_file',
        source_bucket=source_bucket,
        source_object=
        "{{ task_instance.xcom_pull(task_ids='get_file', key='file_name') }}",
        destination_bucket=del_bucket,
        destination_object='processed_sink_files/' +
        "{{ task_instance.xcom_pull(task_ids='get_file', key='file_name') }}",
        move_object=True,
from builtins import range
from datetime import timedelta
from airflow.models import DAG
from airflow.utils.dates import days_ago
from airflow.contrib.operators import gcs_to_bq

args = {
    'owner': 'Airflow',
    'start_date': days_ago(2),
}

dag = DAG(dag_id='Big_Data_Demystified_gcs_to_bq_example',
          default_args=args,
          schedule_interval='0 0 * * *',
          dagrun_timeout=timedelta(minutes=60),
          tags=['example'])

gcs2bq = gcs_to_bq.GoogleCloudStorageToBigQueryOperator(
    task_id='Big_Data_Demystified_gcs2bq',
    bucket='MyBucket',
    source_objects='*',
    destination_project_dataset_table='MyDataSet.MyTable',
    skip_leading_rows=1,
    default_args=args,
    dag=dag,
)
    # To email on failure or retry set 'email' arg to your email and enable
    # emailing here.
    'email_on_failure': False,
    'email_on_retry': False,
    # If a task fails, retry it once after waiting at least 5 minutes
    'retries': 1,
    'retry_delay': datetime.timedelta(minutes=5),
    'project_id': models.Variable.get('gcp_project')
}

with models.DAG(
        'android_reviews_load_to_bigQuery',
        # Continue to run DAG once per day
        schedule_interval=datetime.timedelta(days=1),
        default_args=default_dag_args) as dag:

    #load from local bucket o GCS table of android
    logging.error('trying to GCS_TO_BQ: ')
    load_to_bq_from_gcs = gcs_to_bq.GoogleCloudStorageToBigQueryOperator(
        task_id='load_to_bq_from_gcs',
        source_objects='*',
        skip_leading_rows=1,
        write_disposition='WRITE_TRUNCATE',  #overwrite?
        bucket=DST_BUCKET_UTF8,
        destination_project_dataset_table='DATA.Replica_android_review')

# Define DAG dependencies.
#create_dataproc_cluster >> run_dataproc_hadoop >> delete_dataproc_cluster

load_to_bq_from_gcs
Ejemplo n.º 14
0
                                    provide_context=True,
                                    dag=dag)


t3 = file_to_gcs.FileToGoogleCloudStorageOperator(task_id="upload_raw_data",
                                                  src="user.csv",
                                                  dst="data/user.csv",
                                                  retry_delay=5,
                                                  retries=1,
                                                  bucket="asia-northeast1-example-env-c50e72d7-bucket",
                                                  dag=dag)

t4 = gcs_to_bq.GoogleCloudStorageToBigQueryOperator(task_id="load_into_bq",
                                                    bucket="asia-northeast1-example-env-c50e72d7-bucket",
                                                    source_objects=["data/user.csv"],
                                                    schema_fields=[
                                                        {'name': 'active', 'type': 'BOOL'},
                                                        {'name': 'email', 'type': 'STRING'},
                                                        {'name': 'employee_number', 'type': 'STRING'},
                                                        {'name': 'name', 'type': 'STRING'},
                                                        {'name': 'location', 'type': 'STRING'},
                                                        {'name': 'sys_tags', 'type': 'STRING'}
                                                    ],
                                                    destination_project_dataset_table="composer_demo.user",
                                                    write_disposition='WRITE_TRUNCATE',
                                                    autodetect=True,
                                                    dag=dag)


t1 >> t2 >> t3 >> t4
        bash_api_call_GET_DESKTOP_TRAFFIC = BashOperator(
            task_id='bash_api_call_GET_DESKTOP_TRAFFIC' +
            single_date.strftime("%Y%m%d"),
            bash_command=bash_cmd)

        bash_cmd2 = """gsutil mv /tmp/file_""" + single_date.strftime(
            "%Y%m%d") + '.json gs://data_lake/similar_web_desktop_traffic/'
        bash_gsutil_mv_files_to_ingestion = BashOperator(
            task_id='bash_gsutil_mv_files_to_ingestion' +
            single_date.strftime("%Y%m%d"),
            bash_command=bash_cmd2)
        #bash_cmd="""ls"""
        #bash_api_call_GET_DESKTOP_TRAFFIC = BashOperator(task_id='bash_opr_'+str(item),bash_command=bash_cmd)
        start.set_downstream(bash_api_call_GET_DESKTOP_TRAFFIC)
        bash_api_call_GET_DESKTOP_TRAFFIC.set_downstream(
            bash_gsutil_mv_files_to_ingestion)
        bash_gsutil_mv_files_to_ingestion.set_downstream(wait)

    load_to_bg_GET_DESKTOP_TRAFFIC = gcs_to_bq.GoogleCloudStorageToBigQueryOperator(
        task_id='load_to_bg_GET_DESKTOP_TRAFFIC',
        source_objects=['*'],
        write_disposition='WRITE_TRUNCATE',  #overwrite?
        create_disposition='CREATE_IF_NEEDED',
        bucket=DST_BUCKET,
        destination_project_dataset_table=dst_table,
        autodetect='true')

    end = DummyOperator(task_id='end')

wait >> load_to_bg_GET_DESKTOP_TRAFFIC >> end
Ejemplo n.º 16
0
    create_test_dataset = bash_operator.BashOperator(
        task_id='create_airflow_test_dataset',
        bash_command='bq mk airflow_test',
        dag=dag)

    # [START howto_operator_gcs_to_bq]
    load_csv = gcs_to_bq.GoogleCloudStorageToBigQueryOperator(
        task_id='gcs_to_bq_example',
        bucket='cloud-samples-data',
        source_objects=['bigquery/us-states/us-states.csv'],
        destination_project_dataset_table='airflow_test.gcs_to_bq_table',
        schema_fields=[
            {
                'name': 'name',
                'type': 'STRING',
                'mode': 'NULLABLE'
            },
            {
                'name': 'post_abbr',
                'type': 'STRING',
                'mode': 'NULLABLE'
            },
        ],
        write_disposition='WRITE_TRUNCATE',
        dag=dag)
    # [END howto_operator_gcs_to_bq]

    delete_test_dataset = bash_operator.BashOperator(
        task_id='delete_airflow_test_dataset',
        bash_command='bq rm -rf airflow_test',
        dag=dag)
 move_users_data_to_staging = gcs_to_bq.GoogleCloudStorageToBigQueryOperator(
     task_id='move_users_data_to_staging',
     bucket='demo-sample',
     source_objects=['so_users.json'],
     write_disposition='WRITE_TRUNCATE',
     schema_fields=[{
         'name': 'id',
         'type': 'INTEGER',
         'mode': 'REQUIRED'
     }, {
         'name': 'display_name',
         'type': 'STRING',
         'mode': 'NULLABLE'
     }, {
         'name': 'age',
         'type': 'STRING',
         'mode': 'NULLABLE'
     }, {
         'name': 'reputation',
         'type': 'INTEGER',
         'mode': 'NULLABLE'
     }, {
         'name': 'up_votes',
         'type': 'INTEGER',
         'mode': 'NULLABLE'
     }, {
         'name': 'down_votes',
         'type': 'INTEGER',
         'mode': 'NULLABLE'
     }, {
         'name': 'views',
         'type': 'INTEGER',
         'mode': 'NULLABLE'
     }],
     skip_leading_rows=1,
     ignore_unknown_values=True,
     allow_jagged_rows=True,
     create_disposition='CREATE_IF_NEEDED',
     source_format='NEWLINE_DELIMITED_JSON',
     retries=2,
     retry_delay=timedelta(seconds=15),
     destination_project_dataset_table='demo_temp.stackoverflow_users',
     dag=dag)