Esempio n. 1
0
    def test_description_external_table(self, bq_hook):

        operator = GCSToBigQueryOperator(
            task_id=TASK_ID,
            bucket=TEST_BUCKET,
            source_objects=TEST_SOURCE_OBJECTS,
            destination_project_dataset_table=TEST_EXPLICIT_DEST,
            description=DESCRIPTION,
            external_table=True,
        )

        operator.execute(None)
        # fmt: off
        bq_hook.return_value.get_conn.return_value.cursor.return_value.create_external_table. \
            assert_called_once_with(
                external_project_dataset_table=mock.ANY,
                schema_fields=mock.ANY,
                source_uris=mock.ANY,
                source_format=mock.ANY,
                compression=mock.ANY,
                skip_leading_rows=mock.ANY,
                field_delimiter=mock.ANY,
                max_bad_records=mock.ANY,
                quote_character=mock.ANY,
                ignore_unknown_values=mock.ANY,
                allow_quoted_newlines=mock.ANY,
                allow_jagged_rows=mock.ANY,
                encoding=mock.ANY,
                src_fmt_configs=mock.ANY,
                encryption_configuration=mock.ANY,
                labels=mock.ANY,
                description=DESCRIPTION,
            )
Esempio n. 2
0
    def test_execute_explicit_project(self, bq_hook):
        operator = GCSToBigQueryOperator(
            task_id=TASK_ID,
            bucket=TEST_BUCKET,
            source_objects=TEST_SOURCE_OBJECTS,
            destination_project_dataset_table=TEST_EXPLICIT_DEST,
            max_id_key=MAX_ID_KEY,
        )

        # using non-legacy SQL
        bq_hook.return_value.get_conn.return_value.cursor.return_value.use_legacy_sql = False

        operator.execute(None)

        bq_hook.return_value.get_conn.return_value.cursor.return_value.execute.assert_called_once_with(
            "SELECT MAX(id) FROM `test-project.dataset.table`")
def bs_customer_invoice_chinook_dag():
    @task()
    def extract_transform():
        conn = sqlite3.connect(f"{DATA_PATH}/chinook.db")
        with open(f"{BASE_PATH}/sql/chinook.sql", "r") as query:
            df = pd.read_sql(query.read(), conn)
        df.to_csv(OUT_PATH, index=False, header=False) #prevent on create Index column and exclude the header row

    start = DummyOperator(task_id='start')
    end = DummyOperator(task_id='end')
    extracted_transformed_data = extract_transform()

    stored_data_gcs = LocalFilesystemToGCSOperator(
        task_id="store_to_gcs",
        gcp_conn_id=GOOGLE_CLOUD_CONN_ID,
        src=OUT_PATH,
        dst=GCS_OBJECT_NAME,
        bucket=BUCKET_NAME
    )

    loaded_data_bigquery = GCSToBigQueryOperator(
        task_id='load_to_bigquery',
        bigquery_conn_id=GOOGLE_CLOUD_CONN_ID,
        bucket=BUCKET_NAME,
        source_objects=[GCS_OBJECT_NAME],
        destination_project_dataset_table=f"{DATASET_ID}.{BIGQUERY_TABLE_NAME}",
        schema_fields=[ #based on https://cloud.google.com/bigquery/docs/schemas
            {'name': 'customer_id', 'type': 'INT64', 'mode': 'REQUIRED'},
            {'name': 'full_name', 'type': 'STRING', 'mode': 'NULLABLE'},
            {'name': 'company', 'type': 'STRING', 'mode': 'NULLABLE'},
            {'name': 'address', 'type': 'STRING', 'mode': 'NULLABLE'},
            {'name': 'city', 'type': 'STRING', 'mode': 'NULLABLE'},
            {'name': 'state', 'type': 'STRING', 'mode': 'NULLABLE'},
            {'name': 'country', 'type': 'STRING', 'mode': 'NULLABLE'},
            {'name': 'postal_code', 'type': 'STRING', 'mode': 'NULLABLE'},
            {'name': 'phone', 'type': 'STRING', 'mode': 'NULLABLE'},
            {'name': 'fax', 'type': 'STRING', 'mode': 'NULLABLE'},
            {'name': 'email', 'type': 'STRING', 'mode': 'NULLABLE'},
            {'name': 'invoice_id', 'type': 'INT64', 'mode': 'NULLABLE'},
            {'name': 'invoice_date', 'type': 'DATE', 'mode': 'NULLABLE'},
            {'name': 'billing_address', 'type': 'STRING', 'mode': 'NULLABLE'},
            {'name': 'billing_city', 'type': 'STRING', 'mode': 'NULLABLE'},
            {'name': 'billing_state', 'type': 'STRING', 'mode': 'NULLABLE'},
            {'name': 'billing_country', 'type': 'STRING', 'mode': 'NULLABLE'},
            {'name': 'billing_postal_code', 'type': 'STRING', 'mode': 'NULLABLE'},
            {'name': 'total', 'type': 'FLOAT64', 'mode': 'NULLABLE'},
        ], 
        autodetect=False,
        write_disposition='WRITE_TRUNCATE', #If the table already exists - overwrites the table data
    )

    start >> extracted_transformed_data
    extracted_transformed_data >> stored_data_gcs
    stored_data_gcs >> loaded_data_bigquery
    loaded_data_bigquery >> end
Esempio n. 4
0
    def test_labels(self, bq_hook):

        operator = GCSToBigQueryOperator(
            task_id=TASK_ID,
            bucket=TEST_BUCKET,
            source_objects=TEST_SOURCE_OBJECTS,
            destination_project_dataset_table=TEST_EXPLICIT_DEST,
            labels=LABELS,
        )

        operator.execute(None)

        bq_hook.return_value.get_conn.return_value.cursor.return_value.run_load.assert_called_once_with(
            destination_project_dataset_table=mock.ANY,
            schema_fields=mock.ANY,
            source_uris=mock.ANY,
            source_format=mock.ANY,
            autodetect=mock.ANY,
            create_disposition=mock.ANY,
            skip_leading_rows=mock.ANY,
            write_disposition=mock.ANY,
            field_delimiter=mock.ANY,
            max_bad_records=mock.ANY,
            quote_character=mock.ANY,
            ignore_unknown_values=mock.ANY,
            allow_quoted_newlines=mock.ANY,
            allow_jagged_rows=mock.ANY,
            encoding=mock.ANY,
            schema_update_options=mock.ANY,
            src_fmt_configs=mock.ANY,
            time_partitioning=mock.ANY,
            cluster_fields=mock.ANY,
            encryption_configuration=mock.ANY,
            labels=LABELS,
            description=mock.ANY,
        )
Esempio n. 5
0
def bs_file1000_dag():
    @task()
    def extract_transform():
        df = pd.read_excel(f"{DATA_PATH}/file_1000.xls",
                           index_col=0).reset_index(drop=True)
        df = df.drop(columns='First Name.1')
        df['full_name'] = df['First Name'] + " " + df['Last Name']
        df['gender'] = df['Gender'].apply(lambda row: 'M'
                                          if row == 'Male' else 'F')
        df['date'] = pd.to_datetime(df['Date'],
                                    format='%d/%m/%Y',
                                    errors='coerce')
        df = df.drop(columns=['Date', 'Gender'])
        df.columns = [
            'first_name', 'last_name', 'country', 'age', 'id', 'full_name',
            'gender', 'date'
        ]
        df = df.reindex(columns=[
            'id', 'first_name', 'last_name', 'full_name', 'date', 'age',
            'gender', 'country'
        ]).reset_index(drop=True)

        df.to_csv(OUT_PATH, index=False, header=False)

    start = DummyOperator(task_id='start')
    end = DummyOperator(task_id='end')
    extract_task = extract_transform()

    stored_data_gcs = LocalFilesystemToGCSOperator(
        task_id="store_to_gcs",
        gcp_conn_id=GOOGLE_CLOUD_CONN_ID,
        src=OUT_PATH,
        dst=GCS_OBJECT_NAME,
        bucket=BUCKET_NAME)

    loaded_data_bigquery = GCSToBigQueryOperator(
        task_id='load_to_bigquery',
        bigquery_conn_id=GOOGLE_CLOUD_CONN_ID,
        bucket=BUCKET_NAME,
        source_objects=[GCS_OBJECT_NAME],
        destination_project_dataset_table=f"{DATASET_ID}.{BIGQUERY_TABLE_NAME}",
        schema_fields=[  #based on https://cloud.google.com/bigquery/docs/schemas
            {
                'name': 'id',
                'type': 'INT64',
                'mode': 'REQUIRED'
            },
            {
                'name': 'first_name',
                'type': 'STRING',
                'mode': 'NULLABLE'
            },
            {
                'name': 'last_name',
                'type': 'STRING',
                'mode': 'NULLABLE'
            },
            {
                'name': 'full_name',
                'type': 'STRING',
                'mode': 'NULLABLE'
            },
            {
                'name': 'date',
                'type': 'DATE',
                'mode': 'NULLABLE'
            },
            {
                'name': 'age',
                'type': 'INT64',
                'mode': 'NULLABLE'
            },
            {
                'name': 'gender',
                'type': 'STRING',
                'mode': 'NULLABLE'
            },
            {
                'name': 'country',
                'type': 'STRING',
                'mode': 'NULLABLE'
            },
        ],
        autodetect=False,
        write_disposition=
        'WRITE_TRUNCATE',  #If the table already exists - overwrites the table data
    )

    start >> extract_task
    extract_task >> stored_data_gcs
    stored_data_gcs >> loaded_data_bigquery
    loaded_data_bigquery >> end
    # [START howto_operator_facebook_ads_to_gcs]
    run_operator = FacebookAdsReportToGcsOperator(
        task_id='run_fetch_data',
        owner='airflow',
        bucket_name=GCS_BUCKET,
        parameters=PARAMETERS,
        fields=FIELDS,
        gcp_conn_id=GCS_CONN_ID,
        object_name=GCS_OBJ_PATH,
    )
    # [END howto_operator_facebook_ads_to_gcs]

    load_csv = GCSToBigQueryOperator(
        task_id='gcs_to_bq_example',
        bucket=GCS_BUCKET,
        source_objects=[GCS_OBJ_PATH],
        destination_project_dataset_table=f"{DATASET_NAME}.{TABLE_NAME}",
        write_disposition='WRITE_TRUNCATE',
    )

    read_data_from_gcs_many_chunks = BigQueryInsertJobOperator(
        task_id="read_data_from_gcs_many_chunks",
        configuration={
            "query": {
                "query": f"SELECT COUNT(*) FROM `{GCP_PROJECT_ID}.{DATASET_NAME}.{TABLE_NAME}`",
                "useLegacySql": False,
            }
        },
    )

    delete_bucket = GCSDeleteBucketOperator(
local_pq_to_gcs = LocalFilesystemToGCSOperator(
    task_id='local_pq_to_gcs',
    src=func_param['source_transform'],
    dst=func_param['dest_blob_transform'],
    bucket=func_param['bucket_name'],
    gcp_conn_id='google_cloud_default',
    trigger_rule='all_done',
    dag=dag)

load_gcs_pq_to_bq = GCSToBigQueryOperator(
    task_id='load_gcs_pq_to_bq',
    bucket=func_param['bucket_name'],
    source_objects=[func_param['dest_blob_transform']
                    ],  #default take list of objects, 
    destination_project_dataset_table='project_four_airflow.sales',
    source_format='PARQUET',
    write_disposition='WRITE_APPEND',
    google_cloud_storage_conn_id='google_cloud_default',
    trigger_rule='all_done',
    dag=dag)

bigquery_is_up_to_date = DummyOperator(task_id='bigquery_is_up_to_date')
csv_file_not_exist = DummyOperator(task_id='csv_file_not_exist')
csv_file_exist = DummyOperator(task_id='csv_file_exist')
check_mysql = DummyOperator(task_id='check_mysql')
updating_dataset = DummyOperator(task_id='updating_dataset')

check_data >> check_mysql >> check_dataset
check_data >> csv_file_not_exist >> download_zip >> load_to_mysql >> check_dataset
check_data >> csv_file_exist >> load_to_mysql >> check_dataset
    # [START howto_google_display_video_deletequery_report_operator]
    delete_report = GoogleDisplayVideo360DeleteReportOperator(report_id=report_id, task_id="delete_report")
    # [END howto_google_display_video_deletequery_report_operator]

    create_report >> run_report >> wait_for_report >> get_report >> delete_report

with models.DAG(
    "example_display_video_misc",
    schedule_interval=None,  # Override to match your needs,
    start_date=dates.days_ago(1),
) as dag2:
    # [START howto_google_display_video_upload_multiple_entity_read_files_to_big_query]
    upload_erf_to_bq = GCSToBigQueryOperator(
        task_id='upload_erf_to_bq',
        bucket=BUCKET,
        source_objects=ERF_SOURCE_OBJECT,
        destination_project_dataset_table=f"{BQ_DATA_SET}.gcs_to_bq_table",
        write_disposition='WRITE_TRUNCATE',
    )
    # [END howto_google_display_video_upload_multiple_entity_read_files_to_big_query]

    # [START howto_google_display_video_download_line_items_operator]
    download_line_items = GoogleDisplayVideo360DownloadLineItemsOperator(
        task_id="download_line_items",
        request_body=DOWNLOAD_LINE_ITEMS_REQUEST,
        bucket_name=BUCKET,
        object_name=OBJECT_NAME,
        gzip=False,
    )
    # [END howto_google_display_video_download_line_items_operator]
Esempio n. 9
0
def bs_disaster_dag():
    @task()
    def extract_transform():
        df = pd.read_csv(f"{DATA_PATH}/disaster_data.csv")
        columns = ['text', 'location']
        for column in columns:
            df[column] = df[column].str.replace(r'\s{2,}', ' ', regex=True)
            df[column] = df[column].str.replace(r"[^a-zA-Z0-9\,]",
                                                ' ',
                                                regex=True)

        df.to_csv(OUT_PATH, index=False, header=False)

    start = DummyOperator(task_id='start')
    end = DummyOperator(task_id='end')
    extract_transform_task = extract_transform()

    stored_data_gcs = LocalFilesystemToGCSOperator(
        task_id="store_to_gcs",
        gcp_conn_id=GOOGLE_CLOUD_CONN_ID,
        src=OUT_PATH,
        dst=GCS_OBJECT_NAME,
        bucket=BUCKET_NAME)

    loaded_data_bigquery = GCSToBigQueryOperator(
        task_id='load_to_bigquery',
        bigquery_conn_id=GOOGLE_CLOUD_CONN_ID,
        bucket=BUCKET_NAME,
        source_objects=[GCS_OBJECT_NAME],
        destination_project_dataset_table=f"{DATASET_ID}.{BIGQUERY_TABLE_NAME}",
        schema_fields=[  #based on https://cloud.google.com/bigquery/docs/schemas
            {
                'name': 'id',
                'type': 'INT64',
                'mode': 'REQUIRED'
            },
            {
                'name': 'keyword',
                'type': 'STRING',
                'mode': 'NULLABLE'
            },
            {
                'name': 'location',
                'type': 'STRING',
                'mode': 'NULLABLE'
            },
            {
                'name': 'text',
                'type': 'STRING',
                'mode': 'NULLABLE'
            },
            {
                'name': 'target',
                'type': 'INT64',
                'mode': 'NULLABLE'
            },
        ],
        autodetect=False,
        write_disposition=
        'WRITE_TRUNCATE',  #If the table already exists - overwrites the table data
    )

    start >> extract_transform_task
    extract_transform_task >> stored_data_gcs
    stored_data_gcs >> loaded_data_bigquery
    loaded_data_bigquery >> end
def bs_reviews_dag():
    @task()
    def merge_reviews(reviews: list):
        df_merge = pd.concat([pd.read_json(review) for review in reviews],
                             ignore_index=True)
        print(df_merge)
        df_merge.to_csv(OUT_PATH, index=False, header=False)

    @task()
    def extract_reviews(filename):
        print(filename)
        file_path = f"{DATA_PATH}/{filename}"
        if 'csv' in filename:
            df = pd.read_csv(file_path)
        else:
            df = pd.read_excel(file_path)
        print(df)
        return df.to_json()

    start = DummyOperator(task_id='start')
    end = DummyOperator(task_id='end')

    filenames = os.listdir(DATA_PATH)
    filtered_filename = list(
        filter(lambda filename: re.match(r"(^reviews)", filename), filenames))

    extracted_list = []
    for i in range(len(filtered_filename)):
        extracted = extract_reviews(filtered_filename[i])
        extracted_list.append(extracted)

    merged = merge_reviews(extracted_list)

    stored_data_gcs = LocalFilesystemToGCSOperator(
        task_id="store_to_gcs",
        gcp_conn_id=GOOGLE_CLOUD_CONN_ID,
        src=OUT_PATH,
        dst=GCS_OBJECT_NAME,
        bucket=BUCKET_NAME)

    loaded_data_bigquery = GCSToBigQueryOperator(
        task_id='load_to_bigquery',
        bigquery_conn_id=GOOGLE_CLOUD_CONN_ID,
        bucket=BUCKET_NAME,
        source_objects=[GCS_OBJECT_NAME],
        destination_project_dataset_table=f"{DATASET_ID}.{BIGQUERY_TABLE_NAME}",
        schema_fields=[  #based on https://cloud.google.com/bigquery/docs/schemas
            {
                'name': 'listing_id',
                'type': 'INT64',
                'mode': 'NULLABLE'
            },
            {
                'name': 'id',
                'type': 'INT64',
                'mode': 'REQUIRED'
            },
            {
                'name': 'date',
                'type': 'DATE',
                'mode': 'NULLABLE'
            },
            {
                'name': 'reviewer_id',
                'type': 'INT64',
                'mode': 'NULLABLE'
            },
            {
                'name': 'reviewer_name',
                'type': 'STRING',
                'mode': 'NULLABLE'
            },
            {
                'name': 'comments',
                'type': 'STRING',
                'mode': 'NULLABLE'
            },
        ],
        autodetect=False,
        allow_quoted_newlines=True,
        write_disposition=
        'WRITE_TRUNCATE',  #If the table already exists - overwrites the table data
    )

    start >> extracted_list >> merged
    merged >> stored_data_gcs
    stored_data_gcs >> loaded_data_bigquery >> end
load_users = GCSToBigQueryOperator(
    task_id='gcs_to_bigquery_users',
    bucket='airflow-postgres',
    source_objects=['users.csv'],
    destination_project_dataset_table=f"{DATASET_NAME}.{TABLE_NAME}",
    schema_fields=[
        {
            'name': 'created_at',
            'type': 'TIMESTAMP',
            'mode': 'REQUIRED'
        },
        {
            'name': 'first_name',
            'type': 'STRING',
            'mode': 'REQUIRED'
        },
        {
            'name': 'id',
            'type': 'STRING',
            'mode': 'REQUIRED'
        },
        {
            'name': 'last_name',
            'type': 'STRING',
            'mode': 'REQUIRED'
        },
        {
            'name': 'updated_at',
            'type': 'TIMESTAMP',
            'mode': 'REQUIRED'
        },
    ],
    write_disposition='WRITE_TRUNCATE',
    source_format='CSV',
    encoding='UTF-8',
    dag=dag,
)
def bs_tweets_dag():
    @task()
    def extract_transform_tweets():
      df = pd.read_json(f"{DATA_PATH}/tweet_data.json", lines=True)
      df['created_at'] = df['created_at'].dt.tz_convert(None)
      columns = ['text', 'source']
      for column in columns:
        df[column] = df[column].str.replace(r"[^a-zA-Z0-9\,#@]", ' ', regex=True)
        df[column] = df[column].str.replace(r"\s{2,}", ' ', regex=True)

      filtered_columns = filter(lambda col: 
                              col != 'extended_entities' and
                              col != 'contributors' and col != 'entities' 
                              and col != 'retweeted_status' and col != 'user'
                              and col != 'in_reply_to_user_id_str' 
                              and col != 'in_reply_to_status_id_str' ,
                            list(df.columns))
      df_filtered = df[filtered_columns]
      df_filtered.to_csv(OUT_TWEETS_PATH, index=False, header=False)
      
    @task()
    def extract_transform_tweets_user():
      df = pd.read_json(f"{DATA_PATH}/tweet_data.json", lines=True)
      users = [ {**row['user'], 'tweet_id': row['id']} for _, row in df.iterrows() ]

      df_users = pd.DataFrame(users)
      df_users['created_at'] = pd.to_datetime(df_users['created_at'], 
                                              format='%a %b %d %H:%M:%S %z %Y'
                                              ).dt.tz_convert(None)
      filtered_column = list(filter(lambda col: 
                                  col != 'id' and col != 'tweet_id',
                                list(df_users.columns)))
      df_users = df_users.reindex(columns=['id', 'tweet_id', *(filtered_column)])

      df_users.to_csv(OUT_TWEETS_USER_PATH, index=False, header=False)

    start = DummyOperator(task_id='start')
    end = DummyOperator(task_id='end')
    
    et_tweets = extract_transform_tweets()
    et_tweets_user = extract_transform_tweets_user()

    stored_tweets_data_gcs = LocalFilesystemToGCSOperator(
        task_id="store_tweets_to_gcs",
        gcp_conn_id=GOOGLE_CLOUD_CONN_ID,
        src=OUT_TWEETS_PATH,
        dst=GCS_OBJECT_TWEETS_NAME,
        bucket=BUCKET_NAME
    )

    stored_tweets_user_data_gcs = LocalFilesystemToGCSOperator(
        task_id="store_tweets_user_to_gcs",
        gcp_conn_id=GOOGLE_CLOUD_CONN_ID,
        src=OUT_TWEETS_USER_PATH,
        dst=GCS_OBJECT_TWEETS_USER_NAME,
        bucket=BUCKET_NAME
    )

    loaded_tweets_data_bigquery = GCSToBigQueryOperator(
        task_id='load_tweets_to_bigquery',
        bigquery_conn_id=GOOGLE_CLOUD_CONN_ID,
        bucket=BUCKET_NAME,
        source_objects=[GCS_OBJECT_TWEETS_NAME],
        destination_project_dataset_table=f"{DATASET_ID}.{BIGQUERY_TWEETS_TABLE_NAME}",
        schema_fields=[ #based on https://cloud.google.com/bigquery/docs/schemas
            {'name': 'truncated', 'type': 'BOOL', 'mode': 'NULLABLE'},
            {'name': 'text', 'type': 'STRING', 'mode': 'NULLABLE'},
            {'name': 'is_quote_status', 'type': 'BOOL', 'mode': 'NULLABLE'},
            {'name': 'in_reply_to_status_id', 'type': 'FLOAT64', 'mode': 'NULLABLE'},
            {'name': 'in_reply_to_user_id', 'type': 'FLOAT64', 'mode': 'NULLABLE'},
            {'name': 'id', 'type': 'INT64', 'mode': 'REQUIRED'},            
            {'name': 'favorite_count', 'type': 'INT64', 'mode': 'NULLABLE'},
            {'name': 'retweeted', 'type': 'BOOL', 'mode': 'NULLABLE'},
            {'name': 'coordinates', 'type': 'STRING', 'mode': 'NULLABLE'},
            {'name': 'source', 'type': 'STRING', 'mode': 'NULLABLE'},
            {'name': 'in_reply_to_screen_name', 'type': 'STRING', 'mode': 'NULLABLE'},
            {'name': 'id_str', 'type': 'INT64', 'mode': 'NULLABLE'},
            {'name': 'retweet_count', 'type': 'INT64', 'mode': 'NULLABLE'},
            {'name': 'metadata', 'type': 'STRING', 'mode': 'NULLABLE'},            
            {'name': 'favorited', 'type': 'BOOL', 'mode': 'NULLABLE'},
            {'name': 'geo', 'type': 'STRING', 'mode': 'NULLABLE'},
            {'name': 'lang', 'type': 'STRING', 'mode': 'NULLABLE'},
            {'name': 'created_at', 'type': 'DATETIME', 'mode': 'NULLABLE'},
            {'name': 'place', 'type': 'STRING', 'mode': 'NULLABLE'},
            {'name': 'quoted_status_id', 'type': 'FLOAT64', 'mode': 'NULLABLE'},
            {'name': 'quoted_status', 'type': 'STRING', 'mode': 'NULLABLE'},
            {'name': 'possibly_sensitive', 'type': 'FLOAT64', 'mode': 'NULLABLE'},
            {'name': 'quoted_status_id_str', 'type': 'FLOAT64', 'mode': 'NULLABLE'},
        ], 
        autodetect=False,
        write_disposition='WRITE_TRUNCATE', #If the table already exists - overwrites the table data
    )

    loaded_tweets_user_data_bigquery = GCSToBigQueryOperator(
        task_id='load_tweets_user_to_bigquery',
        bigquery_conn_id=GOOGLE_CLOUD_CONN_ID,
        bucket=BUCKET_NAME,
        source_objects=[GCS_OBJECT_TWEETS_USER_NAME],
        destination_project_dataset_table=f"{DATASET_ID}.{BIGQUERY_TWEETS_USER_TABLE_NAME}",
        schema_fields=[
            {'name': 'id', 'type': 'INT64', 'mode': 'REQUIRED'},
            {'name': 'tweet_id', 'type': 'INT64', 'mode': 'NULLABLE'},
            {'name': 'follow_request_sent', 'type': 'STRING', 'mode': 'NULLABLE'},
            {'name': 'has_extended_profile', 'type': 'BOOL', 'mode': 'NULLABLE'},
            {'name': 'profile_use_background_image', 'type': 'BOOL', 'mode': 'NULLABLE'},
            {'name': 'verified', 'type': 'BOOL', 'mode': 'NULLABLE'},
            {'name': 'translator_type', 'type': 'STRING', 'mode': 'NULLABLE'},
            {'name': 'profile_text_color', 'type': 'STRING', 'mode': 'NULLABLE'},
            {'name': 'profile_image_url_https', 'type': 'STRING', 'mode': 'NULLABLE'},
            {'name': 'profile_sidebar_fill_color', 'type': 'STRING', 'mode': 'NULLABLE'},
            {'name': 'entities', 'type': 'STRING', 'mode': 'NULLABLE'},
            {'name': 'followers_count', 'type': 'INT64', 'mode': 'NULLABLE'},
            {'name': 'protected', 'type': 'BOOL', 'mode': 'NULLABLE'},
            {'name': 'location', 'type': 'STRING', 'mode': 'NULLABLE'},
            {'name': 'default_profile_image', 'type': 'BOOL', 'mode': 'NULLABLE'},
            {'name': 'id_str', 'type': 'STRING', 'mode': 'NULLABLE'},
            {'name': 'lang', 'type': 'STRING', 'mode': 'NULLABLE'},
            {'name': 'utc_offset', 'type': 'FLOAT64', 'mode': 'NULLABLE'},
            {'name': 'statuses_count', 'type': 'INT64', 'mode': 'NULLABLE'},
            {'name': 'description', 'type': 'STRING', 'mode': 'NULLABLE'},
            {'name': 'friends_count', 'type': 'INT64', 'mode': 'NULLABLE'},
            {'name': 'profile_background_image_url_https', 'type': 'STRING', 'mode': 'NULLABLE'},
            {'name': 'profile_link_color', 'type': 'STRING', 'mode': 'NULLABLE'},
            {'name': 'profile_image_url', 'type': 'STRING', 'mode': 'NULLABLE'},
            {'name': 'following', 'type': 'STRING', 'mode': 'NULLABLE'},
            {'name': 'geo_enabled', 'type': 'BOOL', 'mode': 'NULLABLE'},
            {'name': 'profile_background_color', 'type': 'STRING', 'mode': 'NULLABLE'},
            {'name': 'profile_banner_url', 'type': 'STRING', 'mode': 'NULLABLE'},
            {'name': 'profile_background_image_url', 'type': 'STRING', 'mode': 'NULLABLE'},
            {'name': 'screen_name', 'type': 'STRING', 'mode': 'NULLABLE'},
            {'name': 'is_translation_enabled', 'type': 'BOOL', 'mode': 'NULLABLE'},
            {'name': 'profile_background_tile', 'type': 'BOOL', 'mode': 'NULLABLE'},
            {'name': 'favourites_count', 'type': 'INT64', 'mode': 'NULLABLE'},
            {'name': 'name', 'type': 'STRING', 'mode': 'NULLABLE'},
            {'name': 'notifications', 'type': 'STRING', 'mode': 'NULLABLE'},
            {'name': 'url', 'type': 'STRING', 'mode': 'NULLABLE'},
            {'name': 'created_at', 'type': 'DATETIME', 'mode': 'NULLABLE'},
            {'name': 'contributors_enabled', 'type': 'BOOL', 'mode': 'NULLABLE'},
            {'name': 'time_zone', 'type': 'STRING', 'mode': 'NULLABLE'},
            {'name': 'profile_sidebar_border_color', 'type': 'STRING', 'mode': 'NULLABLE'},
            {'name': 'default_profile', 'type': 'BOOL', 'mode': 'NULLABLE'},
            {'name': 'is_translator', 'type': 'BOOL', 'mode': 'NULLABLE'},
            {'name': 'listed_count', 'type': 'INT64', 'mode': 'NULLABLE'},           
        ], 
        autodetect=False,
        allow_quoted_newlines=True,
        write_disposition='WRITE_TRUNCATE',
    )


    start >> [et_tweets, et_tweets_user]
    et_tweets >> stored_tweets_data_gcs
    et_tweets_user >> stored_tweets_user_data_gcs
    stored_tweets_data_gcs >> loaded_tweets_data_bigquery >> end
    stored_tweets_user_data_gcs >> loaded_tweets_user_data_bigquery >> end
Esempio n. 13
0
def bs_database_sqlite_dag():
    @task()
    def extract_transform():
        conn = sqlite3.connect(f"{DATA_PATH}/database.sqlite")
        with open(f"{BASE_PATH}/sql/database_sqlite.sql", "r") as query:
            df = pd.read_sql(query.read(), conn)
        df.to_csv(OUT_PATH, index=False, header=False
                  )  #prevent on create Index column and exclude the header row

    start = DummyOperator(task_id='start')
    end = DummyOperator(task_id='end')
    extracted_transformed_data = extract_transform()

    stored_data_gcs = LocalFilesystemToGCSOperator(
        task_id="store_to_gcs",
        gcp_conn_id=GOOGLE_CLOUD_CONN_ID,
        src=OUT_PATH,
        dst='extract_transform_database_sqlite.csv',
        bucket=BUCKET_NAME)

    loaded_data_bigquery = GCSToBigQueryOperator(
        task_id='load_to_bigquery',
        bigquery_conn_id=GOOGLE_CLOUD_CONN_ID,
        bucket=BUCKET_NAME,
        source_objects=[GCS_OBJECT_NAME],
        destination_project_dataset_table=f"{DATASET_ID}.{BIGQUERY_TABLE_NAME}",
        schema_fields=[  #based on https://cloud.google.com/bigquery/docs/schemas
            {
                'name': 'reviewid',
                'type': 'INT64',
                'mode': 'REQUIRED'
            },
            {
                'name': 'title',
                'type': 'STRING',
                'mode': 'NULLABLE'
            },
            {
                'name': 'artist',
                'type': 'STRING',
                'mode': 'NULLABLE'
            },
            {
                'name': 'url',
                'type': 'STRING',
                'mode': 'NULLABLE'
            },
            {
                'name': 'score',
                'type': 'FLOAT64',
                'mode': 'NULLABLE'
            },
            {
                'name': 'best_new_music',
                'type': 'INTEGER',
                'mode': 'NULLABLE'
            },
            {
                'name': 'author',
                'type': 'STRING',
                'mode': 'NULLABLE'
            },
            {
                'name': 'author_type',
                'type': 'STRING',
                'mode': 'NULLABLE'
            },
            {
                'name': 'pub_date',
                'type': 'DATE',
                'mode': 'NULLABLE'
            },
            {
                'name': 'pub_weekday',
                'type': 'INT64',
                'mode': 'NULLABLE'
            },
            {
                'name': 'pub_day',
                'type': 'INT64',
                'mode': 'NULLABLE'
            },
            {
                'name': 'pub_month',
                'type': 'INT64',
                'mode': 'NULLABLE'
            },
            {
                'name': 'pub_year',
                'type': 'INT64',
                'mode': 'NULLABLE'
            },
            {
                'name': 'concat_genre',
                'type': 'STRING',
                'mode': 'NULLABLE'
            },
            {
                'name': 'concat_label',
                'type': 'STRING',
                'mode': 'NULLABLE'
            },
            {
                'name': 'concat_year',
                'type': 'STRING',
                'mode': 'NULLABLE'
            },
        ],
        autodetect=False,
        write_disposition=
        'WRITE_TRUNCATE',  #If the table already exists - overwrites the table data
    )

    start >> extracted_transformed_data
    extracted_transformed_data >> stored_data_gcs
    stored_data_gcs >> loaded_data_bigquery
    loaded_data_bigquery >> end
Esempio n. 14
0
    start = DummyOperator(task_id='start')

    conn_task = PythonOperator(
        task_id='conn-task',
        python_callable=get_conn,
        op_kwargs={'my_conn_id': 'smtp_default'},
    )

    var_task = PythonOperator(task_id='var-task',
                              python_callable=get_var,
                              op_kwargs={'var_name': 'hello'})

    bq_load_task = GCSToBigQueryOperator(
        task_id='bqload',
        bucket=Variable.get('bucket_secret'),
        source_objects=['astro_word_count.csv'],
        destination_project_dataset_table=Variable.get('bqtable_secret'),
        skip_leading_rows=1,
        schema_fields=[
            {
                'name': 'word',
                'type': 'STRING'
            },
            {
                'name': 'count',
                'type': 'INT64'
            },
        ],
        write_disposition='WRITE_TRUNCATE')

    start >> [conn_task, var_task, bq_load_task]
Esempio n. 15
0
import_in_bigquery = GCSToBigQueryOperator(
    task_id="import_in_bigquery",
    bucket=os.environ["RATINGS_BUCKET"],
    source_objects=[
        "ratings/{{ execution_date.year }}/{{ execution_date.strftime('%m') }}.csv"
    ],
    source_format="CSV",
    create_disposition="CREATE_IF_NEEDED",
    write_disposition="WRITE_TRUNCATE",
    bigquery_conn_id="gcp",
    skip_leading_rows=1,
    schema_fields=[
        {
            "name": "userId",
            "type": "INTEGER"
        },
        {
            "name": "movieId",
            "type": "INTEGER"
        },
        {
            "name": "rating",
            "type": "FLOAT"
        },
        {
            "name": "timestamp",
            "type": "TIMESTAMP"
        },
    ],
    destination_project_dataset_table=(os.environ["GCP_PROJECT"] + ":" +
                                       os.environ["BIGQUERY_DATASET"] + "." +
                                       "ratings${{ ds_nodash }}"),
    dag=dag,
)
Esempio n. 16
0
        loadGcsToBq = GCSToBigQueryOperator(
          task_id='gcstobq_jagged',
          bucket=SOURCE_BUCKET,
          source_objects=[SRC1],
          destination_project_dataset_table=f"{DATASET_NAME}.{TABLE_NAME}",
          create_disposition='CREATE_IF_NEEDED',
          source_format='CSV',
          write_disposition='WRITE_TRUNCATE',
          allow_jagged_rows=True,
          skip_leading_rows=1,
          schema_fields=[
	  {
	    "mode": "NULLABLE",
	    "name": "year",
	    "type": "INTEGER"
	  },
	  {
	    "mode": "NULLABLE",
	    "name": "score",
	    "type": "INTEGER"
	  },
	  {
	    "mode": "NULLABLE",
	    "name": "title",
	    "type": "STRING"
	  },
	  {
	    "mode": "NULLABLE",
	    "name": "tbd1",
	    "type": "STRING"
          }, 
	  {
	    "mode": "NULLABLE",
	    "name": "tbd2",
	    "type": "STRING"
          }, 
          ]
        )
Esempio n. 17
0
        task_id='create_stage_reviews_table',
        dataset_id=GCP_BQ_DATASET_STAGE,
        table_id=GCP_BQ_TABLE_REVIEWS,
        project_id=GCP_PROJECT_ID,
        bigquery_conn_id='gr_bigquery_conn',
        schema_fields=stage.reviews_schema)

    load_stage_data = DummyOperator(task_id='load_stage_data')

    load_stage_users_data = GCSToBigQueryOperator(
        task_id='load_stage_users_data',
        bucket=os.environ['GCP_GCS_BUCKET_PROCESSED'],
        source_objects=['{{ run_id }}/user/part-*'],
        #source_objects=['test/user/part-*'],
        destination_project_dataset_table=
        f"{os.environ['GCP_PROJECT_ID']}.{os.environ['GCP_BQ_DATASET_STAGE']}.{os.environ['GCP_BQ_TABLE_USERS']}",
        schema_fields=users_schema,
        source_format=bigquery.SourceFormat.NEWLINE_DELIMITED_JSON,
        create_disposition=bigquery.CreateDisposition.CREATE_NEVER,
        write_disposition=bigquery.WriteDisposition.WRITE_EMPTY,
        skip_leading_rows=1,
        bigquery_conn_id='gr_bigquery_conn',
        google_cloud_storage_conn_id='gr_storage_conn')

    load_stage_authors_data = GCSToBigQueryOperator(
        task_id='load_stage_authors_data',
        bucket=os.environ['GCP_GCS_BUCKET_PROCESSED'],
        source_objects=['{{ run_id }}/author/part-*'],
        # source_objects=['test/author/part-*'],
        destination_project_dataset_table=
        f"{os.environ['GCP_PROJECT_ID']}.{os.environ['GCP_BQ_DATASET_STAGE']}.{os.environ['GCP_BQ_TABLE_AUTHORS']}",
        schema_fields=authors_schema,
    start_date=datetime(2021, 1, 1),
    catchup=False,
    schedule_interval='@once',
    tags=['example'],
) as dag:
    create_test_dataset = BigQueryCreateEmptyDatasetOperator(
        task_id='create_airflow_test_dataset', dataset_id=DATASET_NAME
    )

    # [START howto_operator_gcs_to_bigquery]
    load_csv = GCSToBigQueryOperator(
        task_id='gcs_to_bigquery_example',
        bucket='cloud-samples-data',
        source_objects=['bigquery/us-states/us-states.csv'],
        destination_project_dataset_table=f"{DATASET_NAME}.{TABLE_NAME}",
        schema_fields=[
            {'name': 'name', 'type': 'STRING', 'mode': 'NULLABLE'},
            {'name': 'post_abbr', 'type': 'STRING', 'mode': 'NULLABLE'},
        ],
        write_disposition='WRITE_TRUNCATE',
    )
    # [END howto_operator_gcs_to_bigquery]

    delete_test_dataset = BigQueryDeleteDatasetOperator(
        task_id='delete_airflow_test_dataset',
        dataset_id=DATASET_NAME,
        delete_contents=True,
    )

create_test_dataset >> load_csv >> delete_test_dataset
Esempio n. 19
0
        "query": {
            "query": "{% include 'create_bad_row_table.sql' %}",
            "use_legacy_sql": False,
        }
    },
    dag=dag,
)

load_data_from_gsc_to_bigquery = GCSToBigQueryOperator(
    task_id="load_data_from_gsc_to_bigquery",
    bucket="{{ var.json.env.bucket }}",
    source_objects=[
        'data/{{execution_date.strftime("%Y") }}/{{execution_date.strftime("%m") }}/{{execution_date.strftime("%d") }}*'
    ],
    destination_project_dataset_table=f"{stg_dataset_name}.{stg_table_name}",
    skip_leading_rows=1,
    source_format="CSV",
    create_disposition="CREATE_IF_NEEDED",
    write_disposition="WRITE_APPEND",
    schema_update_options=["ALLOW_FIELD_RELAXATION", "ALLOW_FIELD_ADDITION"],
    autodetect=True,
    dag=dag,
)

quality_check = DataQualityOperator(
    task_id="quality_check",
    provide_context=True,
    gcp_conn_id="google_cloud_default",
    sql='{% include "quality_check.sql" %}',
    table_list=[
        "{{ var.json.env.project }}.{{ var.json.env.stg }}.{{ var.json.env.raw_data }}"