def test_execute(self, mock_hook):
     mock_instance = mock_hook.return_value
     operator = LocalFilesystemToGCSOperator(task_id='gcs_to_file_sensor',
                                             dag=self.dag,
                                             **self._config)
     operator.execute(None)
     mock_instance.upload.assert_called_once_with(
         bucket_name=self._config['bucket'],
         filename=self._config['src'],
         gzip=self._config['gzip'],
         mime_type=self._config['mime_type'],
         object_name=self._config['dst'])
Beispiel #2
0
 def test_execute_negative(self, mock_hook):
     mock_instance = mock_hook.return_value
     operator = LocalFilesystemToGCSOperator(
         task_id='gcs_to_file_sensor',
         dag=self.dag,
         src='/tmp/fake*.csv',
         dst='test/test1.csv',
         **self._config,
     )
     print(glob('/tmp/fake*.csv'))
     with pytest.raises(ValueError):
         operator.execute(None)
     mock_instance.assert_not_called()
 def test_init(self):
     operator = LocalFilesystemToGCSOperator(task_id='file_to_gcs_operator',
                                             dag=self.dag,
                                             **self._config)
     self.assertEqual(operator.src, self._config['src'])
     self.assertEqual(operator.dst, self._config['dst'])
     self.assertEqual(operator.bucket, self._config['bucket'])
     self.assertEqual(operator.mime_type, self._config['mime_type'])
     self.assertEqual(operator.gzip, self._config['gzip'])
 def test_execute_wildcard(self, mock_hook):
     mock_instance = mock_hook.return_value
     operator = LocalFilesystemToGCSOperator(
         task_id='gcs_to_file_sensor', dag=self.dag, src='/tmp/fake*.csv', dst='test/', **self._config
     )
     operator.execute(None)
     object_names = ['test/' + os.path.basename(fp) for fp in glob('/tmp/fake*.csv')]
     files_objects = zip(glob('/tmp/fake*.csv'), object_names)
     calls = [
         mock.call(
             bucket_name=self._config['bucket'],
             filename=filepath,
             gzip=self._config['gzip'],
             mime_type=self._config['mime_type'],
             object_name=object_name,
         )
         for filepath, object_name in files_objects
     ]
     mock_instance.upload.assert_has_calls(calls)
def bs_customer_invoice_chinook_dag():
    @task()
    def extract_transform():
        conn = sqlite3.connect(f"{DATA_PATH}/chinook.db")
        with open(f"{BASE_PATH}/sql/chinook.sql", "r") as query:
            df = pd.read_sql(query.read(), conn)
        df.to_csv(OUT_PATH, index=False, header=False) #prevent on create Index column and exclude the header row

    start = DummyOperator(task_id='start')
    end = DummyOperator(task_id='end')
    extracted_transformed_data = extract_transform()

    stored_data_gcs = LocalFilesystemToGCSOperator(
        task_id="store_to_gcs",
        gcp_conn_id=GOOGLE_CLOUD_CONN_ID,
        src=OUT_PATH,
        dst=GCS_OBJECT_NAME,
        bucket=BUCKET_NAME
    )

    loaded_data_bigquery = GCSToBigQueryOperator(
        task_id='load_to_bigquery',
        bigquery_conn_id=GOOGLE_CLOUD_CONN_ID,
        bucket=BUCKET_NAME,
        source_objects=[GCS_OBJECT_NAME],
        destination_project_dataset_table=f"{DATASET_ID}.{BIGQUERY_TABLE_NAME}",
        schema_fields=[ #based on https://cloud.google.com/bigquery/docs/schemas
            {'name': 'customer_id', 'type': 'INT64', 'mode': 'REQUIRED'},
            {'name': 'full_name', 'type': 'STRING', 'mode': 'NULLABLE'},
            {'name': 'company', 'type': 'STRING', 'mode': 'NULLABLE'},
            {'name': 'address', 'type': 'STRING', 'mode': 'NULLABLE'},
            {'name': 'city', 'type': 'STRING', 'mode': 'NULLABLE'},
            {'name': 'state', 'type': 'STRING', 'mode': 'NULLABLE'},
            {'name': 'country', 'type': 'STRING', 'mode': 'NULLABLE'},
            {'name': 'postal_code', 'type': 'STRING', 'mode': 'NULLABLE'},
            {'name': 'phone', 'type': 'STRING', 'mode': 'NULLABLE'},
            {'name': 'fax', 'type': 'STRING', 'mode': 'NULLABLE'},
            {'name': 'email', 'type': 'STRING', 'mode': 'NULLABLE'},
            {'name': 'invoice_id', 'type': 'INT64', 'mode': 'NULLABLE'},
            {'name': 'invoice_date', 'type': 'DATE', 'mode': 'NULLABLE'},
            {'name': 'billing_address', 'type': 'STRING', 'mode': 'NULLABLE'},
            {'name': 'billing_city', 'type': 'STRING', 'mode': 'NULLABLE'},
            {'name': 'billing_state', 'type': 'STRING', 'mode': 'NULLABLE'},
            {'name': 'billing_country', 'type': 'STRING', 'mode': 'NULLABLE'},
            {'name': 'billing_postal_code', 'type': 'STRING', 'mode': 'NULLABLE'},
            {'name': 'total', 'type': 'FLOAT64', 'mode': 'NULLABLE'},
        ], 
        autodetect=False,
        write_disposition='WRITE_TRUNCATE', #If the table already exists - overwrites the table data
    )

    start >> extracted_transformed_data
    extracted_transformed_data >> stored_data_gcs
    stored_data_gcs >> loaded_data_bigquery
    loaded_data_bigquery >> end
 def test_execute_multiple(self, mock_hook):
     mock_instance = mock_hook.return_value
     operator = LocalFilesystemToGCSOperator(
         task_id='gcs_to_file_sensor', dag=self.dag, src=self.testfiles, dst='test/', **self._config
     )
     operator.execute(None)
     files_objects = zip(
         self.testfiles, ['test/' + os.path.basename(testfile) for testfile in self.testfiles]
     )
     calls = [
         mock.call(
             bucket_name=self._config['bucket'],
             filename=filepath,
             gzip=self._config['gzip'],
             mime_type=self._config['mime_type'],
             object_name=object_name,
         )
         for filepath, object_name in files_objects
     ]
     mock_instance.upload.assert_has_calls(calls)
Beispiel #7
0
 def test_init(self):
     operator = LocalFilesystemToGCSOperator(
         task_id='file_to_gcs_operator',
         dag=self.dag,
         src=self.testfile1,
         dst='test/test1.csv',
         **self._config,
     )
     self.assertEqual(operator.src, self.testfile1)
     self.assertEqual(operator.dst, 'test/test1.csv')
     self.assertEqual(operator.bucket, self._config['bucket'])
     self.assertEqual(operator.mime_type, self._config['mime_type'])
     self.assertEqual(operator.gzip, self._config['gzip'])
Beispiel #8
0
 def test_init(self):
     operator = LocalFilesystemToGCSOperator(
         task_id='file_to_gcs_operator',
         dag=self.dag,
         src=self.testfile1,
         dst='test/test1.csv',
         **self._config,
     )
     assert operator.src == self.testfile1
     assert operator.dst == 'test/test1.csv'
     assert operator.bucket == self._config['bucket']
     assert operator.mime_type == self._config['mime_type']
     assert operator.gzip == self._config['gzip']
Beispiel #9
0
def bs_database_sqlite_dag():
    @task()
    def extract_transform():
        conn = sqlite3.connect(f"{DATA_PATH}/database.sqlite")
        with open(f"{BASE_PATH}/sql/database_sqlite.sql", "r") as query:
            df = pd.read_sql(query.read(), conn)
        df.to_csv(OUT_PATH, index=False, header=False
                  )  #prevent on create Index column and exclude the header row

    start = DummyOperator(task_id='start')
    end = DummyOperator(task_id='end')
    extracted_transformed_data = extract_transform()

    stored_data_gcs = LocalFilesystemToGCSOperator(
        task_id="store_to_gcs",
        gcp_conn_id=GOOGLE_CLOUD_CONN_ID,
        src=OUT_PATH,
        dst='extract_transform_database_sqlite.csv',
        bucket=BUCKET_NAME)

    loaded_data_bigquery = GCSToBigQueryOperator(
        task_id='load_to_bigquery',
        bigquery_conn_id=GOOGLE_CLOUD_CONN_ID,
        bucket=BUCKET_NAME,
        source_objects=[GCS_OBJECT_NAME],
        destination_project_dataset_table=f"{DATASET_ID}.{BIGQUERY_TABLE_NAME}",
        schema_fields=[  #based on https://cloud.google.com/bigquery/docs/schemas
            {
                'name': 'reviewid',
                'type': 'INT64',
                'mode': 'REQUIRED'
            },
            {
                'name': 'title',
                'type': 'STRING',
                'mode': 'NULLABLE'
            },
            {
                'name': 'artist',
                'type': 'STRING',
                'mode': 'NULLABLE'
            },
            {
                'name': 'url',
                'type': 'STRING',
                'mode': 'NULLABLE'
            },
            {
                'name': 'score',
                'type': 'FLOAT64',
                'mode': 'NULLABLE'
            },
            {
                'name': 'best_new_music',
                'type': 'INTEGER',
                'mode': 'NULLABLE'
            },
            {
                'name': 'author',
                'type': 'STRING',
                'mode': 'NULLABLE'
            },
            {
                'name': 'author_type',
                'type': 'STRING',
                'mode': 'NULLABLE'
            },
            {
                'name': 'pub_date',
                'type': 'DATE',
                'mode': 'NULLABLE'
            },
            {
                'name': 'pub_weekday',
                'type': 'INT64',
                'mode': 'NULLABLE'
            },
            {
                'name': 'pub_day',
                'type': 'INT64',
                'mode': 'NULLABLE'
            },
            {
                'name': 'pub_month',
                'type': 'INT64',
                'mode': 'NULLABLE'
            },
            {
                'name': 'pub_year',
                'type': 'INT64',
                'mode': 'NULLABLE'
            },
            {
                'name': 'concat_genre',
                'type': 'STRING',
                'mode': 'NULLABLE'
            },
            {
                'name': 'concat_label',
                'type': 'STRING',
                'mode': 'NULLABLE'
            },
            {
                'name': 'concat_year',
                'type': 'STRING',
                'mode': 'NULLABLE'
            },
        ],
        autodetect=False,
        write_disposition=
        'WRITE_TRUNCATE',  #If the table already exists - overwrites the table data
    )

    start >> extracted_transformed_data
    extracted_transformed_data >> stored_data_gcs
    stored_data_gcs >> loaded_data_bigquery
    loaded_data_bigquery >> end
Beispiel #10
0
fork = DummyOperator(task_id='fork', trigger_rule='one_success', dag=dag)
join = DummyOperator(task_id='join', trigger_rule='one_success', dag=dag)

t_downloadlogtocloud = BashOperator(task_id="download_state_file",
                                    bash_command=downloadlogscript,
                                    dag=dag)

t_removefile = BashOperator(
    task_id='remove_temp_file',
    bash_command=removetempfile,
    dag=dag,
)
## change it suitable to your setting
t_analytics = LocalFilesystemToGCSOperator(task_id="uploadtostorage",
                                           src=destination_file,
                                           dst=gcsdir,
                                           bucket=GCS_BUCKET,
                                           gcp_conn_id=GCS_CONN_ID,
                                           dag=dag)
## change it suitable for your setting
t_sendresult = SimpleHttpOperator(task_id='sendnotification',
                                  method='POST',
                                  http_conn_id='notificationserver',
                                  endpoint='api/logUpdate',
                                  data=json.dumps({"source_file":
                                                   source_file}),
                                  headers={"Content-Type": "application/json"},
                                  dag=dag)
'''
the dependencies among tasks
'''
t_downloadlogtocloud >> t_analytics
check_dataset = BranchPythonOperator(task_id='check_dataset',
                                     python_callable=check_dataset,
                                     dag=dag,
                                     trigger_rule='none_failed_or_skipped')

extract_mysql_to_local_pq = PythonOperator(task_id='extract_mysql_to_local_pq',
                                           python_callable=mysql_to_pq,
                                           op_kwargs=func_param,
                                           trigger_rule='all_done',
                                           dag=dag)

local_pq_to_gcs = LocalFilesystemToGCSOperator(
    task_id='local_pq_to_gcs',
    src=func_param['source_transform'],
    dst=func_param['dest_blob_transform'],
    bucket=func_param['bucket_name'],
    gcp_conn_id='google_cloud_default',
    trigger_rule='all_done',
    dag=dag)

load_gcs_pq_to_bq = GCSToBigQueryOperator(
    task_id='load_gcs_pq_to_bq',
    bucket=func_param['bucket_name'],
    source_objects=[func_param['dest_blob_transform']
                    ],  #default take list of objects, 
    destination_project_dataset_table='project_four_airflow.sales',
    source_format='PARQUET',
    write_disposition='WRITE_APPEND',
    google_cloud_storage_conn_id='google_cloud_default',
    trigger_rule='all_done',
    dag=dag)
Beispiel #12
0
def bs_file1000_dag():
    @task()
    def extract_transform():
        df = pd.read_excel(f"{DATA_PATH}/file_1000.xls",
                           index_col=0).reset_index(drop=True)
        df = df.drop(columns='First Name.1')
        df['full_name'] = df['First Name'] + " " + df['Last Name']
        df['gender'] = df['Gender'].apply(lambda row: 'M'
                                          if row == 'Male' else 'F')
        df['date'] = pd.to_datetime(df['Date'],
                                    format='%d/%m/%Y',
                                    errors='coerce')
        df = df.drop(columns=['Date', 'Gender'])
        df.columns = [
            'first_name', 'last_name', 'country', 'age', 'id', 'full_name',
            'gender', 'date'
        ]
        df = df.reindex(columns=[
            'id', 'first_name', 'last_name', 'full_name', 'date', 'age',
            'gender', 'country'
        ]).reset_index(drop=True)

        df.to_csv(OUT_PATH, index=False, header=False)

    start = DummyOperator(task_id='start')
    end = DummyOperator(task_id='end')
    extract_task = extract_transform()

    stored_data_gcs = LocalFilesystemToGCSOperator(
        task_id="store_to_gcs",
        gcp_conn_id=GOOGLE_CLOUD_CONN_ID,
        src=OUT_PATH,
        dst=GCS_OBJECT_NAME,
        bucket=BUCKET_NAME)

    loaded_data_bigquery = GCSToBigQueryOperator(
        task_id='load_to_bigquery',
        bigquery_conn_id=GOOGLE_CLOUD_CONN_ID,
        bucket=BUCKET_NAME,
        source_objects=[GCS_OBJECT_NAME],
        destination_project_dataset_table=f"{DATASET_ID}.{BIGQUERY_TABLE_NAME}",
        schema_fields=[  #based on https://cloud.google.com/bigquery/docs/schemas
            {
                'name': 'id',
                'type': 'INT64',
                'mode': 'REQUIRED'
            },
            {
                'name': 'first_name',
                'type': 'STRING',
                'mode': 'NULLABLE'
            },
            {
                'name': 'last_name',
                'type': 'STRING',
                'mode': 'NULLABLE'
            },
            {
                'name': 'full_name',
                'type': 'STRING',
                'mode': 'NULLABLE'
            },
            {
                'name': 'date',
                'type': 'DATE',
                'mode': 'NULLABLE'
            },
            {
                'name': 'age',
                'type': 'INT64',
                'mode': 'NULLABLE'
            },
            {
                'name': 'gender',
                'type': 'STRING',
                'mode': 'NULLABLE'
            },
            {
                'name': 'country',
                'type': 'STRING',
                'mode': 'NULLABLE'
            },
        ],
        autodetect=False,
        write_disposition=
        'WRITE_TRUNCATE',  #If the table already exists - overwrites the table data
    )

    start >> extract_task
    extract_task >> stored_data_gcs
    stored_data_gcs >> loaded_data_bigquery
    loaded_data_bigquery >> end
Beispiel #13
0
def bs_disaster_dag():
    @task()
    def extract_transform():
        df = pd.read_csv(f"{DATA_PATH}/disaster_data.csv")
        columns = ['text', 'location']
        for column in columns:
            df[column] = df[column].str.replace(r'\s{2,}', ' ', regex=True)
            df[column] = df[column].str.replace(r"[^a-zA-Z0-9\,]",
                                                ' ',
                                                regex=True)

        df.to_csv(OUT_PATH, index=False, header=False)

    start = DummyOperator(task_id='start')
    end = DummyOperator(task_id='end')
    extract_transform_task = extract_transform()

    stored_data_gcs = LocalFilesystemToGCSOperator(
        task_id="store_to_gcs",
        gcp_conn_id=GOOGLE_CLOUD_CONN_ID,
        src=OUT_PATH,
        dst=GCS_OBJECT_NAME,
        bucket=BUCKET_NAME)

    loaded_data_bigquery = GCSToBigQueryOperator(
        task_id='load_to_bigquery',
        bigquery_conn_id=GOOGLE_CLOUD_CONN_ID,
        bucket=BUCKET_NAME,
        source_objects=[GCS_OBJECT_NAME],
        destination_project_dataset_table=f"{DATASET_ID}.{BIGQUERY_TABLE_NAME}",
        schema_fields=[  #based on https://cloud.google.com/bigquery/docs/schemas
            {
                'name': 'id',
                'type': 'INT64',
                'mode': 'REQUIRED'
            },
            {
                'name': 'keyword',
                'type': 'STRING',
                'mode': 'NULLABLE'
            },
            {
                'name': 'location',
                'type': 'STRING',
                'mode': 'NULLABLE'
            },
            {
                'name': 'text',
                'type': 'STRING',
                'mode': 'NULLABLE'
            },
            {
                'name': 'target',
                'type': 'INT64',
                'mode': 'NULLABLE'
            },
        ],
        autodetect=False,
        write_disposition=
        'WRITE_TRUNCATE',  #If the table already exists - overwrites the table data
    )

    start >> extract_transform_task
    extract_transform_task >> stored_data_gcs
    stored_data_gcs >> loaded_data_bigquery
    loaded_data_bigquery >> end
Beispiel #14
0
    create_bucket1 = GCSCreateBucketOperator(
        task_id="create_bucket1", bucket_name=BUCKET_1, project_id=PROJECT_ID
    )

    create_bucket2 = GCSCreateBucketOperator(
        task_id="create_bucket2", bucket_name=BUCKET_2, project_id=PROJECT_ID
    )

    list_buckets = GCSListObjectsOperator(task_id="list_buckets", bucket=BUCKET_1)

    list_buckets_result = BashOperator(
        task_id="list_buckets_result", bash_command="echo \"{{ task_instance.xcom_pull('list_buckets') }}\"",
    )

    upload_file = LocalFilesystemToGCSOperator(
        task_id="upload_file", src=PATH_TO_UPLOAD_FILE, dst=BUCKET_FILE_LOCATION, bucket=BUCKET_1,
    )

    transform_file = GCSFileTransformOperator(
        task_id="transform_file",
        source_bucket=BUCKET_1,
        source_object=BUCKET_FILE_LOCATION,
        transform_script=["python", PATH_TO_TRANSFORM_SCRIPT],
    )
    # [START howto_operator_gcs_bucket_create_acl_entry_task]
    gcs_bucket_create_acl_entry_task = GCSBucketCreateAclEntryOperator(
        bucket=BUCKET_1,
        entity=GCS_ACL_ENTITY,
        role=GCS_ACL_BUCKET_ROLE,
        task_id="gcs_bucket_create_acl_entry_task",
    )
def bs_reviews_dag():
    @task()
    def merge_reviews(reviews: list):
        df_merge = pd.concat([pd.read_json(review) for review in reviews],
                             ignore_index=True)
        print(df_merge)
        df_merge.to_csv(OUT_PATH, index=False, header=False)

    @task()
    def extract_reviews(filename):
        print(filename)
        file_path = f"{DATA_PATH}/{filename}"
        if 'csv' in filename:
            df = pd.read_csv(file_path)
        else:
            df = pd.read_excel(file_path)
        print(df)
        return df.to_json()

    start = DummyOperator(task_id='start')
    end = DummyOperator(task_id='end')

    filenames = os.listdir(DATA_PATH)
    filtered_filename = list(
        filter(lambda filename: re.match(r"(^reviews)", filename), filenames))

    extracted_list = []
    for i in range(len(filtered_filename)):
        extracted = extract_reviews(filtered_filename[i])
        extracted_list.append(extracted)

    merged = merge_reviews(extracted_list)

    stored_data_gcs = LocalFilesystemToGCSOperator(
        task_id="store_to_gcs",
        gcp_conn_id=GOOGLE_CLOUD_CONN_ID,
        src=OUT_PATH,
        dst=GCS_OBJECT_NAME,
        bucket=BUCKET_NAME)

    loaded_data_bigquery = GCSToBigQueryOperator(
        task_id='load_to_bigquery',
        bigquery_conn_id=GOOGLE_CLOUD_CONN_ID,
        bucket=BUCKET_NAME,
        source_objects=[GCS_OBJECT_NAME],
        destination_project_dataset_table=f"{DATASET_ID}.{BIGQUERY_TABLE_NAME}",
        schema_fields=[  #based on https://cloud.google.com/bigquery/docs/schemas
            {
                'name': 'listing_id',
                'type': 'INT64',
                'mode': 'NULLABLE'
            },
            {
                'name': 'id',
                'type': 'INT64',
                'mode': 'REQUIRED'
            },
            {
                'name': 'date',
                'type': 'DATE',
                'mode': 'NULLABLE'
            },
            {
                'name': 'reviewer_id',
                'type': 'INT64',
                'mode': 'NULLABLE'
            },
            {
                'name': 'reviewer_name',
                'type': 'STRING',
                'mode': 'NULLABLE'
            },
            {
                'name': 'comments',
                'type': 'STRING',
                'mode': 'NULLABLE'
            },
        ],
        autodetect=False,
        allow_quoted_newlines=True,
        write_disposition=
        'WRITE_TRUNCATE',  #If the table already exists - overwrites the table data
    )

    start >> extracted_list >> merged
    merged >> stored_data_gcs
    stored_data_gcs >> loaded_data_bigquery >> end
def bs_tweets_dag():
    @task()
    def extract_transform_tweets():
      df = pd.read_json(f"{DATA_PATH}/tweet_data.json", lines=True)
      df['created_at'] = df['created_at'].dt.tz_convert(None)
      columns = ['text', 'source']
      for column in columns:
        df[column] = df[column].str.replace(r"[^a-zA-Z0-9\,#@]", ' ', regex=True)
        df[column] = df[column].str.replace(r"\s{2,}", ' ', regex=True)

      filtered_columns = filter(lambda col: 
                              col != 'extended_entities' and
                              col != 'contributors' and col != 'entities' 
                              and col != 'retweeted_status' and col != 'user'
                              and col != 'in_reply_to_user_id_str' 
                              and col != 'in_reply_to_status_id_str' ,
                            list(df.columns))
      df_filtered = df[filtered_columns]
      df_filtered.to_csv(OUT_TWEETS_PATH, index=False, header=False)
      
    @task()
    def extract_transform_tweets_user():
      df = pd.read_json(f"{DATA_PATH}/tweet_data.json", lines=True)
      users = [ {**row['user'], 'tweet_id': row['id']} for _, row in df.iterrows() ]

      df_users = pd.DataFrame(users)
      df_users['created_at'] = pd.to_datetime(df_users['created_at'], 
                                              format='%a %b %d %H:%M:%S %z %Y'
                                              ).dt.tz_convert(None)
      filtered_column = list(filter(lambda col: 
                                  col != 'id' and col != 'tweet_id',
                                list(df_users.columns)))
      df_users = df_users.reindex(columns=['id', 'tweet_id', *(filtered_column)])

      df_users.to_csv(OUT_TWEETS_USER_PATH, index=False, header=False)

    start = DummyOperator(task_id='start')
    end = DummyOperator(task_id='end')
    
    et_tweets = extract_transform_tweets()
    et_tweets_user = extract_transform_tweets_user()

    stored_tweets_data_gcs = LocalFilesystemToGCSOperator(
        task_id="store_tweets_to_gcs",
        gcp_conn_id=GOOGLE_CLOUD_CONN_ID,
        src=OUT_TWEETS_PATH,
        dst=GCS_OBJECT_TWEETS_NAME,
        bucket=BUCKET_NAME
    )

    stored_tweets_user_data_gcs = LocalFilesystemToGCSOperator(
        task_id="store_tweets_user_to_gcs",
        gcp_conn_id=GOOGLE_CLOUD_CONN_ID,
        src=OUT_TWEETS_USER_PATH,
        dst=GCS_OBJECT_TWEETS_USER_NAME,
        bucket=BUCKET_NAME
    )

    loaded_tweets_data_bigquery = GCSToBigQueryOperator(
        task_id='load_tweets_to_bigquery',
        bigquery_conn_id=GOOGLE_CLOUD_CONN_ID,
        bucket=BUCKET_NAME,
        source_objects=[GCS_OBJECT_TWEETS_NAME],
        destination_project_dataset_table=f"{DATASET_ID}.{BIGQUERY_TWEETS_TABLE_NAME}",
        schema_fields=[ #based on https://cloud.google.com/bigquery/docs/schemas
            {'name': 'truncated', 'type': 'BOOL', 'mode': 'NULLABLE'},
            {'name': 'text', 'type': 'STRING', 'mode': 'NULLABLE'},
            {'name': 'is_quote_status', 'type': 'BOOL', 'mode': 'NULLABLE'},
            {'name': 'in_reply_to_status_id', 'type': 'FLOAT64', 'mode': 'NULLABLE'},
            {'name': 'in_reply_to_user_id', 'type': 'FLOAT64', 'mode': 'NULLABLE'},
            {'name': 'id', 'type': 'INT64', 'mode': 'REQUIRED'},            
            {'name': 'favorite_count', 'type': 'INT64', 'mode': 'NULLABLE'},
            {'name': 'retweeted', 'type': 'BOOL', 'mode': 'NULLABLE'},
            {'name': 'coordinates', 'type': 'STRING', 'mode': 'NULLABLE'},
            {'name': 'source', 'type': 'STRING', 'mode': 'NULLABLE'},
            {'name': 'in_reply_to_screen_name', 'type': 'STRING', 'mode': 'NULLABLE'},
            {'name': 'id_str', 'type': 'INT64', 'mode': 'NULLABLE'},
            {'name': 'retweet_count', 'type': 'INT64', 'mode': 'NULLABLE'},
            {'name': 'metadata', 'type': 'STRING', 'mode': 'NULLABLE'},            
            {'name': 'favorited', 'type': 'BOOL', 'mode': 'NULLABLE'},
            {'name': 'geo', 'type': 'STRING', 'mode': 'NULLABLE'},
            {'name': 'lang', 'type': 'STRING', 'mode': 'NULLABLE'},
            {'name': 'created_at', 'type': 'DATETIME', 'mode': 'NULLABLE'},
            {'name': 'place', 'type': 'STRING', 'mode': 'NULLABLE'},
            {'name': 'quoted_status_id', 'type': 'FLOAT64', 'mode': 'NULLABLE'},
            {'name': 'quoted_status', 'type': 'STRING', 'mode': 'NULLABLE'},
            {'name': 'possibly_sensitive', 'type': 'FLOAT64', 'mode': 'NULLABLE'},
            {'name': 'quoted_status_id_str', 'type': 'FLOAT64', 'mode': 'NULLABLE'},
        ], 
        autodetect=False,
        write_disposition='WRITE_TRUNCATE', #If the table already exists - overwrites the table data
    )

    loaded_tweets_user_data_bigquery = GCSToBigQueryOperator(
        task_id='load_tweets_user_to_bigquery',
        bigquery_conn_id=GOOGLE_CLOUD_CONN_ID,
        bucket=BUCKET_NAME,
        source_objects=[GCS_OBJECT_TWEETS_USER_NAME],
        destination_project_dataset_table=f"{DATASET_ID}.{BIGQUERY_TWEETS_USER_TABLE_NAME}",
        schema_fields=[
            {'name': 'id', 'type': 'INT64', 'mode': 'REQUIRED'},
            {'name': 'tweet_id', 'type': 'INT64', 'mode': 'NULLABLE'},
            {'name': 'follow_request_sent', 'type': 'STRING', 'mode': 'NULLABLE'},
            {'name': 'has_extended_profile', 'type': 'BOOL', 'mode': 'NULLABLE'},
            {'name': 'profile_use_background_image', 'type': 'BOOL', 'mode': 'NULLABLE'},
            {'name': 'verified', 'type': 'BOOL', 'mode': 'NULLABLE'},
            {'name': 'translator_type', 'type': 'STRING', 'mode': 'NULLABLE'},
            {'name': 'profile_text_color', 'type': 'STRING', 'mode': 'NULLABLE'},
            {'name': 'profile_image_url_https', 'type': 'STRING', 'mode': 'NULLABLE'},
            {'name': 'profile_sidebar_fill_color', 'type': 'STRING', 'mode': 'NULLABLE'},
            {'name': 'entities', 'type': 'STRING', 'mode': 'NULLABLE'},
            {'name': 'followers_count', 'type': 'INT64', 'mode': 'NULLABLE'},
            {'name': 'protected', 'type': 'BOOL', 'mode': 'NULLABLE'},
            {'name': 'location', 'type': 'STRING', 'mode': 'NULLABLE'},
            {'name': 'default_profile_image', 'type': 'BOOL', 'mode': 'NULLABLE'},
            {'name': 'id_str', 'type': 'STRING', 'mode': 'NULLABLE'},
            {'name': 'lang', 'type': 'STRING', 'mode': 'NULLABLE'},
            {'name': 'utc_offset', 'type': 'FLOAT64', 'mode': 'NULLABLE'},
            {'name': 'statuses_count', 'type': 'INT64', 'mode': 'NULLABLE'},
            {'name': 'description', 'type': 'STRING', 'mode': 'NULLABLE'},
            {'name': 'friends_count', 'type': 'INT64', 'mode': 'NULLABLE'},
            {'name': 'profile_background_image_url_https', 'type': 'STRING', 'mode': 'NULLABLE'},
            {'name': 'profile_link_color', 'type': 'STRING', 'mode': 'NULLABLE'},
            {'name': 'profile_image_url', 'type': 'STRING', 'mode': 'NULLABLE'},
            {'name': 'following', 'type': 'STRING', 'mode': 'NULLABLE'},
            {'name': 'geo_enabled', 'type': 'BOOL', 'mode': 'NULLABLE'},
            {'name': 'profile_background_color', 'type': 'STRING', 'mode': 'NULLABLE'},
            {'name': 'profile_banner_url', 'type': 'STRING', 'mode': 'NULLABLE'},
            {'name': 'profile_background_image_url', 'type': 'STRING', 'mode': 'NULLABLE'},
            {'name': 'screen_name', 'type': 'STRING', 'mode': 'NULLABLE'},
            {'name': 'is_translation_enabled', 'type': 'BOOL', 'mode': 'NULLABLE'},
            {'name': 'profile_background_tile', 'type': 'BOOL', 'mode': 'NULLABLE'},
            {'name': 'favourites_count', 'type': 'INT64', 'mode': 'NULLABLE'},
            {'name': 'name', 'type': 'STRING', 'mode': 'NULLABLE'},
            {'name': 'notifications', 'type': 'STRING', 'mode': 'NULLABLE'},
            {'name': 'url', 'type': 'STRING', 'mode': 'NULLABLE'},
            {'name': 'created_at', 'type': 'DATETIME', 'mode': 'NULLABLE'},
            {'name': 'contributors_enabled', 'type': 'BOOL', 'mode': 'NULLABLE'},
            {'name': 'time_zone', 'type': 'STRING', 'mode': 'NULLABLE'},
            {'name': 'profile_sidebar_border_color', 'type': 'STRING', 'mode': 'NULLABLE'},
            {'name': 'default_profile', 'type': 'BOOL', 'mode': 'NULLABLE'},
            {'name': 'is_translator', 'type': 'BOOL', 'mode': 'NULLABLE'},
            {'name': 'listed_count', 'type': 'INT64', 'mode': 'NULLABLE'},           
        ], 
        autodetect=False,
        allow_quoted_newlines=True,
        write_disposition='WRITE_TRUNCATE',
    )


    start >> [et_tweets, et_tweets_user]
    et_tweets >> stored_tweets_data_gcs
    et_tweets_user >> stored_tweets_user_data_gcs
    stored_tweets_data_gcs >> loaded_tweets_data_bigquery >> end
    stored_tweets_user_data_gcs >> loaded_tweets_user_data_bigquery >> end
Beispiel #17
0
                                    })

    pg_poc_pull = PythonOperator(
        task_id='PG_TO_FILE',
        python_callable=postgres_to_file,
        postgres_conn_id='spacecadets_postgres',
        op_kwargs={
            'conn_id': 'spacecadets_postgres',
            'sql': pg_movies_dirs,
            'filename': pg_csv_filename
        },
    )

    upload_pg_file = LocalFilesystemToGCSOperator(
        task_id="PG_UPLOAD_FILE",
        src=pg_csv_filename,
        dst=GCS_FILENAME.format('movies_directors', pg_base_filename),
        bucket=BUCKET,
    )

    upload_mysql_file = LocalFilesystemToGCSOperator(
        task_id="MYSQL_UPLOAD_FILE",
        src=mysql_csv_filename,
        dst=GCS_FILENAME.format('movies_directors', mysql_base_filename),
        bucket=BUCKET,
    )

    # t1, t2 and t3 are examples of tasks created by instantiating operators
    print_date = BashOperator(
        task_id='print_date',
        bash_command='date',
    )
                            python_callable=get_initial_id,
                            op_kwargs=func_param
                            dag=dag)

check_data=BranchPythonOperator(task_id='check_data',
                                python_callable=check_data,
                                dag=dag)

get_data_from_mongodb = PythonOperator(task_id='get_data_from_mongodb',
                                        python_callable=extract_mongodb,
                                        op_kwargs=func_param,
                                        dag=dag)

load_to_staging= LocalFilesystemToGCSOperator(task_id='load_to_staging',
                                            src=func_param['source'],
                                            dst=func_param['dest_blob'],
                                            bucket=func_param['bucket_name'],
                                            gcp_conn_id='google_cloud_default',
                                            dag=dag)

# update_bigquery_fact_table=BranchPythonOperator(task_id='update_bigquery_fact_table',
#                                             python_callable=table_existence,
#                                             dag=dag)

transform_tripdata= PythonOperator(task_id='transform_tripdata',                        # pull xcom from extract_json
                                    python_callable=transform_tripdata,
                                    op_kwargs=func_param,
                                    dag=dag)

local_parquet_to_gcs= LocalFilesystemToGCSOperator(task_id='local_parquet_to_gcs',
                                                    src=func_param['source_transform'],
                                                    dst=func_param['dest_blob_transform'],
                    autodetect=True)
            except GoogleAPIError:
                print('{} table already exist, skip loading table'.format(
                    table_name))


get_weather_json = PythonOperator(task_id='get_weather_json',
                                  python_callable=get_weather_json,
                                  op_kwargs=func_param,
                                  trigger_rule='all_done',
                                  dag=dag)

load_to_staging = LocalFilesystemToGCSOperator(
    task_id='load_to_staging',
    src=func_param['source_weather'],
    dst=func_param['dest_blob'],
    bucket=func_param['bucket_name'],
    gcp_conn_id='google_cloud_default',
    trigger_rule='all_done',
    dag=dag)

check_dataset = BranchPythonOperator(task_id='check_dataset',
                                     python_callable=table_existence,
                                     trigger_rule='all_done',
                                     dag=dag)

transform_raw_json = PythonOperator(task_id='transform_raw_json',
                                    python_callable=transform_json_data,
                                    op_kwargs=func_param,
                                    trigger_rule='none_failed_or_skipped',
                                    dag=dag)