Beispiel #1
0
def upload_files(**context):
    full_paths, names_only = list_local_files()
    for i in range(len(full_paths)):
        move_to_gcs = FileToGoogleCloudStorageOperator(task_id='to_gcs',
                                                       src=full_paths[i],
                                                       dst=names_only[i],
                                                       bucket=env.GCS_BUCKET)
        logging.info('uploading file ' + names_only[i])
        move_to_gcs.execute(context)
        os.remove(full_paths[i])
 def test_execute(self, mock_hook):
     mock_instance = mock_hook.return_value
     operator = FileToGoogleCloudStorageOperator(
         task_id='gcs_to_file_sensor', dag=self.dag, **self._config)
     operator.execute(None)
     mock_instance.upload.assert_called_once_with(
         bucket_name=self._config['bucket'],
         filename=self._config['src'],
         gzip=self._config['gzip'],
         mime_type=self._config['mime_type'],
         object_name=self._config['dst'])
 def test_execute(self, mock_hook):
     mock_instance = mock_hook.return_value
     operator = FileToGoogleCloudStorageOperator(
         task_id='gcs_to_file_sensor',
         dag=self.dag,
         **self._config
     )
     operator.execute(None)
     mock_instance.upload.assert_called_once_with(
         bucket=self._config['bucket'],
         filename=self._config['src'],
         gzip=self._config['gzip'],
         mime_type=self._config['mime_type'],
         object=self._config['dst']
     )
 def test_init(self):
     operator = FileToGoogleCloudStorageOperator(
         task_id='file_to_gcs_operator', dag=self.dag, **self._config)
     self.assertEqual(operator.src, self._config['src'])
     self.assertEqual(operator.dst, self._config['dst'])
     self.assertEqual(operator.bucket, self._config['bucket'])
     self.assertEqual(operator.mime_type, self._config['mime_type'])
     self.assertEqual(operator.gzip, self._config['gzip'])
Beispiel #5
0
    # get dict
    get_enpdpoints_task_id = "get_{0}_endpoint".format(endpoint_name)
    file_to_gcs_task_id = "{0}_to_gcs".format(endpoint_name)

    #cc exceptions
    if endpoint_name == 'cc':
        get_enpdpoints_task_id = "get_{0}_{1}_endpoint".format(
            endpoint_name, i)
        file_to_gcs_task_id = "{0}_{1}_to_gcs".format(endpoint_name, i)

    t3 = PythonOperator(task_id=get_enpdpoints_task_id,
                        python_callable=get_endpoints,
                        op_args=[e, SAVE_PATH, BASE_URL, MAXPAGESIZE],
                        dag=dag)

    t4 = FileToGoogleCloudStorageOperator(
        task_id=file_to_gcs_task_id,
        google_cloud_storage_conn_id='gcs_silo',
        bucket="ps_mirror",  #"{{var.value.gcs_ps_mirror}}",
        src="{{ task_instance.xcom_pull(task_ids='" + get_enpdpoints_task_id +
        "', key='file_path' )}}",
        dst="powerschool/" + endpoint_name +
        "/{{ task_instance.xcom_pull(task_ids='" + get_enpdpoints_task_id +
        "', key='file_name') }}",
        dag=dag)

    t3.set_upstream(t2)
    t3.set_downstream(t4)

t1 >> t2
from airflow import DAG
from datetime import datetime, timedelta
from airflow.operators.python_operator import PythonOperator
from airflow.contrib.operators.file_to_gcs import FileToGoogleCloudStorageOperator

from comic_dm5 import scrap_dm5

default_args = {
    'owner': 'bangyuwen',
    'depends_on_past': False,
    'start_date': datetime(2018, 8, 4),
    'email': ['*****@*****.**'],
    'email_on_failure': False,
    'email_on_retry': False,
    'retries': 1,
    'retry_delay': timedelta(minutes=5),
}

with DAG('comic_scrap',
         schedule_interval='0 16 * * *',
         default_args=default_args) as dag:
    t1 = PythonOperator(task_id='scrap_dm5', python_callable=scrap_dm5.run)
    t2 = FileToGoogleCloudStorageOperator(
        src='/tmp/scrap_dm5/{{ yesterday_ds_nodash }}.csv',
        dst='/scrap_dm5/{{ yesterday_ds_nodash }}.csv',
        bucket='scrap-comic',
        google_cloud_storage_conn_id='google_cloud_default',
        mime_type='text/plain',
        task_id='upload_to_GCS')
    t1 >> t2
Beispiel #7
0
from airflow import models
from airflow.contrib.operators.file_to_gcs import FileToGoogleCloudStorageOperator
from airflow.contrib.operators.gcs_to_bq import GoogleCloudStorageToBigQueryOperator

default_dag_args = {
    'start_date': datetime.datetime(2019, 1, 14),
    'retry': 1,
    'retry_delay': datetime.timedelta(minutes=4),
    'project_id': models.Variable.get('project_id'),
}

with models.DAG('gcp_sample_dag',
                schedule_interval=None,
                default_args=default_dag_args) as dag:

    local_to_GCS_task = FileToGoogleCloudStorageOperator(
        task_id='local_to_GCS',
        src=models.Variable.get('local_src'),
        dst=models.Variable.get('gcs_dst'),
        bucket=models.Variable.get('gcs_bucket'),
        google_cloud_strage_conn_id='google_cloud_storage_default',
    )

    gcs_to_bq_task = GoogleCloudStorageToBigQueryOperator(
        task_id='gcs_to_bq',
        bucket=models.Variable.get('gcs_bucket'),
        source_objects=['data/gcpug_demo_data.json'],
        source_format='NEWLINE_DELIMITED_JSON',
        destination_project_dataset_table='gcpug_shonan.cloud_composer_demo')

    local_to_GCS_task >> gcs_to_bq_task
    download_yesterdays_csv = PythonOperator(
        task_id="download_yesterdays_csv",
        python_callable=download_csv_raw,  # selenium script
    )

    clean_csv_before_upload = PythonOperator(
        task_id="clean_csv_before_upload",
        python_callable=csv_cleaner,
        do_xcom_push=True,
    )

    upload_file_to_gcs = FileToGoogleCloudStorageOperator(
        task_id="upload_file_to_gcs",
        src=f"{DATA_DOWNLOAD_FILEPATH}" +
        """{{ ti.xcom_pull(task_ids='clean_csv_before_upload') }}""",
        dst="""{{ ti.xcom_pull(task_ids='clean_csv_before_upload') }}""",
        bucket=HYDRO_DATA_LANDING_BUCKET,
        google_cloud_storage_conn_id=GOOGLE_CLOUD_STORAGE_CONN_ID,
        mime_type="parquet",
    )

    # schema for bigquery table
    schema_fields = [
        {
            "name": "interval_start_date_time",
            "type": "TIMESTAMP",
            "mode": "REQUIRED"
        },
        {
            "name": "net_consumption_kwh",
            "type": "FLOAT",
Beispiel #9
0
get_geolite_task = PythonOperator(
    task_id='GetGeolite',
    python_callable=get_geolite,
    provide_context=True,
    op_kwargs={
        'remote_path':
        "https://geolite.maxmind.com/download/geoip/database/GeoLite2-Country-CSV.zip",
        'local_file_name': "GeoLite2-Country-CSV.zip"
    },
    dag=dag)

upload_geoip_to_gcs = FileToGoogleCloudStorageOperator(
    task_id='GeoIPToGCSOperator',
    src=
    "/tmp/geolite_{{ ds_nodash }}/GeoLite2-Country-CSV/GeoLite2-Country-Blocks-IPv4.csv",
    dst=
    "geolite/{{ execution_date.strftime('y=%Y/m=%m/d=%d') }}/GeoLite2-Country-Blocks-IPv4.csv",
    bucket="cloudiq-jay-dev-01-long-term-storage",
    mime_type='text/csv',
    google_cloud_storage_conn_id='google_cloud_default',
    dag=dag)

upload_geolocation_to_gcs = FileToGoogleCloudStorageOperator(
    task_id='GeoLocationToGCSOperator',
    src=
    "/tmp/geolite_{{ ds_nodash }}/GeoLite2-Country-CSV/GeoLite2-Country-Locations-en.csv",
    dst=
    "geolite/{{ execution_date.strftime('y=%Y/m=%m/d=%d') }}/GeoLite2-Country-Locations-en.csv",
    bucket="cloudiq-jay-dev-01-long-term-storage",
    mime_type='text/csv',
    google_cloud_storage_conn_id='google_cloud_default',
    dag=dag)
        reader = csv.reader(file, delimiter=' ')
        data = list(reader)
        mysql.insert_rows('PROCESSED_ORDER', data, target_fields=columns)


with DAG(dag_id='31_scenario',
         catchup=False,
         schedule_interval=timedelta(days=1),
         default_args=default_args,
         user_defined_macros={
             'get_composer_gcs_bucket': get_composer_gcs_bucket,
         }) as dag:

    copy_file_to_gcs = FileToGoogleCloudStorageOperator(
        task_id='copy_file_to_gcs',
        bucket='{{ get_composer_gcs_bucket() }}',
        dst='data/31_scenario/{{ execution_date }}/order.json',
        src='/home/airflow/gcs/data/31_scenario/31_order.json',
        mime_type='application/json')

    dataflow_process = DataFlowPythonOperator(
        task_id='dataflow-process',
        py_file='/home/airflow/gcs/dags/31_dataflow.py',
        options={
            'input':
            'gs://{{ get_composer_gcs_bucket() }}/data/31_scenario/{{ execution_date }}/order.json',
            'output':
            'gs://{{ get_composer_gcs_bucket() }}/data/31_scenario/{{ execution_date }}/order.csv'
        },
        dataflow_default_options={
            'project': 'YOUR-PROJECT-HERE',
            "staging_location":
Beispiel #11
0
    execution_dates = list((datetime_range(start=start_date, end=end_date)))

    for i, ex_date in enumerate(execution_dates):

        ed = ex_date.strftime('%Y-%m-%d')

        ep_template = {'sdt': ed}

        get_enpdpoints_task_id = "get_{0}_dl_endpoint_{1}".format(
            endpoint_name, ed)
        file_to_gcs_task_id = "{0}_{1}_to_gcs".format(endpoint_name, ed)

        t2 = PythonOperator(task_id=get_enpdpoints_task_id,
                            python_callable=get_endpoint_with_dates,
                            op_args=[SAVE_PATH, BASE_URL, API_KEYS],
                            templates_dict=ep_template)

        t3 = FileToGoogleCloudStorageOperator(
            task_id=file_to_gcs_task_id,
            google_cloud_storage_conn_id='gcs_silo',
            bucket="deanslist",
            src="{{ task_instance.xcom_pull(task_ids='" +
            get_enpdpoints_task_id + "', key='dl_file_path' )}}",
            dst="TEST/" + endpoint_name +
            "/{{ task_instance.xcom_pull(task_ids='" + get_enpdpoints_task_id +
            "', key='dl_file_name') }}",
            dag=dag)

        t2.set_upstream(t1)
        t2.set_downstream(t3)
Beispiel #12
0
    'email': ['*****@*****.**'],
    'email_on_failure': True,
    'email_on_retry': False,
    'retries': 2,
    'retry_delay': timedelta(minutes=5),
}

dag = DAG('site-visits-dag',
          default_args=default_args,
          schedule_interval='@daily')

#t1 = file_sensor(
#    task_id='local_file',
#    fs_conn_id='fs_default',
#    file_path='/Users/raghu/git-hub/demo/incoming/site-visits.json',
#    dag=dag)
t1 = DummyOperator(task_id='op1', dag=dag)
t2 = PythonOperator(task_id='python_task',
                    python_callable=convert_json_to_csv.main,
                    dag=dag)
t3 = FileToGoogleCloudStorageOperator(
    task_id='fileToGCS',
    src='/usr/local/demo/outgoing/site-visits.csv',
    dst='site-visits',
    bucket='springml-demo',
    google_cloud_storage_conn_id='google_cloud_default',
    dag=dag)

t3.set_upstream(t2)
t2.set_upstream(t1)
Beispiel #13
0
        # Operator for parsing weather data
        templated_kwargs = dict(source=historical_target,
                                target=weather_target)

        parse_weather_op = PythonOperator(task_id='parse_weather_{}'.format(city),
                                          python_callable=parse_weather_adapter,
                                          provide_context=True,
                                          templates_dict=templated_kwargs)

        # Operator for parsing temperature data
        templated_kwargs = dict(source=historical_target,
                                target=temperature_target)

        parse_temperature_op = PythonOperator(task_id='parse_temperature_{}'.format(city),
                                              python_callable=parse_temperature_adapter,
                                              provide_context=True,
                                              templates_dict=templated_kwargs)

        parsing_tasks = [parse_weather_op, parse_temperature_op]
        fetch_weather_data >> parsing_tasks

        for parsing_task in parsing_tasks:
            target = parsing_task.templates_dict['target']
            filetogcs = FileToGoogleCloudStorageOperator(task_id='{}_to_gcs'.format(parsing_task.task_id),
                                                         src=target,
                                                         dst=target,
                                                         bucket='airflow_training_weather_data',
                                                         mime_type='text/csv')
            parsing_task >> filetogcs
            filetogcs >> drop_files
Beispiel #14
0
yesterday = datetime.combine(datetime.today() - timedelta(1),
                             datetime.min.time())

default_args = {
    'start_date': yesterday,
    'email_on_failure': False,
    'email_on_retry': False,
    'retries': 1,
    'retry_delay': timedelta(minutes=5)
}


def get_composer_gcs_bucket():
    return os.environ['GCS_BUCKET']


with DAG(dag_id='20_gcs_operators',
         catchup=False,
         schedule_interval=timedelta(days=1),
         default_args=default_args,
         user_defined_macros={
             'get_composer_gcs_bucket': get_composer_gcs_bucket,
         }) as dag:

    copy_file = FileToGoogleCloudStorageOperator(
        task_id='copy_file',
        bucket='{{ get_composer_gcs_bucket() }}',
        dst='data/20_gcs_operators/{{ execution_date }}/20_gcs_operators.py',
        src='/home/airflow/gcs/dags/20_gcs_operators.py',
        mime_type='text/x-python-script')
Beispiel #15
0
    metadata={'PIP_PACKAGES': 'praw==6.5.1'},
    num_workers=2,
    num_masters=1,
    image_version='preview',
    master_machine_type='n1-standard-2',
    worker_machine_type='n1-standard-2',
    worker_disk_size=50,
    master_disk_size=50,
    region=region,
    storage_bucket=gcs_netflix_bucket,
    dag=dag)

upload_netflix_catalog_job_to_gcs = FileToGoogleCloudStorageOperator(
    task_id="upload_netflix_catalog_job_to_gcs",
    src="/airflow/dags/spark-scripts/clean_netflix_catalog.py",
    dst="spark-jobs/clean_netflix_catalog.py",
    bucket=gcs_netflix_bucket,
    google_cloud_storage_conn_id=gcp_conn,
    dag=dag)

upload_reddit_comments_job_to_gcs = FileToGoogleCloudStorageOperator(
    task_id="upload_reddit_comments_job_to_gcs",
    src="/airflow/dags/spark-scripts/consume_reddit_comments.py",
    dst="spark-jobs/consume_reddit_comments.py",
    bucket=gcs_netflix_bucket,
    google_cloud_storage_conn_id=gcp_conn,
    dag=dag)

upload_populate_shows_table_job_to_gcs = FileToGoogleCloudStorageOperator(
    task_id="upload_populate_shows_table_job_to_gcs",
    src="/airflow/dags/spark-scripts/populate_shows_table.py",
from airflow.contrib.operators.bigquery_to_gcs import BigQueryToCloudStorageOperator
from airflow.contrib.operators.file_to_gcs import FileToGoogleCloudStorageOperator
from airflow.contrib.operators.gcs_to_bq import GoogleCloudStorageToBigQueryOperator
from airflow.models import DAG

dag = DAG(
    "gcp_movie_ranking",
    start_date=datetime.datetime(1995, 1, 1),
    schedule_interval="@monthly",
)

upload_ratings_to_gcs = FileToGoogleCloudStorageOperator(
    task_id="upload_ratings_to_gcs",
    src=
    "/data/{{ execution_date.year }}/{{ execution_date.strftime('%m') }}.csv",
    bucket=os.environ["RATINGS_BUCKET"],
    dst=
    "ratings/{{ execution_date.year }}/{{ execution_date.strftime('%m') }}.csv",
    dag=dag,
)

import_in_bigquery = GoogleCloudStorageToBigQueryOperator(
    task_id="import_in_bigquery",
    bucket=os.environ["RATINGS_BUCKET"],
    source_objects=[
        "ratings/{{ execution_date.year }}/{{ execution_date.strftime('%m') }}.csv"
    ],
    source_format="CSV",
    create_disposition="CREATE_IF_NEEDED",
    write_disposition="WRITE_TRUNCATE",
    bigquery_conn_id="gcp",
        python_callable=create_json,
        provide_context=True
    )

    create_xdr = PythonOperator(
        task_id='create_xdr',
        python_callable=create_xdr,
        provide_context=True
    )

    push = {}
    for k,v in {'json': 'kre_payout_leaderboard.json', 'csv': 'payout.csv', 'xlsx': 'payout.xlsx', 'xdr0': 'xdr0.xdr', 'xdr1': 'xdr1.xdr'}.items():
        push[k] = FileToGoogleCloudStorageOperator(
            task_id=k,
            src=write_directory + '/{{ ds }}/' + v,
            dst=gcs_directory + '{{ ds }}/' + v,
            bucket='kin-kre-payouts',
            google_cloud_storage_conn_id='google_cloud_default',
        )

    get_total_payout_information >> create_csv 
    get_total_payout_information_by_date >> create_csv
    get_spender_information >> create_csv
    get_holding_information >> create_csv
    get_buying_information >> create_csv
    get_prior_information >> create_csv

    create_csv >> create_json

    create_csv >> create_xdr 
Beispiel #18
0
          start_date=datetime(2020, 1, 1),
          catchup=False)

dummy_operator = DummyOperator(task_id='dummy_task', retries=3, dag=dag)

download_operator = GoogleCloudStorageDownloadOperator(
    task_id='downloader',
    bucket='fictizia',
    object='breast-cancer-wisconsin.data',
    google_cloud_storage_conn_id='google_cloud_default',
    filename=GLOBAL_PATH,
    dag=dag)

load_operator = FileToGoogleCloudStorageOperator(
    task_id='uploader',
    bucket='fictizia',
    src=GLOBAL_OUTPUT_PATH,
    dst='my_file.json',
    google_cloud_storage_conn_id='google_cloud_default',
    dag=dag)

producer_operator = PythonOperator(task_id='producer',
                                   python_callable=produce_data,
                                   dag=dag)
consumer_operator = PythonOperator(task_id='consumer',
                                   python_callable=consume_data,
                                   dag=dag)

download_operator >> producer_operator
download_operator >> consumer_operator >> load_operator
Beispiel #19
0
Writing temporarily from FileSystem to Cloud Storage

t3_CA >> t4_CA  in GCS
'''

writeToGCS_task, tempGCS_dir_paths = [] , [] 
current_time = datetime.today().strftime("%Y%m%d_%H_%M") 
tempGCS_dir_temp="gs://{}/temp/{}".format(bucket_name,current_time)

for cc , input_t4 , t3 in zip(sample_cc,filePaths_t3, decompressionTask):
	current_time = datetime.today().strftime("%Y%m%d_%H_%M") 
	GCS_dir_archive = "gs://{}/archive/{}/{}".format(bucket_name,cc,current_time)
	tempGCS_filepath = os.path.join(tempGCS_dir_temp,cc)
	t4 = FileToGoogleCloudStorageOperator(task_id='uploadToGCS_{}'.format(cc),
		src=input_t4,
		dst=tempGCS_filepath,
		google_cloud_storage_conn_id = storage_connection_id, 
		gzip = False,
		dag=dag)
	t4_archive = FileToGoogleCloudStorageOperator(task_id='uploadToGCS_archive_{}'.format(cc),
		src=input_t4,
		dst=GCS_dir_archive,
		google_cloud_storage_conn_id = storage_connection_id, 
		gzip = True,
		dag=dag)
	t4.set_upstream(t3)
	t4_archive.set_upstream(t3)
	writeToGCS_task.append(t4)
	tempGCS_dir_paths.append(tempGCS_filepath)


Beispiel #20
0
    get_membership = BigQueryToFeatherOperator(
        task_id="get_membership",
        sql=membership_qry,
        destination_file="{0}/membership.feather".format(SAVE_PATH))

    get_terms = BigQueryToFeatherOperator(
        task_id="get_terms",
        sql=terms_qry,
        destination_file="{0}/terms.feather".format(SAVE_PATH))

    create_att_student = CreateAttendStudentOperator(
        task_id="create_att_student", dirpath=SAVE_PATH)

    attend_student_to_gcs = FileToGoogleCloudStorageOperator(
        task_id="attend_student_to_gcs",
        google_cloud_storage_conn_id='gcs_silo',
        bucket="idea_attendance",
        src="{0}/attend_student.feather".format(SAVE_PATH),
        dst="attend_student.feather")
    # by school by grade
    create_group_by_school_grade = CreateGroupedAttendance(
        task_id="ada_by_date_school_grade",
        dirpath=SAVE_PATH,
        file_name="attend_date_school_grade.feather",
        grouping_vars=['date', 'school_abbrev', 'grade_level'])

    create_ytd_by_school_by_grade = GroupedADAToGroupedYTDOperator(
        task_id="create_ytd_by_school_grade",
        dirpath=SAVE_PATH,
        in_file_name="attend_date_school_grade.feather",
        out_file_name="ada_weekly_school_grade.feather",
        grouping_vars=['date', 'school_abbrev', 'grade_level'])
    task_id='verify_tokens',
    python_callable=utils.verify_access_token,
    dag=dag,
)

get_sleep = PythonOperator(
    task_id='get_sleep',
    python_callable=utils.fetch_sleep,
    provide_context=True,
    dag=dag,
)
upload_sleep = FileToGoogleCloudStorageOperator(
    task_id='upload_sleep_to_gcs',
    src='{}/{}-sleep.json'.format(Variable.get('LOCAL_STAGING'), '{{ds}}'),
    dst='{{ds}}/sleep.json',
    bucket=GCS_BUCKET,
    google_cloud_storage_conn_id='sleep-gcp',
    on_success_callback=utils.remove_temp('sleep'),
    dag=dag,
)

get_weather = PythonOperator(
    task_id='get_weather',
    python_callable=utils.fetch_weather,
    op_args=[Variable.get('WEATHERBIT_KEY')],
    provide_context=True,
    dag=dag,
)
upload_weather = FileToGoogleCloudStorageOperator(
    task_id='upload_weather_to_gcs',
    src='{}/{}-weather.json'.format(Variable.get('LOCAL_STAGING'), '{{ds}}'),
    worker_disk_size=50,
    master_disk_size=50,
    region=region,
    storage_bucket=gcs_football_bucket,
    dag=dag
)

international_football_dataset_to_datalake = InternationalFootballDataSetToDataLake(
    task_id="international_football_dataset_to_datalake",
    name="martj42/international-football-results-from-1872-to-2017",
    destination_path="/airflow/datasources/catalog/csv", dag=dag
)

upload_cleaning_spark_job_to_gcs = FileToGoogleCloudStorageOperator(task_id='upload_cleaning_spark_job_to_gcs',
                                                                    src='/airflow/dags/spark-jobs/football_dataset_cleaner.py',
                                                                    dst='spark_jobs/football_dataset_cleaner.py',
                                                                    bucket=gcs_football_bucket,
                                                                    google_cloud_storage_conn_id=gcp_conn,
                                                                    dag=dag)

upload_etl_games_spark_job_to_gcs = FileToGoogleCloudStorageOperator(task_id='upload_etl_games_spark_job_to_gcs',
                                                                     src='/airflow/dags/spark-jobs/etl_games_to_bigquery.py',
                                                                     dst='spark_jobs/etl_games_to_bigquery.py',
                                                                     bucket=gcs_football_bucket,
                                                                     google_cloud_storage_conn_id=gcp_conn,
                                                                     dag=dag)

cleaning_job_code_path = 'gs://' + gcs_football_bucket + '/spark_jobs/football_dataset_cleaner.py'
submit_cleaning_spark_job = DataProcPySparkOperator(
    task_id='submit_cleaning_spark_job',
    main=cleaning_job_code_path,
    cluster_name=cluster_name,

with models.DAG(
        'ejercicio_4',
        description='Ejercicio 4 - clase 2',
        default_args=default_args,
        schedule_interval=None,
) as dag:

    augmentation_operator = PythonOperator(task_id='augmentation',
                                           python_callable=data_augmentation)

    count = 1
    operators = list()

    for base, dirs, files in os.walk(IMAGE_OUTPUT_PATH):
        for file in files:
            load_operator = FileToGoogleCloudStorageOperator(
                task_id='upload_file_' + str(count),
                bucket='fictizia',
                src=base + file,
                dst='images_2/' + file,
                google_cloud_storage_conn_id='google_cloud_default',
                dag=dag)
            operators.append(load_operator)
            count += 1

    augmentation_operator >> operators[0]

    for i in range(1, len(operators)):
        operators[i - 1] >> operators[i]
Beispiel #24
0
    dl_tasks = list(map(lambda t: t[0], tuple_tasks_filenames))
    filenames = list(map(lambda t: t[1], tuple_tasks_filenames))

    save_filenames_task = PythonOperator(task_id="create_filenames_csv",
                                         python_callable=save_to_csv,
                                         op_kwargs={
                                             'filenames': filenames,
                                             'path': filenames_path
                                         },
                                         dag=dag)

    copy_filenames_to_gs = FileToGoogleCloudStorageOperator(
        task_id=f"copy_{ggi_files_to_process}_to_gs",
        src=filenames_path,
        dst=ggi_files_to_process,
        bucket=OUTPUT_BUCKET,
        task_concurrency=50,
        dag=dag)

    dataproc_task = DataProcPySparkOperator(
        task_id="process_write_to_bigquery",
        cluster_name=CLUSTER_NAME,
        main=PYSPARK_MAIN_PATH,
        arguments=[f"gs://{OUTPUT_BUCKET}", ggi_files_to_process],
        pyfiles=[PYSPARK_ARCHIVE_PATH],
        dataproc_pyspark_jars=[
            'gs://spark-lib/bigquery/spark-bigquery-with-dependencies_2.12-0.16.0.jar'
        ],
        region='us-central1',
        retries=0,