def upload_files(**context): full_paths, names_only = list_local_files() for i in range(len(full_paths)): move_to_gcs = FileToGoogleCloudStorageOperator(task_id='to_gcs', src=full_paths[i], dst=names_only[i], bucket=env.GCS_BUCKET) logging.info('uploading file ' + names_only[i]) move_to_gcs.execute(context) os.remove(full_paths[i])
def test_execute(self, mock_hook): mock_instance = mock_hook.return_value operator = FileToGoogleCloudStorageOperator( task_id='gcs_to_file_sensor', dag=self.dag, **self._config) operator.execute(None) mock_instance.upload.assert_called_once_with( bucket_name=self._config['bucket'], filename=self._config['src'], gzip=self._config['gzip'], mime_type=self._config['mime_type'], object_name=self._config['dst'])
def test_execute(self, mock_hook): mock_instance = mock_hook.return_value operator = FileToGoogleCloudStorageOperator( task_id='gcs_to_file_sensor', dag=self.dag, **self._config ) operator.execute(None) mock_instance.upload.assert_called_once_with( bucket=self._config['bucket'], filename=self._config['src'], gzip=self._config['gzip'], mime_type=self._config['mime_type'], object=self._config['dst'] )
def test_init(self): operator = FileToGoogleCloudStorageOperator( task_id='file_to_gcs_operator', dag=self.dag, **self._config) self.assertEqual(operator.src, self._config['src']) self.assertEqual(operator.dst, self._config['dst']) self.assertEqual(operator.bucket, self._config['bucket']) self.assertEqual(operator.mime_type, self._config['mime_type']) self.assertEqual(operator.gzip, self._config['gzip'])
# get dict get_enpdpoints_task_id = "get_{0}_endpoint".format(endpoint_name) file_to_gcs_task_id = "{0}_to_gcs".format(endpoint_name) #cc exceptions if endpoint_name == 'cc': get_enpdpoints_task_id = "get_{0}_{1}_endpoint".format( endpoint_name, i) file_to_gcs_task_id = "{0}_{1}_to_gcs".format(endpoint_name, i) t3 = PythonOperator(task_id=get_enpdpoints_task_id, python_callable=get_endpoints, op_args=[e, SAVE_PATH, BASE_URL, MAXPAGESIZE], dag=dag) t4 = FileToGoogleCloudStorageOperator( task_id=file_to_gcs_task_id, google_cloud_storage_conn_id='gcs_silo', bucket="ps_mirror", #"{{var.value.gcs_ps_mirror}}", src="{{ task_instance.xcom_pull(task_ids='" + get_enpdpoints_task_id + "', key='file_path' )}}", dst="powerschool/" + endpoint_name + "/{{ task_instance.xcom_pull(task_ids='" + get_enpdpoints_task_id + "', key='file_name') }}", dag=dag) t3.set_upstream(t2) t3.set_downstream(t4) t1 >> t2
from airflow import DAG from datetime import datetime, timedelta from airflow.operators.python_operator import PythonOperator from airflow.contrib.operators.file_to_gcs import FileToGoogleCloudStorageOperator from comic_dm5 import scrap_dm5 default_args = { 'owner': 'bangyuwen', 'depends_on_past': False, 'start_date': datetime(2018, 8, 4), 'email': ['*****@*****.**'], 'email_on_failure': False, 'email_on_retry': False, 'retries': 1, 'retry_delay': timedelta(minutes=5), } with DAG('comic_scrap', schedule_interval='0 16 * * *', default_args=default_args) as dag: t1 = PythonOperator(task_id='scrap_dm5', python_callable=scrap_dm5.run) t2 = FileToGoogleCloudStorageOperator( src='/tmp/scrap_dm5/{{ yesterday_ds_nodash }}.csv', dst='/scrap_dm5/{{ yesterday_ds_nodash }}.csv', bucket='scrap-comic', google_cloud_storage_conn_id='google_cloud_default', mime_type='text/plain', task_id='upload_to_GCS') t1 >> t2
from airflow import models from airflow.contrib.operators.file_to_gcs import FileToGoogleCloudStorageOperator from airflow.contrib.operators.gcs_to_bq import GoogleCloudStorageToBigQueryOperator default_dag_args = { 'start_date': datetime.datetime(2019, 1, 14), 'retry': 1, 'retry_delay': datetime.timedelta(minutes=4), 'project_id': models.Variable.get('project_id'), } with models.DAG('gcp_sample_dag', schedule_interval=None, default_args=default_dag_args) as dag: local_to_GCS_task = FileToGoogleCloudStorageOperator( task_id='local_to_GCS', src=models.Variable.get('local_src'), dst=models.Variable.get('gcs_dst'), bucket=models.Variable.get('gcs_bucket'), google_cloud_strage_conn_id='google_cloud_storage_default', ) gcs_to_bq_task = GoogleCloudStorageToBigQueryOperator( task_id='gcs_to_bq', bucket=models.Variable.get('gcs_bucket'), source_objects=['data/gcpug_demo_data.json'], source_format='NEWLINE_DELIMITED_JSON', destination_project_dataset_table='gcpug_shonan.cloud_composer_demo') local_to_GCS_task >> gcs_to_bq_task
download_yesterdays_csv = PythonOperator( task_id="download_yesterdays_csv", python_callable=download_csv_raw, # selenium script ) clean_csv_before_upload = PythonOperator( task_id="clean_csv_before_upload", python_callable=csv_cleaner, do_xcom_push=True, ) upload_file_to_gcs = FileToGoogleCloudStorageOperator( task_id="upload_file_to_gcs", src=f"{DATA_DOWNLOAD_FILEPATH}" + """{{ ti.xcom_pull(task_ids='clean_csv_before_upload') }}""", dst="""{{ ti.xcom_pull(task_ids='clean_csv_before_upload') }}""", bucket=HYDRO_DATA_LANDING_BUCKET, google_cloud_storage_conn_id=GOOGLE_CLOUD_STORAGE_CONN_ID, mime_type="parquet", ) # schema for bigquery table schema_fields = [ { "name": "interval_start_date_time", "type": "TIMESTAMP", "mode": "REQUIRED" }, { "name": "net_consumption_kwh", "type": "FLOAT",
get_geolite_task = PythonOperator( task_id='GetGeolite', python_callable=get_geolite, provide_context=True, op_kwargs={ 'remote_path': "https://geolite.maxmind.com/download/geoip/database/GeoLite2-Country-CSV.zip", 'local_file_name': "GeoLite2-Country-CSV.zip" }, dag=dag) upload_geoip_to_gcs = FileToGoogleCloudStorageOperator( task_id='GeoIPToGCSOperator', src= "/tmp/geolite_{{ ds_nodash }}/GeoLite2-Country-CSV/GeoLite2-Country-Blocks-IPv4.csv", dst= "geolite/{{ execution_date.strftime('y=%Y/m=%m/d=%d') }}/GeoLite2-Country-Blocks-IPv4.csv", bucket="cloudiq-jay-dev-01-long-term-storage", mime_type='text/csv', google_cloud_storage_conn_id='google_cloud_default', dag=dag) upload_geolocation_to_gcs = FileToGoogleCloudStorageOperator( task_id='GeoLocationToGCSOperator', src= "/tmp/geolite_{{ ds_nodash }}/GeoLite2-Country-CSV/GeoLite2-Country-Locations-en.csv", dst= "geolite/{{ execution_date.strftime('y=%Y/m=%m/d=%d') }}/GeoLite2-Country-Locations-en.csv", bucket="cloudiq-jay-dev-01-long-term-storage", mime_type='text/csv', google_cloud_storage_conn_id='google_cloud_default', dag=dag)
reader = csv.reader(file, delimiter=' ') data = list(reader) mysql.insert_rows('PROCESSED_ORDER', data, target_fields=columns) with DAG(dag_id='31_scenario', catchup=False, schedule_interval=timedelta(days=1), default_args=default_args, user_defined_macros={ 'get_composer_gcs_bucket': get_composer_gcs_bucket, }) as dag: copy_file_to_gcs = FileToGoogleCloudStorageOperator( task_id='copy_file_to_gcs', bucket='{{ get_composer_gcs_bucket() }}', dst='data/31_scenario/{{ execution_date }}/order.json', src='/home/airflow/gcs/data/31_scenario/31_order.json', mime_type='application/json') dataflow_process = DataFlowPythonOperator( task_id='dataflow-process', py_file='/home/airflow/gcs/dags/31_dataflow.py', options={ 'input': 'gs://{{ get_composer_gcs_bucket() }}/data/31_scenario/{{ execution_date }}/order.json', 'output': 'gs://{{ get_composer_gcs_bucket() }}/data/31_scenario/{{ execution_date }}/order.csv' }, dataflow_default_options={ 'project': 'YOUR-PROJECT-HERE', "staging_location":
execution_dates = list((datetime_range(start=start_date, end=end_date))) for i, ex_date in enumerate(execution_dates): ed = ex_date.strftime('%Y-%m-%d') ep_template = {'sdt': ed} get_enpdpoints_task_id = "get_{0}_dl_endpoint_{1}".format( endpoint_name, ed) file_to_gcs_task_id = "{0}_{1}_to_gcs".format(endpoint_name, ed) t2 = PythonOperator(task_id=get_enpdpoints_task_id, python_callable=get_endpoint_with_dates, op_args=[SAVE_PATH, BASE_URL, API_KEYS], templates_dict=ep_template) t3 = FileToGoogleCloudStorageOperator( task_id=file_to_gcs_task_id, google_cloud_storage_conn_id='gcs_silo', bucket="deanslist", src="{{ task_instance.xcom_pull(task_ids='" + get_enpdpoints_task_id + "', key='dl_file_path' )}}", dst="TEST/" + endpoint_name + "/{{ task_instance.xcom_pull(task_ids='" + get_enpdpoints_task_id + "', key='dl_file_name') }}", dag=dag) t2.set_upstream(t1) t2.set_downstream(t3)
'email': ['*****@*****.**'], 'email_on_failure': True, 'email_on_retry': False, 'retries': 2, 'retry_delay': timedelta(minutes=5), } dag = DAG('site-visits-dag', default_args=default_args, schedule_interval='@daily') #t1 = file_sensor( # task_id='local_file', # fs_conn_id='fs_default', # file_path='/Users/raghu/git-hub/demo/incoming/site-visits.json', # dag=dag) t1 = DummyOperator(task_id='op1', dag=dag) t2 = PythonOperator(task_id='python_task', python_callable=convert_json_to_csv.main, dag=dag) t3 = FileToGoogleCloudStorageOperator( task_id='fileToGCS', src='/usr/local/demo/outgoing/site-visits.csv', dst='site-visits', bucket='springml-demo', google_cloud_storage_conn_id='google_cloud_default', dag=dag) t3.set_upstream(t2) t2.set_upstream(t1)
# Operator for parsing weather data templated_kwargs = dict(source=historical_target, target=weather_target) parse_weather_op = PythonOperator(task_id='parse_weather_{}'.format(city), python_callable=parse_weather_adapter, provide_context=True, templates_dict=templated_kwargs) # Operator for parsing temperature data templated_kwargs = dict(source=historical_target, target=temperature_target) parse_temperature_op = PythonOperator(task_id='parse_temperature_{}'.format(city), python_callable=parse_temperature_adapter, provide_context=True, templates_dict=templated_kwargs) parsing_tasks = [parse_weather_op, parse_temperature_op] fetch_weather_data >> parsing_tasks for parsing_task in parsing_tasks: target = parsing_task.templates_dict['target'] filetogcs = FileToGoogleCloudStorageOperator(task_id='{}_to_gcs'.format(parsing_task.task_id), src=target, dst=target, bucket='airflow_training_weather_data', mime_type='text/csv') parsing_task >> filetogcs filetogcs >> drop_files
yesterday = datetime.combine(datetime.today() - timedelta(1), datetime.min.time()) default_args = { 'start_date': yesterday, 'email_on_failure': False, 'email_on_retry': False, 'retries': 1, 'retry_delay': timedelta(minutes=5) } def get_composer_gcs_bucket(): return os.environ['GCS_BUCKET'] with DAG(dag_id='20_gcs_operators', catchup=False, schedule_interval=timedelta(days=1), default_args=default_args, user_defined_macros={ 'get_composer_gcs_bucket': get_composer_gcs_bucket, }) as dag: copy_file = FileToGoogleCloudStorageOperator( task_id='copy_file', bucket='{{ get_composer_gcs_bucket() }}', dst='data/20_gcs_operators/{{ execution_date }}/20_gcs_operators.py', src='/home/airflow/gcs/dags/20_gcs_operators.py', mime_type='text/x-python-script')
metadata={'PIP_PACKAGES': 'praw==6.5.1'}, num_workers=2, num_masters=1, image_version='preview', master_machine_type='n1-standard-2', worker_machine_type='n1-standard-2', worker_disk_size=50, master_disk_size=50, region=region, storage_bucket=gcs_netflix_bucket, dag=dag) upload_netflix_catalog_job_to_gcs = FileToGoogleCloudStorageOperator( task_id="upload_netflix_catalog_job_to_gcs", src="/airflow/dags/spark-scripts/clean_netflix_catalog.py", dst="spark-jobs/clean_netflix_catalog.py", bucket=gcs_netflix_bucket, google_cloud_storage_conn_id=gcp_conn, dag=dag) upload_reddit_comments_job_to_gcs = FileToGoogleCloudStorageOperator( task_id="upload_reddit_comments_job_to_gcs", src="/airflow/dags/spark-scripts/consume_reddit_comments.py", dst="spark-jobs/consume_reddit_comments.py", bucket=gcs_netflix_bucket, google_cloud_storage_conn_id=gcp_conn, dag=dag) upload_populate_shows_table_job_to_gcs = FileToGoogleCloudStorageOperator( task_id="upload_populate_shows_table_job_to_gcs", src="/airflow/dags/spark-scripts/populate_shows_table.py",
from airflow.contrib.operators.bigquery_to_gcs import BigQueryToCloudStorageOperator from airflow.contrib.operators.file_to_gcs import FileToGoogleCloudStorageOperator from airflow.contrib.operators.gcs_to_bq import GoogleCloudStorageToBigQueryOperator from airflow.models import DAG dag = DAG( "gcp_movie_ranking", start_date=datetime.datetime(1995, 1, 1), schedule_interval="@monthly", ) upload_ratings_to_gcs = FileToGoogleCloudStorageOperator( task_id="upload_ratings_to_gcs", src= "/data/{{ execution_date.year }}/{{ execution_date.strftime('%m') }}.csv", bucket=os.environ["RATINGS_BUCKET"], dst= "ratings/{{ execution_date.year }}/{{ execution_date.strftime('%m') }}.csv", dag=dag, ) import_in_bigquery = GoogleCloudStorageToBigQueryOperator( task_id="import_in_bigquery", bucket=os.environ["RATINGS_BUCKET"], source_objects=[ "ratings/{{ execution_date.year }}/{{ execution_date.strftime('%m') }}.csv" ], source_format="CSV", create_disposition="CREATE_IF_NEEDED", write_disposition="WRITE_TRUNCATE", bigquery_conn_id="gcp",
python_callable=create_json, provide_context=True ) create_xdr = PythonOperator( task_id='create_xdr', python_callable=create_xdr, provide_context=True ) push = {} for k,v in {'json': 'kre_payout_leaderboard.json', 'csv': 'payout.csv', 'xlsx': 'payout.xlsx', 'xdr0': 'xdr0.xdr', 'xdr1': 'xdr1.xdr'}.items(): push[k] = FileToGoogleCloudStorageOperator( task_id=k, src=write_directory + '/{{ ds }}/' + v, dst=gcs_directory + '{{ ds }}/' + v, bucket='kin-kre-payouts', google_cloud_storage_conn_id='google_cloud_default', ) get_total_payout_information >> create_csv get_total_payout_information_by_date >> create_csv get_spender_information >> create_csv get_holding_information >> create_csv get_buying_information >> create_csv get_prior_information >> create_csv create_csv >> create_json create_csv >> create_xdr
start_date=datetime(2020, 1, 1), catchup=False) dummy_operator = DummyOperator(task_id='dummy_task', retries=3, dag=dag) download_operator = GoogleCloudStorageDownloadOperator( task_id='downloader', bucket='fictizia', object='breast-cancer-wisconsin.data', google_cloud_storage_conn_id='google_cloud_default', filename=GLOBAL_PATH, dag=dag) load_operator = FileToGoogleCloudStorageOperator( task_id='uploader', bucket='fictizia', src=GLOBAL_OUTPUT_PATH, dst='my_file.json', google_cloud_storage_conn_id='google_cloud_default', dag=dag) producer_operator = PythonOperator(task_id='producer', python_callable=produce_data, dag=dag) consumer_operator = PythonOperator(task_id='consumer', python_callable=consume_data, dag=dag) download_operator >> producer_operator download_operator >> consumer_operator >> load_operator
Writing temporarily from FileSystem to Cloud Storage t3_CA >> t4_CA in GCS ''' writeToGCS_task, tempGCS_dir_paths = [] , [] current_time = datetime.today().strftime("%Y%m%d_%H_%M") tempGCS_dir_temp="gs://{}/temp/{}".format(bucket_name,current_time) for cc , input_t4 , t3 in zip(sample_cc,filePaths_t3, decompressionTask): current_time = datetime.today().strftime("%Y%m%d_%H_%M") GCS_dir_archive = "gs://{}/archive/{}/{}".format(bucket_name,cc,current_time) tempGCS_filepath = os.path.join(tempGCS_dir_temp,cc) t4 = FileToGoogleCloudStorageOperator(task_id='uploadToGCS_{}'.format(cc), src=input_t4, dst=tempGCS_filepath, google_cloud_storage_conn_id = storage_connection_id, gzip = False, dag=dag) t4_archive = FileToGoogleCloudStorageOperator(task_id='uploadToGCS_archive_{}'.format(cc), src=input_t4, dst=GCS_dir_archive, google_cloud_storage_conn_id = storage_connection_id, gzip = True, dag=dag) t4.set_upstream(t3) t4_archive.set_upstream(t3) writeToGCS_task.append(t4) tempGCS_dir_paths.append(tempGCS_filepath)
get_membership = BigQueryToFeatherOperator( task_id="get_membership", sql=membership_qry, destination_file="{0}/membership.feather".format(SAVE_PATH)) get_terms = BigQueryToFeatherOperator( task_id="get_terms", sql=terms_qry, destination_file="{0}/terms.feather".format(SAVE_PATH)) create_att_student = CreateAttendStudentOperator( task_id="create_att_student", dirpath=SAVE_PATH) attend_student_to_gcs = FileToGoogleCloudStorageOperator( task_id="attend_student_to_gcs", google_cloud_storage_conn_id='gcs_silo', bucket="idea_attendance", src="{0}/attend_student.feather".format(SAVE_PATH), dst="attend_student.feather") # by school by grade create_group_by_school_grade = CreateGroupedAttendance( task_id="ada_by_date_school_grade", dirpath=SAVE_PATH, file_name="attend_date_school_grade.feather", grouping_vars=['date', 'school_abbrev', 'grade_level']) create_ytd_by_school_by_grade = GroupedADAToGroupedYTDOperator( task_id="create_ytd_by_school_grade", dirpath=SAVE_PATH, in_file_name="attend_date_school_grade.feather", out_file_name="ada_weekly_school_grade.feather", grouping_vars=['date', 'school_abbrev', 'grade_level'])
task_id='verify_tokens', python_callable=utils.verify_access_token, dag=dag, ) get_sleep = PythonOperator( task_id='get_sleep', python_callable=utils.fetch_sleep, provide_context=True, dag=dag, ) upload_sleep = FileToGoogleCloudStorageOperator( task_id='upload_sleep_to_gcs', src='{}/{}-sleep.json'.format(Variable.get('LOCAL_STAGING'), '{{ds}}'), dst='{{ds}}/sleep.json', bucket=GCS_BUCKET, google_cloud_storage_conn_id='sleep-gcp', on_success_callback=utils.remove_temp('sleep'), dag=dag, ) get_weather = PythonOperator( task_id='get_weather', python_callable=utils.fetch_weather, op_args=[Variable.get('WEATHERBIT_KEY')], provide_context=True, dag=dag, ) upload_weather = FileToGoogleCloudStorageOperator( task_id='upload_weather_to_gcs', src='{}/{}-weather.json'.format(Variable.get('LOCAL_STAGING'), '{{ds}}'),
worker_disk_size=50, master_disk_size=50, region=region, storage_bucket=gcs_football_bucket, dag=dag ) international_football_dataset_to_datalake = InternationalFootballDataSetToDataLake( task_id="international_football_dataset_to_datalake", name="martj42/international-football-results-from-1872-to-2017", destination_path="/airflow/datasources/catalog/csv", dag=dag ) upload_cleaning_spark_job_to_gcs = FileToGoogleCloudStorageOperator(task_id='upload_cleaning_spark_job_to_gcs', src='/airflow/dags/spark-jobs/football_dataset_cleaner.py', dst='spark_jobs/football_dataset_cleaner.py', bucket=gcs_football_bucket, google_cloud_storage_conn_id=gcp_conn, dag=dag) upload_etl_games_spark_job_to_gcs = FileToGoogleCloudStorageOperator(task_id='upload_etl_games_spark_job_to_gcs', src='/airflow/dags/spark-jobs/etl_games_to_bigquery.py', dst='spark_jobs/etl_games_to_bigquery.py', bucket=gcs_football_bucket, google_cloud_storage_conn_id=gcp_conn, dag=dag) cleaning_job_code_path = 'gs://' + gcs_football_bucket + '/spark_jobs/football_dataset_cleaner.py' submit_cleaning_spark_job = DataProcPySparkOperator( task_id='submit_cleaning_spark_job', main=cleaning_job_code_path, cluster_name=cluster_name,
with models.DAG( 'ejercicio_4', description='Ejercicio 4 - clase 2', default_args=default_args, schedule_interval=None, ) as dag: augmentation_operator = PythonOperator(task_id='augmentation', python_callable=data_augmentation) count = 1 operators = list() for base, dirs, files in os.walk(IMAGE_OUTPUT_PATH): for file in files: load_operator = FileToGoogleCloudStorageOperator( task_id='upload_file_' + str(count), bucket='fictizia', src=base + file, dst='images_2/' + file, google_cloud_storage_conn_id='google_cloud_default', dag=dag) operators.append(load_operator) count += 1 augmentation_operator >> operators[0] for i in range(1, len(operators)): operators[i - 1] >> operators[i]
dl_tasks = list(map(lambda t: t[0], tuple_tasks_filenames)) filenames = list(map(lambda t: t[1], tuple_tasks_filenames)) save_filenames_task = PythonOperator(task_id="create_filenames_csv", python_callable=save_to_csv, op_kwargs={ 'filenames': filenames, 'path': filenames_path }, dag=dag) copy_filenames_to_gs = FileToGoogleCloudStorageOperator( task_id=f"copy_{ggi_files_to_process}_to_gs", src=filenames_path, dst=ggi_files_to_process, bucket=OUTPUT_BUCKET, task_concurrency=50, dag=dag) dataproc_task = DataProcPySparkOperator( task_id="process_write_to_bigquery", cluster_name=CLUSTER_NAME, main=PYSPARK_MAIN_PATH, arguments=[f"gs://{OUTPUT_BUCKET}", ggi_files_to_process], pyfiles=[PYSPARK_ARCHIVE_PATH], dataproc_pyspark_jars=[ 'gs://spark-lib/bigquery/spark-bigquery-with-dependencies_2.12-0.16.0.jar' ], region='us-central1', retries=0,