def test_exec_success(self, gcs_hook_mock_class, pg_hook_mock_class): """Test the execute function in case where the run is successful.""" op = PostgresToGoogleCloudStorageOperator( task_id=TASK_ID, postgres_conn_id=POSTGRES_CONN_ID, sql=SQL, bucket=BUCKET, filename=FILENAME) pg_hook_mock = pg_hook_mock_class.return_value pg_hook_mock.get_conn().cursor().__iter__.return_value = iter(ROWS) pg_hook_mock.get_conn().cursor().description = CURSOR_DESCRIPTION gcs_hook_mock = gcs_hook_mock_class.return_value def _assert_upload(bucket, obj, tmp_filename, content_type): self.assertEqual(BUCKET, bucket) self.assertEqual(FILENAME.format(0), obj) self.assertEqual('application/json', content_type) with open(tmp_filename, 'rb') as f: self.assertEqual(b''.join(NDJSON_LINES), f.read()) gcs_hook_mock.upload.side_effect = _assert_upload op.execute(None) pg_hook_mock_class.assert_called_once_with(postgres_conn_id=POSTGRES_CONN_ID) pg_hook_mock.get_conn().cursor().execute.assert_called_once_with(SQL, None)
def test_schema_file(self, gcs_hook_mock_class, pg_hook_mock_class): """Test writing schema files.""" pg_hook_mock = pg_hook_mock_class.return_value pg_hook_mock.get_conn().cursor().__iter__.return_value = iter(ROWS) pg_hook_mock.get_conn().cursor().description = CURSOR_DESCRIPTION gcs_hook_mock = gcs_hook_mock_class.return_value def _assert_upload(bucket, obj, tmp_filename, content_type): if obj == SCHEMA_FILENAME: with open(tmp_filename, 'rb') as f: self.assertEqual(SCHEMA_JSON, f.read()) gcs_hook_mock.upload.side_effect = _assert_upload op = PostgresToGoogleCloudStorageOperator( task_id=TASK_ID, sql=SQL, bucket=BUCKET, filename=FILENAME, schema_filename=SCHEMA_FILENAME) op.execute(None) # once for the file and once for the schema self.assertEqual(2, gcs_hook_mock.upload.call_count)
def test_file_splitting(self, gcs_hook_mock_class, pg_hook_mock_class): """Test that ndjson is split by approx_max_file_size_bytes param.""" pg_hook_mock = pg_hook_mock_class.return_value pg_hook_mock.get_conn().cursor().__iter__.return_value = iter(ROWS) pg_hook_mock.get_conn().cursor().description = CURSOR_DESCRIPTION gcs_hook_mock = gcs_hook_mock_class.return_value expected_upload = { FILENAME.format(0): b''.join(NDJSON_LINES[:2]), FILENAME.format(1): NDJSON_LINES[2], } def _assert_upload(bucket, obj, tmp_filename, content_type): self.assertEqual(BUCKET, bucket) self.assertEqual('application/json', content_type) with open(tmp_filename, 'rb') as f: self.assertEqual(expected_upload[obj], f.read()) gcs_hook_mock.upload.side_effect = _assert_upload op = PostgresToGoogleCloudStorageOperator( task_id=TASK_ID, sql=SQL, bucket=BUCKET, filename=FILENAME, approx_max_file_size_bytes=len(expected_upload[FILENAME.format(0)])) op.execute(None)
def test_empty_query(self, gcs_hook_mock_class): """If the sql returns no rows, we should not upload any files""" gcs_hook_mock = gcs_hook_mock_class.return_value op = PostgresToGoogleCloudStorageOperator( task_id=TASK_ID, sql='SELECT * FROM postgres_to_gcs_operator_empty', bucket=BUCKET, filename=FILENAME) op.execute(None) assert not gcs_hook_mock.upload.called, 'No data means no files in the bucket'
def test_exec_success(self, gcs_hook_mock_class): """Test the execute function in case where the run is successful.""" op = PostgresToGoogleCloudStorageOperator( task_id=TASK_ID, postgres_conn_id=POSTGRES_CONN_ID, sql=SQL, bucket=BUCKET, filename=FILENAME) gcs_hook_mock = gcs_hook_mock_class.return_value def _assert_upload(bucket, obj, tmp_filename, content_type): self.assertEqual(BUCKET, bucket) self.assertEqual(FILENAME.format(0), obj) self.assertEqual('application/json', content_type) with open(tmp_filename, 'rb') as f: self.assertEqual(b''.join(NDJSON_LINES), f.read()) gcs_hook_mock.upload.side_effect = _assert_upload op.execute(None)
def test_exec_success(self, gcs_hook_mock_class): """Test the execute function in case where the run is successful.""" op = PostgresToGoogleCloudStorageOperator( task_id=TASK_ID, postgres_conn_id=POSTGRES_CONN_ID, sql=SQL, bucket=BUCKET, filename=FILENAME) gcs_hook_mock = gcs_hook_mock_class.return_value def _assert_upload(bucket, obj, tmp_filename, content_type): self.assertEqual(BUCKET, bucket) self.assertEqual(FILENAME.format(0), obj) self.assertEqual('application/json', content_type) with open(tmp_filename, 'rb') as f: self.assertEqual(b''.join(NDJSON_LINES), f.read()) gcs_hook_mock.upload.side_effect = _assert_upload op.execute(None)
def test_schema_file(self, gcs_hook_mock_class): """Test writing schema files.""" gcs_hook_mock = gcs_hook_mock_class.return_value def _assert_upload(bucket, obj, tmp_filename, mime_type, gzip): # pylint: disable=unused-argument if obj == SCHEMA_FILENAME: with open(tmp_filename, 'rb') as f: self.assertEqual(SCHEMA_JSON, f.read()) gcs_hook_mock.upload.side_effect = _assert_upload op = PostgresToGoogleCloudStorageOperator( task_id=TASK_ID, sql=SQL, bucket=BUCKET, filename=FILENAME, schema_filename=SCHEMA_FILENAME) op.execute(None) # once for the file and once for the schema self.assertEqual(2, gcs_hook_mock.upload.call_count)
def test_file_splitting(self, gcs_hook_mock_class): """Test that ndjson is split by approx_max_file_size_bytes param.""" gcs_hook_mock = gcs_hook_mock_class.return_value expected_upload = { FILENAME.format(0): b''.join(NDJSON_LINES[:2]), FILENAME.format(1): NDJSON_LINES[2], } def _assert_upload(bucket, obj, tmp_filename, content_type): self.assertEqual(BUCKET, bucket) self.assertEqual('application/json', content_type) with open(tmp_filename, 'rb') as f: self.assertEqual(expected_upload[obj], f.read()) gcs_hook_mock.upload.side_effect = _assert_upload op = PostgresToGoogleCloudStorageOperator( task_id=TASK_ID, sql=SQL, bucket=BUCKET, filename=FILENAME, approx_max_file_size_bytes=len(expected_upload[FILENAME.format(0)])) op.execute(None)
from airflow.contrib.operators.postgres_to_gcs_operator import PostgresToGoogleCloudStorageOperator from datetime import timedelta, datetime args = { "owner": "godatadriven", "start_date": airflow.utils.dates.days_ago(10) } dag = DAG(dag_id='postgres', default_args=args, description="DAG for using " + __name__, schedule_interval=timedelta(hours=2, minutes=30)) query = 'SELECT * FROM land_registry_price_paid_uk LIMIT 10' with dag: pg_to_gcs = PostgresToGoogleCloudStorageOperator( task_id='pg_to_gcs', sql=query, bucket='airflow-postgres-1234', filename='output-{{ ds }}', google_cloud_storage_conn_id='google_cloud_storage_default') end = BashOperator( task_id='end', bash_command='echo "That\'s it folks!"', ) pg_to_gcs >> end
with DAG(dag_id="usecase", default_args=args, schedule_interval="@daily") as dag: check_date = ShortCircuitOperator( task_id="check_if_before_end_of_last_year", python_callable=check_date, provide_context=True, ) psql_to_gcs = PostgresToGoogleCloudStorageOperator( task_id="postgres_to_gcs", sql= "SELECT * FROM land_registry_price_paid_uk WHERE transfer_date = '{{ ds }}'", bucket="airflow-training-data-land-registry-krisgeus", filename= "usecase/land_registry_price_paid_uk/{{ ds }}/properties_{}.json", postgres_conn_id="airflow-training-postgres", ) #https://api.exchangeratesapi.io/history?start_at=2018-01-01&end_at=2018-01-02&symbols=EUR&base=GBP http_to_gcs = HttpToGcsOperator( task_id="get_currency_" + currency, method="GET", endpoint= f"/history?start_at={{{{ ds }}}}&end_at={{{{ tomorrow_ds }}}}&base=GBP&symbols={currency}", http_conn_id="currency-http", gcs_conn_id="google_cloud_storage_default", gcs_path=f"usecase/currency/{{{{ ds }}}}-{currency}.json", gcs_bucket=f"{bucket_name}",
dag = DAG( dag_id="exercise4", default_args=args, schedule_interval="@daily", ) start = DummyOperator( task_id="start", dag=dag, ) get_data = PostgresToGoogleCloudStorageOperator( task_id="get_data", sql= "SELECT * FROM land_registry_price_paid_uk WHERE transfer_date = '{{ ds }}'", bucket="airflow-training-data2", filename="{{ ds }}/properties_{}.json", postgres_conn_id="airflow_exercise4", dag=dag, ) get_exchange_rate = HttpToGcsOperator( task_id="get_exchange", endpoint= "/history?start_at={{yesterday_ds}}&end_at={{ds}}&symbols=EUR&base=GBP", gcs_bucket="airflow-training-data2", gcs_path="currency/{{ ds }}-" + currency + ".json", http_conn_id="airflow-training-currency-http", gcs_conn_id="google_cloud_default", *args, **kwargs)
from datetime import timedelta, datetime import pendulum import airflow from airflow.models import DAG from airflow.contrib.operators.postgres_to_gcs_operator import ( PostgresToGoogleCloudStorageOperator, ) default_args = {"owner": "hudson", "email": "*****@*****.**"} with airflow.DAG( dag_id="test_dag_8", start_date=datetime(2019, 9, 28), schedule_interval=None, catchup=False, default_args=default_args, ) as dag: sql = "select * from land_registry_price_paid_uk" bucket_name = "airflow-training-data-hudson" op = PostgresToGoogleCloudStorageOperator( task_id="land_registry_price_paid_uk", postgres_conn_id="trainingdb", sql=sql, bucket=bucket_name, filename="land_registry_price_paid_uk_{}.json", )
from airflow.contrib.operators.postgres_to_gcs_operator import PostgresToGoogleCloudStorageOperator # from airflow.operators.postgres_to_gcs_operator import # from airflow.providers.google.cloud.operators.postgres_to_gcs_operator import Postg args = { 'owner': 'JDreijer', 'start_date': airflow.utils.dates.days_ago(2), } dag = DAG( dag_id='exercise5', default_args=args, schedule_interval=None, dagrun_timeout=timedelta(minutes=60), ) t1 = PostgresToGoogleCloudStorageOperator( task_id='from_pg_to_gcs', postgres_conn_id='postgres_ex', sql='select * from land_registry_price_paid_uk LIMIT 5', bucket='test_bucket_airflow', filename='postgres_dump.csv', google_cloud_storage_conn_id='google_cloud_storage_default', dag=dag) t1 # task_start = DummyOperator(task_id='start_task', dag=dag) # task_start >> t1
from datetime import timedelta import airflow from airflow.contrib.operators.postgres_to_gcs_operator import PostgresToGoogleCloudStorageOperator from airflow.models import DAG from airflow.utils.trigger_rule import TriggerRule from pendulum import Pendulum args = { 'owner': 'Anton Kostyliev', 'start_date': airflow.utils.dates.days_ago(2), } dag = DAG( dag_id='exercise_4', default_args=args, schedule_interval=None, dagrun_timeout=timedelta(minutes=60), ) with dag as dag: copy_task = PostgresToGoogleCloudStorageOperator( task_id='psql_to_gcp', postgres_conn_id='gdd', sql= 'SELECT * FROM land_registry_price_paid_uk WHERE postcode = \'TQ1 1RY\'', bucket='gdd-tetetetetetetete', filename='mydata')
TABLE_NAME = 'user_log' DATASET_NAME = 'dw_bluepi' # current date and time TODAY = date.today().strftime('%Y-%m-%d') HOURS = strftime("%H%M%S", gmtime()) # Load data from source ingest_data = PostgresToGoogleCloudStorageOperator( task_id="ingest_data", dag=dag, sql=f"SELECT id, user_id, action, CAST(status AS int), \ to_char(created_at,'YYYY-MM-DD HH24:MI:SS') AS created_at, \ to_char(updated_at,'YYYY-MM-DD HH24:MI:SS') AS updated_at \ FROM {TABLE_NAME}", export_format="csv", field_delimiter="|", filename= f"{DESTINATION_DIRECTORY}/{DATABASE}/dt={TODAY}/{TABLE_NAME}-{TODAY}.csv", bucket=DESTINATION_BUCKET, retries=2, postgres_conn_id="postgres_crm", google_cloud_storage_conn_id="bluepi_gcp_connection", ) # Transform data then upload dato into BigQuery (Data warehouse) load_tranform_data = PythonOperator(task_id='load_tranform_data', dag=dag, retries=1, python_callable=tranform_data) # Validate data on BigQuery
from airflow import models from airflow.contrib.operators.postgres_to_gcs_operator import PostgresToGoogleCloudStorageOperator from datetime import datetime, timedelta default_args = { 'owner': 'AirFlow', 'start_date': datetime(2015, 6, 1), 'retry_delay': timedelta(minutes=5) } GCS_BUCKET = 'seth_sucks' SQL_QUERY = "SELECT * FROM land_registry_price_paid_uk LIMIT 1000" with models.DAG('GetLandRegistryPrice', default_args=default_args) as dag: upload_data = PostgresToGoogleCloudStorageOperator( postgres_conn_id='postgres', task_id='PostgresToCloud', sql=SQL_QUERY, filename="land_registry_price_paid_uk.txt", bucket=GCS_BUCKET, dag=dag)
local_filename = '/tmp/' + self.filename f = open(local_filename, "w") f.write(response.text) f.close() hook.upload(bucket=self.bucket, object=self.filename, filename=local_filename, mime_type='application/json') default_args = {"owner": "bas", "start_date": airflow.utils.dates.days_ago(3)} dag = DAG(dag_id="case", default_args=default_args, schedule_interval="0 0 * * *") pgsl_to_gcs = PostgresToGoogleCloudStorageOperator(task_id="postgres_to_gcs", sql="SELECT * FROM land_registry_price_paid_uk WHERE transfer_date = '{{ ds }}'", bucket="bvb-data", filename="daily_load_{{ ds }}", postgres_conn_id="post-conn", dag=dag) http_to_gcs = HTTPToCloudStorageOperator(task_id="exchange_rate_to_gcs", endpoint='airflow-training-transform-valutas?date={{ ds }}&to=EUR', bucket="bvb-data", filename="exchange_rate_{{ ds }}", dag=dag) dataproc_create_cluster = DataprocClusterCreateOperator(task_id="dataproc_create", cluster_name="analyse-pricing-{{ ds }}", project_id='airflowbolcom-may2829-aaadbb22', num_workers=2, zone="europe-west4-a", dag=dag)
from airflow.contrib.operators.postgres_to_gcs_operator import ( PostgresToGoogleCloudStorageOperator) import airflow from airflow.models import DAG args = { 'owner': 'Mike', 'start_date': airflow.utils.dates.days_ago(2), } dag = DAG(dag_id='exercis5', default_args=args, schedule_interval=None) t1 = PostgresToGoogleCloudStorageOperator( task_id="postgres_to_gcs", sql= "SELECT * FROM land_registry_price_paid_uk WHERE transfer_date between (now() - '1 week'::interval) and (now() - '2 weeks'::interval", bucket="postgres_export_mfennemore", filename="ex_{{ execution_date }}", postgres_conn_id="postgres_default", google_cloud_storage_conn_id="google_cloud_storage_default", dag=dag, )
schedule_interval='@daily', ) #clear_bucket = GoogleCloudStorageDeleteOperator( # task_id='clear_bucket', # bucket_name="airflow_exercise_4", # prefix="exercise4_", # dag=dag, # #) postgres_to_gcs = PostgresToGoogleCloudStorageOperator( task_id='postgres_to_gcs', postgres_conn_id='postgres_gdd', sql="""select * from public.land_registry_price_paid_uk where transfer_date = '{{ ds }}' ;""", bucket="airflow_exercise_4", filename="exercise4_{{ ds_nodash }}_{}.json", provide_context=True, dag=dag, ) postgres_to_gcs # select * # from public.land_registry_price_paid_uk # where transfer_date = '2003-12-17' # ;
tmp_file_handle.write(response.content) tmp_file_handle.flush() hook = GoogleCloudStorageHook( google_cloud_storage_conn_id=self.gcs_conn_id) hook.upload( bucket=self.gcs_bucket, object=self.gcs_path, filename=tmp_file_handle.name, ) with DAG(**arguments) as dag: postgres_to_gcs = PostgresToGoogleCloudStorageOperator( task_id='postgres_to_gcs', sql= "SELECT * FROM land_registry_price_paid_uk WHERE transfer_date = '2019-11-27'", bucket='output_bucket_for_airflow', filename='prices-{{ ds }}.json', postgres_conn_id='postgres_default', google_cloud_storage_conn_id='google_cloud_storage_default') http_to_gcs = HttpToGcsOperator( task_id='http_to_gcs', endpoint= 'history?start_at={{ yesterday_ds }}&end_at={{ ds }}&symbols=EUR&base=GBP', gcs_bucket='output_bucket_for_airflow', gcs_path='exchange-rates-{{ ds }}.json', method="GET", http_conn_id="http_default", gcs_conn_id="google_cloud_default") create_cluster = DataprocClusterCreateOperator( task_id='create_cluster', project_id='afspfeb3-28e3a1b32a56613ef127e',
args = { "owner": "Miha", "start_date": datetime(2019, 9, 20), } dag = DAG( dag_id="exercise4", default_args=args, schedule_interval="0 0 * * *", ) pgsl_to_gcs = PostgresToGoogleCloudStorageOperator( task_id="transfer_data_from_postgres", sql= "SELECT * FROM land_registry_price_paid_uk WHERE transfer_date = '{{ ds }}'", bucket="airflow-training-data-1983", filename="land_registry/{{ ds }}/properties_{}.json", postgres_conn_id="GoogleCloudSQL-miha", dag=dag, ) currency = "EUR" transfer_currency = HttpToGcsOperator( task_id="get_currency_" + currency, method="GET", endpoint= "history?start_at={{yesterday_ds}}&end_at={{ds}}&symbols=EUR&base=GBP", http_conn_id="airflow-training-currency-http", gcs_path="currency/{{ ds }}-" + currency + ".json", gcs_bucket="airflow-training-data-1983",
args = { 'owner': 'Airflow', 'start_date': airflow.utils.dates.days_ago(2), } with DAG( dag_id='dag_exercise-postgres', default_args=args, schedule_interval='@daily', ) as dag: storeinbucket = PostgresToGoogleCloudStorageOperator( postgres_conn_id='exercise-postgres', sql= 'select * from land_registry_price_paid_uk limit 10', bucket= 'example_postgresstogcp', filename= 'data.json', schema_filename = 'schema.json', task_id= 'storeinbucket' ) storagetoBQtable = GoogleCloudStorageToBigQueryOperator( bucket= 'example_postgresstogcp', source_objects = ['data.json'], schema_object= 'schema.json', destination_project_dataset_table= 'airflowbolcomdec-e4e4712278627.datafrompostgres.tabletest', source_format="NEWLINE_DELIMITED_JSON", write_disposition='WRITE_TRUNCATE', time_partitioning = {'field':'transfer_date'}, task_id= 'storagetoBQtable' )
from datetime import timedelta # noinspection PyPackageRequirements import airflow # noinspection PyPackageRequirements from airflow import DAG from airflow.contrib.operators.postgres_to_gcs_operator import PostgresToGoogleCloudStorageOperator args = { 'owner': 'Airflow', 'start_date': airflow.utils.dates.days_ago(9), } with DAG(dag_id='exercise8', default_args=args, schedule_interval=timedelta(hours=2.5)) as dag: PostgresToGoogleCloudStorageOperator( task_id='postges', postgres_conn_id='gddconnection', google_cloud_storage_conn_id="google_cloud_storage_default", sql= "select * from land_registry_price_paid_uk where transfer_date = '{{ execution_date.strftime('%Y-%m-%d') }}'", bucket='nice_bucket', filename='{{execution_date}}')
from datetime import timedelta import airflow from airflow.models import DAG from airflow.contrib.operators.postgres_to_gcs_operator import ( PostgresToGoogleCloudStorageOperator, ) args = { "owner": "Airflow", "start_date": airflow.utils.dates.days_ago(2), } dag = DAG( dag_id="hook_exercise", default_args=args, schedule_interval=None, dagrun_timeout=timedelta(minutes=60), ) go = PostgresToGoogleCloudStorageOperator( postgres_conn_id="postgres_training_id", sql="SELECT transfer_date FROM land_registry_price_paid_uk LIMIT 30", bucket="airflow_training_bucket", filename="tst{}.json", task_id="postgres_hook", dag=dag, ) go
} return branches[context["execution_date"].weekday()] # noinspection PyUnresolvedReferences args = { 'owner': 'Airflow', 'start_date': airflow.utils.dates.days_ago(5), 'dagrun_timeout': timedelta(minutes=60) } dag = DAG( dag_id='example10', default_args=args, # schedule_interval='0 0 * * *' ## every day schedule_interval='@daily' ## every day # schedule_interval='45 13 * * MON,WED,FRI' ## every day # schedule_interval= timedelta(hours=2, minutes=30) ## every 2.5 hours; difficult in crone ) get_data = PostgresToGoogleCloudStorageOperator( task_id='get_data', filename='{{ds}}/gdk_data{}.json', bucket='gkokotanekov_airflow_training', postgres_conn_id='gddconnection', google_cloud_storage_conn_id='google_cloud_storage_default', # sql = "select * FROM land_registry_price_paid_uk WHERE transfer_date = '{{ execution_date.strftime('%Y-%m-%d') }}'", sql= "select * FROM land_registry_price_paid_uk WHERE transfer_date = '{{ ds }}'", dag=dag)
#Demo from datetime import datetime from datetime import timedelta from airflow import DAG from airflow.contrib.operators.postgres_to_gcs_operator import PostgresToGoogleCloudStorageOperator DESTINATION_BUCKET = 'sc-flow-dev-data' DESTINATION_DIRECTORY = "transferred" dag_params = { 'dag_id': 'PostgresToCloudStorageExample', 'start_date': datetime(2020, 7, 7), 'schedule_interval': timedelta(days=1), } with DAG(**dag_params) as dag: move_results = PostgresToGoogleCloudStorageOperator( task_id="move_results", bucket=DESTINATION_BUCKET, filename=DESTINATION_DIRECTORY + "/{{ execution_date }}" + "/{}.json", sql= '''SELECT *, due_date::date - effective_date::date as calc FROM loan_data;''', retries=3, postgres_conn_id="postgres_poc")
t1 = PythonOperator( task_id="set_filename", python_callable=setfile, provide_context=True, dag=dag, ) t2 = PostgresToGoogleCloudStorageOperator( task_id="postgre_to_gcs", sql="select * from covid", filename="{{ task_instance.xcom_pull(key='filename') }}", postgres_conn_id="postgres_conn", google_cloud_storage_conn_id="gcp_conn", bucket="img_objects_to_audio", dag=dag, ) t3 = GoogleCloudStorageToBigQueryOperator( task_id="gcs_to_bigq", bucket="img_objects_to_audio", source_objects=["{{ ti.xcom_pull(key='filename') }}"], destination_project_dataset_table="tech-289406.daily_data.{}".format("covid_" + datetime.now().strftime("%d_%b_%Y_%H_%M_%S_%f")), source_format="NEWLINE_DELIMITED_JSON", bigquery_conn_id="gcp_conn", google_cloud_storage_conn_id="gcp_conn", dag=dag, )
'owner': 'Airflow', 'start_date': datetime(2019, 1, 1), 'end_date': datetime(2019, 11, 28), 'project_id': Variable.get('gcp_project'), } with DAG(dag_id='exercise_5_real_estate', default_args=default_args, schedule_interval='@daily', dagrun_timeout=timedelta(minutes=60)) as dag: # Fetch land registry prices for current day land_registry_prices_to_gcs = PostgresToGoogleCloudStorageOperator( task_id='land_registry_prices_to_gcs', sql="select * from land_registry_price_paid_uk " "WHERE transfer_date = '{{ ds }}';", bucket='gdd_airflow_npage_properties', filename='land_registry_price_paid_uk-{{ ds }}.json', postgres_conn_id='postgres_gdd', google_cloud_storage_conn_id='google_cloud_storage_default', ) # Fetch exchange rate (average) from previous day until current day, and store # the result in GCS exchange_rates_to_gcs = HttpToGcsOperator( task_id='exchange_rates_to_gcs', base_url='https://api.exchangeratesapi.io', endpoint='history', data={ 'start_at': '{{ yesterday_ds }}', 'end_at': '{{ ds }}', 'symbols': 'EUR',
args = { 'owner': 'Airflow', 'start_date': timezone.datetime(2019, 12, 1), } #def _connect_to_postgres(**context): ## postgres = PostgresToGoogleCloudStorageOperator(task_id='postgres', # postgres_conn_id='postgres_default') # postgres.query() with DAG( dag_id='connect_to_postgres_dag', default_args=args, schedule_interval='@daily', dagrun_timeout=timedelta(minutes=60), ) as dag: PostgresToGoogleCloudStorageOperator(task_id='postgres', sql="SELECT * FROM land_registry_price_paid_uk WHERE transfer_date::date = '{{ds}}'::date LIMIT 1000", filename="output-{{ds}}.csv", bucket="land_data_training_jjac_airflow", postgres_conn_id='postgres_default') #print_weekday = PythonOperator(task_id='weekday_dag', # python_callable=_connect_to_postgres, # provide_context=True)
from airflow.contrib.operators.gcs_to_bq import GoogleCloudStorageToBigQueryOperator project_id = "airflowpublic-20631bd75d741210" bucket_name = "airflow-training-data" currency = "EUR" args = {"owner": "Kris", "start_date": datetime.datetime(2019, 9, 20)} dag = DAG(dag_id="usecase", default_args=args, schedule_interval="0 0 * * *") pgsl_to_gcs = PostgresToGoogleCloudStorageOperator( task_id="postgres_to_gcs", sql= "SELECT * FROM land_registry_price_paid_uk WHERE transfer_date = '{{ ds }}'", bucket="{bucket}".format(bucket=bucket_name), filename= "airflow-training-data/land_registry_price_paid_uk/{{ ds }}/properties_{}.json", postgres_conn_id="airflow-training-postgres", google_cloud_storage_conn_id="google_cloud_storage_default", dag=dag, ) #https://europe-west2-gdd-airflow-training.cloudfunctions.net/function-1?date=2018-01-01&to=usd http_to_gcs = HttpToGcsOperator( task_id="get_currency_" + currency, method="GET", endpoint= "history?start_at={{yesterday_ds}}&end_at={{ds}}&symbols=EUR&base=GBP", http_conn_id="airflow-training-currency-http", gcs_conn_id="google_cloud_storage_default", gcs_path="airflow-training-data/currency/{{ ds }}-" + currency + ".json",
from operators.http_to_gcs_operator import HttpToGcsOperator args = { 'owner': 'Airflow', 'start_date': airflow.utils.dates.days_ago(2), } sql = '''SELECT * FROM land_registry_price_paid_uk WHERE transfer_date = '{{ds}}' ''' http_endpoint = 'history?start_at={{yesterday_ds}}&end_at={{ds}}&symbols=EUR&base=GBP' with DAG( dag_id='dag_exercise_connection', default_args=args, schedule_interval=None, ) as dag: query_table = PostgresToGoogleCloudStorageOperator( task_id="query_table", sql=sql, postgres_conn_id="postgres_training", bucket='dag-exercise-connection', filename="test_table/ {{ds}}.json") fetch_exchange_rate = HttpToGcsOperator( task_id="fetch_exchange_rate", http_conn_id="http_exchange", endpoint=http_endpoint, gcs_bucket="dag-exercise-connection", gcs_path="exchange_rate") [query_table, fetch_exchange_rate]
# specific language governing permissions and limitations # under the License. from datetime import timedelta from airflow.models import DAG from airflow.contrib.operators.postgres_to_gcs_operator import PostgresToGoogleCloudStorageOperator from datetime import datetime get_postgres_data_query = """ SELECT * FROM land_registry_price_paid_uk WHERE transfer_date = '{{ execution_date.date() }}'; """ args = { "owner": "airflow", "start_date": datetime(2019, 12, 5), } with DAG(dag_id='operator_hook_task_dag', default_args=args, schedule_interval='@daily') as dag: postgres_to_gcp = PostgresToGoogleCloudStorageOperator( task_id='postgres_to_gcp', postgres_conn_id='postgres_training', sql=get_postgres_data_query, bucket="airflow_training_ccb", export_format="csv", filename="land_registry_price_paid_uk/{{ execution_date.date() }}.csv")
from airflow.contrib.operators.postgres_to_gcs_operator import PostgresToGoogleCloudStorageOperator from airflow_training.operators.http_to_gcs_operator import HttpToGcsOperator from airflow.operators.dummy_operator import DummyOperator from datetime import datetime dag = DAG(dag_id="exercise4", default_args={ "owner": "jonathan", "start_date": datetime(2019, 9, 20), }, schedule_interval="0 0 * * *") postgres_to_google_cloud = PostgresToGoogleCloudStorageOperator( task_id="postgres_to_google_cloud", sql= "SELECT * FROM land_registry_price_paid_uk WHERE transfer_date = '{{ ds }}'", bucket="airflow-training-data-jalves", filename="{{ ds }}/properties_{}.json", postgres_conn_id="airflow-training", dag=dag) http_to_google_cloud = HttpToGcsOperator( task_id="http_to_google_cloud", method="GET", endpoint= "/history?start_at=2019-09-20&end_at=2019-09-27&symbols=EUR&base=GBP", http_conn_id="airflow-training-http", gcs_bucket="airflow-training-data-jalves", gcs_path="exchange.json", dag=dag) dummy = DummyOperator(task_id="dummy", dag=dag)
# software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. """Example DAG demonstrating the usage of the BashOperator.""" import datetime from datetime import timedelta from airflow.contrib.operators.postgres_to_gcs_operator import PostgresToGoogleCloudStorageOperator from airflow.models import DAG arguments = { 'dag_id': 'exercise5', 'default_args': { 'owner': 'Costas', 'start_date': datetime.datetime.today() - timedelta(seconds=1) }, 'schedule_interval': None } with DAG(**arguments) as dag: postgres_to_gcs = PostgresToGoogleCloudStorageOperator( task_id='postgres_to_gcs', sql='SELECT * FROM land_registry_price_paid_uk LIMIT 10', bucket='output_bucket_for_airflow', filename='output_file', postgres_conn_id='postgres_default', google_cloud_storage_conn_id='google_cloud_storage_default')
'start_date': datetime(2019, 11, 26), 'end_date': datetime(2019, 11, 28), 'schedule_interval': '@daily', 'depends_on_past': False } dag = DAG( dag_id='sparkdag', default_args=args, ) t1 = PostgresToGoogleCloudStorageOperator( task_id='from_pg_to_gcs', postgres_conn_id='postgres_default', sql='select * from land_registry_price_paid_uk WHERE transfer_date = \'{{ ds }}\'', bucket='spark_bucket_jd', filename='postgres-transfers-{{ ds }}.csv', google_cloud_storage_conn_id='google_cloud_storage_default', dag=dag ) t2 = HttpToGcsOperator( task_id='from_api_to_gcs', endpoint='history?start_at={{ yesterday_ds }}&end_at={{ ds }}&symbols=EUR&base=GBP', gcs_path='api-exchangerate-{{ ds }}.json', gcs_bucket='spark_bucket_jd', http_conn_id='exchange_rates_api', dag=dag ) t3 = DataprocClusterCreateOperator(