# [START load_function] def load(**kwargs): ti = kwargs['ti'] total_value_string = ti.xcom_pull(task_ids='transform', key='total_order_value') total_order_value = json.loads(total_value_string) print(total_order_value) # [END load_function] # [START main_flow] extract_task = PythonOperator( task_id='extract', python_callable=extract, ) extract_task.doc_md = """\ #### Extract task A simple Extract task to get data ready for the rest of the data pipeline. In this case, getting data is simulated by reading from a hardcoded JSON string. This data is then put into xcom, so that it can be processed by the next task. """ transform_task = PythonOperator( task_id='transform', python_callable=transform, ) transform_task.doc_md = """\ #### Transform task A simple Transform task which takes in the collection of order data from xcom and computes the total order value. This computed value is then put into xcom, so that it can be processed by the next task.
dag_id, default_args=default_args(), schedule_interval="10 * * * *", start_date=datetime(2021, 1, 1, tzinfo=pendulum.timezone("Asia/Tokyo")), ) as dag: dag.doc_md = __doc__ start = DummyOperator(task_id="start") a = PythonOperator( task_id="a", params={}, python_callable=task_sample, ) a.doc_md = task_sample.__doc__ b = BranchPythonOperator( task_id="b", params={}, python_callable=task_branch, ) c = DummyOperator(task_id="c") d = DummyOperator(task_id="d") e = ShortCircuitOperator( task_id="e", params={}, trigger_rule="none_failed",
echo There are $NUM_TO_PROCESS files to process. test $NUM_TO_PROCESS -gt 0 '''), params={'product': product}, ) # Thanks https://stackoverflow.com/questions/48580341/how-to-add-manual-tasks-in-an-apache-airflow-dag manual_sign_off = PythonOperator( task_id=f"manual_sign_off_{product}", python_callable=task_to_fail, retries=1, retry_delay=TIMEOUT, ) manual_sign_off.doc_md = dedent(""" ## Instructions Perform some manual checks that the number of COGs to be generated seems to be about right. You can also do spot checks that files don't already exist in S3. Once you're happy, mark this job as **Success** for the DAG to continue running. """) submit_task_id = f'submit_cog_convert_job_{product}' submit_bulk_cog_convert = SSHOperator( task_id=submit_task_id, command=dedent(COMMON + """ cd {{work_dir}} mkdir out qsub <<EOF #!/bin/bash #PBS -l wd,walltime=5:00:00,mem=190GB,ncpus=48,jobfs=1GB #PBS -P {{params.project}} #PBS -q {{params.queue}}
df.to_sql(f'clean_{tablename}', con, if_exists='replace', index=False) for table in tables: clean_data = PythonOperator( task_id=f'clean_data_{table}', python_callable=clean_data_df, op_kwargs={'tablename': table}, dag=dag, ) load_data >> clean_data # [START documentation] dag.doc_md = __doc__ load_data.doc_md = """\ #### Load Data This task loads data from the csv files in the data directory (set as an environment variable DATA_DIR) into the database Airflow creates. """ read_data.doc_md = """\ #### Read Data This task does nothing. It demonstrates how to use the SQLite operator. """ clean_data.doc_md = """\ #### Clean Data This task removes a column with pandas. It demonstrates how to alter data and write it back into the same table. """
re_parse_authors_data = PythonOperator( task_id='re_parse_authors', dag=dag, provide_context=True, python_callable=helpers.load_authors, op_kwargs={ 'aws_credentials_id': 'aws_credentials', 'redshift_connection_id': 'redshift', 's3_credentials_id': 's3_credentials', 'region': 'us-east-1', 'bucket': 'arxiv-etl', 'file_name': 'staging/authors/authors-parsed.json' }, ) re_parse_authors_data.doc_md = """ # Parses data from S3 locally and re-formats it to easily work with Redshift COPY, then saves it back to S3 """ stage_authors_to_redshift = StageFromS3ToRedshiftOperator( task_id='stage_authors', dag=dag, provide_context=True, table="staging.authors", redshift_conn_id="redshift", aws_credentials_id="aws_credentials", s3_bucket="arxiv-etl", s3_key="staging/authors/authors_parsed.csv", region="us-east-1", file_type="csv") stage_authors_to_redshift.doc_md = """
dag=math_dag ) t2 = PythonOperator( task_id="subtraction_task", python_callable=sub_nos, depends_on_past=False, retries=3, dag=math_dag ) square_task = PythonOperator( task_id="square_task", python_callable=square_no, depends_on_past=True, retries=3, dag=math_dag ) math_dag.doc_md = __doc__ t1.doc_md = """\ #### Addition Task Documentation A simple task to add two numbers ![miztiik-success-green](https://img.shields.io/badge/Miztiik:Automation:Airflow:Level-300-blue) """ # Configure Task Dependencies t1 >> t2 t1 >> square_task
task_id='get_listings', python_callable=get_listings, dag=dag, ) t2 = PythonOperator( task_id='send_email', provide_context=True, python_callable=send_email, dag=dag, ) # noinspection PyStatementEffect t1 >> t2 # Documentation dag.doc_md = f""" #### DAG Documentation {dag.description} """ t1.doc_md = """ #### Task Documentation Retrieves and stores Zoopla data """ t2.doc_md = """ #### Task Documentation Sends email notification when new data is available """
dag=dag, ) t2 = PythonOperator( task_id='Fetch_Data_and_Create_CSV', python_callable=task2, retries=3, dag=dag, ) t3 = PythonOperator( task_id='Upload_Big_Query', python_callable=task3, retries=3, dag=dag, ) dag.doc_md = __doc__ t1.doc_md = """\ #### Task 1 : Install Requirements. Install requirements present in requirements.txt """ t2.doc_md = """\ #### Task 2 : Fetch data from API & create a local csv. The API provides the change in Covid-19 Cases state-wise everyday """ t1 >> t2 >> t3
file.columns = file.columns.map(lambda x: x.replace('(', '').replace( ')', '')) # удаляем символы скобок из имен колонок engine = PostgresHook( postgres_conn_id='postgres_local').get_sqlalchemy_engine() file.to_sql('airflow_stg_mining_po', con=engine, index=True, if_exists='replace', schema='beeline') # читаем файл и записываем во временную таблицу целевой БД process_file = PythonOperator(task_id='process_file', provide_context=True, python_callable=process_xls_file) process_file.doc_md = """\ #### Task Documentation You can document your task using the attributes `doc_md` (markdown), `doc` (plain text), `doc_rst`, `doc_json`, `doc_yaml` which gets rendered in the UI's Task Instance Details page. ![img](http://montcs.bloomu.edu/~bobmon/Semesters/2012-01/491/import%20soul.png) """ # обновляем целевую таблицу update_target_table = PostgresOperator(task_id='update_target_table', sql=''' insert into beeline.airflow_mining_po select * from beeline.airflow_stg_mining_po on conflict do nothing; ''', postgres_conn_id='postgres_local',
python_callable=createlog, dag=dag) ExtracttoDF = PythonOperator(task_id='sqlite_to_df', python_callable=getdf, dag=dag) LoadTask = PythonOperator(task_id='Destinationdb', python_callable=createdb, dag=dag) UpsertTask = PythonOperator(task_id='Destinationdb_Upsert', python_callable=updatedb, dag=dag) dag.doc_md = __doc__ ExtracttoDF.doc_md = """\ Extract data from source DB """ templated_command = """ {% for i in range(5) %} echo "{{ ds }}" echo "{{ macros.ds_add(ds, 7)}}" echo "{{ params.my_param }}" {% endfor %} """ [ExtracttoDF, CreateLog] >> LoadTask >> UpsertTask
""" message_task(SQL_CONN_STRING, KEY_WORDS, FREQ) with DAG( 'create_postgres_db', description="Creates Postgres DB for tweets if it doesn't already exist", schedule_interval="@once", default_args=default_args ) as create_pgdb_dag: create_db = PythonOperator( task_id='create_db', python_callable=create_postgres_db, dag=create_pgdb_dag ) create_db.doc_md = """\ #### CREATE PGDB Creates a database in Postgres for the transformed tweet data, \ if one does not already exist """ create_db with DAG( 'tweetl_dag', description='Performs ETL round and triggers slackbot', schedule_interval=timedelta(seconds=FREQ), catchup=False, default_args=default_args ) as tweetl_dag: