Exemple #1
0
    task_id="load_source_data",
    python_callable=load_source_data,
    dag=dag,
)

task_validate_source_data_load = GreatExpectationsOperator(
    task_id="validate_source_data_load",
    checkpoint_name="source_data_load.chk",
    dag=dag,
    data_context_root_dir=ge_root_dir,
)

task_run_dbt_dag = DbtRunOperator(task_id="run_dbt_dag", dag=dag)

task_validate_analytical_output = GreatExpectationsOperator(
    task_id="validate_analytical_output",
    checkpoint_name="analytical_output.chk",
    dag=dag,
    data_context_root_dir=ge_root_dir,
)

task_publish = PythonOperator(task_id="publish",
                              python_callable=publish_to_prod,
                              dag=dag)

task_validate_source_data.set_downstream(task_load_source_data)
task_load_source_data.set_downstream(task_validate_source_data_load)
task_validate_source_data_load.set_downstream(task_run_dbt_dag)
task_run_dbt_dag.set_downstream(task_validate_analytical_output)
task_validate_analytical_output.set_downstream(task_publish)
from airflow.utils.timezone import datetime

DEFAULT_DATE = datetime(2016, 1, 1)
default_args = dict(
    start_date=DEFAULT_DATE,
    owner='airflow')


def fail():
    raise ValueError('Expected failure.')


def success(ti=None, *args, **kwargs):
    if ti.execution_date != DEFAULT_DATE + timedelta(days=1):
        fail()


# DAG tests that tasks ignore all dependencies

dag1 = DAG(dag_id='test_run_ignores_all_dependencies',
           default_args=dict(depends_on_past=True, **default_args))
dag1_task1 = PythonOperator(
    task_id='test_run_dependency_task',
    python_callable=fail,
    dag=dag1)
dag1_task2 = PythonOperator(
    task_id='test_run_dependent_task',
    python_callable=success,
    dag=dag1)
dag1_task1.set_downstream(dag1_task2)
Exemple #3
0
tune_model_task = SageMakerTuningOperator(
    task_id='model_tuning',
    dag=dag,
    config=tuner_config,
    # aws_conn_id='airflow-sagemaker',
    wait_for_completion=True,
    check_interval=30)

# launch sagemaker batch transform job and wait until it completes
batch_transform_task = SageMakerTransformOperator(
    task_id='predicting',
    dag=dag,
    config=transform_config,
    # aws_conn_id='airflow-sagemaker',
    wait_for_completion=True,
    check_interval=30,
    trigger_rule=TriggerRule.ONE_SUCCESS)

cleanup_task = DummyOperator(task_id='cleaning_up', dag=dag)

# set the dependencies between tasks

init.set_downstream(preprocess_task)
preprocess_task.set_downstream(prepare_task)
prepare_task.set_downstream(branching)
branching.set_downstream(tune_model_task)
branching.set_downstream(train_model_task)
tune_model_task.set_downstream(batch_transform_task)
train_model_task.set_downstream(batch_transform_task)
batch_transform_task.set_downstream(cleanup_task)
Exemple #4
0
    'email': ['*****@*****.**'],
    'email_on_failure': True,
    'email_on_retry': False,
    'retries': 2,
    'retry_delay': timedelta(minutes=5),
    'start_date': datetime(2020, 9, 24)
}

with DAG('marketvol',
         default_args=default_args,
         description='A simple DAG',
         schedule_interval='0 18 * * 1-5') as dag:

    t1 = PythonOperator(task_id='Tesla_Stonk_Data',
                        python_callable=stonk('TSLA'))

    t2 = PythonOperator(task_id='Apple_Stonk_Data',
                        python_callable=stonk('AAPL'))

    t3 = BashOperator(task_id='Tesla Bash Command',
                      bash_command='mv *_data.csv tmp/data/2020/2020-09-24')

    t4 = BashOperator(task_id='Apple Bash Command',
                      bash_command='mv *_data.csv tmp/dat/2020/2020-09-24')

    t5 = PythonOperator(task_id='Run query on downloaded data',
                        python_callable=python_command)
t1.set_downstream(t3)
t2.set_downstream(t4)
[t3, t4] >> t5
Exemple #5
0
            logging.info("Performing Delete...")
            # using bulk delete
            query.delete(synchronize_session=False)
            session.commit()
            logging.info("Finished Performing Delete")
        else:
            logging.warn("You've opted to skip deleting the db entries. "
                         "Set ENABLE_DELETE to True to delete entries!!!")

        logging.info("Finished Running Cleanup Process")

    except ProgrammingError as e:
        logging.error(e)
        logging.error(
            str(airflow_db_model) + " is not present in the metadata."
            "Skipping...")


for db_object in DATABASE_OBJECTS:

    cleanup_op = PythonOperator(task_id="cleanup_" +
                                str(db_object["airflow_db_model"].__name__),
                                python_callable=cleanup_function,
                                params=db_object,
                                provide_context=True,
                                dag=dag)

    print_configuration.set_downstream(cleanup_op)

# [END composer_metadb_cleanup]
Exemple #6
0
    db_connection.init_query('DROP TABLE IF EXISTS Sales')

    db_connection.init_query(query=('''CREATE TABLE Sales(
            Region varchar(256),
            Country varchar(256),
            `Item Type` varchar(20),
            `Sales Channel` varchar(20),
            `Order Priority` varchar(20),
            `Order Date` DATE NOT NULL,
            `Order ID` int(20) NOT NULL,
            `Ship Date` DATE NOT NULL,
            `Units Sold` int(20) NOT NULL,
            `Unit Price` int(20) NOT NULL,
            `Unit Cost` int(20) NOT NULL,
            `Total Revenue` int(20) NOT NULL,
            `Total Cost` int(20) NOT NULL,
            `Total Profit` int(20) NOT NULL)'''))

    data.csv_load_to_db(filename, destination_folder, db_connection)


run_extract = PythonOperator(task_id='extract',
                             python_callable=extract,
                             dag=dag)

run_transform_load = PythonOperator(task_id='ransform and load',
                                    python_callable=transform_load,
                                    dag=dag)

run_extract.set_downstream(run_transform_load)