isalnum = a_string.isalnum()
    #print('Is String Alphanumeric :', isalnum)
    alphanumeric_filter = filter(str.isalnum, a_string)
    alphanumeric_string = "".join(alphanumeric_filter)
    #remove / from file path
    return alphanumeric_string.replace("/", "__")


with models.DAG(
        'import_ingestion',
        # Continue to run DAG once per day
        schedule_interval='@once',
        default_args=default_dag_args) as dag:

    start = DummyOperator(task_id='start')

    wait = DummyOperator(task_id='wait', trigger_rule="all_done")

    end = DummyOperator(task_id='end', trigger_rule="all_done")

    for blob in blobs:
        #print(blob.name)
        print_file = BashOperator(task_id='print_file_' +
                                  get_alphanumeric_task_id(blob.name),
                                  bash_command='echo "hello "+blob.name',
                                  dag=dag)
        start.set_downstream(print_file)
        print_file.set_downstream(wait)

wait >> end
예제 #2
0
default_args = {
    'owner': 'airflow',
    'depends_on_past': False,
    'start_date': datetime.utcnow(),
    'email': ['*****@*****.**'],
    'email_on_failure': False,
    'email_on_retry': False,
    'retries': 1,
    'retry_delay': timedelta(minutes=5)
}

dag = DAG('kubernetes_hello_world',
          default_args=default_args,
          schedule_interval=timedelta(minutes=10))

start = DummyOperator(task_id='start', dag=dag)

passing = KubernetesPodOperator(namespace='default',
                                image="python:3.6",
                                cmds=["python", "-c"],
                                arguments=["print('hello world')"],
                                labels={"foo": "bar"},
                                name="passing-test",
                                task_id="passing-task",
                                get_logs=True,
                                dag=dag)

failing = KubernetesPodOperator(namespace='default',
                                image="ubuntu:16.04",
                                cmds=["python", "-c"],
                                arguments=["print('hello world')"],
예제 #3
0
from airflow.models import DAG
from airflow.operators.bash_operator import BashOperator
from airflow.operators.dummy_operator import DummyOperator
from airflow.operators.python_operator import PythonOperator

args = {"owner": "Airflow", "start_date": airflow.utils.dates.days_ago(10)}

dag = DAG(
    dag_id="third_exercise",
    default_args=args,
    schedule_interval="@daily",
    dagrun_timeout=timedelta(minutes=60),
)


def print_date(execution_date, **kwargs):
    print("The execution_date is: {}".format(execution_date))


print_execution_time = PythonOperator(
    task_id="task1", dag=dag, python_callable=print_date, provide_context=True
)
the_end = DummyOperator(task_id="the_end", dag=dag)

for seconds in {1, 5, 10}:
    print_execution_time >> BashOperator(
        task_id="sleep_{}".format(seconds),
        bash_command="sleep {}".format(seconds),
        dag=dag,
    ) >> the_end
예제 #4
0
import airflow
from airflow import DAG
from airflow.operators.dummy_operator import DummyOperator

args = {
    'owner': 'Airflow',
    'start_date': airflow.utils.dates.days_ago(9),
}

with DAG(dag_id='exercise3',
         default_args=args,
         schedule_interval='* 45 13 ? * SUN,TUE,THU *') as dag:
    task1 = DummyOperator(task_id='task1')
    task2 = DummyOperator(task_id='task2')
    task3 = DummyOperator(task_id='task3')
    task4 = DummyOperator(task_id='task4')
    task5 = DummyOperator(task_id='task5')
    task1 >> task2 >> [task3, task4] >> task5
예제 #5
0
from datetime import datetime
from airflow import DAG
from airflow.operators.dummy_operator import DummyOperator
from airflow.operators.python_operator import PythonOperator


def print_hello():
    return "Hello world!"


dag = DAG(
    "hello_world",
    description="Simple tutorial DAG",
    schedule_interval="0 12 * * *",
    start_date=datetime(2017, 3, 20),
    catchup=False,
)

dummy_operator = DummyOperator(task_id="dummy_task", retries=3, dag=dag)

hello_operator = PythonOperator(
    task_id="hello_task", python_callable=print_hello, dag=dag
)

dummy_operator >> hello_operator
예제 #6
0
    'task_types': 'TaskType.INGEST'
}

args = {
    'start_date': datetime.utcnow(),
    'provide_context': True,
    'owner': 'airflow',
}

auth_conn = HttpHook.get_connection('test_netrc')
http_conn = HttpHook('GET', 'test_netrc')
redis_hook = RedisHook(redis_conn_id='redis_default')

dag = DAG(dag_id='vlass_execute', default_args=args, schedule_interval=None)

start_task = DummyOperator(task_id='start_task', dag=dag)
end_task = DummyOperator(task_id='end_task', dag=dag)


# provide_context in default_args above must be True to get the kwargs values
def get_file_names(**kwargs):
    prev_date = kwargs['prev_execution_date']
    next_date = kwargs['next_execution_date']
    redis_conn = redis_hook.get_conn()
    redis_keys = redis_conn.keys('vlass_*')
    results = []

    for r in redis_keys:
        key_datetime = datetime.strptime(r[6:], '%Y_%m_%d_%H_%M_%S')
        if prev_date < key_datetime < next_date:
            results.append(redis_conn.get(r).decode('utf-8').split()[1:])
예제 #7
0
    cwl_workflow1 = os.path.join(pipeline_name, 'pipeline.cwl')
    cwl_workflow2 = os.path.join('portal-containers', 'ome-tiff-offsets.cwl')
    cwl_workflow3 = os.path.join('portal-containers', 'sprm-to-json.cwl')

    def build_dataset_name(**kwargs):
        return '{}__{}__{}'.format(
            dag.dag_id, kwargs['dag_run'].conf['parent_submission_id'],
            pipeline_name),

#     prepare_cwl1 = PythonOperator(
#         python_callable=utils.clone_or_update_pipeline,
#         task_id='prepare_cwl1',
#         op_kwargs={'pipeline_name': cwl_workflow1}
#     )

    prepare_cwl1 = DummyOperator(task_id='prepare_cwl1')

    def build_cwltool_cmd1(**kwargs):
        ctx = kwargs['dag_run'].conf
        run_id = kwargs['run_id']
        tmpdir = utils.get_tmp_dir_path(run_id)
        print('tmpdir: ', tmpdir)
        data_dir = ctx['parent_lz_path']
        print('data_dir: ', data_dir)
        cwltool_dir = get_cwltool_bin_path()

        command = [
            'env',
            'PATH=%s:%s' % (cwltool_dir, os.environ['PATH']),
            'cwltool',
            os.fspath(PIPELINE_BASE_DIR / cwl_workflow1),
예제 #8
0
    def test_not_skipping_external(self):
        latest_task = LatestOnlyOperator(task_id='latest', dag=self.dag)
        downstream_task = DummyOperator(task_id='downstream', dag=self.dag)
        downstream_task2 = DummyOperator(task_id='downstream_2', dag=self.dag)

        downstream_task.set_upstream(latest_task)
        downstream_task2.set_upstream(downstream_task)

        self.dag.create_dagrun(
            run_id="manual__1",
            start_date=timezone.utcnow(),
            execution_date=DEFAULT_DATE,
            state=State.RUNNING,
            external_trigger=True,
        )

        self.dag.create_dagrun(
            run_id="manual__2",
            start_date=timezone.utcnow(),
            execution_date=timezone.datetime(2016, 1, 1, 12),
            state=State.RUNNING,
            external_trigger=True,
        )

        self.dag.create_dagrun(
            run_id="manual__3",
            start_date=timezone.utcnow(),
            execution_date=END_DATE,
            state=State.RUNNING,
            external_trigger=True,
        )

        latest_task.run(start_date=DEFAULT_DATE, end_date=END_DATE)
        downstream_task.run(start_date=DEFAULT_DATE, end_date=END_DATE)
        downstream_task2.run(start_date=DEFAULT_DATE, end_date=END_DATE)

        latest_instances = get_task_instances('latest')
        exec_date_to_latest_state = {
            ti.execution_date: ti.state
            for ti in latest_instances
        }
        self.assertEqual(
            {
                timezone.datetime(2016, 1, 1): 'success',
                timezone.datetime(2016, 1, 1, 12): 'success',
                timezone.datetime(2016, 1, 2): 'success'
            }, exec_date_to_latest_state)

        downstream_instances = get_task_instances('downstream')
        exec_date_to_downstream_state = {
            ti.execution_date: ti.state
            for ti in downstream_instances
        }
        self.assertEqual(
            {
                timezone.datetime(2016, 1, 1): 'success',
                timezone.datetime(2016, 1, 1, 12): 'success',
                timezone.datetime(2016, 1, 2): 'success'
            }, exec_date_to_downstream_state)

        downstream_instances = get_task_instances('downstream_2')
        exec_date_to_downstream_state = {
            ti.execution_date: ti.state
            for ti in downstream_instances
        }
        self.assertEqual(
            {
                timezone.datetime(2016, 1, 1): 'success',
                timezone.datetime(2016, 1, 1, 12): 'success',
                timezone.datetime(2016, 1, 2): 'success'
            }, exec_date_to_downstream_state)
예제 #9
0
    def test_skipping_non_latest(self):
        latest_task = LatestOnlyOperator(task_id='latest', dag=self.dag)
        downstream_task = DummyOperator(task_id='downstream', dag=self.dag)
        downstream_task2 = DummyOperator(task_id='downstream_2', dag=self.dag)
        downstream_task3 = DummyOperator(task_id='downstream_3',
                                         trigger_rule=TriggerRule.NONE_FAILED,
                                         dag=self.dag)

        downstream_task.set_upstream(latest_task)
        downstream_task2.set_upstream(downstream_task)
        downstream_task3.set_upstream(downstream_task)

        self.dag.create_dagrun(
            run_id="scheduled__1",
            start_date=timezone.utcnow(),
            execution_date=DEFAULT_DATE,
            state=State.RUNNING,
        )

        self.dag.create_dagrun(
            run_id="scheduled__2",
            start_date=timezone.utcnow(),
            execution_date=timezone.datetime(2016, 1, 1, 12),
            state=State.RUNNING,
        )

        self.dag.create_dagrun(
            run_id="scheduled__3",
            start_date=timezone.utcnow(),
            execution_date=END_DATE,
            state=State.RUNNING,
        )

        latest_task.run(start_date=DEFAULT_DATE, end_date=END_DATE)
        downstream_task.run(start_date=DEFAULT_DATE, end_date=END_DATE)
        downstream_task2.run(start_date=DEFAULT_DATE, end_date=END_DATE)
        downstream_task3.run(start_date=DEFAULT_DATE, end_date=END_DATE)

        latest_instances = get_task_instances('latest')
        exec_date_to_latest_state = {
            ti.execution_date: ti.state
            for ti in latest_instances
        }
        self.assertEqual(
            {
                timezone.datetime(2016, 1, 1): 'success',
                timezone.datetime(2016, 1, 1, 12): 'success',
                timezone.datetime(2016, 1, 2): 'success'
            }, exec_date_to_latest_state)

        downstream_instances = get_task_instances('downstream')
        exec_date_to_downstream_state = {
            ti.execution_date: ti.state
            for ti in downstream_instances
        }
        self.assertEqual(
            {
                timezone.datetime(2016, 1, 1): 'skipped',
                timezone.datetime(2016, 1, 1, 12): 'skipped',
                timezone.datetime(2016, 1, 2): 'success'
            }, exec_date_to_downstream_state)

        downstream_instances = get_task_instances('downstream_2')
        exec_date_to_downstream_state = {
            ti.execution_date: ti.state
            for ti in downstream_instances
        }
        self.assertEqual(
            {
                timezone.datetime(2016, 1, 1): None,
                timezone.datetime(2016, 1, 1, 12): None,
                timezone.datetime(2016, 1, 2): 'success'
            }, exec_date_to_downstream_state)

        downstream_instances = get_task_instances('downstream_3')
        exec_date_to_downstream_state = {
            ti.execution_date: ti.state
            for ti in downstream_instances
        }
        self.assertEqual(
            {
                timezone.datetime(2016, 1, 1): 'success',
                timezone.datetime(2016, 1, 1, 12): 'success',
                timezone.datetime(2016, 1, 2): 'success'
            }, exec_date_to_downstream_state)
예제 #10
0
    def test_lineage(self, _get_backend):
        backend = mock.Mock()
        send_mock = mock.Mock()
        backend.send_lineage = send_mock

        _get_backend.return_value = backend

        dag = DAG(dag_id='test_prepare_lineage', start_date=DEFAULT_DATE)

        f1 = File("/tmp/does_not_exist_1")
        f2 = File("/tmp/does_not_exist_2")
        f3 = File("/tmp/does_not_exist_3")

        with dag:
            op1 = DummyOperator(task_id='leave1',
                                inlets={"datasets": [
                                    f1,
                                ]},
                                outlets={"datasets": [
                                    f2,
                                ]})
            op2 = DummyOperator(task_id='leave2')
            op3 = DummyOperator(task_id='upstream_level_1',
                                inlets={"auto": True},
                                outlets={"datasets": [
                                    f3,
                                ]})
            op4 = DummyOperator(task_id='upstream_level_2')
            op5 = DummyOperator(
                task_id='upstream_level_3',
                inlets={"task_ids": ["leave1", "upstream_level_1"]})

            op1.set_downstream(op3)
            op2.set_downstream(op3)
            op3.set_downstream(op4)
            op4.set_downstream(op5)

        ctx1 = {"ti": TI(task=op1, execution_date=DEFAULT_DATE)}
        ctx2 = {"ti": TI(task=op2, execution_date=DEFAULT_DATE)}
        ctx3 = {"ti": TI(task=op3, execution_date=DEFAULT_DATE)}
        ctx5 = {"ti": TI(task=op5, execution_date=DEFAULT_DATE)}

        func = mock.Mock()
        func.__name__ = 'foo'

        # prepare with manual inlets and outlets
        prep = prepare_lineage(func)
        prep(op1, ctx1)

        self.assertEqual(len(op1.inlets), 1)
        self.assertEqual(op1.inlets[0], f1)

        self.assertEqual(len(op1.outlets), 1)
        self.assertEqual(op1.outlets[0], f2)

        # post process with no backend
        post = apply_lineage(func)
        post(op1, ctx1)
        self.assertEqual(send_mock.call_count, 1)
        send_mock.reset_mock()

        prep(op2, ctx2)
        self.assertEqual(len(op2.inlets), 0)
        post(op2, ctx2)
        self.assertEqual(send_mock.call_count, 1)
        send_mock.reset_mock()

        prep(op3, ctx3)
        self.assertEqual(len(op3.inlets), 1)
        self.assertEqual(op3.inlets[0].qualified_name, f2.qualified_name)
        post(op3, ctx3)
        self.assertEqual(send_mock.call_count, 1)
        send_mock.reset_mock()

        # skip 4

        prep(op5, ctx5)
        self.assertEqual(len(op5.inlets), 2)
        post(op5, ctx5)
        self.assertEqual(send_mock.call_count, 1)
        send_mock.reset_mock()
import airflow
from airflow.models import DAG
from datetime import datetime, timedelta
from airflow.operators.dummy_operator import DummyOperator
from airflow.operators.python_operator import BranchPythonOperator

args = {
    'owner': 'airflow',
    'start_date': datetime(2019, 1, 14),
}
#fetch current hour
curr_time = int(datetime.now().strftime('%H'))

dag = DAG(dag_id='BranchPython_example', default_args=args)

run_this_first = DummyOperator(task_id='run_this_first', dag=dag)

options = [
    'branch_a-10_15', 'branch_b-16_20', 'branch_c-21__', 'branch_d-last'
]


def func_condition():  #compair hours
    if (curr_time > 10 and curr_time <= 15):
        return options[0]
    elif (curr_time > 15 and curr_time <= 20):
        return options[1]
    elif (curr_time > 20):
        return options[2]
    else:
        return options[3]
예제 #12
0
from airflow import DAG
from airflow.operators.python_operator import PythonOperator
from airflow.operators.dummy_operator import DummyOperator
from datetime import datetime

dag = DAG("FailingpythonTask",
          start_date=datetime(2019, 9, 12),
          schedule_interval="@daily",
          catchup=False)


def myPythonFunction():
    # raise Exception
    print("Zooooo Except bbbbbbbbbbb momooooooooooooooooo")
    raise Exception


with dag:
    t1 = PythonOperator(task_id="t1",
                        dag=dag,
                        python_callable=myPythonFunction)
    t2 = DummyOperator(task_id="t2", trigger_rule="all_done")

    t1 >> t2

if __name__ == "__main__":
    dag.cli()
예제 #13
0
    finally:
        if conn is not None:
            conn.close()
            print('Closed Database connection')


def total_run():
    for currency in currency_list:
        update_table(currency)


default_args = {
    'owner': 'yurii',
    'depends_on_past': False,
    'start_date': datetime(2018, 6, 1),
    'email': ['*****@*****.**'],
    'email_on_failure': ['*****@*****.**'],
    'email_on_retry': False,
    'retries': 1,
    'backfill': False,
}

money_dag = DAG('exchange_rate',
                default_args=default_args,
                catchup=False,
                schedule_interval='@hourly')

t1 = PythonOperator(task_id='do_all', python_callable=total_run, dag=money_dag)
t0 = DummyOperator(task_id='do_nothing', dag=money_dag)
t1 >> t0
예제 #14
0
    rm_tmp_tables_pre = rm_tmp_tables("_pre")

    sqlalchemy_create_objects_from_schema = SqlAlchemyCreateObjectOperator(
        task_id="sqlalchemy_create_objects_from_schema",
        data_schema_name=DAG_ID)

    add_cdc_ids = [
        PostgresOperator(
            task_id=f"add_cdc_id_to_{table}",
            sql=
            f"ALTER TABLE {table} ADD COLUMN IF NOT EXISTS cdc_id BIGINT UNIQUE NOT NULL ",
        ) for table in table_mappings.values()
    ]

    join_parallel_tasks = DummyOperator(task_id="join_parallel_tasks")

    postgres_create_tables_like = [
        PostgresTableCopyOperator(
            task_id=f"postgres_create_tables_like_{table}",
            source_table_name=table,
            target_table_name=f"{TMP_TABLE_PREFIX}{table}",
            # Only copy table definitions. Don't do anything else.
            truncate_target=False,
            copy_data=False,
            drop_source=False,
        ) for table in table_mappings.values()
    ]

    def _transform_csv_files(**kwargs: Any) -> None:
        """Transform CSV files to have suitable headers and columns for DB insertion.
        "query":
        "SELECT ListingId, __time, ListOfficeName FROM \"{datasource}\" WHERE \"__time\" BETWEEN TIMESTAMP '{yesterday}' AND TIMESTAMP '{today}'"
        .format(datasource=harPropDatasource, yesterday=yesterday, today=today)
    }
    druidQuery = json.dumps(druidJson)

    mlsUrl = harUrl + ' {d}'.format(d=yesterday)

    indexTemplate = downloadTemplate(templateUrl)

    druidIndexSpec = createIndexSpec(indexTemplate, validationDatasource,
                                     intervals, 'nvl("dummyCol1", \'Druid\')')
    mlsIndexSpec = createIndexSpec(indexTemplate, validationDatasource,
                                   intervals, 'nvl("dummyCol1", \'MLS\')')

    start = DummyOperator(task_id='start')

    wait = BashOperator(task_id='wait-for-15m', bash_command="sleep 15m")

    loadDruid = KubernetesPodOperator(
        namespace='data',
        image="truongretell/druiddataloader:latest",
        image_pull_policy='Always',
        cmds=[
            "sh", "-c",
            "dotnet DruidDataLoader.dll '{link}' '/shared-data' 'har-validation' '{yesterday}' '{query}'"
            .format(link=druidUrl, yesterday=yesterday, query=druidQuery)
        ],
        task_id="load-property-sold-validation-task-" + str(yesterday),
        name="load-property-sold-validation-task-" + str(yesterday),
        volumes=[volume],
예제 #16
0
}

dag = DAG('manga',
          default_args=default_args,
          description='dag for ETL manga',
          schedule_interval='50 * * * *')

run_etl = PythonOperator(task_id="myextract",
                         python_callable=myextract,
                         dag=dag)

transf = BranchPythonOperator(task_id="branching",
                              python_callable=mytransforme,
                              provide_context=True,
                              dag=dag)

continue1 = DummyOperator(task_id='continue', dag=dag)
Stop = DummyOperator(task_id='stop', dag=dag)

loadData = PythonOperator(task_id="loadData", python_callable=myload, dag=dag)

alertLoic = EmailOperator(task_id='send_email',
                          to='[email protected] ',
                          subject='Airflow Alert',
                          html_content=""" <h3>Email Test</h3> """,
                          dag=dag)

run_etl >> transf
transf >> [continue1, Stop]
continue1 >> loadData
loadData >> alertLoic
예제 #17
0
from airflow.operators.bash_operator import BashOperator
from airflow.operators.dummy_operator import DummyOperator

args = {
    "owner": "Airflow",
    "start_date": airflow.utils.dates.days_ago(2),
}

dag = DAG(
    dag_id="my_third_dag",
    default_args=args,
    schedule_interval=timedelta(minutes=150),
    dagrun_timeout=timedelta(minutes=60),
)

run_this_last = DummyOperator(task_id="run_this_last", dag=dag)

# [START howto_operator_bash]
run_this1 = BashOperator(
    task_id="echo_1",
    bash_command="echo 1",
    dag=dag,
)

# [START howto_operator_bash]
run_this2 = BashOperator(
    task_id="echo_2",
    bash_command="echo 2",
    dag=dag,
)
예제 #18
0
        task_id=f'sub_dag_task3',
        dag=sub_dag,
    )
    t4 = DummyOperator(
        task_id=f'sub_dag_task4',
        dag=sub_dag,
    )
    t1 >> [t2, t3] >> t4
    return sub_dag


dag = DAG(dag_id='sub_dag_example',
          schedule_interval=None,
          start_date=datetime(2020, 1, 1),
          default_args={"owner": "airflow_lesson"})
start = DummyOperator(task_id='start', dag=dag)

end = DummyOperator(task_id='end', dag=dag)

r_task = DummyOperator(task_id=f'some_task', dag=dag)
r_task_2 = DummyOperator(task_id=f'another_task', dag=dag)

for i in range(0, 15):
    sub_dags = SubDagOperator(task_id=f'do_sub_dags_{i}',
                              subdag=prepare_sub_dag(
                                  dag.dag_id, child_dag=f'do_sub_dags_{i}'),
                              dag=dag)
    r_task_2 >> sub_dags >> end

start >> r_task >> r_task_2
예제 #19
0
    'retries': 1,
    'retry_delay': timedelta(minutes=5),
    # 'queue': 'bash_queue',
    'pool': 'kube',
    # 'priority_weight': 10,
    # 'end_date': datetime(2016, 1, 1),
}

dag = DAG(
    'PyhtonDockerHub',
    default_args=default_args,
    schedule_interval=timedelta(days=1),
    dagrun_timeout=timedelta(minutes=5),
)

start = DummyOperator(task_id='run_this_first', dag=dag)

boom = KubernetesPodOperator(
    namespace='airflow',
    image="python:3.6-stretch",
    image_pull_policy="Always",
    cmds=["python", "-c"],
    arguments=["print('hello world')"],
    name="python",
    task_id="startPython",
    is_delete_operator_pod=True,
    hostnetwork=False,
    dag=dag,
    in_cluster=False,
)
예제 #20
0
from airflow.utils.dates import days_ago

args = {
    'owner': 'airflow',
    'start_date': days_ago(2),
}

dag = DAG(
    dag_id='asdasd',
    default_args=args,
    schedule_interval='0 0 * * *',
    dagrun_timeout=timedelta(minutes=60),
)

run_this_last = DummyOperator(
    task_id='run_this_last',
    dag=dag,
)

# [START howto_operator_bash]
run_this = BashOperator(
    task_id='run_after_loop',
    bash_command='echo 1',
    dag=dag,
)


# [END howto_operator_bash]

run_this >> run_this_last

for i in range(3):
예제 #21
0
    'owner': 'airflow',
    'start_date': airflow.utils.dates.days_ago(2),
    'provide_context': True
}


dag = DAG(
    "Data_Platform_ETL",
    schedule_interval="@daily",
    default_args=args)




process_start = DummyOperator(
	task_id='process_start',
	dag=dag
)



# Function to perform ETL task for MySQL Table
def fetch_data_from_MySQL(table_name,props,database,s3_bucket,s3_prefix ,**kwargs):
	try:
		table_exec_config = Variable.get("table_run_config", deserialize_json=True)

		if( table_name not in table_exec_config):
			print("Table not  exists")
			full_refresh_flag = True

		else :
			print("Table exists")
예제 #22
0
# AWS_KEY = os.environ.get('AWS_KEY')
# AWS_SECRET = os.environ.get('AWS_SECRET')

default_args = {
    'owner': 'udacity',
    'start_date': datetime(2019, 1, 12),
}

dag = DAG('udac_example_dag',
          default_args=default_args,
          description='Load and transform data in Redshift with Airflow',
          schedule_interval='@hourly'
        )

start_operator = DummyOperator(task_id='Begin_execution',  dag=dag)

stage_events_to_redshift = StageToRedshiftOperator(
    task_id='Stage_events',
    dag=dag
)

stage_songs_to_redshift = StageToRedshiftOperator(
    task_id='Stage_songs',
    dag=dag
)

load_songplays_table = LoadFactOperator(
    task_id='Load_songplays_fact_table',
    dag=dag
)
예제 #23
0
        'temp_tables': 'temp_tables.json'
    },
    dag=dag)

conn_id = 'hw4_test_db'

check_db_task = BranchPythonOperator(task_id='check_db',
                                     python_callable=check_db,
                                     op_kwargs={
                                         'conn_id': conn_id,
                                         'success_task_name': 'process_orders',
                                         'failed_task_name': 'db_not_reachable'
                                     },
                                     dag=dag)

db_not_reachable_task = DummyOperator(task_id='db_not_reachable', dag=dag)

notify_error_task = PythonOperator(
    task_id='notify_error',
    python_callable=send_message_from_file,
    op_kwargs={
        'token': Variable.get('HW3_TELEGRAM_BOT_TOKEN_TEST'),
        'chat_id': Variable.get('HW3_TELEGRAM_CHAT_ID_TEST'),
        'message_file_path': error_file_path,
    },
    trigger_rule='one_failed',
    dag=dag)

all_success_task = DummyOperator(task_id='all_success', dag=dag)

check_db_task >> [process_orders_task, db_not_reachable_task]
예제 #24
0
    "depends_on_past": False,
    "start_date": datetime(2019, 1, 24),
    "email": ["*****@*****.**"],
    "email_on_failure": False,
    "email_on_retry": False,
    "on_failure_callback": slack.task_fail_slack_alert,
    "retries": 0,
}


tpu_supported_models = Variable.get("tpu_training_supported_models").split(",")
# distributed_training = Variable.get("distributed_training")

dag = DAG("6-train_model", default_args=default_args, catchup=False, schedule_interval=None)

start_task = DummyOperator(task_id="start_task", dag=dag)
end_task = DummyOperator(task_id="end_task", dag=dag)


package_tensorflow_libs_cmd = f"cd {TENSORFLOW_OBJECT_DETECTION_RESEARCH_FOLDER} && object_detection/dataset_tools/create_pycocotools_package.sh /tmp/pycocotools && python setup.py sdist && (cd slim && python setup.py sdist)"

package_tensorflow_libs_with_dependencies = BashOperator(
    task_id="package_tensorflow_libs_with_dependencies",
    bash_command=package_tensorflow_libs_cmd,
    dag=dag,
)

for json_file in glob(f"{AIRFLOW_TRAINABLE_FOLDER}/*.json"):

    training_name = file_ops.get_filename(json_file, with_extension=False)
    now = datetime.now().strftime("%Y%m%dT%H%M")
예제 #25
0
    'catchup': False,
    'email': ['*****@*****.**'],
    'email_on_failure': False,
    'email_on_retry': False,
    'retries': 0,
    'retry_delay': timedelta(minutes=5),
}

dag = DAG('dag_that_executes_via_k8s_executor',
          default_args=default_args,
          schedule_interval=timedelta(minutes=30),
          max_active_runs=1,
          concurrency=10)

# Generate 2 tasks
tasks = ["task{}".format(i) for i in range(1, 3)]
example_dag_complete_node = DummyOperator(task_id="example_dag_complete",
                                          dag=dag)

org_dags = []
for task in tasks:

    bash_command = 'echo HELLO'

    org_node = BashOperator(task_id="{}".format(task),
                            bash_command=bash_command,
                            wait_for_downstream=False,
                            retries=5,
                            dag=dag)
    org_node.set_downstream(example_dag_complete_node)
예제 #26
0
#     start, done = duplex_graph(duplex_method, extract_features_dag)
#
#     duplex_start >> start
#     done >> duplex_done
#

#################################################
# Feature extraction
#################################################
files_for_features = ["_".join(t) for t in product(FILE_NAMES, DUPLEX_DICT.keys())]
files_for_features.sort()

NUMBER_OF_CHUNKS = 5

split_start = DummyOperator(
    task_id=f"split_start",
        dag=extract_features_dag)


split_done = DummyOperator(
    task_id=f"split_done",
        dag=extract_features_dag)



for f in files_for_features:
    t = PythonOperator(
    task_id=f"split_{f}",
    python_callable=split_file,
    op_kwargs={'infile': f"{data_step_path}duplex/{f}.csv",
               'dir': Path(data_step_path) / "duplex"/ "split_files",
예제 #27
0
파일: dag.py 프로젝트: Jackjaps/JacobsRepo
args = {
    'owner': 'airflow',
}

dag = DAG(
    dag_id='Jonathan_bash_operator',
    default_args=args,
    schedule_interval='0 0 * * *',
    start_date=days_ago(2),
    dagrun_timeout=timedelta(minutes=60),
    tags=['example']
)

task1 = DummyOperator(
    task_id='Start',
    dag=dag
)

# [START howto_operator_bash]
task2 = BashOperator(
    task_id='bash_jacobo',
    bash_command='echo "esta es una ejecucion normal"',
    dag=dag,
)
# [END howto_operator_bash]

task1 >> task2

#if __name__ == "__main__":
#    dag.cli()
예제 #28
0
from airflow.utils.dates import days_ago
from airflow.models import Variable

DAG_ID = "transfer_data_gcp_to_aws_dag"

# To have bucket names parametrized
AWS_BUCKET = Variable.get("AWS_BUCKET")
GCP_BUCKET = Variable.get("GCP_BUCKET")

def_args = {'start_date': days_ago(1), 'owner': 'cm0'}

transfer_dag = DAG(dag_id=DAG_ID,
                   schedule_interval=None,
                   default_args=def_args)

start = DummyOperator(task_id="start", dag=transfer_dag)

transfer_operator = GoogleCloudStorageToS3Operator(
    task_id="transfer_gcp_to_aws",
    bucket=GCP_BUCKET,
    dest_s3_key=AWS_BUCKET,
    replace=True,
    dag=transfer_dag)

end = DummyOperator(task_id="end", dag=transfer_dag)

# For troubleshooting GCP and AWS resources access, you can uncomment below operators which will list
# both buckets. Not recommendable for recursive listing or for buckets without inner folders (where there are
# many files directly on the bucket root). Also required to comment line 41.

# list_gcs = GoogleCloudStorageListOperator(task_id="list_gcs", bucket=GCP_BUCKET, dag=transfer_dag)
예제 #29
0
    pass_value=0,
    task_id='validate_no_dups_movies',
    sql=SqlQueries.validate_no_dups_movies,
    use_legacy_sql=False)

validate_no_dups_person = BigQueryValueCheckOperator(
    dag=dag,
    pass_value=0,
    task_id='validate_no_dups_person',
    sql=SqlQueries.validate_no_dups_person,
    use_legacy_sql=False)

###########################
# Key stages tasks
###########################
start_operator = DummyOperator(task_id='start_dag', dag=dag)
staging_complete = DummyOperator(task_id='staging_complete', dag=dag)
analytics_complete = DummyOperator(task_id='analytics_complete', dag=dag)
validation_complete = DummyOperator(task_id='analytics_complete', dag=dag)
end_operator = DummyOperator(task_id='finished_dag', dag=dag)

###########################
# Tasks Dependencies
###########################

# Stage all IMDB data
start_operator >> stage_imdb_name_basics >> staging_complete
start_operator >> stage_imdb_title_ratings >> staging_complete
start_operator >> stage_imdb_title_principals >> staging_complete
start_operator >> stage_imdb_title_basics >> staging_complete
from airflow.operators.python_operator import ShortCircuitOperator

args = {
    'owner': 'airflow',
    'start_date': airflow.utils.dates.days_ago(2),
}

dag = DAG(dag_id='example_short_circuit_operator', default_args=args)

cond_true = ShortCircuitOperator(
    task_id='condition_is_True',
    python_callable=lambda: True,
    dag=dag,
)

cond_false = ShortCircuitOperator(
    task_id='condition_is_False',
    python_callable=lambda: False,
    dag=dag,
)

true_1, true_2 = [
    DummyOperator(task_id='true_' + str(i), dag=dag) for i in [1, 2]
]
false_1, false_2 = [
    DummyOperator(task_id='false_' + str(i), dag=dag) for i in [1, 2]
]

cond_true >> true_1 >> true_2
cond_false >> false_1 >> false_2