def test_cross_downstream(self): """Test if all dependencies between tasks are all set correctly.""" dag = DAG(dag_id="test_dag", start_date=datetime.now()) start_tasks = [DummyOperator(task_id="t{i}".format(i=i), dag=dag) for i in range(1, 4)] end_tasks = [DummyOperator(task_id="t{i}".format(i=i), dag=dag) for i in range(4, 7)] helpers.cross_downstream(from_tasks=start_tasks, to_tasks=end_tasks) for start_task in start_tasks: six.assertCountEqual(self, start_task.get_direct_relatives(upstream=False), end_tasks)
def test_cross_downstream(self): """Test if all dependencies between tasks are all set correctly.""" dag = DAG(dag_id="test_dag", start_date=datetime.now()) start_tasks = [DummyOperator(task_id="t{i}".format(i=i), dag=dag) for i in range(1, 4)] end_tasks = [DummyOperator(task_id="t{i}".format(i=i), dag=dag) for i in range(4, 7)] helpers.cross_downstream(from_tasks=start_tasks, to_tasks=end_tasks) for start_task in start_tasks: self.assertCountEqual(start_task.get_direct_relatives(upstream=False), end_tasks)
}) t_combine_embed = PythonOperator( task_id='combine_embeddings', python_callable=nn.combine_embeddings, op_kwargs={'metadata_filename': config.metadata_filename}) embed_dir = os.path.join(os.environ['GLEAMS_HOME'], 'data', 'embed') embed_filename = os.path.join( embed_dir, os.path.splitext(os.path.basename( config.metadata_filename))[0].replace('metadata_', 'embed_')) # dist_filename = (ann_filename.replace('ann_', 'dist_') # .replace('.faiss', '.npz')) # t_cluster = PythonOperator( # task_id='compute_pairwise_distances', # python_callable=cluster.cluster, # op_kwargs={'distances_filename': dist_filename} # ) t_metadata >> t_split_feat t_download >> t_enc_feat helpers.cross_downstream([t_split_feat, t_enc_feat], t_combine_feat.values()) for suffix in suffixes: t_combine_feat[suffix] >> [t_pairs_pos[suffix], t_pairs_neg[suffix]] [ t_pairs_pos['train'], t_pairs_neg['train'], t_pairs_pos['val'], t_pairs_neg['val'] ] >> t_train t_train >> t_embed t_embed >> t_combine_embed
def create_day_partitioned_ingestion_dag( dag_id, main_function, reingestion_day_list_list, start_date=datetime(1970, 1, 1), concurrency=1, default_args=conf.DAG_DEFAULT_ARGS, dagrun_timeout=timedelta(hours=23), ingestion_task_timeout=timedelta(hours=2) ): """ Given a `main_function` and `reingestion_day_list_list`, this factory method instantiates a DAG that will run the given `main_function`, parameterized by a number of dates, whose calculation is described below. Required Arguments: dag_id: string giving a unique id of the DAG to be created. main_function: python function to be run. The function must take a single parameter (date) which will be a string of the form 'YYYY-MM-DD'. reingestion_day_list_list: list of lists of integers. It gives the set of days before the current execution date of the DAG for which the `main_function` should be run, and describes how the calls to the function should be prioritized. Optional Arguments: start_date: datetime.datetime giving the first valid execution_date of the DAG. concurrency: integer that sets the number of tasks which can run simultaneously for this DAG. It's important to keep the rate limits of the Provider API in mind when setting this parameter. default_args: dictionary which is passed to the airflow.dag.DAG __init__ method. dagrun_timeout: datetime.timedelta giving the total amount of time a given dagrun may take. ingestion_task_timeout: datetime.timedelta giving the amount of time a call to the `main_function` is allowed to take. Calculation of ingestion dates: The `reingestion_day_list_list` should have the form [ [int, ..., int], [int, ..., int], ..., [int, ..., int] ] It's not necessary for the inner lists to be the same length. The DAG instantiated by this factory method will first run the `main_function` for the current execution_date, then for the current date minus the number of days given by integers in the first list (in an arbitrary order, and possibly in parallel if so configured), then for the dates calculated from the second list, and so on. For example, given the `reingestion_day_list_list` [ [1, 2, 3], [8, 13, 18], [28, 38, 48] ], and assuming the current execution date is 2020-01-01, the instantiated dag will run the `main_function` with the parameters [ ['2020-01-01'], ['2019-12-31', 2019-12-30', '2019-12-29'], ['2019-12-24', 2019-12-19', '2019-12-14'], ['2019-12-04', 2019-11-24', '2019-11-14'] ]. The order of the inner lists gives the order in which sets of dates may be run. The order within the inner lists is not relevant. The size of the inner lists does *not* set the number of simultaneous executions of the `main_function` allowed; that is set by the `concurrency` parameter. """ args = deepcopy(default_args) args.update(start_date=start_date) dag = DAG( dag_id=dag_id, default_args=args, concurrency=concurrency, max_active_runs=concurrency, dagrun_timeout=dagrun_timeout, schedule_interval='@daily', start_date=start_date, catchup=False, ) with dag: ingest_operator_list_list = _build_ingest_operator_list_list( reingestion_day_list_list, dag, main_function, ingestion_task_timeout ) end_task = ops.get_log_operator(dag, dag.dag_id, 'Finished') for i in range(len(ingest_operator_list_list) - 1): wait_operator = ops.get_wait_till_done_operator( dag, f'wait_L{i}' ) cross_downstream( ingest_operator_list_list[i], [ wait_operator, end_task ] ) wait_operator >> ingest_operator_list_list[i + 1] ingest_operator_list_list[-1] >> end_task return dag