Esempio n. 1
0
    def test_cross_downstream(self):
        """Test if all dependencies between tasks are all set correctly."""
        dag = DAG(dag_id="test_dag", start_date=datetime.now())
        start_tasks = [DummyOperator(task_id="t{i}".format(i=i), dag=dag) for i in range(1, 4)]
        end_tasks = [DummyOperator(task_id="t{i}".format(i=i), dag=dag) for i in range(4, 7)]
        helpers.cross_downstream(from_tasks=start_tasks, to_tasks=end_tasks)

        for start_task in start_tasks:
            six.assertCountEqual(self, start_task.get_direct_relatives(upstream=False), end_tasks)
Esempio n. 2
0
    def test_cross_downstream(self):
        """Test if all dependencies between tasks are all set correctly."""
        dag = DAG(dag_id="test_dag", start_date=datetime.now())
        start_tasks = [DummyOperator(task_id="t{i}".format(i=i), dag=dag) for i in range(1, 4)]
        end_tasks = [DummyOperator(task_id="t{i}".format(i=i), dag=dag) for i in range(4, 7)]
        helpers.cross_downstream(from_tasks=start_tasks, to_tasks=end_tasks)

        for start_task in start_tasks:
            self.assertCountEqual(start_task.get_direct_relatives(upstream=False), end_tasks)
Esempio n. 3
0
                             })
    t_combine_embed = PythonOperator(
        task_id='combine_embeddings',
        python_callable=nn.combine_embeddings,
        op_kwargs={'metadata_filename': config.metadata_filename})
    embed_dir = os.path.join(os.environ['GLEAMS_HOME'], 'data', 'embed')
    embed_filename = os.path.join(
        embed_dir,
        os.path.splitext(os.path.basename(
            config.metadata_filename))[0].replace('metadata_', 'embed_'))
    # dist_filename = (ann_filename.replace('ann_', 'dist_')
    #                              .replace('.faiss', '.npz'))
    # t_cluster = PythonOperator(
    #     task_id='compute_pairwise_distances',
    #     python_callable=cluster.cluster,
    #     op_kwargs={'distances_filename': dist_filename}
    # )

    t_metadata >> t_split_feat
    t_download >> t_enc_feat
    helpers.cross_downstream([t_split_feat, t_enc_feat],
                             t_combine_feat.values())
    for suffix in suffixes:
        t_combine_feat[suffix] >> [t_pairs_pos[suffix], t_pairs_neg[suffix]]
    [
        t_pairs_pos['train'], t_pairs_neg['train'], t_pairs_pos['val'],
        t_pairs_neg['val']
    ] >> t_train
    t_train >> t_embed
    t_embed >> t_combine_embed
Esempio n. 4
0
def create_day_partitioned_ingestion_dag(
    dag_id,
    main_function,
    reingestion_day_list_list,
    start_date=datetime(1970, 1, 1),
    concurrency=1,
    default_args=conf.DAG_DEFAULT_ARGS,
    dagrun_timeout=timedelta(hours=23),
    ingestion_task_timeout=timedelta(hours=2)
):
    """
    Given a `main_function` and `reingestion_day_list_list`, this
    factory method instantiates a DAG that will run the given
    `main_function`, parameterized by a number of dates, whose
    calculation is described below.

    Required Arguments:

    dag_id:                     string giving a unique id of the DAG to
                                be created.
    main_function:              python function to be run. The
                                function must take a single parameter
                                (date) which will be a string of the
                                form 'YYYY-MM-DD'.
    reingestion_day_list_list:  list of lists of integers. It gives the
                                set of days before the current execution
                                date of the DAG for which the
                                `main_function` should be run, and
                                describes how the calls to the function
                                should be prioritized.

    Optional Arguments:

    start_date:              datetime.datetime giving the
                             first valid execution_date of the DAG.
    concurrency:             integer that sets the number of tasks which
                             can run simultaneously for this DAG. It's
                             important to keep the rate limits of the
                             Provider API in mind when setting this
                             parameter.
    default_args:            dictionary which is passed to the
                             airflow.dag.DAG __init__ method.
    dagrun_timeout:          datetime.timedelta giving the total amount
                             of time a given dagrun may take.
    ingestion_task_timeout:  datetime.timedelta giving the amount of
                             time a call to the `main_function` is
                             allowed to take.

    Calculation of ingestion dates:

    The `reingestion_day_list_list` should have the form
        [
            [int, ..., int],
            [int, ..., int],
            ...,
            [int, ..., int]
        ]
    It's not necessary for the inner lists to be the same length. The
    DAG instantiated by this factory method will first run the
    `main_function` for the current execution_date, then for the current
    date minus the number of days given by integers in the first list
    (in an arbitrary order, and possibly in parallel if so configured),
    then for the dates calculated from the second list, and so on.  For
    example, given the `reingestion_day_list_list`
        [
            [1, 2, 3],
            [8, 13, 18],
            [28, 38, 48]
        ],
    and assuming the current execution date is 2020-01-01, the
    instantiated dag will run the `main_function` with the parameters
        [
            ['2020-01-01'],
            ['2019-12-31', 2019-12-30', '2019-12-29'],
            ['2019-12-24', 2019-12-19', '2019-12-14'],
            ['2019-12-04', 2019-11-24', '2019-11-14']
        ].
    The order of the inner lists gives the order in which sets of dates
    may be run.  The order within the inner lists is not relevant.  The
    size of the inner lists does *not* set the number of simultaneous
    executions of the `main_function` allowed; that is set by the
    `concurrency` parameter.
    """
    args = deepcopy(default_args)
    args.update(start_date=start_date)
    dag = DAG(
        dag_id=dag_id,
        default_args=args,
        concurrency=concurrency,
        max_active_runs=concurrency,
        dagrun_timeout=dagrun_timeout,
        schedule_interval='@daily',
        start_date=start_date,
        catchup=False,
    )
    with dag:
        ingest_operator_list_list = _build_ingest_operator_list_list(
            reingestion_day_list_list,
            dag,
            main_function,
            ingestion_task_timeout
        )
        end_task = ops.get_log_operator(dag, dag.dag_id, 'Finished')
        for i in range(len(ingest_operator_list_list) - 1):
            wait_operator = ops.get_wait_till_done_operator(
                dag,
                f'wait_L{i}'
            )
            cross_downstream(
                ingest_operator_list_list[i],
                [
                    wait_operator,
                    end_task
                ]
            )
            wait_operator >> ingest_operator_list_list[i + 1]
        ingest_operator_list_list[-1] >> end_task

    return dag