def setUp(self):
     self.dagbag = DagBag()
     self.dag_id = 'etl_covid_data_dag'
Esempio n. 2
0
def run(args):

    utils.pessimistic_connection_handling()

    # Setting up logging
    log = os.path.expanduser(configuration.get('core', 'BASE_LOG_FOLDER'))
    directory = log + "/{args.dag_id}/{args.task_id}".format(args=args)
    if not os.path.exists(directory):
        os.makedirs(directory)
    args.execution_date = dateutil.parser.parse(args.execution_date)
    iso = args.execution_date.isoformat()
    filename = "{directory}/{iso}".format(**locals())

    # store old log (to help with S3 appends)
    if os.path.exists(filename):
        with open(filename, 'r') as logfile:
            old_log = logfile.read()
    else:
        old_log = None

    subdir = process_subdir(args.subdir)
    logging.root.handlers = []
    logging.basicConfig(filename=filename,
                        level=settings.LOGGING_LEVEL,
                        format=settings.LOG_FORMAT)

    if not args.pickle:
        dagbag = DagBag(subdir)
        if args.dag_id not in dagbag.dags:
            msg = 'DAG [{0}] could not be found in {1}'.format(
                args.dag_id, subdir)
            logging.error(msg)
            raise AirflowException(msg)
        dag = dagbag.dags[args.dag_id]
        task = dag.get_task(task_id=args.task_id)
    else:
        session = settings.Session()
        logging.info('Loading pickle id {args.pickle}'.format(**locals()))
        dag_pickle = session.query(DagPickle).filter(
            DagPickle.id == args.pickle).first()
        if not dag_pickle:
            raise AirflowException("Who hid the pickle!? [missing pickle]")
        dag = dag_pickle.pickle
        task = dag.get_task(task_id=args.task_id)

    task_start_date = None
    if args.task_start_date:
        task_start_date = dateutil.parser.parse(args.task_start_date)
        task.start_date = task_start_date
    ti = TaskInstance(task, args.execution_date)

    if args.local:
        print("Logging into: " + filename)
        run_job = jobs.LocalTaskJob(
            task_instance=ti,
            mark_success=args.mark_success,
            force=args.force,
            pickle_id=args.pickle,
            task_start_date=task_start_date,
            ignore_dependencies=args.ignore_dependencies,
            pool=args.pool)
        run_job.run()
    elif args.raw:
        ti.run(
            mark_success=args.mark_success,
            force=args.force,
            ignore_dependencies=args.ignore_dependencies,
            job_id=args.job_id,
            pool=args.pool,
        )
    else:
        pickle_id = None
        if args.ship_dag:
            try:
                # Running remotely, so pickling the DAG
                session = settings.Session()
                pickle = DagPickle(dag)
                session.add(pickle)
                session.commit()
                pickle_id = pickle.id
                print(('Pickled dag {dag} '
                       'as pickle_id:{pickle_id}').format(**locals()))
            except Exception as e:
                print('Could not pickle the DAG')
                print(e)
                raise e

        executor = DEFAULT_EXECUTOR
        executor.start()
        print("Sending to executor.")
        executor.queue_task_instance(
            ti,
            mark_success=args.mark_success,
            pickle_id=pickle_id,
            ignore_dependencies=args.ignore_dependencies,
            force=args.force)
        executor.heartbeat()
        executor.end()

    if configuration.get('core', 'S3_LOG_FOLDER').startswith('s3:'):
        import boto
        s3_log = filename.replace(log,
                                  configuration.get('core', 'S3_LOG_FOLDER'))
        bucket, key = s3_log.lstrip('s3:/').split('/', 1)
        if os.path.exists(filename):

            # get logs
            with open(filename, 'r') as logfile:
                new_log = logfile.read()

            # remove old logs (since they are already in S3)
            if old_log:
                new_log.replace(old_log, '')

            try:
                s3 = boto.connect_s3()
                s3_key = boto.s3.key.Key(s3.get_bucket(bucket), key)

                # append new logs to old S3 logs, if available
                if s3_key.exists():
                    old_s3_log = s3_key.get_contents_as_string().decode()
                    new_log = old_s3_log + '\n' + new_log

                # send log to S3
                encrypt = configuration.get('core', 'ENCRYPT_S3_LOGS')
                s3_key.set_contents_from_string(new_log, encrypt_key=encrypt)
            except:
                print('Could not send logs to S3.')
 def index(self):
     dagbag = DagBag()
     return self.render("rest_api_plugin/index.html", dags=dagbag.dags, airflow_webserver_base_url=airflow_webserver_base_url, url_dict=url_dict)
Esempio n. 4
0
 def setUp(self):
     self.dagbag = DagBag(dag_folder=DEV_NULL, include_examples=True)
     self.args = {'owner': 'airflow', 'start_date': DEFAULT_DATE}
     self.dag = DAG(TEST_DAG_ID, default_args=self.args)
Esempio n. 5
0
def test_dag_loading():
    dagbag = DagBag()
    dag = dagbag.get_dag(dag_id='my_dag')
    assert dagbag.import_errors == {}
    assert dag is not None
    assert len(dag.tasks) == 1
Esempio n. 6
0
    def test_should_response_200_serialized(self):
        # Create empty app with empty dagbag to check if DAG is read from db
        app_serialized = app.create_app(testing=True)
        dag_bag = DagBag(os.devnull,
                         include_examples=False,
                         read_dags_from_db=True)
        app_serialized.dag_bag = dag_bag
        client = app_serialized.test_client()

        SerializedDagModel.write_dag(self.dag)

        expected = {
            "catchup": True,
            "concurrency": 16,
            "dag_id": "test_dag",
            "dag_run_timeout": None,
            "default_view": "tree",
            "description": None,
            "doc_md": "details",
            "fileloc": __file__,
            "is_paused": None,
            "is_subdag": False,
            "orientation": "LR",
            "owners": [],
            "schedule_interval": {
                "__type": "TimeDelta",
                "days": 1,
                "microseconds": 0,
                "seconds": 0,
            },
            "start_date": "2020-06-15T00:00:00+00:00",
            "tags": None,
            "timezone": "Timezone('UTC')",
        }
        response = client.get(f"/api/v1/dags/{self.dag_id}/details",
                              environ_overrides={'REMOTE_USER': "******"})
        assert response.status_code == 200
        assert response.json == expected

        response = self.client.get(f"/api/v1/dags/{self.dag_id}/details",
                                   environ_overrides={'REMOTE_USER': "******"})
        assert response.status_code == 200
        expected = {
            'catchup': True,
            'concurrency': 16,
            'dag_id': 'test_dag',
            'dag_run_timeout': None,
            'default_view': 'tree',
            'description': None,
            'doc_md': 'details',
            'fileloc': __file__,
            'is_paused': None,
            'is_subdag': False,
            'orientation': 'LR',
            'owners': [],
            'schedule_interval': {
                '__type': 'TimeDelta',
                'days': 1,
                'microseconds': 0,
                'seconds': 0
            },
            'start_date': '2020-06-15T00:00:00+00:00',
            'tags': None,
            'timezone': "Timezone('UTC')",
        }
        assert response.json == expected
Esempio n. 7
0
def dag_report(args):
    """Displays dagbag stats at the command line"""
    dagbag = DagBag(process_subdir(args.subdir))
    print(tabulate(dagbag.dagbag_stats, headers="keys", tablefmt=args.output))
Esempio n. 8
0
 def setUp(self):
     self.dagbag = DagBag()
     session = settings.Session()
     session.query(models.ImportError).delete()
     session.commit()
Esempio n. 9
0
 def setUp(self):
     self.parser = cli.CLIFactory.get_parser()
     self.dagbag = DagBag(include_examples=True)
Esempio n. 10
0
def run(args):

    utils.pessimistic_connection_handling()

    # Setting up logging
    log_base = os.path.expanduser(configuration.get('core', 'BASE_LOG_FOLDER'))
    directory = log_base + "/{args.dag_id}/{args.task_id}".format(args=args)
    if not os.path.exists(directory):
        os.makedirs(directory)
    args.execution_date = dateutil.parser.parse(args.execution_date)
    iso = args.execution_date.isoformat()
    filename = "{directory}/{iso}".format(**locals())

    subdir = process_subdir(args.subdir)
    logging.root.handlers = []
    logging.basicConfig(
        filename=filename,
        level=settings.LOGGING_LEVEL,
        format=settings.LOG_FORMAT)

    if not args.pickle:
        dagbag = DagBag(subdir)
        if args.dag_id not in dagbag.dags:
            msg = 'DAG [{0}] could not be found in {1}'.format(args.dag_id, subdir)
            logging.error(msg)
            raise AirflowException(msg)
        dag = dagbag.dags[args.dag_id]
        task = dag.get_task(task_id=args.task_id)
    else:
        session = settings.Session()
        logging.info('Loading pickle id {args.pickle}'.format(**locals()))
        dag_pickle = session.query(
            DagPickle).filter(DagPickle.id == args.pickle).first()
        if not dag_pickle:
            raise AirflowException("Who hid the pickle!? [missing pickle]")
        dag = dag_pickle.pickle
        task = dag.get_task(task_id=args.task_id)

    task_start_date = None
    if args.task_start_date:
        task_start_date = dateutil.parser.parse(args.task_start_date)
        task.start_date = task_start_date
    ti = TaskInstance(task, args.execution_date)

    if args.local:
        print("Logging into: " + filename)
        run_job = jobs.LocalTaskJob(
            task_instance=ti,
            mark_success=args.mark_success,
            force=args.force,
            pickle_id=args.pickle,
            task_start_date=task_start_date,
            ignore_dependencies=args.ignore_dependencies,
            pool=args.pool)
        run_job.run()
    elif args.raw:
        ti.run(
            mark_success=args.mark_success,
            force=args.force,
            ignore_dependencies=args.ignore_dependencies,
            job_id=args.job_id,
            pool=args.pool,
        )
    else:
        pickle_id = None
        if args.ship_dag:
            try:
                # Running remotely, so pickling the DAG
                session = settings.Session()
                pickle = DagPickle(dag)
                session.add(pickle)
                session.commit()
                pickle_id = pickle.id
                print((
                    'Pickled dag {dag} '
                    'as pickle_id:{pickle_id}').format(**locals()))
            except Exception as e:
                print('Could not pickle the DAG')
                print(e)
                raise e

        executor = DEFAULT_EXECUTOR
        executor.start()
        print("Sending to executor.")
        executor.queue_task_instance(
            ti,
            mark_success=args.mark_success,
            pickle_id=pickle_id,
            ignore_dependencies=args.ignore_dependencies,
            force=args.force,
            pool=args.pool)
        executor.heartbeat()
        executor.end()

    # store logs remotely
    remote_base = configuration.get('core', 'REMOTE_BASE_LOG_FOLDER')

    # deprecated as of March 2016
    if not remote_base and configuration.get('core', 'S3_LOG_FOLDER'):
        warnings.warn(
            'The S3_LOG_FOLDER configuration key has been replaced by '
            'REMOTE_BASE_LOG_FOLDER. Your configuration still works but please '
            'update airflow.cfg to ensure future compatibility.',
            DeprecationWarning)
        remote_base = configuration.get('core', 'S3_LOG_FOLDER')

    if os.path.exists(filename):
        # read log and remove old logs to get just the latest additions

        with open(filename, 'r') as logfile:
            log = logfile.read()

        remote_log_location = filename.replace(log_base, remote_base)
        # S3

        if remote_base.startswith('s3:/'):
            utils.S3Log().write(log, remote_log_location)
        # GCS
        elif remote_base.startswith('gs:/'):
            utils.GCSLog().write(
                log,
                remote_log_location,
                append=True)
        # Other
        elif remote_base:
            logging.error(
                'Unsupported remote log location: {}'.format(remote_base))
Esempio n. 11
0
 def setUpClass(cls):
     cls.dagbag = DagBag(include_examples=True)
     cls.parser = cli.CLIFactory.get_parser()
Esempio n. 12
0
 def setUpClass(cls):
     super().setUpClass()
     DagBag(example_bash_operator.__file__).get_dag(
         "example_bash_operator").sync_to_db()
 def setUp(self):
     self.dagbag = DagBag()
     self.dag_id = self.dagbag.get_dag('sync_country_from_zendesk_pipeline')
     self.dag_login_aws = self.dag_id.tasks[0]
     self.dag_sync_country_from_zendesk_pipeline = self.dag_id.tasks[1]
 def setUp(self):
     variable = Variable()
     self.dagbag = DagBag(dag_folder=variable.get('dags_folder'))
Esempio n. 15
0
 def setUp(self):
     self.dagbag = DagBag(include_examples=True)
     self.cluster = LocalCluster()
Esempio n. 16
0
    def test_retry_still_in_executor(self):
        """
        Checks if the scheduler does not put a task in limbo, when a task is retried
        but is still present in the executor.
        """
        executor = TestExecutor()
        dagbag = DagBag(executor=executor)
        dagbag.dags.clear()
        dagbag.executor = executor

        dag = DAG(dag_id='test_retry_still_in_executor',
                  start_date=DEFAULT_DATE,
                  schedule_interval="@once")
        dag_task1 = BashOperator(task_id='test_retry_handling_op',
                                 bash_command='exit 1',
                                 retries=1,
                                 dag=dag,
                                 owner='airflow')

        dag.clear()
        dag.is_subdag = False

        session = settings.Session()
        orm_dag = DagModel(dag_id=dag.dag_id)
        orm_dag.is_paused = False
        session.merge(orm_dag)
        session.commit()

        dagbag.bag_dag(dag=dag, root_dag=dag, parent_dag=dag)

        @mock.patch('airflow.models.DagBag', return_value=dagbag)
        @mock.patch('airflow.models.DagBag.collect_dags')
        def do_schedule(function, function2):
            # Use a empty file since the above mock will return the
            # expected DAGs. Also specify only a single file so that it doesn't
            # try to schedule the above DAG repeatedly.
            scheduler = SchedulerJob(num_runs=1,
                                     executor=executor,
                                     subdir=os.path.join(
                                         models.DAGS_FOLDER, "no_dags.py"))
            scheduler.heartrate = 0
            scheduler.run()

        do_schedule()
        self.assertEquals(1, len(executor.queued_tasks))

        def run_with_error(task):
            try:
                task.run()
            except AirflowException:
                pass

        ti_tuple = six.next(six.itervalues(executor.queued_tasks))
        (command, priority, queue, ti) = ti_tuple
        ti.task = dag_task1

        # fail execution
        run_with_error(ti)
        self.assertEqual(ti.state, State.UP_FOR_RETRY)
        self.assertEqual(ti.try_number, 1)

        ti.refresh_from_db(lock_for_update=True, session=session)
        ti.state = State.SCHEDULED
        session.merge(ti)
        session.commit()

        # do not schedule
        do_schedule()
        self.assertTrue(executor.has_task(ti))
        ti.refresh_from_db()
        self.assertEqual(ti.state, State.SCHEDULED)

        # now the executor has cleared and it should be allowed the re-queue
        executor.queued_tasks.clear()
        do_schedule()
        ti.refresh_from_db()
        self.assertEqual(ti.state, State.QUEUED)
Esempio n. 17
0
 def setUp(self):
     """Method to set up the DAG Validation Class instance for testing."""
     self.dagbag = DagBag()
Esempio n. 18
0
 def setUp(self):
     DAGS_DIR = os.getenv('INPUT_DAGPATHS')
     os.environ['PYTHONPATH'] = f"{os.getenv('PYTHONPATH')}:{DAGS_DIR}"
     logging.info("DAGs dir : {}".format(DAGS_DIR))
     self.dagbag = DagBag(dag_folder=DAGS_DIR, include_examples=False)
Esempio n. 19
0
def dag_list_dags(args):
    """Displays dags with or without stats at the command line"""
    dagbag = DagBag(process_subdir(args.subdir))
    dags = dagbag.dags.values()
    print(_tabulate_dags(dags, tablefmt=args.output))
Esempio n. 20
0
 def setUpClass(cls):
     cls.dagbag = DagBag(include_examples=True)
     cls.dagbag.sync_to_db()
     cls.parser = cli_parser.get_parser()
Esempio n. 21
0
 def get_dagbag():
     return DagBag()
Esempio n. 22
0
def get_dag(args):
    dagbag = DagBag(process_subdir(args.subdir))
    if args.dag_id not in dagbag.dags:
        raise AirflowException('dag_id could not be found: {}'.format(
            args.dag_id))
    return dagbag.dags[args.dag_id]
Esempio n. 23
0
    def test_handle_failure_callback_with_zombies_are_correctly_passed_to_dag_file_processor(self):
        """
        Check that the same set of failure callback with zombies are passed to the dag
        file processors until the next zombie detection logic is invoked.
        """
        test_dag_path = os.path.join(TEST_DAG_FOLDER, 'test_example_bash_operator.py')
        with conf_vars({('scheduler', 'max_threads'): '1',
                        ('core', 'load_examples'): 'False'}):
            dagbag = DagBag(test_dag_path)
            with create_session() as session:
                session.query(LJ).delete()
                dag = dagbag.get_dag('test_example_bash_operator')
                dag.sync_to_db()
                task = dag.get_task(task_id='run_this_last')

                ti = TI(task, DEFAULT_DATE, State.RUNNING)
                local_job = LJ(ti)
                local_job.state = State.SHUTDOWN
                session.add(local_job)
                session.commit()

                # TODO: If there was an actual Relationshop between TI and Job
                # we wouldn't need this extra commit
                session.add(ti)
                ti.job_id = local_job.id
                session.commit()

                expected_failure_callback_requests = [
                    TaskCallbackRequest(
                        full_filepath=dag.full_filepath,
                        simple_task_instance=SimpleTaskInstance(ti),
                        msg="Message"
                    )
                ]

            test_dag_path = os.path.join(TEST_DAG_FOLDER, 'test_example_bash_operator.py')

            child_pipe, parent_pipe = multiprocessing.Pipe()
            async_mode = 'sqlite' not in conf.get('core', 'sql_alchemy_conn')

            fake_processors = []

            def fake_processor_factory(*args, **kwargs):
                nonlocal fake_processors
                processor = FakeDagFileProcessorRunner._fake_dag_processor_factory(*args, **kwargs)
                fake_processors.append(processor)
                return processor

            manager = DagFileProcessorManager(
                dag_directory=test_dag_path,
                max_runs=1,
                processor_factory=fake_processor_factory,
                processor_timeout=timedelta.max,
                signal_conn=child_pipe,
                dag_ids=[],
                pickle_dags=False,
                async_mode=async_mode)

            self.run_processor_manager_one_loop(manager, parent_pipe)

            if async_mode:
                # Once for initial parse, and then again for the add_callback_to_queue
                assert len(fake_processors) == 2
                assert fake_processors[0]._file_path == test_dag_path
                assert fake_processors[0]._callback_requests == []
            else:
                assert len(fake_processors) == 1

            assert fake_processors[-1]._file_path == test_dag_path
            callback_requests = fake_processors[-1]._callback_requests
            assert (
                set(zombie.simple_task_instance.key for zombie in expected_failure_callback_requests) ==
                set(result.simple_task_instance.key for result in callback_requests)
            )

            child_pipe.close()
            parent_pipe.close()
Esempio n. 24
0
def test_dag_loads_with_no_errors(tmpdir):
    tmp_directory = str(tmpdir)
    dag_bag = DagBag(dag_folder=tmp_directory, include_examples=False)
    dag_bag.process_file(os.path.join(FILE_DIR, 'loader_workflow.py'))
    assert len(dag_bag.import_errors) == 0
    assert len(dag_bag.dags) == 2
Esempio n. 25
0
 def setUpClass(cls):
     cls.dagbag = DagBag(include_examples=True)
Esempio n. 26
0
def dagbag():
    return DagBag(include_examples=False)
Esempio n. 27
0
def list_dags(args):
    dagbag = DagBag(process_subdir(args.subdir))
    print("\n".join(sorted(dagbag.dags)))
Esempio n. 28
0
 def setUp(self):
     self.dagbag = DagBag(include_examples=True)
Esempio n. 29
0
def make_example_dags(module_path):
    """Loads DAGs from a module for test."""
    dagbag = DagBag(module_path)
    return dagbag.dags
Esempio n. 30
0
 def setUp(self):
     self.dagbag = DagBag()