Example #1
0
    def create_dagrun(self, *args, **kwargs):

        # run Airflow's create_dagrun() first
        dagrun = super(DAG, self).create_dagrun(*args, **kwargs)

        create_dag_start_ms = self._now_ms()
        execution_date = kwargs.get('execution_date')
        run_args = {
            'external_trigger': kwargs.get('external_trigger', False)
        }

        extractors = {}
        try:
            extractors = get_extractors()
        except Exception as e:
            log.warn(f'Failed retrieve extractors: {e}',
                     airflow_dag_id=self.dag_id,
                     marquez_namespace=self.marquez_namespace)

        # Marquez metadata collection
        try:
            marquez_client = self.get_marquez_client()

            # Create the Namespace
            # TODO: Use 'anonymous' owner for now, but we may want to use
            # the 'owner' attribute defined via default_args for a DAG
            marquez_client.create_namespace(self.marquez_namespace,
                                            "anonymous")

            # Register each task in the DAG
            for task_id, task in self.task_dict.items():
                t = self._now_ms()
                try:
                    self.report_task(
                        dagrun.run_id,
                        execution_date,
                        run_args,
                        task,
                        extractors.get(task.__class__.__name__))
                except Exception as e:
                    log.error(f'Failed to record task: {e}',
                              airflow_dag_id=self.dag_id,
                              task_id=task_id,
                              marquez_namespace=self.marquez_namespace,
                              duration_ms=(self._now_ms() - t))

            log.info('Successfully recorded metadata',
                     airflow_dag_id=self.dag_id,
                     marquez_namespace=self.marquez_namespace,
                     duration_ms=(self._now_ms() - create_dag_start_ms))

        except Exception as e:
            log.error(f'Failed to record metadata: {e}',
                      airflow_dag_id=self.dag_id,
                      marquez_namespace=self.marquez_namespace,
                      duration_ms=(self._now_ms() - create_dag_start_ms))

        return dagrun
Example #2
0
    def report_jobrun(self, run_args, execution_date):
        now_ms = self._now_ms()

        job_name = self.dag_id
        start_time = execution_date.format("%Y-%m-%dT%H:%M:%SZ")
        end_time = self.compute_endtime(execution_date)
        if end_time:
            end_time = end_time.strftime("%Y-%m-%dT%H:%M:%SZ")
        marquez_client = self.get_marquez_client()

        marquez_client.create_job(job_name,
                                  self.marquez_location,
                                  self.marquez_input_urns,
                                  self.marquez_output_urns,
                                  description=self.description)
        log.info(f'Successfully recorded job: {job_name}',
                 airflow_dag_id=self.dag_id,
                 marquez_namespace=self.marquez_namespace)

        marquez_jobrun = marquez_client.create_job_run(
            job_name,
            run_args=run_args,
            nominal_start_time=start_time,
            nominal_end_time=end_time)

        marquez_jobrun_id = marquez_jobrun.get('runId')
        if marquez_jobrun_id:
            marquez_client.mark_job_run_as_running(marquez_jobrun_id)
            log.info(f'Successfully recorded job run: {job_name}',
                     airflow_dag_id=self.dag_id,
                     airflow_dag_execution_time=start_time,
                     marquez_run_id=marquez_jobrun_id,
                     marquez_namespace=self.marquez_namespace,
                     duration_ms=(self._now_ms() - now_ms))
        else:
            log.warn(f'Run id found not found: {job_name}',
                     airflow_dag_id=self.dag_id,
                     airflow_dag_execution_time=start_time,
                     marquez_run_id=marquez_jobrun_id,
                     marquez_namespace=self.marquez_namespace,
                     duration_ms=(self._now_ms() - now_ms))

        return marquez_jobrun_id
Example #3
0
    def report_task(self, dag_run_id, execution_date, run_args, task,
                    extractor):

        report_job_start_ms = self._now_ms()
        marquez_client = self.get_marquez_client()
        if execution_date:
            start_time = self._to_iso_8601(execution_date)
            end_time = self.compute_endtime(execution_date)
        else:
            start_time = None
            end_time = None

        if end_time:
            end_time = self._to_iso_8601(end_time)

        task_location = None
        try:
            if hasattr(task, 'file_path') and task.file_path:
                task_location = get_location(task.file_path)
            else:
                task_location = get_location(task.dag.fileloc)
        except Exception:
            log.warn('Unable to fetch the location')

        steps_metadata = []
        if extractor:
            try:
                log.info(f'Using extractor {extractor.__name__}',
                         task_type=task.__class__.__name__,
                         airflow_dag_id=self.dag_id,
                         task_id=task.task_id,
                         airflow_run_id=dag_run_id,
                         marquez_namespace=self.marquez_namespace)
                steps_metadata = extractor(task).extract()
            except Exception as e:
                log.error(f'Failed to extract metadata {e}',
                          airflow_dag_id=self.dag_id,
                          task_id=task.task_id,
                          airflow_run_id=dag_run_id,
                          marquez_namespace=self.marquez_namespace)
        else:
            log.warn('Unable to find an extractor.',
                     task_type=task.__class__.__name__,
                     airflow_dag_id=self.dag_id,
                     task_id=task.task_id,
                     airflow_run_id=dag_run_id,
                     marquez_namespace=self.marquez_namespace)

        task_name = f'{self.dag_id}.{task.task_id}'

        # If no extractor found or failed to extract metadata,
        # report the task metadata
        if not steps_metadata:
            steps_metadata = [StepMetadata(task_name)]

        # store all the JobRuns associated with a task
        marquez_jobrun_ids = []

        for step in steps_metadata:
            input_datasets = []
            output_datasets = []

            try:
                input_datasets = self.register_datasets(step.inputs)
            except Exception as e:
                log.error(f'Failed to register inputs: {e}',
                          inputs=str(step.inputs),
                          airflow_dag_id=self.dag_id,
                          task_id=task.task_id,
                          step=step.name,
                          airflow_run_id=dag_run_id,
                          marquez_namespace=self.marquez_namespace)
            try:
                output_datasets = self.register_datasets(step.outputs)
            except Exception as e:
                log.error(f'Failed to register outputs: {e}',
                          outputs=str(step.outputs),
                          airflow_dag_id=self.dag_id,
                          task_id=task.task_id,
                          step=step.name,
                          airflow_run_id=dag_run_id,
                          marquez_namespace=self.marquez_namespace)

            marquez_client.create_job(
                job_name=step.name,
                job_type='BATCH',  # job type
                location=(step.location or task_location),
                input_dataset=input_datasets,
                output_dataset=output_datasets,
                context=step.context,
                description=self.description,
                namespace_name=self.marquez_namespace)
            log.info(f'Successfully recorded job: {step.name}',
                     airflow_dag_id=self.dag_id,
                     marquez_namespace=self.marquez_namespace)

            marquez_jobrun_id = marquez_client.create_job_run(
                step.name,
                run_args=run_args,
                nominal_start_time=start_time,
                nominal_end_time=end_time).get('runId')

            if marquez_jobrun_id:
                marquez_jobrun_ids.append(marquez_jobrun_id)
                marquez_client.mark_job_run_as_started(marquez_jobrun_id)
            else:
                log.error(f'Failed to get run id: {step.name}',
                          airflow_dag_id=self.dag_id,
                          airflow_run_id=dag_run_id,
                          marquez_namespace=self.marquez_namespace)
            log.info(f'Successfully recorded job run: {step.name}',
                     airflow_dag_id=self.dag_id,
                     airflow_dag_execution_time=start_time,
                     marquez_run_id=marquez_jobrun_id,
                     marquez_namespace=self.marquez_namespace,
                     duration_ms=(self._now_ms() - report_job_start_ms))

        # Store the mapping for all the steps associated with a task
        try:
            self._job_id_mapping.set(
                JobIdMapping.make_key(task_name, dag_run_id),
                json.dumps(marquez_jobrun_ids))

        except Exception as e:
            log.error(f'Failed to set id mapping : {e}',
                      airflow_dag_id=self.dag_id,
                      task_id=task.task_id,
                      airflow_run_id=dag_run_id,
                      marquez_run_id=marquez_jobrun_ids,
                      marquez_namespace=self.marquez_namespace)