def create_dagrun(self, *args, **kwargs): # run Airflow's create_dagrun() first dagrun = super(DAG, self).create_dagrun(*args, **kwargs) create_dag_start_ms = self._now_ms() execution_date = kwargs.get('execution_date') run_args = { 'external_trigger': kwargs.get('external_trigger', False) } extractors = {} try: extractors = get_extractors() except Exception as e: log.warn(f'Failed retrieve extractors: {e}', airflow_dag_id=self.dag_id, marquez_namespace=self.marquez_namespace) # Marquez metadata collection try: marquez_client = self.get_marquez_client() # Create the Namespace # TODO: Use 'anonymous' owner for now, but we may want to use # the 'owner' attribute defined via default_args for a DAG marquez_client.create_namespace(self.marquez_namespace, "anonymous") # Register each task in the DAG for task_id, task in self.task_dict.items(): t = self._now_ms() try: self.report_task( dagrun.run_id, execution_date, run_args, task, extractors.get(task.__class__.__name__)) except Exception as e: log.error(f'Failed to record task: {e}', airflow_dag_id=self.dag_id, task_id=task_id, marquez_namespace=self.marquez_namespace, duration_ms=(self._now_ms() - t)) log.info('Successfully recorded metadata', airflow_dag_id=self.dag_id, marquez_namespace=self.marquez_namespace, duration_ms=(self._now_ms() - create_dag_start_ms)) except Exception as e: log.error(f'Failed to record metadata: {e}', airflow_dag_id=self.dag_id, marquez_namespace=self.marquez_namespace, duration_ms=(self._now_ms() - create_dag_start_ms)) return dagrun
def report_jobrun(self, run_args, execution_date): now_ms = self._now_ms() job_name = self.dag_id start_time = execution_date.format("%Y-%m-%dT%H:%M:%SZ") end_time = self.compute_endtime(execution_date) if end_time: end_time = end_time.strftime("%Y-%m-%dT%H:%M:%SZ") marquez_client = self.get_marquez_client() marquez_client.create_job(job_name, self.marquez_location, self.marquez_input_urns, self.marquez_output_urns, description=self.description) log.info(f'Successfully recorded job: {job_name}', airflow_dag_id=self.dag_id, marquez_namespace=self.marquez_namespace) marquez_jobrun = marquez_client.create_job_run( job_name, run_args=run_args, nominal_start_time=start_time, nominal_end_time=end_time) marquez_jobrun_id = marquez_jobrun.get('runId') if marquez_jobrun_id: marquez_client.mark_job_run_as_running(marquez_jobrun_id) log.info(f'Successfully recorded job run: {job_name}', airflow_dag_id=self.dag_id, airflow_dag_execution_time=start_time, marquez_run_id=marquez_jobrun_id, marquez_namespace=self.marquez_namespace, duration_ms=(self._now_ms() - now_ms)) else: log.warn(f'Run id found not found: {job_name}', airflow_dag_id=self.dag_id, airflow_dag_execution_time=start_time, marquez_run_id=marquez_jobrun_id, marquez_namespace=self.marquez_namespace, duration_ms=(self._now_ms() - now_ms)) return marquez_jobrun_id
def report_task(self, dag_run_id, execution_date, run_args, task, extractor): report_job_start_ms = self._now_ms() marquez_client = self.get_marquez_client() if execution_date: start_time = self._to_iso_8601(execution_date) end_time = self.compute_endtime(execution_date) else: start_time = None end_time = None if end_time: end_time = self._to_iso_8601(end_time) task_location = None try: if hasattr(task, 'file_path') and task.file_path: task_location = get_location(task.file_path) else: task_location = get_location(task.dag.fileloc) except Exception: log.warn('Unable to fetch the location') steps_metadata = [] if extractor: try: log.info(f'Using extractor {extractor.__name__}', task_type=task.__class__.__name__, airflow_dag_id=self.dag_id, task_id=task.task_id, airflow_run_id=dag_run_id, marquez_namespace=self.marquez_namespace) steps_metadata = extractor(task).extract() except Exception as e: log.error(f'Failed to extract metadata {e}', airflow_dag_id=self.dag_id, task_id=task.task_id, airflow_run_id=dag_run_id, marquez_namespace=self.marquez_namespace) else: log.warn('Unable to find an extractor.', task_type=task.__class__.__name__, airflow_dag_id=self.dag_id, task_id=task.task_id, airflow_run_id=dag_run_id, marquez_namespace=self.marquez_namespace) task_name = f'{self.dag_id}.{task.task_id}' # If no extractor found or failed to extract metadata, # report the task metadata if not steps_metadata: steps_metadata = [StepMetadata(task_name)] # store all the JobRuns associated with a task marquez_jobrun_ids = [] for step in steps_metadata: input_datasets = [] output_datasets = [] try: input_datasets = self.register_datasets(step.inputs) except Exception as e: log.error(f'Failed to register inputs: {e}', inputs=str(step.inputs), airflow_dag_id=self.dag_id, task_id=task.task_id, step=step.name, airflow_run_id=dag_run_id, marquez_namespace=self.marquez_namespace) try: output_datasets = self.register_datasets(step.outputs) except Exception as e: log.error(f'Failed to register outputs: {e}', outputs=str(step.outputs), airflow_dag_id=self.dag_id, task_id=task.task_id, step=step.name, airflow_run_id=dag_run_id, marquez_namespace=self.marquez_namespace) marquez_client.create_job( job_name=step.name, job_type='BATCH', # job type location=(step.location or task_location), input_dataset=input_datasets, output_dataset=output_datasets, context=step.context, description=self.description, namespace_name=self.marquez_namespace) log.info(f'Successfully recorded job: {step.name}', airflow_dag_id=self.dag_id, marquez_namespace=self.marquez_namespace) marquez_jobrun_id = marquez_client.create_job_run( step.name, run_args=run_args, nominal_start_time=start_time, nominal_end_time=end_time).get('runId') if marquez_jobrun_id: marquez_jobrun_ids.append(marquez_jobrun_id) marquez_client.mark_job_run_as_started(marquez_jobrun_id) else: log.error(f'Failed to get run id: {step.name}', airflow_dag_id=self.dag_id, airflow_run_id=dag_run_id, marquez_namespace=self.marquez_namespace) log.info(f'Successfully recorded job run: {step.name}', airflow_dag_id=self.dag_id, airflow_dag_execution_time=start_time, marquez_run_id=marquez_jobrun_id, marquez_namespace=self.marquez_namespace, duration_ms=(self._now_ms() - report_job_start_ms)) # Store the mapping for all the steps associated with a task try: self._job_id_mapping.set( JobIdMapping.make_key(task_name, dag_run_id), json.dumps(marquez_jobrun_ids)) except Exception as e: log.error(f'Failed to set id mapping : {e}', airflow_dag_id=self.dag_id, task_id=task.task_id, airflow_run_id=dag_run_id, marquez_run_id=marquez_jobrun_ids, marquez_namespace=self.marquez_namespace)