コード例 #1
0
    def get_task_logs(dag_id, task_id, pendulum_execution_date, session):
        if not dag_id or not task_id or not pendulum_execution_date:
            return None

        dagbag = models.DagBag(
            os.devnull,  # to initialize an empty dag bag
            store_serialized_dags=STORE_SERIALIZED_DAGS,
        )
        dag = dagbag.get_dag(dag_id)
        ti = (
            session.query(models.TaskInstance)
            .filter(
                models.TaskInstance.dag_id == dag_id,
                models.TaskInstance.task_id == task_id,
                models.TaskInstance.execution_date == pendulum_execution_date,
            )
            .first()
        )
        ti.task = dag.get_task(ti.task_id)

        file_task_handler = FileTaskHandler(
            base_log_folder=conf.get("core", "BASE_LOG_FOLDER"),
            filename_template=conf.get("core", "LOG_FILENAME_TEMPLATE"),
        )
        logs, metadatas = file_task_handler.read(ti, None, None)
        return logs, metadatas
コード例 #2
0
    def test_python_formatting(self):
        expected_filename = 'dag_for_testing_filename_rendering/task_for_testing_filename_rendering/%s/42.log' % DEFAULT_DATE.isoformat(
        )

        fth = FileTaskHandler(
            '', '{dag_id}/{task_id}/{execution_date}/{try_number}.log')
        rendered_filename = fth._render_filename(self.ti, 42)
        self.assertEqual(expected_filename, rendered_filename)
コード例 #3
0
    def test_jinja_rendering(self):
        expected_filename = \
            'dag_for_testing_filename_rendering/task_for_testing_filename_rendering/%s/42.log' \
            % DEFAULT_DATE.isoformat()

        fth = FileTaskHandler('', '{{ ti.dag_id }}/{{ ti.task_id }}/{{ ts }}/{{ try_number }}.log')
        rendered_filename = fth._render_filename(self.ti, 42)
        self.assertEqual(expected_filename, rendered_filename)
コード例 #4
0
    def test_jinja_rendering(self):
        expected_filename = \
            'dag_for_testing_filename_rendering/task_for_testing_filename_rendering/%s/42.log' \
            % DEFAULT_DATE.isoformat()

        fth = FileTaskHandler('', '{{ ti.dag_id }}/{{ ti.task_id }}/{{ ts }}/{{ try_number }}.log')
        rendered_filename = fth._render_filename(self.ti, 42)
        self.assertEqual(expected_filename, rendered_filename)
コード例 #5
0
    def test_python_formatting(self):
        expected_filename = \
            'dag_for_testing_filename_rendering/task_for_testing_filename_rendering/%s/42.log' \
            % DEFAULT_DATE.isoformat()

        fth = FileTaskHandler('', '{dag_id}/{task_id}/{execution_date}/{try_number}.log')
        rendered_filename = fth._render_filename(self.ti, 42)
        self.assertEqual(expected_filename, rendered_filename)
コード例 #6
0
    def _read(self, ti, try_number):
        """
        Read logs of given task instance and try_number from GCS.
        If failed, read the log from BigQuery.

        :param ti: task instance object
        :type ti: airflow.models.TypeInstance
        :param try_number: task instance try_number to read logs from
        :type try_number: int
        """
        # Explicitly getting log relative path is necessary as the given
        # task instance might be different than task instance passed in
        # in set_context method.
        log_relative_path = self._render_filename(ti, try_number + 1)
        remote_loc = os.path.join(self.remote_base, log_relative_path)

        if self.gcs_log_exists(remote_loc):
            # If GCS remote file exists, we do not fetch logs from task instance
            # local machine even if there are errors reading remote logs, as
            # remote_log will contain error message.
            remote_log = self.gcs_read(remote_loc, return_error=True)
            log = '*** Reading remote log from {}.\n{}\n'.format(
                remote_loc, remote_log)
            return log

        # everything above is copied from gcs_task_handler.py, the following elif statement includes changes
        # specifically for bluecore_gcs_task_handler
        elif ti.operator == "KubernetesJobOperator":
            # If remote file is not available and the job is a Kubernetes Job,
            # use the task instance attributes to query BigQuery logs and
            # make the logs available in real time from the "Log" page of the UI.

            # TODO: PROD-19910 job_name is always none on the first try, should fix this so an exception doesn't show each time
            job_name = ti.xcom_pull(task_ids=ti.task_id,
                                    key='kubernetes_job_name')
            if job_name is None:
                return "An XCOM kubernetes job_name was not found. This does not mean the job has failed. It is " \
                       "possible that a job_name has not yet been created. Try refreshing the page. "

            pod_output = BQGCSTaskHandler.get_pods(job_name)

            billing_project_id = configuration.get(
                'core', 'kubernetes_bigquery_billing_project')
            client = bigquery.Client(billing_project_id)

            # get info for the BQ table name
            data_project_name = configuration.get(
                'core', 'kubernetes_bigquery_data_project')
            start_date = ti.start_date.strftime("%Y%m%d")
            end_date = datetime.utcnow().date().strftime("%Y%m%d")

            log_lines = []
            tables = []

            for pod in pod_output['items']:
                pod_id = pod['metadata']['name']
                for container in pod['spec']['containers']:
                    container_name = string.replace(container['name'], '-',
                                                    '_')
                    bq_table_name = "{}.gke_logs.{}_*".format(
                        data_project_name, container_name)
                    tables.append((bq_table_name, pod_id))

            if not tables:
                message = "No pods were found running for this task, and the task's output has not been written to " \
                          "GCS.  If refreshing this page doesn't present a log, it's possible that this task failed " \
                          "to spawn a kubernetes job.\n\n"
                message += json.dumps(tables) + "\n\n"
                try:
                    message += FileTaskHandler._read(self, ti, try_number)
                except:
                    message += "Failed to read airflow worker log!  Stack trace:\n{}".format(
                        traceback.format_exc())
                return message

            query = ("\n UNION ALL \n ".join([
                BQGCSTaskHandler.generate_query(bq_table_name=bq_table_name,
                                                pod_id=pod_id,
                                                start_date=start_date,
                                                end_date=end_date)
                for bq_table_name, pod_id in tables
            ]) + "\n ORDER BY pod_id, logName, timestamp")

            try:
                query_job = client.query(query)
                result = query_job.result()
            except BadRequest as e:
                return "BadRequest error from BigQuery. The query may be empty or the job finished. Try refreshing " \
                       "the page. \n {e}".format(e=e)

            log_name = ""
            for index, row in enumerate(result):
                if row.logName.split("/")[-1:][0] != log_name:
                    log_name = row.logName.split("/")[-1:][0]
                    log_lines.append(
                        "LOGGING OUTPUT FROM {log_name} \n".format(
                            log_name=log_name))
                log_lines.append("{}  {}".format(
                    row.timestamp.strftime("%Y-%m-%d %H:%M:%S.%f")[:-4],
                    row.textPayload))
            return "".join(log_lines)

        # else statement taken from gcs_task_handler.py
        else:
            log = super(BQGCSTaskHandler, self)._read(ti, try_number)
            return log