Beispiel #1
0
 async def run(self, server):
     # Connect to dashboard.
     self._stub = await self._connect_to_dashboard()
     # Start monitor task.
     self._monitor = monitor_events(
         self._event_dir,
         lambda data: create_task(self._cached_events.put(data)),
         source_types=event_consts.EVENT_AGENT_MONITOR_SOURCE_TYPES)
     # Start reporting events.
     await self.report_events()
Beispiel #2
0
def monitor_events(
    event_dir,
    callback,
    scan_interval_seconds=event_consts.SCAN_EVENT_DIR_INTERVAL_SECONDS,
    start_mtime=time.time() + event_consts.SCAN_EVENT_START_OFFSET_SECONDS,
    monitor_files=None,
    source_types=None,
):
    """Monitor events in directory. New events will be read and passed to the
    callback.

    Args:
        event_dir (str): The event log directory.
        callback (def callback(List[str]): pass): A callback accepts a list of
            event strings.
        scan_interval_seconds (float): An interval seconds between two scans.
        start_mtime (float): Only the event log files whose last modification
            time is greater than start_mtime are monitored.
        monitor_files (Dict[int, MonitorFile]): The map from event log file id
            to MonitorFile object. Monitor all files start from the beginning
            if the value is None.
        source_types (List[str]): A list of source type name from
            event_pb2.Event.SourceType.keys(). Monitor all source types if the
            value is None.
    """
    loop = asyncio.get_event_loop()
    if monitor_files is None:
        monitor_files = {}

    logger.info(
        "Monitor events logs modified after %s on %s, "
        "the source types are %s.",
        start_mtime,
        event_dir,
        "all" if source_types is None else source_types,
    )

    MonitorFile = collections.namedtuple("MonitorFile",
                                         ["size", "mtime", "position"])

    def _source_file_filter(source_file):
        stat = os.stat(source_file)
        return stat.st_mtime > start_mtime

    def _read_monitor_file(file, pos):
        assert isinstance(
            file,
            str), f"File should be a str, but a {type(file)}({file}) found"
        fd = os.open(file, os.O_RDONLY)
        try:
            stat = os.stat(fd)
            # Check the file size to avoid raising the exception
            # ValueError: cannot mmap an empty file
            if stat.st_size <= 0:
                return []
            fid = stat.st_ino or file
            monitor_file = monitor_files.get(fid)
            if monitor_file:
                if (monitor_file.position == monitor_file.size
                        and monitor_file.size == stat.st_size
                        and monitor_file.mtime == stat.st_mtime):
                    logger.debug(
                        "Skip reading the file because "
                        "there is no change: %s", file)
                    return []
                position = monitor_file.position
            else:
                logger.info("Found new event log file: %s", file)
                position = pos
            # Close the fd in finally.
            r = _read_file(fd, position, closefd=False)
            # It should be fine to update the dict in executor thread.
            monitor_files[r.fid] = MonitorFile(r.size, r.mtime, r.position)
            loop.call_soon_threadsafe(callback, r.lines)
        except Exception as e:
            raise Exception(f"Read event file failed: {file}") from e
        finally:
            os.close(fd)

    @async_loop_forever(scan_interval_seconds, cancellable=True)
    async def _scan_event_log_files():
        # Scan event files.
        source_files = await loop.run_in_executor(None, _get_source_files,
                                                  event_dir, source_types,
                                                  _source_file_filter)

        # Limit concurrent read to avoid fd exhaustion.
        semaphore = asyncio.Semaphore(event_consts.CONCURRENT_READ_LIMIT)

        async def _concurrent_coro(filename):
            async with semaphore:
                return await loop.run_in_executor(None, _read_monitor_file,
                                                  filename, 0)

        # Read files.
        await asyncio.gather(*[
            _concurrent_coro(filename)
            for filename in list(itertools.chain(*source_files.values()))
        ])

    return create_task(_scan_event_log_files())
Beispiel #3
0
    async def InitializeJobEnv(self, request, context):
        # TODO(fyrestone): Handle duplicated InitializeJobEnv requests
        # when initializing job environment.
        # TODO(fyrestone): Support reinitialize job environment.

        # TODO(fyrestone): Use job id instead of unique id.
        unique_id = secrets.token_hex(6)

        # Parse the job description from the request.
        try:
            job_description_data = json.loads(request.job_description)
            job_info = JobInfo(unique_id=unique_id,
                               temp_dir=self._dashboard_agent.temp_dir,
                               log_dir=self._dashboard_agent.log_dir,
                               **job_description_data)
        except json.JSONDecodeError as ex:
            error_message = str(ex)
            error_message += f", job_payload:\n{request.job_description}"
            logger.error("[%s] Initialize job environment failed, %s.",
                         unique_id, error_message)
            return job_agent_pb2.InitializeJobEnvReply(
                status=agent_manager_pb2.AGENT_RPC_STATUS_FAILED,
                error_message=error_message)
        except Exception as ex:
            logger.exception(ex)
            return job_agent_pb2.InitializeJobEnvReply(
                status=agent_manager_pb2.AGENT_RPC_STATUS_FAILED,
                error_message=traceback.format_exc())

        async def _initialize_job_env():
            os.makedirs(job_consts.JOB_DIR.format(temp_dir=job_info.temp_dir,
                                                  unique_id=unique_id),
                        exist_ok=True)
            # Download the job package.
            await DownloadPackage(job_info,
                                  self._dashboard_agent.http_session).run()
            # Start the driver.
            logger.info("[%s] Starting driver.", unique_id)
            language = job_info.language
            if language == job_consts.PYTHON:
                driver = await StartPythonDriver(
                    job_info, self._dashboard_agent.redis_address,
                    self._dashboard_agent.redis_password).run()
            else:
                raise Exception(f"Unsupported language type: {language}")
            job_info.driver = driver

        initialize_task = create_task(_initialize_job_env())

        try:
            await initialize_task
        except asyncio.CancelledError:
            logger.error("[%s] Initialize job environment has been cancelled.",
                         unique_id)
            return job_agent_pb2.InitializeJobEnvReply(
                status=agent_manager_pb2.AGENT_RPC_STATUS_FAILED,
                error_message="InitializeJobEnv has been cancelled, "
                "did you call CleanJobEnv?")
        except Exception as ex:
            logger.exception(ex)
            return job_agent_pb2.InitializeJobEnvReply(
                status=agent_manager_pb2.AGENT_RPC_STATUS_FAILED,
                error_message=traceback.format_exc())

        driver_pid = 0
        if job_info.driver:
            driver_pid = job_info.driver.pid

        logger.info(
            "[%s] Job environment initialized, "
            "the driver (pid=%s) started.", unique_id, driver_pid)
        return job_agent_pb2.InitializeJobEnvReply(
            status=agent_manager_pb2.AGENT_RPC_STATUS_OK,
            driver_pid=driver_pid)