Esempio n. 1
0
def _init_headless(run, cloud=True):
    global join
    global _user_process_finished_called

    program = util.get_program()
    if program:
        os.environ[env.PROGRAM] = os.getenv(env.PROGRAM) or program

    environ = dict(os.environ)
    run.set_environment(environ)

    server = wandb_socket.Server()
    run.socket = server
    hooks = ExitHooks()
    hooks.hook()

    if platform.system() == "Windows":
        try:
            import win32api
            # Make sure we are not ignoring CTRL_C_EVENT
            # https://docs.microsoft.com/en-us/windows/console/setconsolectrlhandler
            # https://stackoverflow.com/questions/1364173/stopping-python-using-ctrlc
            win32api.SetConsoleCtrlHandler(None, False)
        except ImportError:
            termerror(
                "Install the win32api library with `pip install pypiwin32`")

        # PTYs don't work in windows so we create these unused pipes and
        # mirror stdout to run.dir/output.log.  There should be a way to make
        # pipes work, but I haven't figured it out.  See links in compat/windows
        stdout_master_fd, stdout_slave_fd = os.pipe()
        stderr_master_fd, stderr_slave_fd = os.pipe()
    else:
        stdout_master_fd, stdout_slave_fd = io_wrap.wandb_pty(resize=False)
        stderr_master_fd, stderr_slave_fd = io_wrap.wandb_pty(resize=False)

    headless_args = {
        'command': 'headless',
        'pid': os.getpid(),
        'stdout_master_fd': stdout_master_fd,
        'stderr_master_fd': stderr_master_fd,
        'cloud': cloud,
        'port': server.port
    }
    internal_cli_path = os.path.join(os.path.dirname(__file__),
                                     'internal_cli.py')

    if six.PY2 or platform.system() == "Windows":
        # TODO(adrian): close_fds=False is bad for security. we set
        # it so we can pass the PTY FDs to the wandb process. We
        # should use subprocess32, which has pass_fds.
        popen_kwargs = {'close_fds': False}
    else:
        popen_kwargs = {'pass_fds': [stdout_master_fd, stderr_master_fd]}

    # TODO(adrian): ensure we use *exactly* the same python interpreter
    # TODO(adrian): make wandb the foreground process so we don't give
    # up terminal control until syncing is finished.
    # https://stackoverflow.com/questions/30476971/is-the-child-process-in-foreground-or-background-on-fork-in-c
    wandb_process = subprocess.Popen(
        [sys.executable, internal_cli_path,
         json.dumps(headless_args)],
        env=environ,
        **popen_kwargs)
    termlog('Tracking run with wandb version {}'.format(__version__))
    os.close(stdout_master_fd)
    os.close(stderr_master_fd)
    # Listen on the socket waiting for the wandb process to be ready
    try:
        success, _ = server.listen(30)
    except KeyboardInterrupt:
        success = False
    else:
        if not success:
            termerror('W&B process (PID {}) did not respond'.format(
                wandb_process.pid))
    if not success:
        wandb_process.kill()
        for _ in range(20):
            time.sleep(0.1)
            if wandb_process.poll() is not None:
                break
        if wandb_process.poll() is None:
            termerror('Failed to kill wandb process, PID {}'.format(
                wandb_process.pid))
        # TODO attempt to upload a debug log
        path = GLOBAL_LOG_FNAME.replace(os.getcwd() + os.sep, "")
        raise LaunchError("W&B process failed to launch, see: {}".format(path))

    if platform.system() == "Windows":
        output = open(os.path.join(run.dir, "output.log"), "wb")
        stdout_redirector = io_wrap.WindowsRedirector(sys.stdout, output)
        stderr_redirector = io_wrap.WindowsRedirector(sys.stderr, output)
    else:
        stdout_slave = os.fdopen(stdout_slave_fd, 'wb')
        stderr_slave = os.fdopen(stderr_slave_fd, 'wb')
        try:
            stdout_redirector = io_wrap.FileRedirector(sys.stdout,
                                                       stdout_slave)
            stderr_redirector = io_wrap.FileRedirector(sys.stderr,
                                                       stderr_slave)
        except (ValueError, AttributeError):
            # stdout / err aren't files
            output = open(os.path.join(run.dir, "output.log"), "wb")
            stdout_redirector = io_wrap.WindowsRedirector(sys.stdout, output)
            stderr_redirector = io_wrap.WindowsRedirector(sys.stderr, output)

    # TODO(adrian): we should register this right after starting the wandb process to
    # make sure we shut down the W&B process eg. if there's an exception in the code
    # above
    atexit.register(_user_process_finished, server, hooks, wandb_process,
                    stdout_redirector, stderr_redirector)

    def _wandb_join(exit_code=None):
        global _global_run_stack
        shutdown_async_log_thread()
        run.close_files()
        if exit_code is not None:
            hooks.exit_code = exit_code
        _user_process_finished(server, hooks, wandb_process, stdout_redirector,
                               stderr_redirector)
        if len(_global_run_stack) > 0:
            _global_run_stack.pop()

    join = _wandb_join
    _user_process_finished_called = False

    # redirect output last of all so we don't miss out on error messages
    stdout_redirector.redirect()
    if not env.is_debug():
        stderr_redirector.redirect()
Esempio n. 2
0
    def setup(self):
        class TimeOutException(Exception):
            pass

        def alarm_handler(signum, frame):
            raise TimeOutException()

        self.data["root"] = os.getcwd()
        program = os.getenv(env.PROGRAM) or util.get_program()
        if program:
            self.data["program"] = program
        else:
            self.data["program"] = '<python with no main file>'
            if wandb._get_python_type() != "python":
                if os.getenv(env.NOTEBOOK_NAME):
                    self.data["program"] = os.getenv(env.NOTEBOOK_NAME)
                else:
                    meta = wandb.jupyter.notebook_metadata()
                    if meta.get("path"):
                        if "fileId=" in meta["path"]:
                            self.data[
                                "colab"] = "https://colab.research.google.com/drive/" + meta[
                                    "path"].split("fileId=")[1]
                            self.data["program"] = meta["name"]
                        else:
                            self.data["program"] = meta["path"]
                            self.data["root"] = meta["root"]

        if not os.getenv(env.DISABLE_CODE):
            logger.debug("code probe starting")
            in_jupyter = wandb._get_python_type() != "python"
            # windows doesn't support alarm() and jupyter could call this in a thread context
            if platform.system() == "Windows" or not hasattr(
                    signal, 'SIGALRM') or in_jupyter:
                logger.debug("non time limited probe of code")
                self._setup_code_git()
                self._setup_code_program()
            else:
                old_alarm = None
                try:
                    try:
                        old_alarm = signal.signal(signal.SIGALRM,
                                                  alarm_handler)
                        signal.alarm(25)
                        self._setup_code_git()
                        self._setup_code_program()
                    finally:
                        signal.alarm(0)
                except TimeOutException:
                    logger.debug("timeout waiting for setup_code")
                finally:
                    if old_alarm:
                        signal.signal(signal.SIGALRM, old_alarm)
            logger.debug("code probe done")

        self.data["startedAt"] = datetime.utcfromtimestamp(
            wandb.START_TIME).isoformat()
        try:
            username = getpass.getuser()
        except KeyError:
            # getuser() could raise KeyError in restricted environments like
            # chroot jails or docker containers.  Return user id in these cases.
            username = str(os.getuid())

        # Host names, usernames, emails, the root directory, and executable paths are sensitive for anonymous users.
        if self._api.settings().get('anonymous') != 'true':
            self.data["host"] = os.environ.get(env.HOST, socket.gethostname())
            self.data["username"] = os.getenv(env.USERNAME, username)
            self.data["executable"] = sys.executable
        else:
            self.data.pop("email", None)
            self.data.pop("root", None)

        self.data["os"] = platform.platform(aliased=True)
        self.data["python"] = platform.python_version()

        if env.get_docker():
            self.data["docker"] = env.get_docker()
        try:
            pynvml.nvmlInit()
            self.data["gpu"] = pynvml.nvmlDeviceGetName(
                pynvml.nvmlDeviceGetHandleByIndex(0)).decode("utf8")
            self.data["gpu_count"] = pynvml.nvmlDeviceGetCount()
        except pynvml.NVMLError:
            pass
        try:
            self.data["cpu_count"] = multiprocessing.cpu_count()
        except NotImplementedError:
            pass
        # TODO: we should use the cuda library to collect this
        if os.path.exists("/usr/local/cuda/version.txt"):
            with open("/usr/local/cuda/version.txt") as f:
                self.data["cuda"] = f.read().split(" ")[-1].strip()
        self.data["args"] = sys.argv[1:]
        self.data["state"] = "running"