Exemple #1
0
 def __init__(self, pid=None, api=None, process_q=None, notify_q=None):
     try:
         pynvml.nvmlInit()
         self.gpu_count = pynvml.nvmlDeviceGetCount()
     except pynvml.NVMLError as err:
         self.gpu_count = 0
     #self.run = run
     self._pid = pid
     self._api = api
     self._interface = interface.BackendSender(
         process_queue=process_q,
         notify_queue=notify_q,
     )
     self.sampler = {}
     self.samples = 0
     self._shutdown = False
     if psutil:
         net = psutil.net_io_counters()
         self.network_init = {
             "sent": net.bytes_sent,
             "recv": net.bytes_recv
         }
     else:
         wandb.termlog(
             "psutil not installed, only GPU stats will be reported.  Install with pip install psutil"
         )
     self._thread = threading.Thread(target=self._thread_body)
     self._thread.daemon = True
Exemple #2
0
 def __init__(self, settings=None, process_q=None, notify_q=None):
     self._settings = settings
     self.fname = os.path.join(self._settings.files_dir, METADATA_FNAME)
     self.data = {}
     self._interface = interface.BackendSender(
         process_queue=process_q,
         notify_queue=notify_q,
     )
Exemple #3
0
 def ensure_launched(self, *args, **kwargs):
     print("Fake Backend Launched")
     wandb_process = ProcessMock()
     self.interface = interface.BackendSender(
         process=wandb_process,
         record_q=self.record_q,
         result_q=self.result_q,
     )
     self.interface._communicate = self._communicate
     self.interface._orig_publish = self.interface._publish
     self.interface._publish = self._publish
Exemple #4
0
 def __init__(self, settings=None, process_q=None, notify_q=None):
     self._settings = settings
     self.data = {}
     self.fname = os.path.join(self._settings.files_dir, METADATA_FNAME)
     self._interface = interface.BackendSender(
         process_queue=process_q,
         notify_queue=notify_q,
     )
     self._git = git_repo.GitRepo(
         remote=self._settings["git_remote"] if "git_remote" in
         self._settings.keys() else "origin")
 def ensure_launched(self, *args, **kwargs):
     print("Fake Backend Launched")
     wandb_process = ProcessMock()
     self.interface = interface.BackendSender(
         process=wandb_process,
         notify_queue=self.notify_queue,
         process_queue=self.process_queue,
         request_queue=self.req_queue,
         response_queue=self.resp_queue,
     )
     self.interface._request_response = self._request_response
     self.interface._orig_queue_process = self.interface._queue_process
     self.interface._queue_process = self._queue_process
Exemple #6
0
 def __init__(self, settings=None, process_q=None, notify_q=None):
     self._settings = settings
     self.data = {}
     self.fname = os.path.join(self._settings.files_dir, METADATA_FNAME)
     self._interface = interface.BackendSender(
         process_queue=process_q, notify_queue=notify_q,
     )
     self._git = git_repo.GitRepo(
         remote=self._settings["git_remote"]
         if "git_remote" in self._settings.keys()
         else "origin"
     )
     # Location under "code" directory in files where program was saved.
     self._saved_program = None
     # Locations under files directory where diff patches were saved.
     self._saved_patches = []
Exemple #7
0
    def __init__(self, settings, process_q, notify_q, resp_q, run_meta=None):
        self._settings = settings
        self._resp_q = resp_q
        self._run_meta = run_meta

        self._fs = None
        self._pusher = None
        self._dir_watcher = None
        self._tb_watcher = None

        # State updated by login
        self._entity = None
        self._flags = None

        # State updated by wandb.init
        self._run = None
        self._project = None

        # State updated by resuming
        self._offsets = {
            "step": 0,
            "history": 0,
            "events": 0,
            "output": 0,
            "runtime": 0,
        }

        self._api = internal_api.Api(default_settings=settings)
        self._api_settings = dict()

        # TODO(jhr): do something better, why do we need to send full lines?
        self._partial_output = dict()

        self._interface = interface.BackendSender(
            process_queue=process_q,
            notify_queue=notify_q,
        )

        self._exit_code = 0

        # keep track of config and summary from key/val updates
        # self._consolidated_config = dict()
        self._consolidated_summary = dict()
Exemple #8
0
    def run(self):
        for sync_item in self._sync_list:
            if os.path.isdir(sync_item):
                files = os.listdir(sync_item)
                filtered_files = list(
                    filter(lambda f: f.endswith(WANDB_SUFFIX), files))
                if check_and_warn_old(files) or len(filtered_files) != 1:
                    print("Skipping directory: {}".format(sync_item))
                    continue
                sync_item = os.path.join(sync_item, filtered_files[0])
            dirname = os.path.dirname(sync_item)
            files_dir = os.path.join(dirname, "files")
            sd = dict(
                files_dir=files_dir,
                _start_time=0,
                git_remote=None,
                resume=None,
                program=None,
                ignore_globs=(),
                run_id=None,
                entity=None,
                project=None,
                run_group=None,
                job_type=None,
                run_tags=None,
                run_name=None,
                run_notes=None,
                save_code=None,
            )
            settings = settings_static.SettingsStatic(sd)
            record_q = queue.Queue()
            result_q = queue.Queue()
            publish_interface = interface.BackendSender(record_q=record_q)
            sm = sender.SendManager(
                settings=settings,
                record_q=record_q,
                result_q=result_q,
                interface=publish_interface,
            )
            ds = datastore.DataStore()
            ds.open_for_scan(sync_item)

            # save exit for final send
            exit_pb = None
            shown = False

            while True:
                data = ds.scan_data()
                if data is None:
                    break
                pb = wandb_internal_pb2.Record()
                pb.ParseFromString(data)
                record_type = pb.WhichOneof("record_type")
                if self._view:
                    if self._verbose:
                        print("Record:", pb)
                    else:
                        print("Record:", record_type)
                    continue
                if record_type == "run":
                    if self._run_id:
                        pb.run.run_id = self._run_id
                    if self._project:
                        pb.run.project = self._project
                    if self._entity:
                        pb.run.entity = self._entity
                    pb.control.req_resp = True
                elif record_type == "exit":
                    exit_pb = pb
                    continue
                elif record_type == "final":
                    assert exit_pb, "final seen without exit"
                    pb = exit_pb
                    exit_pb = None
                sm.send(pb)
                # send any records that were added in previous send
                while not record_q.empty():
                    data = record_q.get(block=True)
                    sm.send(data)

                if pb.control.req_resp:
                    result = result_q.get(block=True)
                    result_type = result.WhichOneof("result_type")
                    if not shown and result_type == "run_result":
                        r = result.run_result.run
                        # TODO(jhr): hardcode until we have settings in sync
                        url = "{}/{}/{}/runs/{}".format(
                            self._app_url,
                            url_quote(r.entity),
                            url_quote(r.project),
                            url_quote(r.run_id),
                        )
                        print("Syncing: %s ..." % url, end="")
                        sys.stdout.flush()
                        shown = True
            sm.finish()
            if self._mark_synced:
                synced_file = "{}{}".format(sync_item, SYNCED_SUFFIX)
                with open(synced_file, "w"):
                    pass
            print("done.")
Exemple #9
0
    def ensure_launched(
        self,
        settings=None,
        log_level=None,
        stdout_fd=None,
        stderr_fd=None,
        use_redirect=None,
    ):
        """Launch backend worker if not running."""
        log_level = log_level or logging.DEBUG
        settings = settings or {}
        settings = dict(settings)

        # os.set_inheritable(stdout_fd, True)
        # os.set_inheritable(stderr_fd, True)
        # stdout_read_file = os.fdopen(stdout_fd, 'rb')
        # stderr_read_file = os.fdopen(stderr_fd, 'rb')

        fd_pipe_child, fd_pipe_parent = self._wl._multiprocessing.Pipe()

        process_queue = self._wl._multiprocessing.Queue()
        # async_queue = self._wl._multiprocessing.Queue()
        # fd_request_queue = self._wl._multiprocessing.Queue()
        # fd_response_queue = self._wl._multiprocessing.Queue()
        # TODO: should this be one item just to make sure it stays fully synchronous?
        req_queue = self._wl._multiprocessing.Queue()
        resp_queue = self._wl._multiprocessing.Queue()
        cancel_queue = self._wl._multiprocessing.Queue()
        notify_queue = self._wl._multiprocessing.Queue()

        wandb_process = self._wl._multiprocessing.Process(
            target=wandb_internal,
            args=(
                settings,
                notify_queue,
                process_queue,
                req_queue,
                resp_queue,
                cancel_queue,
                fd_pipe_child,
                log_level,
                use_redirect,
            ),
        )
        wandb_process.name = "wandb_internal"

        # Support running code without a: __name__ == "__main__"
        save_mod_name = None
        save_mod_path = None
        main_module = sys.modules["__main__"]
        main_mod_spec = getattr(main_module, "__spec__", None)
        main_mod_path = getattr(main_module, "__file__", None)
        main_mod_name = None
        if main_mod_spec:
            main_mod_name = getattr(main_mod_spec, "name", None)
        if main_mod_name is not None:
            save_mod_name = main_mod_name
            main_module.__spec__.name = "wandb.internal.mpmain"
        elif main_mod_path is not None:
            save_mod_path = main_module.__file__
            fname = os.path.join(os.path.dirname(wandb.__file__), "internal",
                                 "mpmain", "__main__.py")
            main_module.__file__ = fname

        # Start the process with __name__ == "__main__" workarounds
        wandb_process.start()

        if use_redirect:
            pass
        else:
            if platform.system() == "Windows":
                # https://bugs.python.org/issue38188
                # import msvcrt
                # print("DEBUG1: {}".format(stdout_fd))
                # stdout_fd = msvcrt.get_osfhandle(stdout_fd)
                # print("DEBUG2: {}".format(stdout_fd))
                # stderr_fd = msvcrt.get_osfhandle(stderr_fd)
                # multiprocessing.reduction.send_handle(fd_pipe_parent,
                #   stdout_fd,  wandb_process.pid)
                # multiprocessing.reduction.send_handle(fd_pipe_parent,
                #   stderr_fd,  wandb_process.pid)

                # should we do this?
                # os.close(stdout_fd)
                # os.close(stderr_fd)
                pass
            else:
                multiprocessing.reduction.send_handle(fd_pipe_parent,
                                                      stdout_fd,
                                                      wandb_process.pid)
                multiprocessing.reduction.send_handle(fd_pipe_parent,
                                                      stderr_fd,
                                                      wandb_process.pid)

                # should we do this?
                os.close(stdout_fd)
                os.close(stderr_fd)

        # Undo temporary changes from: __name__ == "__main__"
        if save_mod_name:
            main_module.__spec__.name = save_mod_name
        elif save_mod_path:
            main_module.__file__ = save_mod_path

        self.fd_pipe_parent = fd_pipe_parent

        self.wandb_process = wandb_process

        self.process_queue = process_queue
        # self.async_queue = async_queue
        # self.fd_request_queue = fd_request_queue
        # self.fd_response_queue = fd_response_queue
        self.req_queue = req_queue
        self.resp_queue = resp_queue
        self.cancel_queue = cancel_queue
        self.notify_queue = notify_queue

        self.interface = interface.BackendSender(
            process=wandb_process,
            notify_queue=notify_queue,
            process_queue=process_queue,
            request_queue=req_queue,
            response_queue=resp_queue,
        )
    def setup(self):
        log_level = logging.DEBUG
        start_time = time.time()
        start_datetime = datetime.datetime.now()
        timespec = datetime.datetime.strftime(start_datetime, "%Y%m%d_%H%M%S")

        wandb_dir = "wandb"
        run_path = "run-{}-server".format(timespec)
        run_dir = os.path.join(wandb_dir, run_path)
        files_dir = os.path.join(run_dir, "files")
        sync_file = os.path.join(run_dir, "run-{}.wandb".format(start_time))
        os.makedirs(files_dir)
        settings = dict(
            log_internal=os.path.join(run_dir, "internal.log"),
            files_dir=files_dir,
            _start_time=start_time,
            _start_datetime=start_datetime,
            disable_code=None,
            code_program=None,
            save_code=None,
            sync_file=sync_file,
            _internal_queue_timeout=20,
            _internal_check_process=0,
            _disable_meta=True,
            _disable_stats=False,
            git_remote=None,
            program=None,
            resume=None,
            ignore_globs=(),
            offline=None,
            _log_level=log_level,
            run_id=None,
            entity=None,
            project=None,
            run_group=None,
            run_job_type=None,
            run_tags=None,
            run_name=None,
            run_notes=None,
            _jupyter=None,
            _kaggle=None,
            _offline=None,
        )

        mp = multiprocessing
        fd_pipe_child, fd_pipe_parent = mp.Pipe()

        record_q = mp.Queue()
        # TODO: should this be one item just to make sure it stays fully synchronous?
        result_q = mp.Queue()

        wandb_process = mp.Process(
            target=wandb_internal,
            kwargs=dict(
                settings=settings,
                record_q=record_q,
                result_q=result_q,
            ),
        )
        wandb_process.name = "wandb_internal"
        wandb_process.start()

        self.record_q = record_q
        self.result_q = result_q
        self.wandb_process = wandb_process

        self._interface = interface.BackendSender(
            record_q=record_q,
            result_q=result_q,
            process=wandb_process,
        )
Exemple #11
0
    def setup(self):
        log_level = logging.DEBUG
        start_time = time.time()
        start_datetime = datetime.datetime.now()
        timespec = datetime.datetime.strftime(start_datetime, "%Y%m%d_%H%M%S")

        wandb_dir = "wandb"
        run_path = "run-{}-server".format(timespec)
        run_dir = os.path.join(wandb_dir, run_path)
        files_dir = os.path.join(run_dir, "files")
        sync_file = os.path.join(run_dir, "run-{}.wandb".format(start_time))
        os.makedirs(files_dir)
        settings = dict(
            log_internal=os.path.join(run_dir, "internal.log"),
            files_dir=files_dir,
            _start_time=start_time,
            _start_datetime=start_datetime,
            disable_code=None,
            code_program=None,
            save_code=None,
            sync_file=sync_file,
            _internal_queue_timeout=20,
            _internal_check_process=0,
            _disable_meta=True,
            _disable_stats=False,
            git_remote=None,
            program=None,
            resume=None,
            ignore_globs=(),
        )

        mp = multiprocessing
        fd_pipe_child, fd_pipe_parent = mp.Pipe()

        process_queue = mp.Queue()
        # TODO: should this be one item just to make sure it stays fully synchronous?
        req_queue = mp.Queue()
        resp_queue = mp.Queue()
        cancel_queue = mp.Queue()
        notify_queue = mp.Queue()
        use_redirect = True

        wandb_process = mp.Process(
            target=wandb_internal,
            args=(
                settings,
                notify_queue,
                process_queue,
                req_queue,
                resp_queue,
                cancel_queue,
                fd_pipe_child,
                log_level,
                use_redirect,
            ),
        )
        wandb_process.name = "wandb_internal"
        wandb_process.start()

        self.wandb_process = wandb_process
        self.notify_queue = notify_queue

        self._interface = interface.BackendSender(
            process_queue=process_queue,
            notify_queue=notify_queue,
            request_queue=req_queue,
            response_queue=resp_queue,
            process=wandb_process,
        )
Exemple #12
0
    def ensure_launched(
        self,
        settings=None,
        log_level=None,
        stdout_fd=None,
        stderr_fd=None,
        use_redirect=None,
    ):
        """Launch backend worker if not running."""
        settings = dict(settings or ())
        settings["_log_level"] = log_level or logging.DEBUG

        # TODO: this is brittle and should likely be handled directly on the
        # settings object.  Multi-processing blows up when it can't pickle
        # objects.
        if "_early_logger" in settings:
            del settings["_early_logger"]

        self.record_q = self._wl._multiprocessing.Queue()
        self.result_q = self._wl._multiprocessing.Queue()
        self.wandb_process = self._wl._multiprocessing.Process(
            target=wandb_internal,
            kwargs=dict(
                settings=settings,
                record_q=self.record_q,
                result_q=self.result_q,
            ),
        )
        self.wandb_process.name = "wandb_internal"

        # Support running code without a: __name__ == "__main__"
        save_mod_name = None
        save_mod_path = None
        main_module = sys.modules["__main__"]
        main_mod_spec = getattr(main_module, "__spec__", None)
        main_mod_path = getattr(main_module, "__file__", None)
        main_mod_name = None
        if main_mod_spec:
            main_mod_name = getattr(main_mod_spec, "name", None)
        if main_mod_name is not None:
            save_mod_name = main_mod_name
            main_module.__spec__.name = "wandb.internal.mpmain"
        elif main_mod_path is not None:
            save_mod_path = main_module.__file__
            fname = os.path.join(os.path.dirname(wandb.__file__), "internal",
                                 "mpmain", "__main__.py")
            main_module.__file__ = fname

        # Start the process with __name__ == "__main__" workarounds
        self.wandb_process.start()
        self._internal_pid = self.wandb_process.pid

        # Undo temporary changes from: __name__ == "__main__"
        if save_mod_name:
            main_module.__spec__.name = save_mod_name
        elif save_mod_path:
            main_module.__file__ = save_mod_path

        self.interface = interface.BackendSender(
            process=self.wandb_process,
            record_q=self.record_q,
            result_q=self.result_q,
        )
Exemple #13
0
def wandb_internal(settings, record_q, result_q):
    """Internal process function entrypoint.

    Read from record queue and dispatch work to various threads.

    Args:
        settings: dictionary of configuration parameters.
        record_q: records to be handled
        result_q: for sending results back

    """
    # mark this process as internal
    wandb._IS_INTERNAL_PROCESS = True

    # Lets make sure we dont modify settings so use a static object
    settings = settings_static.SettingsStatic(settings)
    if settings.log_internal:
        configure_logging(settings.log_internal, settings._log_level)

    parent_pid = os.getppid()
    pid = os.getpid()

    logger.info("W&B internal server running at pid: %s", pid)

    publish_interface = interface.BackendSender(record_q=record_q)

    stopped = threading.Event()
    threads = []

    send_record_q = queue.Queue()
    record_sender_thread = SenderThread(
        settings=settings,
        record_q=send_record_q,
        result_q=result_q,
        stopped=stopped,
        interface=publish_interface,
    )
    threads.append(record_sender_thread)

    write_record_q = queue.Queue()
    record_writer_thread = WriterThread(
        settings=settings,
        record_q=write_record_q,
        result_q=result_q,
        stopped=stopped,
        writer_q=write_record_q,
    )
    threads.append(record_writer_thread)

    record_handler_thread = HandlerThread(
        settings=settings,
        record_q=record_q,
        result_q=result_q,
        stopped=stopped,
        sender_q=send_record_q,
        writer_q=write_record_q,
        interface=publish_interface,
    )
    threads.append(record_handler_thread)

    process_check = ProcessCheck(settings=settings, pid=parent_pid)

    for thread in threads:
        thread.start()

    interrupt_count = 0
    while not stopped.isSet():
        try:
            # wait for stop event
            while not stopped.isSet():
                time.sleep(1)
                if process_check.is_dead():
                    logger.error("Internal process shutdown.")
                    stopped.set()
        except KeyboardInterrupt:
            interrupt_count += 1
            logger.warning(
                "Internal process interrupt: {}".format(interrupt_count))
        finally:
            if interrupt_count >= 2:
                logger.error("Internal process interrupted.")
                stopped.set()

    for thread in threads:
        thread.join()

    for thread in threads:
        exc_info = thread.get_exception()
        if exc_info:
            logger.error("Thread {}:".format(thread.name), exc_info=exc_info)
            print("Thread {}:".format(thread.name), file=sys.stderr)
            traceback.print_exception(*exc_info)
            sentry_exc(exc_info, delay=True)
            sys.exit(-1)