def __init__(self, pid=None, api=None, process_q=None, notify_q=None): try: pynvml.nvmlInit() self.gpu_count = pynvml.nvmlDeviceGetCount() except pynvml.NVMLError as err: self.gpu_count = 0 #self.run = run self._pid = pid self._api = api self._interface = interface.BackendSender( process_queue=process_q, notify_queue=notify_q, ) self.sampler = {} self.samples = 0 self._shutdown = False if psutil: net = psutil.net_io_counters() self.network_init = { "sent": net.bytes_sent, "recv": net.bytes_recv } else: wandb.termlog( "psutil not installed, only GPU stats will be reported. Install with pip install psutil" ) self._thread = threading.Thread(target=self._thread_body) self._thread.daemon = True
def __init__(self, settings=None, process_q=None, notify_q=None): self._settings = settings self.fname = os.path.join(self._settings.files_dir, METADATA_FNAME) self.data = {} self._interface = interface.BackendSender( process_queue=process_q, notify_queue=notify_q, )
def ensure_launched(self, *args, **kwargs): print("Fake Backend Launched") wandb_process = ProcessMock() self.interface = interface.BackendSender( process=wandb_process, record_q=self.record_q, result_q=self.result_q, ) self.interface._communicate = self._communicate self.interface._orig_publish = self.interface._publish self.interface._publish = self._publish
def __init__(self, settings=None, process_q=None, notify_q=None): self._settings = settings self.data = {} self.fname = os.path.join(self._settings.files_dir, METADATA_FNAME) self._interface = interface.BackendSender( process_queue=process_q, notify_queue=notify_q, ) self._git = git_repo.GitRepo( remote=self._settings["git_remote"] if "git_remote" in self._settings.keys() else "origin")
def ensure_launched(self, *args, **kwargs): print("Fake Backend Launched") wandb_process = ProcessMock() self.interface = interface.BackendSender( process=wandb_process, notify_queue=self.notify_queue, process_queue=self.process_queue, request_queue=self.req_queue, response_queue=self.resp_queue, ) self.interface._request_response = self._request_response self.interface._orig_queue_process = self.interface._queue_process self.interface._queue_process = self._queue_process
def __init__(self, settings=None, process_q=None, notify_q=None): self._settings = settings self.data = {} self.fname = os.path.join(self._settings.files_dir, METADATA_FNAME) self._interface = interface.BackendSender( process_queue=process_q, notify_queue=notify_q, ) self._git = git_repo.GitRepo( remote=self._settings["git_remote"] if "git_remote" in self._settings.keys() else "origin" ) # Location under "code" directory in files where program was saved. self._saved_program = None # Locations under files directory where diff patches were saved. self._saved_patches = []
def __init__(self, settings, process_q, notify_q, resp_q, run_meta=None): self._settings = settings self._resp_q = resp_q self._run_meta = run_meta self._fs = None self._pusher = None self._dir_watcher = None self._tb_watcher = None # State updated by login self._entity = None self._flags = None # State updated by wandb.init self._run = None self._project = None # State updated by resuming self._offsets = { "step": 0, "history": 0, "events": 0, "output": 0, "runtime": 0, } self._api = internal_api.Api(default_settings=settings) self._api_settings = dict() # TODO(jhr): do something better, why do we need to send full lines? self._partial_output = dict() self._interface = interface.BackendSender( process_queue=process_q, notify_queue=notify_q, ) self._exit_code = 0 # keep track of config and summary from key/val updates # self._consolidated_config = dict() self._consolidated_summary = dict()
def run(self): for sync_item in self._sync_list: if os.path.isdir(sync_item): files = os.listdir(sync_item) filtered_files = list( filter(lambda f: f.endswith(WANDB_SUFFIX), files)) if check_and_warn_old(files) or len(filtered_files) != 1: print("Skipping directory: {}".format(sync_item)) continue sync_item = os.path.join(sync_item, filtered_files[0]) dirname = os.path.dirname(sync_item) files_dir = os.path.join(dirname, "files") sd = dict( files_dir=files_dir, _start_time=0, git_remote=None, resume=None, program=None, ignore_globs=(), run_id=None, entity=None, project=None, run_group=None, job_type=None, run_tags=None, run_name=None, run_notes=None, save_code=None, ) settings = settings_static.SettingsStatic(sd) record_q = queue.Queue() result_q = queue.Queue() publish_interface = interface.BackendSender(record_q=record_q) sm = sender.SendManager( settings=settings, record_q=record_q, result_q=result_q, interface=publish_interface, ) ds = datastore.DataStore() ds.open_for_scan(sync_item) # save exit for final send exit_pb = None shown = False while True: data = ds.scan_data() if data is None: break pb = wandb_internal_pb2.Record() pb.ParseFromString(data) record_type = pb.WhichOneof("record_type") if self._view: if self._verbose: print("Record:", pb) else: print("Record:", record_type) continue if record_type == "run": if self._run_id: pb.run.run_id = self._run_id if self._project: pb.run.project = self._project if self._entity: pb.run.entity = self._entity pb.control.req_resp = True elif record_type == "exit": exit_pb = pb continue elif record_type == "final": assert exit_pb, "final seen without exit" pb = exit_pb exit_pb = None sm.send(pb) # send any records that were added in previous send while not record_q.empty(): data = record_q.get(block=True) sm.send(data) if pb.control.req_resp: result = result_q.get(block=True) result_type = result.WhichOneof("result_type") if not shown and result_type == "run_result": r = result.run_result.run # TODO(jhr): hardcode until we have settings in sync url = "{}/{}/{}/runs/{}".format( self._app_url, url_quote(r.entity), url_quote(r.project), url_quote(r.run_id), ) print("Syncing: %s ..." % url, end="") sys.stdout.flush() shown = True sm.finish() if self._mark_synced: synced_file = "{}{}".format(sync_item, SYNCED_SUFFIX) with open(synced_file, "w"): pass print("done.")
def ensure_launched( self, settings=None, log_level=None, stdout_fd=None, stderr_fd=None, use_redirect=None, ): """Launch backend worker if not running.""" log_level = log_level or logging.DEBUG settings = settings or {} settings = dict(settings) # os.set_inheritable(stdout_fd, True) # os.set_inheritable(stderr_fd, True) # stdout_read_file = os.fdopen(stdout_fd, 'rb') # stderr_read_file = os.fdopen(stderr_fd, 'rb') fd_pipe_child, fd_pipe_parent = self._wl._multiprocessing.Pipe() process_queue = self._wl._multiprocessing.Queue() # async_queue = self._wl._multiprocessing.Queue() # fd_request_queue = self._wl._multiprocessing.Queue() # fd_response_queue = self._wl._multiprocessing.Queue() # TODO: should this be one item just to make sure it stays fully synchronous? req_queue = self._wl._multiprocessing.Queue() resp_queue = self._wl._multiprocessing.Queue() cancel_queue = self._wl._multiprocessing.Queue() notify_queue = self._wl._multiprocessing.Queue() wandb_process = self._wl._multiprocessing.Process( target=wandb_internal, args=( settings, notify_queue, process_queue, req_queue, resp_queue, cancel_queue, fd_pipe_child, log_level, use_redirect, ), ) wandb_process.name = "wandb_internal" # Support running code without a: __name__ == "__main__" save_mod_name = None save_mod_path = None main_module = sys.modules["__main__"] main_mod_spec = getattr(main_module, "__spec__", None) main_mod_path = getattr(main_module, "__file__", None) main_mod_name = None if main_mod_spec: main_mod_name = getattr(main_mod_spec, "name", None) if main_mod_name is not None: save_mod_name = main_mod_name main_module.__spec__.name = "wandb.internal.mpmain" elif main_mod_path is not None: save_mod_path = main_module.__file__ fname = os.path.join(os.path.dirname(wandb.__file__), "internal", "mpmain", "__main__.py") main_module.__file__ = fname # Start the process with __name__ == "__main__" workarounds wandb_process.start() if use_redirect: pass else: if platform.system() == "Windows": # https://bugs.python.org/issue38188 # import msvcrt # print("DEBUG1: {}".format(stdout_fd)) # stdout_fd = msvcrt.get_osfhandle(stdout_fd) # print("DEBUG2: {}".format(stdout_fd)) # stderr_fd = msvcrt.get_osfhandle(stderr_fd) # multiprocessing.reduction.send_handle(fd_pipe_parent, # stdout_fd, wandb_process.pid) # multiprocessing.reduction.send_handle(fd_pipe_parent, # stderr_fd, wandb_process.pid) # should we do this? # os.close(stdout_fd) # os.close(stderr_fd) pass else: multiprocessing.reduction.send_handle(fd_pipe_parent, stdout_fd, wandb_process.pid) multiprocessing.reduction.send_handle(fd_pipe_parent, stderr_fd, wandb_process.pid) # should we do this? os.close(stdout_fd) os.close(stderr_fd) # Undo temporary changes from: __name__ == "__main__" if save_mod_name: main_module.__spec__.name = save_mod_name elif save_mod_path: main_module.__file__ = save_mod_path self.fd_pipe_parent = fd_pipe_parent self.wandb_process = wandb_process self.process_queue = process_queue # self.async_queue = async_queue # self.fd_request_queue = fd_request_queue # self.fd_response_queue = fd_response_queue self.req_queue = req_queue self.resp_queue = resp_queue self.cancel_queue = cancel_queue self.notify_queue = notify_queue self.interface = interface.BackendSender( process=wandb_process, notify_queue=notify_queue, process_queue=process_queue, request_queue=req_queue, response_queue=resp_queue, )
def setup(self): log_level = logging.DEBUG start_time = time.time() start_datetime = datetime.datetime.now() timespec = datetime.datetime.strftime(start_datetime, "%Y%m%d_%H%M%S") wandb_dir = "wandb" run_path = "run-{}-server".format(timespec) run_dir = os.path.join(wandb_dir, run_path) files_dir = os.path.join(run_dir, "files") sync_file = os.path.join(run_dir, "run-{}.wandb".format(start_time)) os.makedirs(files_dir) settings = dict( log_internal=os.path.join(run_dir, "internal.log"), files_dir=files_dir, _start_time=start_time, _start_datetime=start_datetime, disable_code=None, code_program=None, save_code=None, sync_file=sync_file, _internal_queue_timeout=20, _internal_check_process=0, _disable_meta=True, _disable_stats=False, git_remote=None, program=None, resume=None, ignore_globs=(), offline=None, _log_level=log_level, run_id=None, entity=None, project=None, run_group=None, run_job_type=None, run_tags=None, run_name=None, run_notes=None, _jupyter=None, _kaggle=None, _offline=None, ) mp = multiprocessing fd_pipe_child, fd_pipe_parent = mp.Pipe() record_q = mp.Queue() # TODO: should this be one item just to make sure it stays fully synchronous? result_q = mp.Queue() wandb_process = mp.Process( target=wandb_internal, kwargs=dict( settings=settings, record_q=record_q, result_q=result_q, ), ) wandb_process.name = "wandb_internal" wandb_process.start() self.record_q = record_q self.result_q = result_q self.wandb_process = wandb_process self._interface = interface.BackendSender( record_q=record_q, result_q=result_q, process=wandb_process, )
def setup(self): log_level = logging.DEBUG start_time = time.time() start_datetime = datetime.datetime.now() timespec = datetime.datetime.strftime(start_datetime, "%Y%m%d_%H%M%S") wandb_dir = "wandb" run_path = "run-{}-server".format(timespec) run_dir = os.path.join(wandb_dir, run_path) files_dir = os.path.join(run_dir, "files") sync_file = os.path.join(run_dir, "run-{}.wandb".format(start_time)) os.makedirs(files_dir) settings = dict( log_internal=os.path.join(run_dir, "internal.log"), files_dir=files_dir, _start_time=start_time, _start_datetime=start_datetime, disable_code=None, code_program=None, save_code=None, sync_file=sync_file, _internal_queue_timeout=20, _internal_check_process=0, _disable_meta=True, _disable_stats=False, git_remote=None, program=None, resume=None, ignore_globs=(), ) mp = multiprocessing fd_pipe_child, fd_pipe_parent = mp.Pipe() process_queue = mp.Queue() # TODO: should this be one item just to make sure it stays fully synchronous? req_queue = mp.Queue() resp_queue = mp.Queue() cancel_queue = mp.Queue() notify_queue = mp.Queue() use_redirect = True wandb_process = mp.Process( target=wandb_internal, args=( settings, notify_queue, process_queue, req_queue, resp_queue, cancel_queue, fd_pipe_child, log_level, use_redirect, ), ) wandb_process.name = "wandb_internal" wandb_process.start() self.wandb_process = wandb_process self.notify_queue = notify_queue self._interface = interface.BackendSender( process_queue=process_queue, notify_queue=notify_queue, request_queue=req_queue, response_queue=resp_queue, process=wandb_process, )
def ensure_launched( self, settings=None, log_level=None, stdout_fd=None, stderr_fd=None, use_redirect=None, ): """Launch backend worker if not running.""" settings = dict(settings or ()) settings["_log_level"] = log_level or logging.DEBUG # TODO: this is brittle and should likely be handled directly on the # settings object. Multi-processing blows up when it can't pickle # objects. if "_early_logger" in settings: del settings["_early_logger"] self.record_q = self._wl._multiprocessing.Queue() self.result_q = self._wl._multiprocessing.Queue() self.wandb_process = self._wl._multiprocessing.Process( target=wandb_internal, kwargs=dict( settings=settings, record_q=self.record_q, result_q=self.result_q, ), ) self.wandb_process.name = "wandb_internal" # Support running code without a: __name__ == "__main__" save_mod_name = None save_mod_path = None main_module = sys.modules["__main__"] main_mod_spec = getattr(main_module, "__spec__", None) main_mod_path = getattr(main_module, "__file__", None) main_mod_name = None if main_mod_spec: main_mod_name = getattr(main_mod_spec, "name", None) if main_mod_name is not None: save_mod_name = main_mod_name main_module.__spec__.name = "wandb.internal.mpmain" elif main_mod_path is not None: save_mod_path = main_module.__file__ fname = os.path.join(os.path.dirname(wandb.__file__), "internal", "mpmain", "__main__.py") main_module.__file__ = fname # Start the process with __name__ == "__main__" workarounds self.wandb_process.start() self._internal_pid = self.wandb_process.pid # Undo temporary changes from: __name__ == "__main__" if save_mod_name: main_module.__spec__.name = save_mod_name elif save_mod_path: main_module.__file__ = save_mod_path self.interface = interface.BackendSender( process=self.wandb_process, record_q=self.record_q, result_q=self.result_q, )
def wandb_internal(settings, record_q, result_q): """Internal process function entrypoint. Read from record queue and dispatch work to various threads. Args: settings: dictionary of configuration parameters. record_q: records to be handled result_q: for sending results back """ # mark this process as internal wandb._IS_INTERNAL_PROCESS = True # Lets make sure we dont modify settings so use a static object settings = settings_static.SettingsStatic(settings) if settings.log_internal: configure_logging(settings.log_internal, settings._log_level) parent_pid = os.getppid() pid = os.getpid() logger.info("W&B internal server running at pid: %s", pid) publish_interface = interface.BackendSender(record_q=record_q) stopped = threading.Event() threads = [] send_record_q = queue.Queue() record_sender_thread = SenderThread( settings=settings, record_q=send_record_q, result_q=result_q, stopped=stopped, interface=publish_interface, ) threads.append(record_sender_thread) write_record_q = queue.Queue() record_writer_thread = WriterThread( settings=settings, record_q=write_record_q, result_q=result_q, stopped=stopped, writer_q=write_record_q, ) threads.append(record_writer_thread) record_handler_thread = HandlerThread( settings=settings, record_q=record_q, result_q=result_q, stopped=stopped, sender_q=send_record_q, writer_q=write_record_q, interface=publish_interface, ) threads.append(record_handler_thread) process_check = ProcessCheck(settings=settings, pid=parent_pid) for thread in threads: thread.start() interrupt_count = 0 while not stopped.isSet(): try: # wait for stop event while not stopped.isSet(): time.sleep(1) if process_check.is_dead(): logger.error("Internal process shutdown.") stopped.set() except KeyboardInterrupt: interrupt_count += 1 logger.warning( "Internal process interrupt: {}".format(interrupt_count)) finally: if interrupt_count >= 2: logger.error("Internal process interrupted.") stopped.set() for thread in threads: thread.join() for thread in threads: exc_info = thread.get_exception() if exc_info: logger.error("Thread {}:".format(thread.name), exc_info=exc_info) print("Thread {}:".format(thread.name), file=sys.stderr) traceback.print_exception(*exc_info) sentry_exc(exc_info, delay=True) sys.exit(-1)