def mp_worker(arguments): print('sub process', os.getpid()) inputs, the_time = arguments from random import randint additional_parameters = { 'stuff_' + str(randint(0, 100)): 'some stuff ' + str(randint(0, 100)) } Task.current_task().connect(additional_parameters) print(" Process %s\tWaiting %s seconds" % (inputs, the_time)) time.sleep(int(the_time)) print(" Process %s\tDONE" % inputs)
def get_output(command, return_command=False): save_artifact = False if command.startswith("tlt") and ( command.partition(" ")[0] != "tlt-train" and command.partition(" ")[0] != "tlt-converter"): command_prefix, _, command_args = command.partition(" ") command_prefix = shutil.which(command_prefix) command = "{} {} {}".format(sys.executable, command_prefix, command_args) elif command.startswith("ls -rlt"): # we will save as artifact if needed save_artifact = True print("=============== Running command: {}".format(command)) result = run(command, stdout=PIPE, stderr=STDOUT, universal_newlines=True, shell=True) print(result.stdout) if save_artifact: name = result.stdout.split("\n")[-2].rpartition(" ")[2] if name.endswith("tlt") or name.endswith("etlt") or name.endswith( "hdf5"): command_path = command.partition(" ")[2].rpartition(" ")[2] tlt_task = Task.current_task() tlt_task.upload_artifact( name=name, artifact_object=os.path.join(os.path.expandvars(command_path), name), ) if return_command: return result.stdout
def _setup_check_clearml(self, logger: ClearMLLogger, output_uri: str) -> None: try: from clearml import Task except ImportError: try: # Backwards-compatibility for legacy Trains SDK from trains import Task except ImportError: raise RuntimeError( "This contrib module requires clearml to be installed. " "You may install clearml using: \n pip install clearml \n") if logger and not isinstance(logger, ClearMLLogger): raise TypeError("logger must be an instance of ClearMLLogger") self._task = Task.current_task() if not self._task: raise RuntimeError( "ClearMLSaver requires a ClearML Task to be initialized. " "Please use the `logger` argument or call `clearml.Task.init()`." ) if output_uri: self._task.output_uri = output_uri
def download_pretrained_model(model_name, ngc_model, conf_file): model_file = (get_field_from_config( conf_file, "pretrained_model_file").strip().strip('"')) if model_file: model_dir = model_file.rpartition("/")[0].rpartition("/")[0] os.makedirs(model_dir) else: model_dir = "tmp/" os.makedirs(model_dir) # Download the pretrained model from NGC download_path = None command_output = get_output( "ngc registry model download-version {} --dest {}".format( ngc_model, model_dir), return_command=True, ) for output in command_output.split("\n"): if output.startswith("Downloaded local path"): download_path = output.partition(":")[2].strip() break if download_path: tlt_task = Task.current_task() tlt_task.upload_artifact( name=model_name, artifact_object=os.path.join( os.path.expandvars("{}".format(download_path)), "{}.hdf5".format(model_name), ), )
def _clearml_log_params(params_dict): try: from clearml import Task except ImportError: # Backwards-compatibility for legacy Trains SDK from trains import Task task = Task.current_task() task.connect(params_dict)
def _clearml_log_artifact(fp): try: from clearml import Task except ImportError: # Backwards-compatibility for legacy Trains SDK from trains import Task task = Task.current_task() task.upload_artifact(Path(fp).name, fp)
def run(num_workers): """ Distributed Synchronous SGD Example """ th.manual_seed(1234) train_set, bsz = partition_dataset(num_workers) model = Net() optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.5) num_batches = ceil(len(train_set.dataset) / float(bsz)) from random import randint param = {'worker_{}_stuff'.format(dist.get_rank()): 'some stuff ' + str(randint(0, 100))} Task.current_task().connect(param) Task.current_task().upload_artifact( 'temp {:02d}'.format(dist.get_rank()), artifact_object={'worker_rank': dist.get_rank()}) for epoch in range(2): epoch_loss = 0.0 for i, (data, target) in enumerate(train_set): optimizer.zero_grad() output = model(data) loss = F.nll_loss(output, target) epoch_loss += loss.item() loss.backward() average_gradients(model) optimizer.step() if i % 10 == 0: print('{}] Train Epoch {} - {} \tLoss {:.6f}'.format(dist.get_rank(), epoch, i, loss)) Task.current_task().get_logger().report_scalar( 'loss', 'worker {:02d}'.format(dist.get_rank()), value=loss.item(), iteration=i) if i > 100: break print('Rank ', dist.get_rank(), ', epoch ', epoch, ': ', epoch_loss / num_batches)
def __init__(self, **kwargs: Any): try: from clearml import Task from clearml.binding.frameworks.tensorflow_bind import WeightsGradientHistHelper except ImportError: try: # Backwards-compatibility for legacy Trains SDK from trains import Task from trains.binding.frameworks.tensorflow_bind import WeightsGradientHistHelper except ImportError: raise RuntimeError( "This contrib module requires clearml to be installed. " "You may install clearml using: \n pip install clearml \n") experiment_kwargs = { k: v for k, v in kwargs.items() if k not in ("project_name", "task_name", "task_type") } if self.bypass_mode(): warnings.warn("ClearMLSaver: running in bypass mode") class _Stub(object): def __call__(self, *_: Any, **__: Any) -> "_Stub": return self def __getattr__(self, attr: str) -> "_Stub": if attr in ("name", "id"): return "" # type: ignore[return-value] return self def __setattr__(self, attr: str, val: Any) -> None: pass self._task = _Stub() else: # Try to retrieve current the ClearML Task before trying to create a new one self._task = Task.current_task() if self._task is None: self._task = Task.init( project_name=kwargs.get("project_name"), task_name=kwargs.get("task_name"), task_type=kwargs.get("task_type", Task.TaskTypes.training), **experiment_kwargs, ) self.clearml_logger = self._task.get_logger() self.grad_helper = WeightsGradientHistHelper( logger=self.clearml_logger)
def model_prune(task_args): # Create an output directory if it doesn't exist. get_output("mkdir -p /home/{}/experiment_dir_pruned".format( task_args.arch)) train_task = Task.get_task(task_id=task_args.trains_model_task) unpruned_weights = train_task.artifacts["unpruned_weights"].get_local_copy( ) tlt_prune(task_args, unpruned_weights) tlt_task = Task.current_task() tlt_task.upload_artifact( name="pruned_weights", artifact_object=os.path.join( os.path.expandvars("{}".format(task_args.output_file))), )
def remote_run_experiment(self): for parameter_setup in self._parameter_setups: print(parameter_setup) task = Task.create( project_name=f"{self._project_name}", task_name=self.make_task_name(parameter_setup), repo=self._repo, branch=self._branch, script=self._script, requirements_file="../requirements.txt" ) task.set_parent(Task.current_task().id) task.connect(parameter_setup) Task.enqueue(task, self._queue)
def predictions_gt_images_handler(engine, logger, *args, **kwargs): x, _ = engine.state.batch y_pred, y = engine.state.output num_x = num_y = 4 le = num_x * num_y fig = plt.figure(figsize=(20, 20)) trans = transforms.ToPILImage() classes = ( "plane", "car", "bird", "cat", "deer", "dog", "frog", "horse", "ship", "truck", ) enumeration = {k: v for v, k in enumerate(classes, 1)} Task.current_task().connect_label_enumeration(enumeration) for idx in range(le): preds = torch.argmax(F.softmax(y_pred[idx], dim=0)) probs = torch.max(F.softmax(y_pred[idx], dim=0)) ax = fig.add_subplot(num_x, num_y, idx + 1, xticks=[], yticks=[]) ax.imshow(trans(x[idx])) ax.set_title( "{0} {1:.1f}% (label: {2})".format(classes[preds], probs * 100, classes[y[idx]]), color=("green" if preds == y[idx] else "red"), ) logger.writer.add_figure("predictions vs actuals", figure=fig, global_step=engine.state.epoch)
def train_unpruned(model_name): train_tlt() tlt_task = Task.current_task() get_output("ls -lh {}".format(tlt_task.get_parameter("Args/results_dir"))) tlt_task.upload_artifact( name="unpruned_weights", artifact_object=os.path.join( os.path.expandvars("{}/weights/{}.tlt".format( tlt_task.get_parameter("Args/results_dir"), model_name))), ) tlt_task.upload_artifact( name="pbtxt model configuration file", artifact_object=os.path.join( os.path.expandvars("{}/graph.pbtxt".format( tlt_task.get_parameter("Args/results_dir")))), )
def compute_and_log_cm(cm_metric, iteration): cm = cm_metric.compute() # CM: values are normalized such that diagonal values represent class recalls cm = ConfusionMatrix.normalize(cm, "recall").cpu().numpy() if idist.get_rank() == 0: from clearml import Task clearml_logger = Task.current_task().get_logger() clearml_logger.report_confusion_matrix( title="Final Confusion Matrix", series="cm-preds-gt", matrix=cm, iteration=iteration, xlabels=data.VOCSegmentationOpencv.target_names, ylabels=data.VOCSegmentationOpencv.target_names, )
def compute_and_log_cm(): cm = cm_metric.compute() # CM: values are normalized such that diagonal values represent class recalls cm = ConfusionMatrix.normalize(cm, "recall").cpu().numpy() if idist.get_rank() == 0: try: from clearml import Task except ImportError: # Backwards-compatibility for legacy Trains SDK from trains import Task clearml_logger = Task.current_task().get_logger() clearml_logger.report_confusion_matrix( title="Final Confusion Matrix", series="cm-preds-gt", matrix=cm, iteration=trainer.state.iteration, xlabels=VOCSegmentationOpencv.target_names, ylabels=VOCSegmentationOpencv.target_names, )
def _daemon(cls, jupyter_notebook_filename): from clearml import Task # load jupyter notebook package # noinspection PyBroadException try: # noinspection PyPackageRequirements from nbconvert.exporters.script import ScriptExporter _script_exporter = ScriptExporter() except Exception as ex: _logger.warning('Could not read Jupyter Notebook: {}'.format(ex)) return # load pigar # noinspection PyBroadException try: from ....utilities.pigar.reqs import get_installed_pkgs_detail, file_import_modules from ....utilities.pigar.modules import ReqsModules from ....utilities.pigar.log import logger logger.setLevel(logging.WARNING) except Exception: file_import_modules = None # load IPython # noinspection PyBroadException try: # noinspection PyPackageRequirements from IPython import get_ipython except Exception: # should not happen get_ipython = None # setup local notebook files if jupyter_notebook_filename: notebook = Path(jupyter_notebook_filename) local_jupyter_filename = jupyter_notebook_filename else: notebook = None fd, local_jupyter_filename = mkstemp(suffix='.ipynb') os.close(fd) last_update_ts = None counter = 0 prev_script_hash = None # noinspection PyBroadException try: from ....version import __version__ our_module = cls.__module__.split('.')[0], __version__ except Exception: our_module = None # noinspection PyBroadException try: import re replace_ipython_pattern = re.compile(r'\n([ \t]*)get_ipython\(\)') except Exception: replace_ipython_pattern = None # main observer loop, check if we need to exit while not cls._exit_event.wait(timeout=0.): # wait for timeout or sync event cls._sync_event.wait(cls._sample_frequency if counter else cls._first_sample_frequency) cls._sync_event.clear() counter += 1 # noinspection PyBroadException try: # if there is no task connected, do nothing task = Task.current_task() if not task: continue script_code = None fmodules = None current_cell = None # if we have a local file: if notebook: if not notebook.exists(): continue # check if notebook changed if last_update_ts is not None and notebook.stat().st_mtime - last_update_ts <= 0: continue last_update_ts = notebook.stat().st_mtime else: # serialize notebook to a temp file if cls._jupyter_history_logger: script_code, current_cell = cls._jupyter_history_logger.history_to_str() else: # noinspection PyBroadException try: # noinspection PyBroadException try: os.unlink(local_jupyter_filename) except Exception: pass get_ipython().run_line_magic('history', '-t -f {}'.format(local_jupyter_filename)) with open(local_jupyter_filename, 'r') as f: script_code = f.read() # load the modules from ....utilities.pigar.modules import ImportedModules fmodules = ImportedModules() for nm in set([str(m).split('.')[0] for m in sys.modules]): fmodules.add(nm, 'notebook', 0) except Exception: continue # get notebook python script if script_code is None and local_jupyter_filename: script_code, _ = _script_exporter.from_filename(local_jupyter_filename) if cls._store_notebook_artifact: # also upload the jupyter notebook as artifact task.upload_artifact( name='notebook', artifact_object=Path(local_jupyter_filename), preview='See `notebook preview` artifact', metadata={'UPDATE': datetime.utcnow().strftime('%Y-%m-%d %H:%M:%S')}, wait_on_upload=True, ) # noinspection PyBroadException try: from nbconvert.exporters import HTMLExporter # noqa html, _ = HTMLExporter().from_filename(filename=local_jupyter_filename) local_html = Path(gettempdir()) / 'notebook_{}.html'.format(task.id) with open(local_html.as_posix(), 'wt') as f: f.write(html) task.upload_artifact( name='notebook preview', artifact_object=local_html, preview='Click `FILE PATH` link', metadata={'UPDATE': datetime.utcnow().strftime('%Y-%m-%d %H:%M:%S')}, delete_after_upload=True, wait_on_upload=True, ) except Exception: pass current_script_hash = hash(script_code + (current_cell or '')) if prev_script_hash and prev_script_hash == current_script_hash: continue # remove ipython direct access from the script code # we will not be able to run them anyhow if replace_ipython_pattern: script_code = replace_ipython_pattern.sub(r'\n# \g<1>get_ipython()', script_code) requirements_txt = '' conda_requirements = '' # parse jupyter python script and prepare pip requirements (pigar) # if backend supports requirements if file_import_modules and Session.check_min_api_version('2.2'): if fmodules is None: fmodules, _ = file_import_modules( notebook.parts[-1] if notebook else 'notebook', script_code) if current_cell: cell_fmodules, _ = file_import_modules( notebook.parts[-1] if notebook else 'notebook', current_cell) # noinspection PyBroadException try: fmodules |= cell_fmodules except Exception: pass # add current cell to the script if current_cell: script_code += '\n' + current_cell fmodules = ScriptRequirements.add_trains_used_packages(fmodules) # noinspection PyUnboundLocalVariable installed_pkgs = get_installed_pkgs_detail() # make sure we are in installed packages if our_module and (our_module[0] not in installed_pkgs): installed_pkgs[our_module[0]] = our_module # noinspection PyUnboundLocalVariable reqs = ReqsModules() for name in fmodules: if name in installed_pkgs: pkg_name, version = installed_pkgs[name] reqs.add(pkg_name, version, fmodules[name]) requirements_txt, conda_requirements = ScriptRequirements.create_requirements_txt(reqs) # update script prev_script_hash = current_script_hash data_script = task.data.script data_script.diff = script_code data_script.requirements = {'pip': requirements_txt, 'conda': conda_requirements} # noinspection PyProtectedMember task._update_script(script=data_script) # update requirements # noinspection PyProtectedMember task._update_requirements(requirements=requirements_txt) except Exception: pass
def run(epochs, lr, momentum, log_interval, params, trainloader, testloader, model): device = "cuda" if torch.cuda.is_available() else "cpu" net = Net(params).to(device) criterion = nn.CrossEntropyLoss() optimizer = optim.SGD(net.parameters(), lr=lr, momentum=momentum) trainer = create_supervised_trainer(net, optimizer, criterion, device=device) trainer.logger = setup_logger("trainer") val_metrics = { "accuracy": Accuracy(), "loss": Loss(criterion), "recall": Recall() } evaluator = create_supervised_evaluator(net, metrics=val_metrics, device=device) evaluator.logger = setup_logger("evaluator") # Attach handler to plot trainer's loss every 100 iterations tb_logger = TensorboardLogger(log_dir="cifar-output") tb_logger.attach_output_handler( trainer, event_name=Events.ITERATION_COMPLETED(every=params.get("loss_report")), tag="training", output_transform=lambda loss: {"loss": loss}, ) # Attach handler to dump evaluator's metrics every epoch completed for tag, evaluator in [("training", trainer), ("validation", evaluator)]: tb_logger.attach_output_handler( evaluator, event_name=Events.EPOCH_COMPLETED, tag=tag, metric_names="all", global_step_transform=global_step_from_engine(trainer), ) # Attach function to build debug images and report every epoch end tb_logger.attach( evaluator, log_handler=predictions_gt_images_handler, event_name=Events.EPOCH_COMPLETED(once=1), ) desc = "ITERATION - loss: {:.2f}" pbar = tqdm(initial=0, leave=False, total=len(trainloader), desc=desc.format(0)) @trainer.on(Events.ITERATION_COMPLETED(every=log_interval)) def log_training_loss(engine): pbar.desc = desc.format(engine.state.output) pbar.update(log_interval) @trainer.on(Events.EPOCH_COMPLETED) def log_training_results(engine): pbar.refresh() evaluator.run(trainloader) metrics = evaluator.state.metrics avg_accuracy = metrics["accuracy"] avg_nll = metrics["loss"] tqdm.write( "Training Results - Epoch: {} Avg accuracy: {:.2f} Avg loss: {:.2f}" .format(engine.state.epoch, avg_accuracy, avg_nll)) @trainer.on(Events.EPOCH_COMPLETED) def log_validation_results(engine): evaluator.run(testloader) metrics = evaluator.state.metrics avg_accuracy = metrics["accuracy"] avg_nll = metrics["loss"] tqdm.write( "Validation Results - Epoch: {} Avg accuracy: {:.2f} Avg loss: {:.2f}" .format(engine.state.epoch, avg_accuracy, avg_nll)) pbar.n = pbar.last_print_n = 0 @trainer.on(Events.EPOCH_COMPLETED | Events.COMPLETED) def log_time(): tqdm.write("{} took {} seconds".format( trainer.last_event_name.name, trainer.state.times[trainer.last_event_name.name], )) trainer.run(trainloader, max_epochs=epochs) pbar.close() PATH = "./cifar_net.pth" # CONDITION depicts a custom condition for when to save the model. The model is saved and then updated in ClearML CONDITION = True if CONDITION: torch.save(net.state_dict(), PATH) model.update_weights(weights_filename=PATH) print("Finished Training") print("Task ID number is: {}".format(Task.current_task().id))
def _clearml_log_params(params_dict): from clearml import Task task = Task.current_task() task.connect(params_dict)
def _clearml_log_artifact(fp): from clearml import Task task = Task.current_task() task.upload_artifact(Path(fp).name, fp)
}, index=['falcon', 'dog', 'spider', 'fish']) # Register Pandas object as artifact to watch # (it will be monitored in the background and automatically synced and uploaded) task.register_artifact('train', df, metadata={ 'counting': 'legs', 'max legs': 69 }) # change the artifact object df.sample(frac=0.5, replace=True, random_state=1) # or access it from anywhere using the Task's get_registered_artifacts() Task.current_task().get_registered_artifacts()['train'].sample(frac=0.5, replace=True, random_state=1) # add and upload pandas.DataFrame (onetime snapshot of the object) task.upload_artifact('Pandas', artifact_object=df) # add and upload local file artifact task.upload_artifact('local file', artifact_object=os.path.join('data_samples', 'dancing.jpg')) # add and upload dictionary stored as JSON) task.upload_artifact('dictionary', df.to_dict()) # add and upload Numpy Object (stored as .npz file) task.upload_artifact('Numpy Eye', np.eye(100, 100)) # add and upload Image (stored as .png file) im = Image.open(os.path.join('data_samples', 'dancing.jpg')) task.upload_artifact('pillow_image', im)