Esempio n. 1
0
def mp_worker(arguments):
    print('sub process', os.getpid())
    inputs, the_time = arguments
    from random import randint
    additional_parameters = {
        'stuff_' + str(randint(0, 100)): 'some stuff ' + str(randint(0, 100))
    }
    Task.current_task().connect(additional_parameters)
    print(" Process %s\tWaiting %s seconds" % (inputs, the_time))
    time.sleep(int(the_time))
    print(" Process %s\tDONE" % inputs)
Esempio n. 2
0
def get_output(command, return_command=False):
    save_artifact = False
    if command.startswith("tlt") and (
            command.partition(" ")[0] != "tlt-train"
            and command.partition(" ")[0] != "tlt-converter"):
        command_prefix, _, command_args = command.partition(" ")
        command_prefix = shutil.which(command_prefix)
        command = "{} {} {}".format(sys.executable, command_prefix,
                                    command_args)
    elif command.startswith("ls -rlt"):  # we will save as artifact if needed
        save_artifact = True
    print("=============== Running command: {}".format(command))
    result = run(command,
                 stdout=PIPE,
                 stderr=STDOUT,
                 universal_newlines=True,
                 shell=True)
    print(result.stdout)
    if save_artifact:
        name = result.stdout.split("\n")[-2].rpartition(" ")[2]
        if name.endswith("tlt") or name.endswith("etlt") or name.endswith(
                "hdf5"):
            command_path = command.partition(" ")[2].rpartition(" ")[2]
            tlt_task = Task.current_task()
            tlt_task.upload_artifact(
                name=name,
                artifact_object=os.path.join(os.path.expandvars(command_path),
                                             name),
            )
    if return_command:
        return result.stdout
Esempio n. 3
0
    def _setup_check_clearml(self, logger: ClearMLLogger,
                             output_uri: str) -> None:
        try:
            from clearml import Task
        except ImportError:
            try:
                # Backwards-compatibility for legacy Trains SDK
                from trains import Task
            except ImportError:
                raise RuntimeError(
                    "This contrib module requires clearml to be installed. "
                    "You may install clearml using: \n pip install clearml \n")

        if logger and not isinstance(logger, ClearMLLogger):
            raise TypeError("logger must be an instance of ClearMLLogger")

        self._task = Task.current_task()
        if not self._task:
            raise RuntimeError(
                "ClearMLSaver requires a ClearML Task to be initialized. "
                "Please use the `logger` argument or call `clearml.Task.init()`."
            )

        if output_uri:
            self._task.output_uri = output_uri
Esempio n. 4
0
def download_pretrained_model(model_name, ngc_model, conf_file):
    model_file = (get_field_from_config(
        conf_file, "pretrained_model_file").strip().strip('"'))
    if model_file:
        model_dir = model_file.rpartition("/")[0].rpartition("/")[0]
        os.makedirs(model_dir)
    else:
        model_dir = "tmp/"
        os.makedirs(model_dir)
    # Download the pretrained model from NGC
    download_path = None
    command_output = get_output(
        "ngc registry model download-version {} --dest {}".format(
            ngc_model, model_dir),
        return_command=True,
    )
    for output in command_output.split("\n"):
        if output.startswith("Downloaded local path"):
            download_path = output.partition(":")[2].strip()
            break

    if download_path:
        tlt_task = Task.current_task()
        tlt_task.upload_artifact(
            name=model_name,
            artifact_object=os.path.join(
                os.path.expandvars("{}".format(download_path)),
                "{}.hdf5".format(model_name),
            ),
        )
Esempio n. 5
0
def _clearml_log_params(params_dict):
    try:
        from clearml import Task
    except ImportError:
        # Backwards-compatibility for legacy Trains SDK
        from trains import Task

    task = Task.current_task()
    task.connect(params_dict)
Esempio n. 6
0
def _clearml_log_artifact(fp):
    try:
        from clearml import Task
    except ImportError:
        # Backwards-compatibility for legacy Trains SDK
        from trains import Task

    task = Task.current_task()
    task.upload_artifact(Path(fp).name, fp)
Esempio n. 7
0
def run(num_workers):
    """ Distributed Synchronous SGD Example """
    th.manual_seed(1234)
    train_set, bsz = partition_dataset(num_workers)
    model = Net()
    optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.5)

    num_batches = ceil(len(train_set.dataset) / float(bsz))

    from random import randint
    param = {'worker_{}_stuff'.format(dist.get_rank()): 'some stuff ' + str(randint(0, 100))}
    Task.current_task().connect(param)
    Task.current_task().upload_artifact(
        'temp {:02d}'.format(dist.get_rank()), artifact_object={'worker_rank': dist.get_rank()})

    for epoch in range(2):
        epoch_loss = 0.0
        for i, (data, target) in enumerate(train_set):
            optimizer.zero_grad()
            output = model(data)
            loss = F.nll_loss(output, target)
            epoch_loss += loss.item()
            loss.backward()
            average_gradients(model)
            optimizer.step()
            if i % 10 == 0:
                print('{}] Train Epoch {} - {} \tLoss  {:.6f}'.format(dist.get_rank(), epoch, i, loss))
                Task.current_task().get_logger().report_scalar(
                    'loss', 'worker {:02d}'.format(dist.get_rank()), value=loss.item(), iteration=i)
            if i > 100:
                break
        print('Rank ', dist.get_rank(), ', epoch ',
              epoch, ': ', epoch_loss / num_batches)
Esempio n. 8
0
    def __init__(self, **kwargs: Any):
        try:
            from clearml import Task
            from clearml.binding.frameworks.tensorflow_bind import WeightsGradientHistHelper
        except ImportError:
            try:
                # Backwards-compatibility for legacy Trains SDK
                from trains import Task
                from trains.binding.frameworks.tensorflow_bind import WeightsGradientHistHelper
            except ImportError:
                raise RuntimeError(
                    "This contrib module requires clearml to be installed. "
                    "You may install clearml using: \n pip install clearml \n")

        experiment_kwargs = {
            k: v
            for k, v in kwargs.items()
            if k not in ("project_name", "task_name", "task_type")
        }

        if self.bypass_mode():
            warnings.warn("ClearMLSaver: running in bypass mode")

            class _Stub(object):
                def __call__(self, *_: Any, **__: Any) -> "_Stub":
                    return self

                def __getattr__(self, attr: str) -> "_Stub":
                    if attr in ("name", "id"):
                        return ""  # type: ignore[return-value]
                    return self

                def __setattr__(self, attr: str, val: Any) -> None:
                    pass

            self._task = _Stub()
        else:
            # Try to retrieve current the ClearML Task before trying to create a new one
            self._task = Task.current_task()
            if self._task is None:
                self._task = Task.init(
                    project_name=kwargs.get("project_name"),
                    task_name=kwargs.get("task_name"),
                    task_type=kwargs.get("task_type", Task.TaskTypes.training),
                    **experiment_kwargs,
                )

        self.clearml_logger = self._task.get_logger()

        self.grad_helper = WeightsGradientHistHelper(
            logger=self.clearml_logger)
def model_prune(task_args):
    # Create an output directory if it doesn't exist.
    get_output("mkdir -p /home/{}/experiment_dir_pruned".format(
        task_args.arch))
    train_task = Task.get_task(task_id=task_args.trains_model_task)
    unpruned_weights = train_task.artifacts["unpruned_weights"].get_local_copy(
    )
    tlt_prune(task_args, unpruned_weights)
    tlt_task = Task.current_task()
    tlt_task.upload_artifact(
        name="pruned_weights",
        artifact_object=os.path.join(
            os.path.expandvars("{}".format(task_args.output_file))),
    )
 def remote_run_experiment(self):
     for parameter_setup in self._parameter_setups:
         print(parameter_setup)
         task = Task.create(
             project_name=f"{self._project_name}",
             task_name=self.make_task_name(parameter_setup),
             repo=self._repo,
             branch=self._branch,
             script=self._script,
             requirements_file="../requirements.txt"
         )
         task.set_parent(Task.current_task().id)
         task.connect(parameter_setup)
         Task.enqueue(task, self._queue)
Esempio n. 11
0
def predictions_gt_images_handler(engine, logger, *args, **kwargs):
    x, _ = engine.state.batch
    y_pred, y = engine.state.output

    num_x = num_y = 4
    le = num_x * num_y
    fig = plt.figure(figsize=(20, 20))
    trans = transforms.ToPILImage()
    classes = (
        "plane",
        "car",
        "bird",
        "cat",
        "deer",
        "dog",
        "frog",
        "horse",
        "ship",
        "truck",
    )
    enumeration = {k: v for v, k in enumerate(classes, 1)}
    Task.current_task().connect_label_enumeration(enumeration)

    for idx in range(le):
        preds = torch.argmax(F.softmax(y_pred[idx], dim=0))
        probs = torch.max(F.softmax(y_pred[idx], dim=0))
        ax = fig.add_subplot(num_x, num_y, idx + 1, xticks=[], yticks=[])
        ax.imshow(trans(x[idx]))
        ax.set_title(
            "{0} {1:.1f}% (label: {2})".format(classes[preds], probs * 100,
                                               classes[y[idx]]),
            color=("green" if preds == y[idx] else "red"),
        )
    logger.writer.add_figure("predictions vs actuals",
                             figure=fig,
                             global_step=engine.state.epoch)
Esempio n. 12
0
def train_unpruned(model_name):
    train_tlt()
    tlt_task = Task.current_task()
    get_output("ls -lh {}".format(tlt_task.get_parameter("Args/results_dir")))
    tlt_task.upload_artifact(
        name="unpruned_weights",
        artifact_object=os.path.join(
            os.path.expandvars("{}/weights/{}.tlt".format(
                tlt_task.get_parameter("Args/results_dir"), model_name))),
    )
    tlt_task.upload_artifact(
        name="pbtxt model configuration file",
        artifact_object=os.path.join(
            os.path.expandvars("{}/graph.pbtxt".format(
                tlt_task.get_parameter("Args/results_dir")))),
    )
Esempio n. 13
0
def compute_and_log_cm(cm_metric, iteration):
    cm = cm_metric.compute()
    # CM: values are normalized such that diagonal values represent class recalls
    cm = ConfusionMatrix.normalize(cm, "recall").cpu().numpy()

    if idist.get_rank() == 0:
        from clearml import Task

        clearml_logger = Task.current_task().get_logger()
        clearml_logger.report_confusion_matrix(
            title="Final Confusion Matrix",
            series="cm-preds-gt",
            matrix=cm,
            iteration=iteration,
            xlabels=data.VOCSegmentationOpencv.target_names,
            ylabels=data.VOCSegmentationOpencv.target_names,
        )
Esempio n. 14
0
        def compute_and_log_cm():
            cm = cm_metric.compute()
            # CM: values are normalized such that diagonal values represent class recalls
            cm = ConfusionMatrix.normalize(cm, "recall").cpu().numpy()

            if idist.get_rank() == 0:
                try:
                    from clearml import Task
                except ImportError:
                    # Backwards-compatibility for legacy Trains SDK
                    from trains import Task

                clearml_logger = Task.current_task().get_logger()
                clearml_logger.report_confusion_matrix(
                    title="Final Confusion Matrix",
                    series="cm-preds-gt",
                    matrix=cm,
                    iteration=trainer.state.iteration,
                    xlabels=VOCSegmentationOpencv.target_names,
                    ylabels=VOCSegmentationOpencv.target_names,
                )
Esempio n. 15
0
    def _daemon(cls, jupyter_notebook_filename):
        from clearml import Task

        # load jupyter notebook package
        # noinspection PyBroadException
        try:
            # noinspection PyPackageRequirements
            from nbconvert.exporters.script import ScriptExporter
            _script_exporter = ScriptExporter()
        except Exception as ex:
            _logger.warning('Could not read Jupyter Notebook: {}'.format(ex))
            return
        # load pigar
        # noinspection PyBroadException
        try:
            from ....utilities.pigar.reqs import get_installed_pkgs_detail, file_import_modules
            from ....utilities.pigar.modules import ReqsModules
            from ....utilities.pigar.log import logger
            logger.setLevel(logging.WARNING)
        except Exception:
            file_import_modules = None
        # load IPython
        # noinspection PyBroadException
        try:
            # noinspection PyPackageRequirements
            from IPython import get_ipython
        except Exception:
            # should not happen
            get_ipython = None

        # setup local notebook files
        if jupyter_notebook_filename:
            notebook = Path(jupyter_notebook_filename)
            local_jupyter_filename = jupyter_notebook_filename
        else:
            notebook = None
            fd, local_jupyter_filename = mkstemp(suffix='.ipynb')
            os.close(fd)
        last_update_ts = None
        counter = 0
        prev_script_hash = None

        # noinspection PyBroadException
        try:
            from ....version import __version__
            our_module = cls.__module__.split('.')[0], __version__
        except Exception:
            our_module = None

        # noinspection PyBroadException
        try:
            import re
            replace_ipython_pattern = re.compile(r'\n([ \t]*)get_ipython\(\)')
        except Exception:
            replace_ipython_pattern = None

        # main observer loop, check if we need to exit
        while not cls._exit_event.wait(timeout=0.):
            # wait for timeout or sync event
            cls._sync_event.wait(cls._sample_frequency if counter else cls._first_sample_frequency)

            cls._sync_event.clear()
            counter += 1
            # noinspection PyBroadException
            try:
                # if there is no task connected, do nothing
                task = Task.current_task()
                if not task:
                    continue

                script_code = None
                fmodules = None
                current_cell = None
                # if we have a local file:
                if notebook:
                    if not notebook.exists():
                        continue
                    # check if notebook changed
                    if last_update_ts is not None and notebook.stat().st_mtime - last_update_ts <= 0:
                        continue
                    last_update_ts = notebook.stat().st_mtime
                else:
                    # serialize notebook to a temp file
                    if cls._jupyter_history_logger:
                        script_code, current_cell = cls._jupyter_history_logger.history_to_str()
                    else:
                        # noinspection PyBroadException
                        try:
                            # noinspection PyBroadException
                            try:
                                os.unlink(local_jupyter_filename)
                            except Exception:
                                pass
                            get_ipython().run_line_magic('history', '-t -f {}'.format(local_jupyter_filename))
                            with open(local_jupyter_filename, 'r') as f:
                                script_code = f.read()
                            # load the modules
                            from ....utilities.pigar.modules import ImportedModules
                            fmodules = ImportedModules()
                            for nm in set([str(m).split('.')[0] for m in sys.modules]):
                                fmodules.add(nm, 'notebook', 0)
                        except Exception:
                            continue

                # get notebook python script
                if script_code is None and local_jupyter_filename:
                    script_code, _ = _script_exporter.from_filename(local_jupyter_filename)
                    if cls._store_notebook_artifact:
                        # also upload the jupyter notebook as artifact
                        task.upload_artifact(
                            name='notebook',
                            artifact_object=Path(local_jupyter_filename),
                            preview='See `notebook preview` artifact',
                            metadata={'UPDATE': datetime.utcnow().strftime('%Y-%m-%d %H:%M:%S')},
                            wait_on_upload=True,
                        )
                        # noinspection PyBroadException
                        try:
                            from nbconvert.exporters import HTMLExporter  # noqa
                            html, _ = HTMLExporter().from_filename(filename=local_jupyter_filename)
                            local_html = Path(gettempdir()) / 'notebook_{}.html'.format(task.id)
                            with open(local_html.as_posix(), 'wt') as f:
                                f.write(html)
                            task.upload_artifact(
                                name='notebook preview', artifact_object=local_html,
                                preview='Click `FILE PATH` link',
                                metadata={'UPDATE': datetime.utcnow().strftime('%Y-%m-%d %H:%M:%S')},
                                delete_after_upload=True,
                                wait_on_upload=True,
                            )
                        except Exception:
                            pass

                current_script_hash = hash(script_code + (current_cell or ''))
                if prev_script_hash and prev_script_hash == current_script_hash:
                    continue

                # remove ipython direct access from the script code
                # we will not be able to run them anyhow
                if replace_ipython_pattern:
                    script_code = replace_ipython_pattern.sub(r'\n# \g<1>get_ipython()', script_code)

                requirements_txt = ''
                conda_requirements = ''
                # parse jupyter python script and prepare pip requirements (pigar)
                # if backend supports requirements
                if file_import_modules and Session.check_min_api_version('2.2'):
                    if fmodules is None:
                        fmodules, _ = file_import_modules(
                            notebook.parts[-1] if notebook else 'notebook', script_code)
                        if current_cell:
                            cell_fmodules, _ = file_import_modules(
                                notebook.parts[-1] if notebook else 'notebook', current_cell)
                            # noinspection PyBroadException
                            try:
                                fmodules |= cell_fmodules
                            except Exception:
                                pass
                    # add current cell to the script
                    if current_cell:
                        script_code += '\n' + current_cell
                    fmodules = ScriptRequirements.add_trains_used_packages(fmodules)
                    # noinspection PyUnboundLocalVariable
                    installed_pkgs = get_installed_pkgs_detail()
                    # make sure we are in installed packages
                    if our_module and (our_module[0] not in installed_pkgs):
                        installed_pkgs[our_module[0]] = our_module

                    # noinspection PyUnboundLocalVariable
                    reqs = ReqsModules()
                    for name in fmodules:
                        if name in installed_pkgs:
                            pkg_name, version = installed_pkgs[name]
                            reqs.add(pkg_name, version, fmodules[name])
                    requirements_txt, conda_requirements = ScriptRequirements.create_requirements_txt(reqs)

                # update script
                prev_script_hash = current_script_hash
                data_script = task.data.script
                data_script.diff = script_code
                data_script.requirements = {'pip': requirements_txt, 'conda': conda_requirements}
                # noinspection PyProtectedMember
                task._update_script(script=data_script)
                # update requirements
                # noinspection PyProtectedMember
                task._update_requirements(requirements=requirements_txt)
            except Exception:
                pass
Esempio n. 16
0
def run(epochs, lr, momentum, log_interval, params, trainloader, testloader,
        model):
    device = "cuda" if torch.cuda.is_available() else "cpu"
    net = Net(params).to(device)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.SGD(net.parameters(), lr=lr, momentum=momentum)

    trainer = create_supervised_trainer(net,
                                        optimizer,
                                        criterion,
                                        device=device)
    trainer.logger = setup_logger("trainer")

    val_metrics = {
        "accuracy": Accuracy(),
        "loss": Loss(criterion),
        "recall": Recall()
    }
    evaluator = create_supervised_evaluator(net,
                                            metrics=val_metrics,
                                            device=device)
    evaluator.logger = setup_logger("evaluator")

    # Attach handler to plot trainer's loss every 100 iterations
    tb_logger = TensorboardLogger(log_dir="cifar-output")
    tb_logger.attach_output_handler(
        trainer,
        event_name=Events.ITERATION_COMPLETED(every=params.get("loss_report")),
        tag="training",
        output_transform=lambda loss: {"loss": loss},
    )

    # Attach handler to dump evaluator's metrics every epoch completed
    for tag, evaluator in [("training", trainer), ("validation", evaluator)]:
        tb_logger.attach_output_handler(
            evaluator,
            event_name=Events.EPOCH_COMPLETED,
            tag=tag,
            metric_names="all",
            global_step_transform=global_step_from_engine(trainer),
        )

    # Attach function to build debug images and report every epoch end
    tb_logger.attach(
        evaluator,
        log_handler=predictions_gt_images_handler,
        event_name=Events.EPOCH_COMPLETED(once=1),
    )

    desc = "ITERATION - loss: {:.2f}"
    pbar = tqdm(initial=0,
                leave=False,
                total=len(trainloader),
                desc=desc.format(0))

    @trainer.on(Events.ITERATION_COMPLETED(every=log_interval))
    def log_training_loss(engine):
        pbar.desc = desc.format(engine.state.output)
        pbar.update(log_interval)

    @trainer.on(Events.EPOCH_COMPLETED)
    def log_training_results(engine):
        pbar.refresh()
        evaluator.run(trainloader)
        metrics = evaluator.state.metrics
        avg_accuracy = metrics["accuracy"]
        avg_nll = metrics["loss"]
        tqdm.write(
            "Training Results - Epoch: {}  Avg accuracy: {:.2f} Avg loss: {:.2f}"
            .format(engine.state.epoch, avg_accuracy, avg_nll))

    @trainer.on(Events.EPOCH_COMPLETED)
    def log_validation_results(engine):
        evaluator.run(testloader)
        metrics = evaluator.state.metrics
        avg_accuracy = metrics["accuracy"]
        avg_nll = metrics["loss"]
        tqdm.write(
            "Validation Results - Epoch: {}  Avg accuracy: {:.2f} Avg loss: {:.2f}"
            .format(engine.state.epoch, avg_accuracy, avg_nll))

        pbar.n = pbar.last_print_n = 0

    @trainer.on(Events.EPOCH_COMPLETED | Events.COMPLETED)
    def log_time():
        tqdm.write("{} took {} seconds".format(
            trainer.last_event_name.name,
            trainer.state.times[trainer.last_event_name.name],
        ))

    trainer.run(trainloader, max_epochs=epochs)
    pbar.close()

    PATH = "./cifar_net.pth"

    # CONDITION depicts a custom condition for when to save the model. The model is saved and then updated in ClearML
    CONDITION = True

    if CONDITION:
        torch.save(net.state_dict(), PATH)
        model.update_weights(weights_filename=PATH)
    print("Finished Training")
    print("Task ID number is: {}".format(Task.current_task().id))
Esempio n. 17
0
def _clearml_log_params(params_dict):
    from clearml import Task

    task = Task.current_task()
    task.connect(params_dict)
Esempio n. 18
0
def _clearml_log_artifact(fp):
    from clearml import Task

    task = Task.current_task()
    task.upload_artifact(Path(fp).name, fp)
Esempio n. 19
0
    },
    index=['falcon', 'dog', 'spider', 'fish'])

# Register Pandas object as artifact to watch
# (it will be monitored in the background and automatically synced and uploaded)
task.register_artifact('train',
                       df,
                       metadata={
                           'counting': 'legs',
                           'max legs': 69
                       })
# change the artifact object
df.sample(frac=0.5, replace=True, random_state=1)
# or access it from anywhere using the Task's get_registered_artifacts()
Task.current_task().get_registered_artifacts()['train'].sample(frac=0.5,
                                                               replace=True,
                                                               random_state=1)

# add and upload pandas.DataFrame (onetime snapshot of the object)
task.upload_artifact('Pandas', artifact_object=df)
# add and upload local file artifact
task.upload_artifact('local file',
                     artifact_object=os.path.join('data_samples',
                                                  'dancing.jpg'))
# add and upload dictionary stored as JSON)
task.upload_artifact('dictionary', df.to_dict())
# add and upload Numpy Object (stored as .npz file)
task.upload_artifact('Numpy Eye', np.eye(100, 100))
# add and upload Image (stored as .png file)
im = Image.open(os.path.join('data_samples', 'dancing.jpg'))
task.upload_artifact('pillow_image', im)