def set_credentials(cls, api_host: str = None, web_host: str = None, files_host: str = None, key: str = None, secret: str = None) -> None: """ Set new default TRAINS-server host and credentials. These configurations could be overridden by either OS environment variables or trains.conf configuration file. Note: Credentials need to be set *prior* to Logger initialization. Args: api_host: Trains API server url, example: ``host='http://localhost:8008'`` web_host: Trains WEB server url, example: ``host='http://localhost:8080'`` files_host: Trains Files server url, example: ``host='http://localhost:8081'`` key: user key/secret pair, example: ``key='thisisakey123'`` secret: user key/secret pair, example: ``secret='thisisseceret123'`` """ Task.set_credentials(api_host=api_host, web_host=web_host, files_host=files_host, key=key, secret=secret)
def update_config(config: OmegaConf): """ Serialize and sync config with trains :param config: the config to sync :return: """ # expected config_global format schema = OmegaConf.structured(config._metadata.object_type) # serialize config # For config logging we use yaml format (Trains: Artifacts -> Model configuration) # save config in a temp yaml file config_global_file = tempfile.NamedTemporaryFile("w+t") config_global_file.write(OmegaConf.to_yaml(config)) config_global_file.flush() config_global_file_name = config_global_file.name # sync with server if a task has been created current_task = Task.current_task() if current_task: # send yaml to trains server config_global_file_name = Task.current_task().connect_configuration( config_global_file_name) # for visualization (Trains: Hyperparameters) Task.current_task().connect( generate_trains_hyperparameter_dict(config)) config_back_ = OmegaConf.load(config_global_file_name) config_back = OmegaConf.merge(schema, config_back_) return config_back
def mp_worker(arguments): print('sub process', os.getpid()) inputs, the_time = arguments from random import randint additional_parameters = {'stuff_' + str(randint(0, 100)): 'some stuff ' + str(randint(0, 100))} Task.current_task().connect(additional_parameters) print(" Process %s\tWaiting %s seconds" % (inputs, the_time)) time.sleep(int(the_time)) print(" Process %s\tDONE" % inputs)
def __init__( self, pool_frequency=0.2, # type: float default_execution_queue=None, # type: Optional[str] pipeline_time_limit=None, # type: Optional[float] auto_connect_task=True, # type: Union[bool, Task] always_create_task=False, # type: bool add_pipeline_tags=False, # type: bool ): # type: (...) -> () """ Create a new pipeline controller. The newly created object will launch and monitor the new experiments. :param float pool_frequency: The pooling frequency (in minutes) for monitoring experiments / states. :param str default_execution_queue: The execution queue to use if no execution queue is provided :param float pipeline_time_limit: The maximum time (minutes) for the entire pipeline process. The default is ``None``, indicating no time limit. :param bool auto_connect_task: Store pipeline arguments and configuration in the Task - ``True`` - The pipeline argument and configuration will be stored in the current Task. All arguments will be under the hyper-parameter section ``Pipeline``, and the pipeline DAG will be stored as a Task configuration object named ``Pipeline``. - ``False`` - Do not store with Task. - ``Task`` - A specific Task object to connect the pipeline with. :param bool always_create_task: Always create a new Task - ``True`` - No current Task initialized. Create a new task named ``Pipeline`` in the ``base_task_id`` project. - ``False`` - Use the :py:meth:`task.Task.current_task` (if exists) to report statistics. :param bool add_pipeline_tags: (default: False) if True, add `pipe: <pipeline_task_id>` tag to all steps (Tasks) created by this pipeline. """ self._nodes = {} self._running_nodes = [] self._start_time = None self._pipeline_time_limit = pipeline_time_limit * 60. if pipeline_time_limit else None self._default_execution_queue = default_execution_queue self._pool_frequency = pool_frequency * 60. self._thread = None self._stop_event = None self._experiment_created_cb = None self._add_pipeline_tags = add_pipeline_tags self._task = auto_connect_task if isinstance( auto_connect_task, Task) else Task.current_task() self._step_ref_pattern = re.compile(self._step_pattern) if not self._task and always_create_task: self._task = Task.init( project_name='Pipelines', task_name='Pipeline {}'.format(datetime.now()), task_type=Task.TaskTypes.controller, ) # make sure all the created tasks are our children, as we are creating them if self._task: self._task.add_tags([self._tag]) self._auto_connect_task = bool(auto_connect_task)
def clone_and_queue(template_task: str, queue: str) -> Task: github_payload = os.getenv('GITHUB_EVENT_PATH') with open(github_payload, 'r') as f: payload = json.load(f) task = Task.get_task(task_id=template_task) # Clone the task to pipe to. This creates a task with status Draft whose parameters can be modified. cloned_task = Task.clone(source_task=task, name=f"{template_task} cloned task from Github") script_commit = payload.get("comment", {}).get("body", "").partition(" ")[2] selected_type, _, selected_value = script_commit.partition(" ") if selected_type and selected_value: data_script = cloned_task.data.script if selected_type == "branch": data_script.branch = selected_value data_script.tag = "" data_script.version_num = "" elif selected_type == "tag": data_script.branch = "" data_script.tag = selected_value data_script.version_num = "" elif selected_type == "commit": data_script.branch = "" data_script.tag = "" data_script.version_num = selected_value else: raise Exception( f"You must supply branch, tag or commit as type, not {selected_type}" ) print(f"Change train script head to {selected_value} {selected_type}") # noinspection PyProtectedMember cloned_task._update_script(script=data_script) Task.enqueue(cloned_task.id, queue_name=queue) owner, repo = payload.get("repository", {}).get("full_name", "").split("/") if owner and repo: gh = login(token=os.getenv("GITHUB_TOKEN")) if gh: issue = gh.issue(owner, repo, payload.get("issue", {}).get("number")) if issue: issue.create_comment( f"New task, id:{cloned_task.id} is in queue {queue_name}") else: print( f'can not comment issue, {payload.get("issue", {}).get("number")}' ) else: print(f"can not log in to gh, {os.getenv('GITHUB_TOKEN')}") return cloned_task
def main(): # Create the experiment Task task = Task.init(project_name="examples", task_name="text reporting") print('reporting text logs') # report regular console print print('This is standard output test') # report stderr print('This is standard error test', file=sys.stderr) # Get the task logger, # You can also call Task.current_task().get_logger() from anywhere in your code. logger = task.get_logger() # report text based logs report_logs(logger) # force flush reports # If flush is not called, reports are flushed in the background every couple of seconds, # and at the end of the process execution logger.flush() print('We are done reporting, have a great day :)')
def transmit_artifacts(self, id): artifacts = (self.info[id][self.artifacts] if self.artifacts in self.info[id].keys() else {}) task = self.call_func( 'Task.get_task', id, lambda id_: Task.get_task(project_name=PROJECT_NAME, task_name=id_), self.get_run_name_by_id(id)) for type, l in artifacts.items(): if type == "folder": for name, obj in l: task.upload_artifact(name=name, artifact_object=obj) elif type == 'text': for name, obj in l: task.upload_artifact(name=name, artifact_object=obj) elif type == "dataframe": for name, obj in l: task.upload_artifact(name=name, artifact_object=obj) elif type == "image": for name, obj in l: task.upload_artifact(name=name, artifact_object=obj) elif type == "dictionary": for name, obj in l: task.upload_artifact(name=name, artifact_object=obj) elif type == "storage-server": for name, obj in l: task.upload_artifact(name=name, artifact_object=obj)
def __init__(self, logger: TrainsLogger = None, output_uri: str = None, dirname: str = None, *args, **kwargs): try: from trains import Task except ImportError: raise RuntimeError( "This contrib module requires trains to be installed. " "You may install trains using: \n pip install trains \n") if logger and not isinstance(logger, TrainsLogger): raise TypeError("logger must be an instance of TrainsLogger") self.task = Task.current_task() if not self.task: raise RuntimeError( "TrainsSaver requires a Trains Task to be initialized." "Please use the `logger` argument or call `trains.Task.init()`." ) if not dirname: dirname = tempfile.mkdtemp(prefix="ignite_checkpoints_{}".format( datetime.now().strftime("%Y_%m_%d_%H_%M_%S_"))) warnings.warn( "TrainsSaver created a temporary checkpoints directory: {}". format(dirname)) super(TrainsSaver, self).__init__(dirname=dirname, *args, **kwargs) if output_uri: self.task.output_uri = output_uri
def __init__( self, project_name: Optional[str] = None, task_name: Optional[str] = None, task_type: str = 'training', reuse_last_task_id: bool = True, output_uri: Optional[str] = None, auto_connect_arg_parser: bool = True, auto_connect_frameworks: bool = True, auto_resource_monitoring: bool = True ) -> None: super().__init__() if self._bypass: self._trains = None else: self._trains = Task.init( project_name=project_name, task_name=task_name, task_type=task_type, reuse_last_task_id=reuse_last_task_id, output_uri=output_uri, auto_connect_arg_parser=auto_connect_arg_parser, auto_connect_frameworks=auto_connect_frameworks, auto_resource_monitoring=auto_resource_monitoring )
def seed(self): for id in self.get_ids(): task = self.call_func( 'Task.create', id, lambda id_: Task.create(project_name=PROJECT_NAME, task_name=id_), self.get_run_name_by_id(id)) self.call_func('transmit_information', id, lambda id_: self.transmit_information(id_), id) self.call_func('transmit_metrics', id, lambda id_: self.transmit_metrics(id_), id) self.call_func('transmit_artifacts', id, lambda id_: self.transmit_artifacts(id_), id) task.mark_started() task.completed() output_log_web_page = task.get_output_log_web_page() url_parts = output_log_web_page.split('projects') project_id = url_parts[1].split('/')[1] self.project_link = url_parts[0] + '/projects/' + project_id self.migration_count += 1 self.pbar.update(1)
def _verify_node(self, node): # type: (Node) -> bool """ Raise ValueError on verification errors :return: Return True iff the specific node is verified """ if not node.base_task_id: raise ValueError("Node '{}', base_task_id is empty".format(node.name)) if not self._default_execution_queue and not node.queue: raise ValueError("Node '{}' missing execution queue, " "no default queue defined and no specific node queue defined".format(node.name)) task = Task.get_task(task_id=node.base_task_id) if not task: raise ValueError("Node '{}', base_task_id={} is invalid".format(node.name, node.base_task_id)) pattern = self._step_ref_pattern for v in node.parameters.values(): if isinstance(v, str): for g in pattern.findall(v): self.__verify_step_reference(node, g) return True
def __init__(self, task: Task = None, projectName: str = None, taskName: str = None, additionalLoggingValuesDict=None): """ :param task: instances of trains.Task :param projectName: only necessary if task is not provided :param taskName: only necessary if task is not provided :param additionalLoggingValuesDict: """ if task is None: if projectName is None or taskName is None: raise ValueError("Either the trains task or the project name and task name have to be provided") self.task = Task.init(project_name=projectName, task_name=taskName, reuse_last_task_id=False) else: if projectName is not None: log.warning( f"projectName parameter with value {projectName} passed even though task has been given, " f"will ignore this parameter" ) if taskName is not None: log.warning( f"taskName parameter with value {taskName} passed even though task has been given, " f"will ignore this parameter" ) self.task = task self.logger = self.task.get_logger() super().__init__(additionalLoggingValuesDict=additionalLoggingValuesDict)
def set_credentials(cls, api_host: str = None, web_host: str = None, files_host: str = None, key: str = None, secret: str = None) -> None: """ Set new default TRAINS-server host and credentials These configurations could be overridden by either OS environment variables or trains.conf configuration file Notice! credentials needs to be set *prior* to Logger initialization :param api_host: Trains API server url, example: host='http://localhost:8008' :param web_host: Trains WEB server url, example: host='http://localhost:8080' :param files_host: Trains Files server url, example: host='http://localhost:8081' :param key: user key/secret pair, example: key='thisisakey123' :param secret: user key/secret pair, example: secret='thisisseceret123' """ Task.set_credentials(api_host=api_host, web_host=web_host, files_host=files_host, key=key, secret=secret)
def trains(self, x: data_type, y: data_type = None, x_cv: data_type = None, y_cv: data_type = None, *, trains_config: Dict[str, Any] = None, keep_task_open: bool = False, queue: str = None) -> "Wrapper": if trains_config is None: return self.fit(x, y, x_cv, y_cv) # init trains if trains_config is None: trains_config = {} project_name = trains_config.get("project_name") task_name = trains_config.get("task_name") if queue is None: task = Task.init(**trains_config) cloned_task = None else: task = Task.get_task(project_name=project_name, task_name=task_name) cloned_task = Task.clone(source_task=task, parent=task.id) # before loop self._verbose_level = 6 self._data_config["verbose_level"] = 6 self._before_loop(x, y, x_cv, y_cv) self.pipeline.use_tqdm = False copied_config = shallow_copy_dict(self.config) if queue is not None: cloned_task.set_parameters(copied_config) Task.enqueue(cloned_task.id, queue) return self # loop task.connect(copied_config) global trains_logger trains_logger = task.get_logger() self._loop() if not keep_task_open: task.close() trains_logger = None return self
def setup_trains_logging(config): if config["with_trains"]: from trains import Task task = Task.init("Carbon Black Semantic Segmentation Training", config["task_name"]) task.connect_configuration(config) # Log hyper parameters hyper_parameters = list(config.keys()) task.connect({k: config[k] for k in hyper_parameters})
def delete_all_tasks_from_project(pr_name): # type: (str) -> () """ <Description> :param str pr_name: """ client = APIClient() tasks = Task.get_tasks(project_name=pr_name) for task in tasks: client.tasks.delete(task=task.id, force=True)
def run(num_workers): """ Distributed Synchronous SGD Example """ th.manual_seed(1234) train_set, bsz = partition_dataset(num_workers) model = Net() optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.5) num_batches = ceil(len(train_set.dataset) / float(bsz)) from random import randint param = {'worker_{}_stuff'.format(dist.get_rank()): 'some stuff ' + str(randint(0, 100))} Task.current_task().connect(param) Task.current_task().upload_artifact( 'temp {:02d}'.format(dist.get_rank()), artifact_object={'worker_rank': dist.get_rank()}) for epoch in range(2): epoch_loss = 0.0 for i, (data, target) in enumerate(train_set): optimizer.zero_grad() output = model(data) loss = F.nll_loss(output, target) epoch_loss += loss.item() loss.backward() average_gradients(model) optimizer.step() if i % 10 == 0: print('{}] Train Epoch {} - {} \tLoss {:.6f}'.format(dist.get_rank(), epoch, i, loss)) Task.current_task().get_logger().report_scalar( 'loss', 'worker {:02d}'.format(dist.get_rank()), value=loss.item(), iteration=i) if i > 100: break print('Rank ', dist.get_rank(), ', epoch ', epoch, ': ', epoch_loss / num_batches)
def main(): # Training settings parser = argparse.ArgumentParser(description='PyTorch MNIST Example') parser.add_argument('--batch-size', type=int, default=64, metavar='N', help='input batch size for training (default: 64)') parser.add_argument('--test-batch-size', type=int, default=1000, metavar='N', help='input batch size for testing (default: 1000)') parser.add_argument('--epochs', type=int, default=2, metavar='N', help='number of epochs to train (default: 10)') parser.add_argument('--lr', type=float, default=0.01, metavar='LR', help='learning rate (default: 0.01)') parser.add_argument('--momentum', type=float, default=0.5, metavar='M', help='SGD momentum (default: 0.5)') parser.add_argument('--no-cuda', action='store_true', default=False, help='disables CUDA training') parser.add_argument('--seed', type=int, default=1, metavar='S', help='random seed (default: 1)') parser.add_argument('--log-interval', type=int, default=10, metavar='N', help='how many batches to wait before logging training status') args = parser.parse_args() args.cuda = not args.no_cuda and torch.cuda.is_available() task = Task.init(project_name='examples', task_name='pytorch with tensorboardX') writer = SummaryWriter('runs') writer.add_text('TEXT', 'This is some text', 0) torch.manual_seed(args.seed) if args.cuda: torch.cuda.manual_seed(args.seed) kwargs = {'num_workers': 4, 'pin_memory': True} if args.cuda else {} train_loader = torch.utils.data.DataLoader(datasets.MNIST('../data', train=True, download=True, transform=transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,))])), batch_size=args.batch_size, shuffle=True, **kwargs) test_loader = torch.utils.data.DataLoader(datasets.MNIST('../data', train=False, transform=transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,))])), batch_size=args.batch_size, shuffle=True, **kwargs) model = Net() if args.cuda: model.cuda() optimizer = optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum) for epoch in range(1, args.epochs + 1): train(model, epoch, train_loader, args, optimizer, writer) torch.save(model, os.path.join(gettempdir(), 'model{}'.format(epoch))) test(model, test_loader, args, optimizer, writer)
def TrainModel(model, base_model, model_name): task = Task.init(project_name="Ex3ModelTrains", task_name=model_name) reporter = TrainsReporter() # Show a summary of the model. Check the number of trainable parameters model.summary() # Compile the model model.compile(loss=keras.losses.BinaryCrossentropy(from_logits=True), optimizer=keras.optimizers.Adam(), metrics=[metrics.BinaryAccuracy()]) # Train the model model.fit(train_ds, steps_per_epoch=train_ds.samples / train_ds.batch_size, epochs=20, validation_data=valid_ds, validation_steps=valid_ds.samples / valid_ds.batch_size, callbacks=[reporter], verbose=1) # Unfreeze the base_model. Note that it keeps running in inference mode # since we passed `training=False` when calling it. This means that # the batchnorm layers will not update their batch statistics. # This prevents the batchnorm layers from undoing all the training # we've done so far. base_model.trainable = True reporter.epoch_ref = 20 score = model.evaluate(test_ds) print('Test evaluation Score:', model.evaluate(test_ds)) print('validation evaluation Score:', model.evaluate(valid_ds)) model.compile( optimizer=keras.optimizers.Adam(1e-5), # Low learning rate loss=keras.losses.BinaryCrossentropy(from_logits=True), metrics=[keras.metrics.BinaryAccuracy()], ) model.fit(train_ds, steps_per_epoch=train_ds.samples / train_ds.batch_size, epochs=10, validation_data=valid_ds, validation_steps=valid_ds.samples / valid_ds.batch_size, callbacks=[reporter], verbose=1) score = model.evaluate(test_ds) print('Test evaluation Score:', model.evaluate(test_ds)) print('validation evaluation Score:', model.evaluate(valid_ds))
def main(): parser = ArgumentParser() parser.add_argument( "--run", help="Run the autoscaler after wizard finished", action="store_true", default=False, ) args = parser.parse_args() if running_remotely(): hyper_params = AwsAutoScaler.Settings().as_dict() configurations = AwsAutoScaler.Configuration().as_dict() else: print("AWS Autoscaler setup\n") config_file = Path(CONF_FILE).absolute() if config_file.exists() and input_bool( "Load configurations from config file '{}' [Y/n]? ".format( str(CONF_FILE)), default=True, ): with config_file.open("r") as f: conf = yaml.load(f, Loader=yaml.SafeLoader) hyper_params = conf["hyper_params"] configurations = conf["configurations"] else: configurations, hyper_params = run_wizard() try: with config_file.open("w+") as f: conf = { "hyper_params": hyper_params, "configurations": configurations, } yaml.safe_dump(conf, f) except Exception: print( "Error! Could not write configuration file at: {}".format( str(CONF_FILE))) return task = Task.init(project_name="Auto-Scaler", task_name="AWS Auto-Scaler") task.connect(hyper_params) task.connect_configuration(configurations) autoscaler = AwsAutoScaler(hyper_params, configurations) if running_remotely() or args.run: autoscaler.start()
def trains_log_text(text): logger = None if "TRAINS_STD_LOGGER" in globals(): logger = TRAINS_STD_LOGGER else: try: from trains import Task logger = Task.current_task().get_logger() except: pass if logger is None: return logger.report_text(text) logger.flush()
def transmit_metrics(self, id): task = self.call_func( 'Task.get_task', id, lambda id_: Task.get_task(project_name=PROJECT_NAME, task_name=id_), self.get_run_name_by_id(id)) logger = task.get_logger() metrics = self.get_metrics(id) for graph_name, series_name, table in metrics: for p in table: logger.report_scalar(graph_name, series_name, iteration=p[0], value=float(p[1])) task.completed()
def transmit_information(self, id): parameters = self.get_params(id) general_information = self.get_general_information(id) artifact = self.get_artifact(id) tags = self.get_tags(id) task = self.call_func( 'Task.get_task', id, lambda id_: Task.get_task(project_name=PROJECT_NAME, task_name=id_), self.get_run_name_by_id(id)) task_values = self.call_func('task.export_task', id, lambda _: task.export_task(), self.get_run_name_by_id(id)) task_values["comment"] = (tags["note.content"] if "note.content" in tags.keys() else "") task_values["hyperparams"]["Args"] = parameters task_values["started"] = general_information["started"] task_values["completed"] = general_information["completed"] task_values["script"]["branch"] = (tags["source.git.branch"] if "source.git.branch" in tags.keys() else self.branch) task_values["script"]["repository"] = (tags["source.git.repoURL"] if "source.git.repoURL" in tags.keys() else "") task_values["script"]["version_num"] = (tags["source.git.commit"] if "source.git.commit" in tags.keys() else "") task_values["script"]["entry_point"] = tags["entry_point"] task_values["script"]["working_dir"] = tags["working_dir"] if "project.env" in tags.keys(): task_values["script"]["requirements"][tags["project.env"]] = ( artifact["requirements"] if "requirements" in artifact.keys() else "") task_values["user"] = tags["user"] self.call_func('task.update_task', id, lambda _task_values: task.update_task(_task_values), task_values) if len(tags["VALUETAG"].keys()) > 0: self.call_func( 'task.connect_configuration', id, lambda _dict: task.connect_configuration( _dict, name="MLflow Tags"), tags["VALUETAG"])
def compute_and_log_cm(): cm = cm_metric.compute() # CM: values are normalized such that diagonal values represent class recalls cm = ConfusionMatrix.normalize(cm, "recall").cpu().numpy() if idist.get_rank() == 0: from trains import Task trains_logger = Task.current_task().get_logger() trains_logger.report_confusion_matrix( title="Final Confusion Matrix", series="cm-preds-gt", matrix=cm, iteration=trainer.state.iteration, xlabels=VOCSegmentationOpencv.target_names, ylabels=VOCSegmentationOpencv.target_names, )
def __init__(self, *_, **kwargs): try: from trains import Task from trains.binding.frameworks.tensorflow_bind import WeightsGradientHistHelper except ImportError: raise RuntimeError( "This contrib module requires trains to be installed. " "You may install trains using: \n pip install trains \n") experiment_kwargs = { k: v for k, v in kwargs.items() if k not in ( "project_name", "task_name", "task_type", ) } if self.bypass_mode(): warnings.warn("TrainsSaver: running in bypass mode") class _Stub(object): def __call__(self, *_, **__): return self def __getattr__(self, attr): if attr in ("name", "id"): return "" return self def __setattr__(self, attr, val): pass self._task = _Stub() else: self._task = Task.init( project_name=kwargs.get("project_name"), task_name=kwargs.get("task_name"), task_type=kwargs.get("task_type", Task.TaskTypes.training), **experiment_kwargs, ) self.trains_logger = self._task.get_logger() self.grad_helper = WeightsGradientHistHelper( logger=self.trains_logger, )
def read(self): self.thread_id = threading.current_thread().ident for id, path in self.paths: self.info[id] = {} self.call_func( 'read_general_information', id, lambda id_, path_: self.read_general_information(id_, path_), id, path) self.call_func('read_tags', id, lambda id_, path_: self.read_tags(id_, path_), id, path + self.tags) if "runName" in self.info[id][self.tags].keys(): self.ID_to_Name[id] = self.info[id][self.tags]["runName"] if self.project_exist: task = self.call_func( 'Task.get_task', id, lambda id_: Task.get_task(project_name=PROJECT_NAME, task_name=id_), self.get_run_name_by_id(id)) if task: task_tags = task.data.system_tags if hasattr( task.data, 'system_tags') else task.data.tags if not ARCHIVED_TAG in task_tags: del self.info[id] self.msgs['FAILED'].append( 'task ' + id + ' already exist, if you want to migrate it again, you can archive it in Allegro Trains' ) self.pbar.update(1) continue self.call_func('read_artifacts', id, lambda id_, path_: self.read_artifacts(id_, path_), id, path + self.artifacts) self.call_func('read_metrics', id, lambda id_, path_: self.read_metrics(id_, path_), id, path + self.metrics) self.call_func('read_params', id, lambda id_, path_: self.read_params(id_, path_), id, path + self.params)
def run(config, logger=None, local_rank=0, **kwargs): assert torch.cuda.is_available() assert torch.backends.cudnn.enabled, "Nvidia/Amp requires cudnn backend to be enabled." task = Task.init( "ignite", "DeeplabV3_ResNet101 pascal_voc2012 segmentation example") dist.init_process_group("nccl", init_method="env://") # As we passed config with option --manual_config_load assert hasattr(config, "setup"), ( "We need to manually setup the configuration, please set --manual_config_load " "to py_config_runner") config = config.setup() assert_config(config, TRAINVAL_CONFIG) # The following attributes are automatically added by py_config_runner assert hasattr(config, "config_filepath") and isinstance( config.config_filepath, Path) assert hasattr(config, "script_filepath") and isinstance( config.script_filepath, Path) # dump python files to reproduce the run task.connect_configuration(config.config_filepath.as_posix()) task.upload_artifact("script", config.script_filepath) config.output_path = Path("./artifacts") # log the configuration, if we are the master node if dist.get_rank() == 0: task.connect(get_params(config, TRAINVAL_CONFIG)) try: training(config, local_rank=local_rank, with_trains_logging=True) except KeyboardInterrupt: logger.info("Caught KeyboardInterrupt -> exit") except Exception as e: # noqa logger.exception("") dist.destroy_process_group() raise e dist.destroy_process_group()
def run(config, **kwargs): """This is the main method to run the training. As this training script is launched with `py_config_runner` it should obligatory contain `run(config, **kwargs)` method. """ assert torch.cuda.is_available(), torch.cuda.is_available() assert torch.backends.cudnn.enabled, "Nvidia/Amp requires cudnn backend to be enabled." with idist.Parallel(backend="nccl") as parallel: logger = setup_logger(name="Pascal-VOC12 Training", distributed_rank=idist.get_rank()) assert_config(config, TRAINVAL_CONFIG) # The following attributes are automatically added by py_config_runner assert hasattr(config, "config_filepath") and isinstance( config.config_filepath, Path) assert hasattr(config, "script_filepath") and isinstance( config.script_filepath, Path) if idist.get_rank() == 0 and exp_tracking.has_trains: from trains import Task task = Task.init("Pascal-VOC12 Training", config.config_filepath.stem) task.connect_configuration(config.config_filepath.as_posix()) log_basic_info(logger, config) config.output_path = Path(exp_tracking.get_output_path()) # dump python files to reproduce the run exp_tracking.log_artifact(config.config_filepath.as_posix()) exp_tracking.log_artifact(config.script_filepath.as_posix()) exp_tracking.log_params(get_params(config, TRAINVAL_CONFIG)) try: parallel.run(training, config, logger=logger) except KeyboardInterrupt: logger.info("Catched KeyboardInterrupt -> exit") except Exception as e: # noqa logger.exception("") raise e
def main(): # Create the experiment Task task = Task.init(project_name="examples", task_name="scalar reporting") print('reporting scalar graphs') # Get the task logger, # You can also call Task.current_task().get_logger() from anywhere in your code. logger = task.get_logger() # report scalars report_scalars(logger) # force flush reports # If flush is not called, reports are flushed in the background every couple of seconds, # and at the end of the process execution logger.flush() print('We are done reporting, have a great day :)')
def __init__(self, project_name: Optional[str] = None, task_name: Optional[str] = None, task_type: str = 'training', reuse_last_task_id: bool = True, output_uri: Optional[str] = None, auto_connect_arg_parser: bool = True, auto_connect_frameworks: bool = True, auto_resource_monitoring: bool = True) -> None: if not _TRAINS_AVAILABLE: raise ImportError( 'You want to use `test_tube` logger which is not installed yet,' ' install it with `pip install test-tube`.') super().__init__() if self.bypass_mode(): self._trains = None print('TRAINS Task: running in bypass mode') print('TRAINS results page: disabled') class _TaskStub(object): def __call__(self, *args, **kwargs): return self def __getattr__(self, attr): if attr in ('name', 'id'): return '' return self def __setattr__(self, attr, val): pass self._trains = _TaskStub() else: self._trains = Task.init( project_name=project_name, task_name=task_name, task_type=task_type, reuse_last_task_id=reuse_last_task_id, output_uri=output_uri, auto_connect_arg_parser=auto_connect_arg_parser, auto_connect_frameworks=auto_connect_frameworks, auto_resource_monitoring=auto_resource_monitoring)