Ejemplo n.º 1
0
    def set_credentials(cls,
                        api_host: str = None,
                        web_host: str = None,
                        files_host: str = None,
                        key: str = None,
                        secret: str = None) -> None:
        """
        Set new default TRAINS-server host and credentials.
        These configurations could be overridden by either OS environment variables
        or trains.conf configuration file.

        Note:
            Credentials need to be set *prior* to Logger initialization.

        Args:
            api_host: Trains API server url, example: ``host='http://localhost:8008'``
            web_host: Trains WEB server url, example: ``host='http://localhost:8080'``
            files_host: Trains Files server url, example: ``host='http://localhost:8081'``
            key: user key/secret pair, example: ``key='thisisakey123'``
            secret: user key/secret pair, example: ``secret='thisisseceret123'``
        """
        Task.set_credentials(api_host=api_host,
                             web_host=web_host,
                             files_host=files_host,
                             key=key,
                             secret=secret)
def update_config(config: OmegaConf):
    """
    Serialize and sync config with trains
    :param config: the config to sync
    :return:
    """
    # expected config_global format
    schema = OmegaConf.structured(config._metadata.object_type)

    # serialize config
    # For config logging we use yaml format (Trains: Artifacts ->  Model configuration)
    # save config in a temp yaml file
    config_global_file = tempfile.NamedTemporaryFile("w+t")
    config_global_file.write(OmegaConf.to_yaml(config))
    config_global_file.flush()
    config_global_file_name = config_global_file.name

    # sync with server if a task has been created
    current_task = Task.current_task()
    if current_task:
        # send yaml to trains server
        config_global_file_name = Task.current_task().connect_configuration(
            config_global_file_name)

        # for visualization (Trains: Hyperparameters)
        Task.current_task().connect(
            generate_trains_hyperparameter_dict(config))

    config_back_ = OmegaConf.load(config_global_file_name)
    config_back = OmegaConf.merge(schema, config_back_)

    return config_back
Ejemplo n.º 3
0
def mp_worker(arguments):
    print('sub process', os.getpid())
    inputs, the_time = arguments
    from random import randint
    additional_parameters = {'stuff_' + str(randint(0, 100)): 'some stuff ' + str(randint(0, 100))}
    Task.current_task().connect(additional_parameters)
    print(" Process %s\tWaiting %s seconds" % (inputs, the_time))
    time.sleep(int(the_time))
    print(" Process %s\tDONE" % inputs)
Ejemplo n.º 4
0
    def __init__(
            self,
            pool_frequency=0.2,  # type: float
            default_execution_queue=None,  # type: Optional[str]
            pipeline_time_limit=None,  # type: Optional[float]
            auto_connect_task=True,  # type: Union[bool, Task]
            always_create_task=False,  # type: bool
            add_pipeline_tags=False,  # type: bool
    ):
        # type: (...) -> ()
        """
        Create a new pipeline controller. The newly created object will launch and monitor the new experiments.

        :param float pool_frequency: The pooling frequency (in minutes) for monitoring experiments / states.
        :param str default_execution_queue: The execution queue to use if no execution queue is provided
        :param float pipeline_time_limit: The maximum time (minutes) for the entire pipeline process. The
            default is ``None``, indicating no time limit.
        :param bool auto_connect_task: Store pipeline arguments and configuration in the Task
            - ``True`` - The pipeline argument and configuration will be stored in the current Task. All arguments will
              be under the hyper-parameter section ``Pipeline``, and the pipeline DAG will be stored as a
              Task configuration object named ``Pipeline``.

            - ``False`` - Do not store with Task.
            - ``Task`` - A specific Task object to connect the pipeline with.
        :param bool always_create_task: Always create a new Task
            - ``True`` - No current Task initialized. Create a new task named ``Pipeline`` in the ``base_task_id``
              project.

            - ``False`` - Use the :py:meth:`task.Task.current_task` (if exists) to report statistics.
        :param bool add_pipeline_tags: (default: False) if True, add `pipe: <pipeline_task_id>` tag to all
            steps (Tasks) created by this pipeline.
        """
        self._nodes = {}
        self._running_nodes = []
        self._start_time = None
        self._pipeline_time_limit = pipeline_time_limit * 60. if pipeline_time_limit else None
        self._default_execution_queue = default_execution_queue
        self._pool_frequency = pool_frequency * 60.
        self._thread = None
        self._stop_event = None
        self._experiment_created_cb = None
        self._add_pipeline_tags = add_pipeline_tags
        self._task = auto_connect_task if isinstance(
            auto_connect_task, Task) else Task.current_task()
        self._step_ref_pattern = re.compile(self._step_pattern)
        if not self._task and always_create_task:
            self._task = Task.init(
                project_name='Pipelines',
                task_name='Pipeline {}'.format(datetime.now()),
                task_type=Task.TaskTypes.controller,
            )

        # make sure all the created tasks are our children, as we are creating them
        if self._task:
            self._task.add_tags([self._tag])
            self._auto_connect_task = bool(auto_connect_task)
Ejemplo n.º 5
0
def clone_and_queue(template_task: str, queue: str) -> Task:
    github_payload = os.getenv('GITHUB_EVENT_PATH')
    with open(github_payload, 'r') as f:
        payload = json.load(f)

    task = Task.get_task(task_id=template_task)
    # Clone the task to pipe to. This creates a task with status Draft whose parameters can be modified.
    cloned_task = Task.clone(source_task=task,
                             name=f"{template_task} cloned task from Github")
    script_commit = payload.get("comment", {}).get("body",
                                                   "").partition(" ")[2]
    selected_type, _, selected_value = script_commit.partition(" ")
    if selected_type and selected_value:
        data_script = cloned_task.data.script
        if selected_type == "branch":
            data_script.branch = selected_value
            data_script.tag = ""
            data_script.version_num = ""
        elif selected_type == "tag":
            data_script.branch = ""
            data_script.tag = selected_value
            data_script.version_num = ""
        elif selected_type == "commit":
            data_script.branch = ""
            data_script.tag = ""
            data_script.version_num = selected_value
        else:
            raise Exception(
                f"You must supply branch, tag or commit as type, not {selected_type}"
            )

        print(f"Change train script head to {selected_value} {selected_type}")
        # noinspection PyProtectedMember
        cloned_task._update_script(script=data_script)

    Task.enqueue(cloned_task.id, queue_name=queue)
    owner, repo = payload.get("repository", {}).get("full_name", "").split("/")
    if owner and repo:
        gh = login(token=os.getenv("GITHUB_TOKEN"))
        if gh:
            issue = gh.issue(owner, repo,
                             payload.get("issue", {}).get("number"))
            if issue:
                issue.create_comment(
                    f"New task, id:{cloned_task.id} is in queue {queue_name}")
            else:
                print(
                    f'can not comment issue, {payload.get("issue", {}).get("number")}'
                )
        else:
            print(f"can not log in to gh, {os.getenv('GITHUB_TOKEN')}")
    return cloned_task
Ejemplo n.º 6
0
def main():
    # Create the experiment Task
    task = Task.init(project_name="examples", task_name="text reporting")

    print('reporting text logs')

    # report regular console print
    print('This is standard output test')

    # report stderr
    print('This is standard error test', file=sys.stderr)

    # Get the task logger,
    # You can also call Task.current_task().get_logger() from anywhere in your code.
    logger = task.get_logger()

    # report text based logs
    report_logs(logger)

    # force flush reports
    # If flush is not called, reports are flushed in the background every couple of seconds,
    # and at the end of the process execution
    logger.flush()

    print('We are done reporting, have a great day :)')
Ejemplo n.º 7
0
 def transmit_artifacts(self, id):
     artifacts = (self.info[id][self.artifacts]
                  if self.artifacts in self.info[id].keys() else {})
     task = self.call_func(
         'Task.get_task',
         id, lambda id_: Task.get_task(project_name=PROJECT_NAME,
                                       task_name=id_),
         self.get_run_name_by_id(id))
     for type, l in artifacts.items():
         if type == "folder":
             for name, obj in l:
                 task.upload_artifact(name=name, artifact_object=obj)
         elif type == 'text':
             for name, obj in l:
                 task.upload_artifact(name=name, artifact_object=obj)
         elif type == "dataframe":
             for name, obj in l:
                 task.upload_artifact(name=name, artifact_object=obj)
         elif type == "image":
             for name, obj in l:
                 task.upload_artifact(name=name, artifact_object=obj)
         elif type == "dictionary":
             for name, obj in l:
                 task.upload_artifact(name=name, artifact_object=obj)
         elif type == "storage-server":
             for name, obj in l:
                 task.upload_artifact(name=name, artifact_object=obj)
Ejemplo n.º 8
0
    def __init__(self,
                 logger: TrainsLogger = None,
                 output_uri: str = None,
                 dirname: str = None,
                 *args,
                 **kwargs):
        try:
            from trains import Task
        except ImportError:
            raise RuntimeError(
                "This contrib module requires trains to be installed. "
                "You may install trains using: \n pip install trains \n")

        if logger and not isinstance(logger, TrainsLogger):
            raise TypeError("logger must be an instance of TrainsLogger")

        self.task = Task.current_task()
        if not self.task:
            raise RuntimeError(
                "TrainsSaver requires a Trains Task to be initialized."
                "Please use the `logger` argument or call `trains.Task.init()`."
            )

        if not dirname:
            dirname = tempfile.mkdtemp(prefix="ignite_checkpoints_{}".format(
                datetime.now().strftime("%Y_%m_%d_%H_%M_%S_")))
            warnings.warn(
                "TrainsSaver created a temporary checkpoints directory: {}".
                format(dirname))

        super(TrainsSaver, self).__init__(dirname=dirname, *args, **kwargs)

        if output_uri:
            self.task.output_uri = output_uri
Ejemplo n.º 9
0
 def __init__(
         self,
         project_name: Optional[str] = None,
         task_name: Optional[str] = None,
         task_type: str = 'training',
         reuse_last_task_id: bool = True,
         output_uri: Optional[str] = None,
         auto_connect_arg_parser: bool = True,
         auto_connect_frameworks: bool = True,
         auto_resource_monitoring: bool = True
 ) -> None:
     super().__init__()
     if self._bypass:
         self._trains = None
     else:
         self._trains = Task.init(
             project_name=project_name,
             task_name=task_name,
             task_type=task_type,
             reuse_last_task_id=reuse_last_task_id,
             output_uri=output_uri,
             auto_connect_arg_parser=auto_connect_arg_parser,
             auto_connect_frameworks=auto_connect_frameworks,
             auto_resource_monitoring=auto_resource_monitoring
         )
Ejemplo n.º 10
0
    def seed(self):
        for id in self.get_ids():
            task = self.call_func(
                'Task.create', id,
                lambda id_: Task.create(project_name=PROJECT_NAME,
                                        task_name=id_),
                self.get_run_name_by_id(id))

            self.call_func('transmit_information', id,
                           lambda id_: self.transmit_information(id_), id)

            self.call_func('transmit_metrics', id,
                           lambda id_: self.transmit_metrics(id_), id)

            self.call_func('transmit_artifacts', id,
                           lambda id_: self.transmit_artifacts(id_), id)

            task.mark_started()
            task.completed()
            output_log_web_page = task.get_output_log_web_page()
            url_parts = output_log_web_page.split('projects')
            project_id = url_parts[1].split('/')[1]
            self.project_link = url_parts[0] + '/projects/' + project_id
            self.migration_count += 1
            self.pbar.update(1)
Ejemplo n.º 11
0
    def _verify_node(self, node):
        # type: (Node) -> bool
        """
        Raise ValueError on verification errors

        :return: Return True iff the specific node is verified
        """
        if not node.base_task_id:
            raise ValueError("Node '{}', base_task_id is empty".format(node.name))

        if not self._default_execution_queue and not node.queue:
            raise ValueError("Node '{}' missing execution queue, "
                             "no default queue defined and no specific node queue defined".format(node.name))

        task = Task.get_task(task_id=node.base_task_id)
        if not task:
            raise ValueError("Node '{}', base_task_id={} is invalid".format(node.name, node.base_task_id))

        pattern = self._step_ref_pattern

        for v in node.parameters.values():
            if isinstance(v, str):
                for g in pattern.findall(v):
                    self.__verify_step_reference(node, g)

        return True
Ejemplo n.º 12
0
    def __init__(self, task: Task = None, projectName: str = None, taskName: str = None,
            additionalLoggingValuesDict=None):
        """

        :param task: instances of trains.Task
        :param projectName: only necessary if task is not provided
        :param taskName: only necessary if task is not provided
        :param additionalLoggingValuesDict:
        """
        if task is None:
            if projectName is None or taskName is None:
                raise ValueError("Either the trains task or the project name and task name have to be provided")
            self.task = Task.init(project_name=projectName, task_name=taskName, reuse_last_task_id=False)
        else:
            if projectName is not None:
                log.warning(
                    f"projectName parameter with value {projectName} passed even though task has been given, "
                    f"will ignore this parameter"
                )
            if taskName is not None:
                log.warning(
                    f"taskName parameter with value {taskName} passed even though task has been given, "
                    f"will ignore this parameter"
                )
            self.task = task
        self.logger = self.task.get_logger()
        super().__init__(additionalLoggingValuesDict=additionalLoggingValuesDict)
Ejemplo n.º 13
0
    def set_credentials(cls, api_host: str = None, web_host: str = None, files_host: str = None,
                        key: str = None, secret: str = None) -> None:
        """
        Set new default TRAINS-server host and credentials
        These configurations could be overridden by either OS environment variables
        or trains.conf configuration file

        Notice! credentials needs to be set *prior* to Logger initialization

        :param api_host: Trains API server url, example: host='http://localhost:8008'
        :param web_host: Trains WEB server url, example: host='http://localhost:8080'
        :param files_host: Trains Files server url, example: host='http://localhost:8081'
        :param key: user key/secret pair, example: key='thisisakey123'
        :param secret: user key/secret pair, example: secret='thisisseceret123'
        """
        Task.set_credentials(api_host=api_host, web_host=web_host, files_host=files_host,
                             key=key, secret=secret)
Ejemplo n.º 14
0
 def trains(self,
            x: data_type,
            y: data_type = None,
            x_cv: data_type = None,
            y_cv: data_type = None,
            *,
            trains_config: Dict[str, Any] = None,
            keep_task_open: bool = False,
            queue: str = None) -> "Wrapper":
     if trains_config is None:
         return self.fit(x, y, x_cv, y_cv)
     # init trains
     if trains_config is None:
         trains_config = {}
     project_name = trains_config.get("project_name")
     task_name = trains_config.get("task_name")
     if queue is None:
         task = Task.init(**trains_config)
         cloned_task = None
     else:
         task = Task.get_task(project_name=project_name,
                              task_name=task_name)
         cloned_task = Task.clone(source_task=task, parent=task.id)
     # before loop
     self._verbose_level = 6
     self._data_config["verbose_level"] = 6
     self._before_loop(x, y, x_cv, y_cv)
     self.pipeline.use_tqdm = False
     copied_config = shallow_copy_dict(self.config)
     if queue is not None:
         cloned_task.set_parameters(copied_config)
         Task.enqueue(cloned_task.id, queue)
         return self
     # loop
     task.connect(copied_config)
     global trains_logger
     trains_logger = task.get_logger()
     self._loop()
     if not keep_task_open:
         task.close()
         trains_logger = None
     return self
Ejemplo n.º 15
0
def setup_trains_logging(config):
    if config["with_trains"]:
        from trains import Task

        task = Task.init("Carbon Black Semantic Segmentation Training",
                         config["task_name"])
        task.connect_configuration(config)

        # Log hyper parameters
        hyper_parameters = list(config.keys())
        task.connect({k: config[k] for k in hyper_parameters})
Ejemplo n.º 16
0
def delete_all_tasks_from_project(pr_name):
    # type: (str) -> ()
    """
    <Description>

    :param str pr_name:
    """
    client = APIClient()
    tasks = Task.get_tasks(project_name=pr_name)
    for task in tasks:
        client.tasks.delete(task=task.id, force=True)
Ejemplo n.º 17
0
def run(num_workers):
    """ Distributed Synchronous SGD Example """
    th.manual_seed(1234)
    train_set, bsz = partition_dataset(num_workers)
    model = Net()
    optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.5)

    num_batches = ceil(len(train_set.dataset) / float(bsz))

    from random import randint
    param = {'worker_{}_stuff'.format(dist.get_rank()): 'some stuff ' + str(randint(0, 100))}
    Task.current_task().connect(param)
    Task.current_task().upload_artifact(
        'temp {:02d}'.format(dist.get_rank()), artifact_object={'worker_rank': dist.get_rank()})

    for epoch in range(2):
        epoch_loss = 0.0
        for i, (data, target) in enumerate(train_set):
            optimizer.zero_grad()
            output = model(data)
            loss = F.nll_loss(output, target)
            epoch_loss += loss.item()
            loss.backward()
            average_gradients(model)
            optimizer.step()
            if i % 10 == 0:
                print('{}] Train Epoch {} - {} \tLoss  {:.6f}'.format(dist.get_rank(), epoch, i, loss))
                Task.current_task().get_logger().report_scalar(
                    'loss', 'worker {:02d}'.format(dist.get_rank()), value=loss.item(), iteration=i)
            if i > 100:
                break
        print('Rank ', dist.get_rank(), ', epoch ',
              epoch, ': ', epoch_loss / num_batches)
Ejemplo n.º 18
0
def main():
    # Training settings
    parser = argparse.ArgumentParser(description='PyTorch MNIST Example')
    parser.add_argument('--batch-size', type=int, default=64, metavar='N',
                        help='input batch size for training (default: 64)')
    parser.add_argument('--test-batch-size', type=int, default=1000, metavar='N',
                        help='input batch size for testing (default: 1000)')
    parser.add_argument('--epochs', type=int, default=2, metavar='N',
                        help='number of epochs to train (default: 10)')
    parser.add_argument('--lr', type=float, default=0.01, metavar='LR',
                        help='learning rate (default: 0.01)')
    parser.add_argument('--momentum', type=float, default=0.5, metavar='M',
                        help='SGD momentum (default: 0.5)')
    parser.add_argument('--no-cuda', action='store_true', default=False,
                        help='disables CUDA training')
    parser.add_argument('--seed', type=int, default=1, metavar='S',
                        help='random seed (default: 1)')
    parser.add_argument('--log-interval', type=int, default=10, metavar='N',
                        help='how many batches to wait before logging training status')
    args = parser.parse_args()
    args.cuda = not args.no_cuda and torch.cuda.is_available()

    task = Task.init(project_name='examples', task_name='pytorch with tensorboardX')
    writer = SummaryWriter('runs')
    writer.add_text('TEXT', 'This is some text', 0)

    torch.manual_seed(args.seed)
    if args.cuda:
        torch.cuda.manual_seed(args.seed)

    kwargs = {'num_workers': 4, 'pin_memory': True} if args.cuda else {}
    train_loader = torch.utils.data.DataLoader(datasets.MNIST('../data', train=True, download=True,
                                                              transform=transforms.Compose([
                                                                  transforms.ToTensor(),
                                                                  transforms.Normalize((0.1307,), (0.3081,))])),
                                               batch_size=args.batch_size, shuffle=True, **kwargs)
    test_loader = torch.utils.data.DataLoader(datasets.MNIST('../data', train=False,
                                                             transform=transforms.Compose([
                                                                 transforms.ToTensor(),
                                                                 transforms.Normalize((0.1307,), (0.3081,))])),
                                              batch_size=args.batch_size, shuffle=True, **kwargs)

    model = Net()
    if args.cuda:
        model.cuda()
    optimizer = optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum)

    for epoch in range(1, args.epochs + 1):
        train(model, epoch, train_loader, args, optimizer, writer)
        torch.save(model, os.path.join(gettempdir(), 'model{}'.format(epoch)))
    test(model, test_loader, args, optimizer, writer)
Ejemplo n.º 19
0
def TrainModel(model, base_model, model_name):

    task = Task.init(project_name="Ex3ModelTrains", task_name=model_name)
    reporter = TrainsReporter()
    # Show a summary of the model. Check the number of trainable parameters
    model.summary()

    # Compile the model
    model.compile(loss=keras.losses.BinaryCrossentropy(from_logits=True),
                  optimizer=keras.optimizers.Adam(),
                  metrics=[metrics.BinaryAccuracy()])

    # Train the model
    model.fit(train_ds,
              steps_per_epoch=train_ds.samples / train_ds.batch_size,
              epochs=20,
              validation_data=valid_ds,
              validation_steps=valid_ds.samples / valid_ds.batch_size,
              callbacks=[reporter],
              verbose=1)

    # Unfreeze the base_model. Note that it keeps running in inference mode
    # since we passed `training=False` when calling it. This means that
    # the batchnorm layers will not update their batch statistics.
    # This prevents the batchnorm layers from undoing all the training
    # we've done so far.
    base_model.trainable = True
    reporter.epoch_ref = 20

    score = model.evaluate(test_ds)
    print('Test evaluation Score:', model.evaluate(test_ds))
    print('validation evaluation Score:', model.evaluate(valid_ds))

    model.compile(
        optimizer=keras.optimizers.Adam(1e-5),  # Low learning rate
        loss=keras.losses.BinaryCrossentropy(from_logits=True),
        metrics=[keras.metrics.BinaryAccuracy()],
    )

    model.fit(train_ds,
              steps_per_epoch=train_ds.samples / train_ds.batch_size,
              epochs=10,
              validation_data=valid_ds,
              validation_steps=valid_ds.samples / valid_ds.batch_size,
              callbacks=[reporter],
              verbose=1)

    score = model.evaluate(test_ds)
    print('Test evaluation Score:', model.evaluate(test_ds))
    print('validation evaluation Score:', model.evaluate(valid_ds))
Ejemplo n.º 20
0
def main():
    parser = ArgumentParser()
    parser.add_argument(
        "--run",
        help="Run the autoscaler after wizard finished",
        action="store_true",
        default=False,
    )
    args = parser.parse_args()

    if running_remotely():
        hyper_params = AwsAutoScaler.Settings().as_dict()
        configurations = AwsAutoScaler.Configuration().as_dict()
    else:
        print("AWS Autoscaler setup\n")

        config_file = Path(CONF_FILE).absolute()
        if config_file.exists() and input_bool(
                "Load configurations from config file '{}' [Y/n]? ".format(
                    str(CONF_FILE)),
                default=True,
        ):
            with config_file.open("r") as f:
                conf = yaml.load(f, Loader=yaml.SafeLoader)
            hyper_params = conf["hyper_params"]
            configurations = conf["configurations"]
        else:
            configurations, hyper_params = run_wizard()

            try:
                with config_file.open("w+") as f:
                    conf = {
                        "hyper_params": hyper_params,
                        "configurations": configurations,
                    }
                    yaml.safe_dump(conf, f)
            except Exception:
                print(
                    "Error! Could not write configuration file at: {}".format(
                        str(CONF_FILE)))
                return

    task = Task.init(project_name="Auto-Scaler", task_name="AWS Auto-Scaler")
    task.connect(hyper_params)
    task.connect_configuration(configurations)

    autoscaler = AwsAutoScaler(hyper_params, configurations)

    if running_remotely() or args.run:
        autoscaler.start()
Ejemplo n.º 21
0
def trains_log_text(text):
    logger = None
    if "TRAINS_STD_LOGGER" in globals():
        logger = TRAINS_STD_LOGGER
    else:
        try:
            from trains import Task
            logger = Task.current_task().get_logger()
        except:
            pass
    if logger is None:
        return
    logger.report_text(text)
    logger.flush()
Ejemplo n.º 22
0
 def transmit_metrics(self, id):
     task = self.call_func(
         'Task.get_task',
         id, lambda id_: Task.get_task(project_name=PROJECT_NAME,
                                       task_name=id_),
         self.get_run_name_by_id(id))
     logger = task.get_logger()
     metrics = self.get_metrics(id)
     for graph_name, series_name, table in metrics:
         for p in table:
             logger.report_scalar(graph_name,
                                  series_name,
                                  iteration=p[0],
                                  value=float(p[1]))
     task.completed()
Ejemplo n.º 23
0
    def transmit_information(self, id):
        parameters = self.get_params(id)
        general_information = self.get_general_information(id)
        artifact = self.get_artifact(id)
        tags = self.get_tags(id)

        task = self.call_func(
            'Task.get_task',
            id, lambda id_: Task.get_task(project_name=PROJECT_NAME,
                                          task_name=id_),
            self.get_run_name_by_id(id))

        task_values = self.call_func('task.export_task', id,
                                     lambda _: task.export_task(),
                                     self.get_run_name_by_id(id))

        task_values["comment"] = (tags["note.content"]
                                  if "note.content" in tags.keys() else "")
        task_values["hyperparams"]["Args"] = parameters
        task_values["started"] = general_information["started"]
        task_values["completed"] = general_information["completed"]
        task_values["script"]["branch"] = (tags["source.git.branch"]
                                           if "source.git.branch"
                                           in tags.keys() else self.branch)
        task_values["script"]["repository"] = (tags["source.git.repoURL"]
                                               if "source.git.repoURL"
                                               in tags.keys() else "")
        task_values["script"]["version_num"] = (tags["source.git.commit"]
                                                if "source.git.commit"
                                                in tags.keys() else "")
        task_values["script"]["entry_point"] = tags["entry_point"]
        task_values["script"]["working_dir"] = tags["working_dir"]
        if "project.env" in tags.keys():
            task_values["script"]["requirements"][tags["project.env"]] = (
                artifact["requirements"]
                if "requirements" in artifact.keys() else "")
        task_values["user"] = tags["user"]

        self.call_func('task.update_task', id,
                       lambda _task_values: task.update_task(_task_values),
                       task_values)

        if len(tags["VALUETAG"].keys()) > 0:
            self.call_func(
                'task.connect_configuration', id,
                lambda _dict: task.connect_configuration(
                    _dict, name="MLflow Tags"), tags["VALUETAG"])
Ejemplo n.º 24
0
        def compute_and_log_cm():
            cm = cm_metric.compute()
            # CM: values are normalized such that diagonal values represent class recalls
            cm = ConfusionMatrix.normalize(cm, "recall").cpu().numpy()

            if idist.get_rank() == 0:
                from trains import Task

                trains_logger = Task.current_task().get_logger()
                trains_logger.report_confusion_matrix(
                    title="Final Confusion Matrix",
                    series="cm-preds-gt",
                    matrix=cm,
                    iteration=trainer.state.iteration,
                    xlabels=VOCSegmentationOpencv.target_names,
                    ylabels=VOCSegmentationOpencv.target_names,
                )
Ejemplo n.º 25
0
    def __init__(self, *_, **kwargs):
        try:
            from trains import Task
            from trains.binding.frameworks.tensorflow_bind import WeightsGradientHistHelper
        except ImportError:
            raise RuntimeError(
                "This contrib module requires trains to be installed. "
                "You may install trains using: \n pip install trains \n")

        experiment_kwargs = {
            k: v
            for k, v in kwargs.items() if k not in (
                "project_name",
                "task_name",
                "task_type",
            )
        }

        if self.bypass_mode():
            warnings.warn("TrainsSaver: running in bypass mode")

            class _Stub(object):
                def __call__(self, *_, **__):
                    return self

                def __getattr__(self, attr):
                    if attr in ("name", "id"):
                        return ""
                    return self

                def __setattr__(self, attr, val):
                    pass

            self._task = _Stub()
        else:
            self._task = Task.init(
                project_name=kwargs.get("project_name"),
                task_name=kwargs.get("task_name"),
                task_type=kwargs.get("task_type", Task.TaskTypes.training),
                **experiment_kwargs,
            )

        self.trains_logger = self._task.get_logger()

        self.grad_helper = WeightsGradientHistHelper(
            logger=self.trains_logger, )
Ejemplo n.º 26
0
    def read(self):
        self.thread_id = threading.current_thread().ident
        for id, path in self.paths:
            self.info[id] = {}

            self.call_func(
                'read_general_information', id,
                lambda id_, path_: self.read_general_information(id_, path_),
                id, path)

            self.call_func('read_tags', id,
                           lambda id_, path_: self.read_tags(id_, path_), id,
                           path + self.tags)

            if "runName" in self.info[id][self.tags].keys():
                self.ID_to_Name[id] = self.info[id][self.tags]["runName"]

            if self.project_exist:
                task = self.call_func(
                    'Task.get_task', id,
                    lambda id_: Task.get_task(project_name=PROJECT_NAME,
                                              task_name=id_),
                    self.get_run_name_by_id(id))
                if task:
                    task_tags = task.data.system_tags if hasattr(
                        task.data, 'system_tags') else task.data.tags
                    if not ARCHIVED_TAG in task_tags:
                        del self.info[id]
                        self.msgs['FAILED'].append(
                            'task ' + id +
                            ' already exist, if you want to migrate it again, you can archive it in Allegro Trains'
                        )
                        self.pbar.update(1)
                        continue

            self.call_func('read_artifacts', id,
                           lambda id_, path_: self.read_artifacts(id_, path_),
                           id, path + self.artifacts)

            self.call_func('read_metrics', id,
                           lambda id_, path_: self.read_metrics(id_, path_),
                           id, path + self.metrics)

            self.call_func('read_params', id,
                           lambda id_, path_: self.read_params(id_, path_), id,
                           path + self.params)
Ejemplo n.º 27
0
def run(config, logger=None, local_rank=0, **kwargs):

    assert torch.cuda.is_available()
    assert torch.backends.cudnn.enabled, "Nvidia/Amp requires cudnn backend to be enabled."

    task = Task.init(
        "ignite", "DeeplabV3_ResNet101 pascal_voc2012 segmentation example")

    dist.init_process_group("nccl", init_method="env://")

    # As we passed config with option --manual_config_load
    assert hasattr(config, "setup"), (
        "We need to manually setup the configuration, please set --manual_config_load "
        "to py_config_runner")

    config = config.setup()

    assert_config(config, TRAINVAL_CONFIG)
    # The following attributes are automatically added by py_config_runner
    assert hasattr(config, "config_filepath") and isinstance(
        config.config_filepath, Path)
    assert hasattr(config, "script_filepath") and isinstance(
        config.script_filepath, Path)

    # dump python files to reproduce the run
    task.connect_configuration(config.config_filepath.as_posix())
    task.upload_artifact("script", config.script_filepath)

    config.output_path = Path("./artifacts")

    # log the configuration, if we are the master node
    if dist.get_rank() == 0:
        task.connect(get_params(config, TRAINVAL_CONFIG))

    try:
        training(config, local_rank=local_rank, with_trains_logging=True)
    except KeyboardInterrupt:
        logger.info("Caught KeyboardInterrupt -> exit")
    except Exception as e:  # noqa
        logger.exception("")
        dist.destroy_process_group()
        raise e

    dist.destroy_process_group()
Ejemplo n.º 28
0
def run(config, **kwargs):
    """This is the main method to run the training. As this training script is launched with `py_config_runner`
    it should obligatory contain `run(config, **kwargs)` method.

    """

    assert torch.cuda.is_available(), torch.cuda.is_available()
    assert torch.backends.cudnn.enabled, "Nvidia/Amp requires cudnn backend to be enabled."

    with idist.Parallel(backend="nccl") as parallel:

        logger = setup_logger(name="Pascal-VOC12 Training",
                              distributed_rank=idist.get_rank())

        assert_config(config, TRAINVAL_CONFIG)
        # The following attributes are automatically added by py_config_runner
        assert hasattr(config, "config_filepath") and isinstance(
            config.config_filepath, Path)
        assert hasattr(config, "script_filepath") and isinstance(
            config.script_filepath, Path)

        if idist.get_rank() == 0 and exp_tracking.has_trains:
            from trains import Task

            task = Task.init("Pascal-VOC12 Training",
                             config.config_filepath.stem)
            task.connect_configuration(config.config_filepath.as_posix())

        log_basic_info(logger, config)

        config.output_path = Path(exp_tracking.get_output_path())
        # dump python files to reproduce the run
        exp_tracking.log_artifact(config.config_filepath.as_posix())
        exp_tracking.log_artifact(config.script_filepath.as_posix())
        exp_tracking.log_params(get_params(config, TRAINVAL_CONFIG))

        try:
            parallel.run(training, config, logger=logger)
        except KeyboardInterrupt:
            logger.info("Catched KeyboardInterrupt -> exit")
        except Exception as e:  # noqa
            logger.exception("")
            raise e
Ejemplo n.º 29
0
def main():
    # Create the experiment Task
    task = Task.init(project_name="examples", task_name="scalar reporting")

    print('reporting scalar graphs')

    # Get the task logger,
    # You can also call Task.current_task().get_logger() from anywhere in your code.
    logger = task.get_logger()

    # report scalars
    report_scalars(logger)

    # force flush reports
    # If flush is not called, reports are flushed in the background every couple of seconds,
    # and at the end of the process execution
    logger.flush()

    print('We are done reporting, have a great day :)')
Ejemplo n.º 30
0
    def __init__(self,
                 project_name: Optional[str] = None,
                 task_name: Optional[str] = None,
                 task_type: str = 'training',
                 reuse_last_task_id: bool = True,
                 output_uri: Optional[str] = None,
                 auto_connect_arg_parser: bool = True,
                 auto_connect_frameworks: bool = True,
                 auto_resource_monitoring: bool = True) -> None:
        if not _TRAINS_AVAILABLE:
            raise ImportError(
                'You want to use `test_tube` logger which is not installed yet,'
                ' install it with `pip install test-tube`.')
        super().__init__()
        if self.bypass_mode():
            self._trains = None
            print('TRAINS Task: running in bypass mode')
            print('TRAINS results page: disabled')

            class _TaskStub(object):
                def __call__(self, *args, **kwargs):
                    return self

                def __getattr__(self, attr):
                    if attr in ('name', 'id'):
                        return ''
                    return self

                def __setattr__(self, attr, val):
                    pass

            self._trains = _TaskStub()
        else:
            self._trains = Task.init(
                project_name=project_name,
                task_name=task_name,
                task_type=task_type,
                reuse_last_task_id=reuse_last_task_id,
                output_uri=output_uri,
                auto_connect_arg_parser=auto_connect_arg_parser,
                auto_connect_frameworks=auto_connect_frameworks,
                auto_resource_monitoring=auto_resource_monitoring)