コード例 #1
0
                        '--task-id',
                        type=str,
                        required=True)
    parser.add_argument('--execution-queue',
                        '-q',
                        type=str,
                        default='rtx2080ti')
    parser.add_argument('--run-as-service', '--service', action="store_true")
    parser.add_argument('--no-reuse-last-task-id',
                        dest='reuse_id',
                        action="store_false",
                        default=True)
    args = parser.parse_args()

    task = Task.init(project_name='language-model-hp',
                     task_name=f'{args.model}',
                     task_type=Task.TaskTypes.optimizer,
                     reuse_last_task_id=args.reuse_id)
    task.connect(args)

    optimizer = HyperParameterOptimizer(
        base_task_id=args.
        template_task_id,  # This is the experiment we want to optimize
        # here we define the hyper-parameters to optimize
        hyper_parameters=hyper_parameters[args.model],
        # setting the objective metric we want to maximize/minimize
        objective_metric_title='val_ppl',
        objective_metric_series='val_ppl',
        objective_metric_sign='min',  # maximize or minimize the objective metric
        # setting optimizer - clearml supports GridSearch, RandomSearch, OptimizerBOHB and OptimizerOptuna
        optimizer_class=OptimizerOptuna,
        # Configuring optimization parameters
コード例 #2
0
ファイル: A1_dataset_input.py プロジェクト: abiller/events
        max_pixel_value=255.0,
    )
    values = default_values.copy() if norm_setting is None \
        else norm_setting.copy()
    values.update({"p": 1.0})
    return albumentations.Normalize(**values)


if __name__ == "__main__":
    # force colab to get dataclasses
    Task.add_requirements('dataclasses', '0.4')
    # override numpy version for colab
    Task.add_requirements('numpy', '1.19.5')
    # Track everything on ClearML Free
    task = Task.init(project_name='R|D?R&D! Webinar 01',
                     task_name='Full integration',
                     output_uri=True,  # auto save everything to Clearml Free
                     )

    # Need to run on cpu only?
    device = "cuda" if torch.cuda.is_available() else "cpu"
    if device == "cpu":
        warnings.warn('GPU not available!, using CPU mode')
        warnings.filterwarnings("ignore", module='torch.cuda.amp.autocast')

    # configs
    cfg = FlowerTrainingConfig()
    aug_cfg = AugConfig()
    task.connect(cfg, 'config')
    task.connect(aug_cfg, 'augmentation_config')
    # default model config
    task.set_model_config(config_dict=asdict(ModelConfig()))
コード例 #3
0
def main():
    pipeline_name = str(os.getenv('PPS_PIPELINE_NAME', 'None'))
    print("Pachyderm pipeline")
    # Connecting ClearML with the current process,
    # from here on everything is logged automatically
    task = Task.init(project_name=pipeline_name,
                     task_name='Pachyderm PyTorch MNIST Train')

    # Training settings
    parser = argparse.ArgumentParser(description='PyTorch MNIST Example')
    parser.add_argument('--batch-size',
                        type=int,
                        default=64,
                        metavar='N',
                        help='input batch size for training (default: 64)')
    parser.add_argument('--test-batch-size',
                        type=int,
                        default=1000,
                        metavar='N',
                        help='input batch size for testing (default: 1000)')
    parser.add_argument('--epochs',
                        type=int,
                        default=10,
                        metavar='N',
                        help='number of epochs to train (default: 10)')
    parser.add_argument('--lr',
                        type=float,
                        default=0.01,
                        metavar='LR',
                        help='learning rate (default: 0.01)')
    parser.add_argument('--momentum',
                        type=float,
                        default=0.5,
                        metavar='M',
                        help='SGD momentum (default: 0.5)')
    parser.add_argument('--no-cuda',
                        action='store_true',
                        default=False,
                        help='disables CUDA training')
    parser.add_argument('--seed',
                        type=int,
                        default=1,
                        metavar='S',
                        help='random seed (default: 1)')
    parser.add_argument(
        '--log-interval',
        type=int,
        default=10,
        metavar='N',
        help='how many batches to wait before logging training status')

    parser.add_argument('--save-model',
                        action='store_true',
                        default=True,
                        help='For Saving the current Model')
    parser.add_argument('--save-location',
                        type=str,
                        default='./',
                        help='For Saving the current Model')
    parser.add_argument('--data-location',
                        type=str,
                        default=os.path.join('..', 'data'),
                        help='For loading the dataset')

    args = parser.parse_args()
    use_cuda = not args.no_cuda and torch.cuda.is_available()

    torch.manual_seed(args.seed)

    device = torch.device("cuda" if use_cuda else "cpu")

    kwargs = {"num_workers": 4, "pin_memory": True} if use_cuda else {}
    train_loader = torch.utils.data.DataLoader(datasets.MNIST(
        args.data_location,
        train=True,
        download=True,
        transform=transforms.Compose([
            transforms.ToTensor(),
            transforms.Normalize((0.1307, ), (0.3081, ))
        ]),
    ),
                                               batch_size=args.batch_size,
                                               shuffle=True,
                                               **kwargs)
    test_loader = torch.utils.data.DataLoader(datasets.MNIST(
        args.data_location,
        train=False,
        transform=transforms.Compose([
            transforms.ToTensor(),
            transforms.Normalize((0.1307, ), (0.3081, ))
        ]),
    ),
                                              batch_size=args.test_batch_size,
                                              shuffle=True,
                                              **kwargs)

    model = Net().to(device)
    optimizer = optim.SGD(model.parameters(),
                          lr=args.lr,
                          momentum=args.momentum)

    for epoch in range(1, args.epochs + 1):
        train(args, model, device, train_loader, optimizer, epoch)
        test(args, model, device, test_loader, epoch)

    if args.save_model:
        torch.save(model.state_dict(),
                   os.path.join(args.save_location, "mnist_cnn.pt"))
コード例 #4
0
def main():
    print('ClearML experiment monitor Slack service\n')

    # Slack Monitor arguments
    parser = argparse.ArgumentParser(
        description='ClearML monitor experiments and post Slack Alerts')
    parser.add_argument('--channel',
                        type=str,
                        help='Set the channel to post the Slack alerts')
    parser.add_argument('--slack_api',
                        type=str,
                        default=os.environ.get('SLACK_API_TOKEN', None),
                        help='Slack API key for sending messages')
    parser.add_argument(
        '--message_prefix',
        type=str,
        help=
        'Add message prefix (For example, to alert all channel members use: "Hey <!here>,")'
    )
    parser.add_argument(
        '--project',
        type=str,
        default='',
        help=
        'The name (or partial name) of the project to monitor, use empty for all projects'
    )
    parser.add_argument(
        '--min_num_iterations',
        type=int,
        default=0,
        help=
        'Minimum number of iterations of failed/completed experiment to alert. '
        'This will help eliminate unnecessary debug sessions that crashed right after starting '
        '(default:0 alert on all)')
    parser.add_argument(
        '--include_manual_experiments',
        action="store_true",
        default=False,
        help='Include experiments running manually (i.e. not by clearml-agent)'
    )
    parser.add_argument(
        '--include_completed_experiments',
        action="store_true",
        default=False,
        help='Include completed experiments (i.e. not just failed experiments)'
    )
    parser.add_argument(
        '--refresh_rate',
        type=float,
        default=10.,
        help=
        'Set refresh rate of the monitoring service, default every 10.0 sec')
    parser.add_argument(
        '--service_queue',
        type=str,
        default='services',
        help=
        'Queue name to use when running as a service (default: \'services\'')
    parser.add_argument(
        '--local',
        action="store_true",
        default=False,
        help='Run service locally instead of as a service '
        '(Default: Automatically launch itself on the services queue)')

    args = parser.parse_args()

    if not args.slack_api:
        print(
            'Slack API key was not provided, please run with --slack_api <KEY>'
        )
        exit(1)

    if not args.channel:
        print(
            'Slack channel was not provided, please run with --channel <channel_name>'
        )
        exit(1)

    # create the slack monitoring object
    slack_monitor = SlackMonitor(slack_api_token=args.slack_api,
                                 channel=args.channel,
                                 message_prefix=args.message_prefix)

    # configure the monitoring filters
    slack_monitor.min_num_iterations = args.min_num_iterations
    slack_monitor.include_manual_experiments = args.include_manual_experiments
    if args.project:
        slack_monitor.set_projects(project_names_re=[args.project])
    if args.include_completed_experiments:
        slack_monitor.status_alerts += ["completed"]

    # start the monitoring Task
    # Connecting ClearML with the current process,
    # from here on everything is logged automatically
    task = Task.init(project_name='Monitoring',
                     task_name='Slack Alerts',
                     task_type=Task.TaskTypes.monitor)
    if not args.local:
        task.execute_remotely(queue_name=args.service_queue)
        # we will not get here if we are running locally

    print('\nStarting monitoring service\nProject: "{}"\nRefresh rate: {}s\n'.
          format(args.project or 'all', args.refresh_rate))

    # Let everyone know we are up and running
    start_message = \
        '{}Allegro ClearML Slack monitoring service started\nMonitoring project \'{}\''.format(
            (args.message_prefix + ' ') if args.message_prefix else '',
            args.project or 'all')
    slack_monitor.post_message(start_message)

    # Start the monitor service, this function will never end
    slack_monitor.monitor(pool_period=args.refresh_rate)
コード例 #5
0
import os
from tempfile import gettempdir

import numpy as np
from PIL import Image
from torch.utils.tensorboard import SummaryWriter

from clearml import Task

# Connecting ClearML with the current process,
# from here on everything is logged automatically
task = Task.init(project_name='examples',
                 task_name='PyTorch TensorBoard toy example')

writer = SummaryWriter(log_dir=os.path.join(gettempdir(), 'tensorboard_logs'))

# convert to 4d [batch, col, row, RGB-channels]
image_open = Image.open(
    os.path.join("..", "..", "reporting", "data_samples", "picasso.jpg"))
image = np.asarray(image_open)
image_gray = image[:, :, 0][np.newaxis, :, :, np.newaxis]
image_rgba = np.concatenate(
    (image,
     255 * np.atleast_3d(np.ones(shape=image.shape[:2], dtype=np.uint8))),
    axis=2)
image_rgba = image_rgba[np.newaxis, :, :, :]
image = image[np.newaxis, :, :, :]

writer.add_image("test/first", image[0], dataformats='HWC')
writer.add_image("test_gray/second", image_gray[0], dataformats='HWC')
writer.add_image("test_rgba/third", image_rgba[0], dataformats='HWC')
コード例 #6
0
def main():
    # Training settings
    parser = argparse.ArgumentParser(description='PyTorch MNIST Example')
    parser.add_argument('--batch-size',
                        type=int,
                        default=64,
                        metavar='N',
                        help='input batch size for training (default: 64)')
    parser.add_argument('--test-batch-size',
                        type=int,
                        default=1000,
                        metavar='N',
                        help='input batch size for testing (default: 1000)')
    parser.add_argument('--epochs',
                        type=int,
                        default=2,
                        metavar='N',
                        help='number of epochs to train (default: 10)')
    parser.add_argument('--lr',
                        type=float,
                        default=0.01,
                        metavar='LR',
                        help='learning rate (default: 0.01)')
    parser.add_argument('--momentum',
                        type=float,
                        default=0.5,
                        metavar='M',
                        help='SGD momentum (default: 0.5)')
    parser.add_argument('--no-cuda',
                        action='store_true',
                        default=False,
                        help='disables CUDA training')
    parser.add_argument('--seed',
                        type=int,
                        default=1,
                        metavar='S',
                        help='random seed (default: 1)')
    parser.add_argument(
        '--log-interval',
        type=int,
        default=10,
        metavar='N',
        help='how many batches to wait before logging training status')
    args = parser.parse_args()
    args.cuda = not args.no_cuda and torch.cuda.is_available()

    # Connecting ClearML with the current process,
    # from here on everything is logged automatically
    task = Task.init(project_name='examples',
                     task_name='PyTorch with tensorboardX')

    writer = SummaryWriter('runs')
    writer.add_text('TEXT', 'This is some text', 0)

    torch.manual_seed(args.seed)
    if args.cuda:
        torch.cuda.manual_seed(args.seed)

    kwargs = {'num_workers': 4, 'pin_memory': True} if args.cuda else {}
    train_loader = torch.utils.data.DataLoader(datasets.MNIST(
        '../data',
        train=True,
        download=True,
        transform=transforms.Compose([
            transforms.ToTensor(),
            transforms.Normalize((0.1307, ), (0.3081, ))
        ])),
                                               batch_size=args.batch_size,
                                               shuffle=True,
                                               **kwargs)
    test_loader = torch.utils.data.DataLoader(datasets.MNIST(
        '../data',
        train=False,
        transform=transforms.Compose([
            transforms.ToTensor(),
            transforms.Normalize((0.1307, ), (0.3081, ))
        ])),
                                              batch_size=args.test_batch_size,
                                              shuffle=True,
                                              **kwargs)

    model = Net()
    if args.cuda:
        model.cuda()
    optimizer = optim.SGD(model.parameters(),
                          lr=args.lr,
                          momentum=args.momentum)

    for epoch in range(1, args.epochs + 1):
        train(model, epoch, train_loader, args, optimizer, writer)
        torch.save(model, os.path.join(gettempdir(), 'model{}'.format(epoch)))
    test(model, test_loader, args, optimizer, writer)
コード例 #7
0
from clearml import Task
from time import sleep

# Initialize the Task Pipe's first Task used to start the Task Pipe
task = Task.init("examples",
                 "Simple Controller Task",
                 task_type=Task.TaskTypes.controller)

# Create a hyper-parameter dictionary for the task
param = dict()
# Connect the hyper-parameter dictionary to the task
param = task.connect(param)

# In this example we pass next task's name as a parameter
param["next_task_name"] = "Toy Base Task"
# This is a parameter name in the next task we want to change
param["param_name"] = "Example_Param"
# This is the parameter value in the next task we want to change
param["param_name_new_value"] = 3
# The queue where we want the template task (clone) to be sent to
param["execution_queue_name"] = "default"

# Simulate the work of a Task
print("Processing....")
sleep(2.0)
print("Done processing :)")

# Get a reference to the task to pipe to.
next_task = Task.get_task(project_name=task.get_project_name(),
                          task_name=param["next_task_name"])
コード例 #8
0
ファイル: hyper_parameters.py プロジェクト: tmankita/trains
# ClearML - example code, ArgumentParser parameter logging and dictionary parameter logging
#
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import sys
from argparse import ArgumentParser


from clearml import Task

# Connecting ClearML with the current process,
# from here on everything is logged automatically
task = Task.init(project_name='examples', task_name='hyper-parameters example')

parameters = {
    'list': [1, 2, 3],
    'dict': {'a': 1, 'b': 2},
    'tuple': (1, 2, 3),
    'int': 3,
    'float': 2.2,
    'string': 'my string',
}
parameters = task.connect(parameters)

# adding new parameter after connect (will be logged as well)
parameters['new_param'] = 'this is new'

# changing the value of a parameter (new value will be stored instead of previous one)
parameters['float'] = '9.9'
コード例 #9
0
model.add(Dense(10))
model.add(Activation('softmax'))

model2 = Sequential()
model2.add(Dense(512, input_shape=(784, )))
model2.add(Activation('relu'))

model.summary()

model.compile(loss='categorical_crossentropy',
              optimizer=RMSprop(),
              metrics=['accuracy'])

# Connecting ClearML with the current process,
# from here on everything is logged automatically
task = Task.init(project_name='examples',
                 task_name='Keras with TensorBoard example')
task.connect_configuration({
    'test': 1337,
    'nested': {
        'key': 'value',
        'number': 1
    }
})

# Advanced: setting model class enumeration
labels = dict(('digit_%d' % i, i) for i in range(10))
task.set_model_label_enumeration(labels)

output_folder = os.path.join(tempfile.gettempdir(), 'keras_example')

board = TensorBoard(histogram_freq=1,
コード例 #10
0
ファイル: cleanup_service.py プロジェクト: tmankita/trains
import logging
import os
from datetime import datetime
from glob import glob
from shutil import rmtree
from time import sleep, time

from clearml.backend_api.session.client import APIClient

from clearml import Task

# Connecting ClearML with the current process,
# from here on everything is logged automatically
task = Task.init(
    project_name="DevOps",
    task_name="Cleanup Service",
    task_type=Task.TaskTypes.service,
    reuse_last_task_id=False,
)

# set the base docker including the mount point for the file server data data
file_server_mount = "/opt/trains/data/fileserver/"
task.set_base_docker("ubuntu:18.04 -v /opt/trains/data/fileserver/:{}".format(
    file_server_mount))

# args for the running task
args = {
    "delete_threshold_days": 30.0,
    "cleanup_period_in_days": 1.0,
    "run_as_service": True,
    "force_delete": False,
}
コード例 #11
0
# ClearML - Example of manual model configuration and uploading
#
import os
from tempfile import gettempdir

from keras import Input, layers, Model

from clearml import Task

# Connecting ClearML with the current process,
# from here on everything is logged automatically
task = Task.init(project_name='examples',
                 task_name='Model configuration and upload')


def get_model():
    # Create a simple model.
    inputs = Input(shape=(32, ))
    outputs = layers.Dense(1)(inputs)
    keras_model = Model(inputs, outputs)
    keras_model.compile(optimizer='adam', loss='mean_squared_error')
    return keras_model


# create a model
model = get_model()

# Connect a local configuration file
config_file = os.path.join('..', '..', 'reporting', 'data_samples',
                           'sample.json')
config_file = task.connect_configuration(config_file)
コード例 #12
0
ファイル: artifacts.py プロジェクト: groupgithub21/clearml
import os
from time import sleep

import pandas as pd
import numpy as np
from PIL import Image
from clearml import Task

# Connecting ClearML with the current process,
# from here on everything is logged automatically
task = Task.init(project_name='examples', task_name='Artifacts example')

df = pd.DataFrame(
    {
        'num_legs': [2, 4, 8, 0],
        'num_wings': [2, 0, 0, 0],
        'num_specimen_seen': [10, 2, 1, 8]
    },
    index=['falcon', 'dog', 'spider', 'fish'])

# Register Pandas object as artifact to watch
# (it will be monitored in the background and automatically synced and uploaded)
task.register_artifact('train',
                       df,
                       metadata={
                           'counting': 'legs',
                           'max legs': 69
                       })
# change the artifact object
df.sample(frac=0.5, replace=True, random_state=1)
# or access it from anywhere using the Task's get_registered_artifacts()
コード例 #13
0
ファイル: cifar_ignite.py プロジェクト: tmankita/trains
import torch.optim as optim
import torchvision.datasets as datasets
import torchvision.transforms as transforms
from ignite.contrib.handlers import TensorboardLogger
from ignite.engine import Events, create_supervised_trainer, create_supervised_evaluator
from ignite.handlers import global_step_from_engine
from ignite.metrics import Accuracy, Loss, Recall
from ignite.utils import setup_logger
from torch.utils.tensorboard import SummaryWriter
from tqdm import tqdm

from clearml import Task, StorageManager

# Connecting ClearML with the current process,
# from here on everything is logged automatically
task = Task.init(project_name='Image Example',
                 task_name='image classification CIFAR10')
params = {
    'number_of_epochs': 20,
    'batch_size': 64,
    'dropout': 0.25,
    'base_lr': 0.001,
    'momentum': 0.9,
    'loss_report': 100
}
params = task.connect(params)  # enabling configuration override by clearml
print(params)  # printing actual configuration (after override in remote mode)

manager = StorageManager()

dataset_path = Path(
    manager.get_local_copy(
コード例 #14
0
def main():
    parser = ArgumentParser()
    parser.add_argument(
        "--run",
        help="Run the autoscaler after wizard finished",
        action="store_true",
        default=False,
    )
    parser.add_argument(
        "--remote",
        help="Run the autoscaler as a service, launch on the `services` queue",
        action="store_true",
        default=False,
    )
    parser.add_argument(
        "--config-file",
        help="Configuration file name",
        type=Path,
        default=Path("aws_autoscaler.yaml"),
    )
    args = parser.parse_args()

    if running_remotely():
        conf = default_config
    else:
        print("AWS Autoscaler setup wizard\n"
              "---------------------------\n"
              "Follow the wizard to configure your AWS auto-scaler service.\n"
              "Once completed, you will be able to view and change the configuration in the clearml-server web UI.\n"
              "It means there is no need to worry about typos or mistakes :)\n")

        if args.config_file.exists() and input_bool(
            "Load configurations from config file '{}' [Y/n]? ".format(args.config_file),
            default=True,
        ):
            with args.config_file.open("r") as f:
                conf = yaml.load(f, Loader=yaml.SafeLoader)
        else:
            configurations, hyper_params = run_wizard()
            conf = {
                "hyper_params": hyper_params,
                "configurations": configurations,
            }
            # noinspection PyBroadException
            try:
                with args.config_file.open("w+") as f:
                    yaml.safe_dump(conf, f)
            except Exception:
                print(
                    "Error! Could not write configuration file at: {}".format(
                        args.config_file
                    )
                )
                return

    # Connecting ClearML with the current process,
    # from here on everything is logged automatically
    task = Task.init(project_name="DevOps", task_name="AWS Auto-Scaler", task_type=Task.TaskTypes.service)
    task.connect(conf['hyper_params'])
    configurations = conf['configurations']
    configurations.update(json.loads(task.get_configuration_object(name="General") or "{}"))
    task.set_configuration_object(name="General", config_text=json.dumps(configurations, indent=2))

    if args.remote or args.run:
        print("Running AWS auto-scaler as a service\nExecution log {}".format(task.get_output_log_web_page()))

    if args.remote:
        # if we are running remotely enqueue this run, and leave the process
        # the clearml-agent services will pick it up and execute it for us.
        task.execute_remotely(queue_name='services')

    driver = AWSDriver.from_config(conf)
    conf = ScalerConfig.from_config(conf)
    autoscaler = AutoScaler(conf, driver)
    if running_remotely() or args.run:
        autoscaler.start()
コード例 #15
0
def training(local_rank, config):

    rank = idist.get_rank()
    manual_seed(config["seed"] + rank)
    device = idist.device()

    logger = setup_logger(name="CIFAR10-Training")

    log_basic_info(logger, config)

    output_path = config["output_path"]
    if rank == 0:
        if config["stop_iteration"] is None:
            now = datetime.now().strftime("%Y%m%d-%H%M%S")
        else:
            now = f"stop-on-{config['stop_iteration']}"

        folder_name = f"{config['model']}_backend-{idist.backend()}-{idist.get_world_size()}_{now}"
        output_path = Path(output_path) / folder_name
        if not output_path.exists():
            output_path.mkdir(parents=True)
        config["output_path"] = output_path.as_posix()
        logger.info(f"Output path: {config['output_path']}")

        if "cuda" in device.type:
            config["cuda device name"] = torch.cuda.get_device_name(local_rank)

        if config["with_clearml"]:
            try:
                from clearml import Task
            except ImportError:
                # Backwards-compatibility for legacy Trains SDK
                from trains import Task

            task = Task.init("CIFAR10-Training", task_name=output_path.stem)
            task.connect_configuration(config)
            # Log hyper parameters
            hyper_params = [
                "model",
                "batch_size",
                "momentum",
                "weight_decay",
                "num_epochs",
                "learning_rate",
                "num_warmup_epochs",
            ]
            task.connect({k: config[k] for k in hyper_params})

    # Setup dataflow, model, optimizer, criterion
    train_loader, test_loader = get_dataflow(config)

    config["num_iters_per_epoch"] = len(train_loader)
    model, optimizer, criterion, lr_scheduler = initialize(config)

    # Create trainer for current task
    trainer = create_trainer(model, optimizer, criterion, lr_scheduler,
                             train_loader.sampler, config, logger)

    # Let's now setup evaluator engine to perform model's validation and compute metrics
    metrics = {
        "Accuracy": Accuracy(),
        "Loss": Loss(criterion),
    }

    # We define two evaluators as they wont have exactly similar roles:
    # - `evaluator` will save the best model based on validation score
    evaluator = create_evaluator(model, metrics=metrics, config=config)
    train_evaluator = create_evaluator(model, metrics=metrics, config=config)

    def run_validation(engine):
        epoch = trainer.state.epoch
        state = train_evaluator.run(train_loader)
        log_metrics(logger, epoch, state.times["COMPLETED"], "Train",
                    state.metrics)
        state = evaluator.run(test_loader)
        log_metrics(logger, epoch, state.times["COMPLETED"], "Test",
                    state.metrics)

    trainer.add_event_handler(
        Events.EPOCH_COMPLETED(every=config["validate_every"])
        | Events.COMPLETED, run_validation)

    if rank == 0:
        # Setup TensorBoard logging on trainer and evaluators. Logged values are:
        #  - Training metrics, e.g. running average loss values
        #  - Learning rate
        #  - Evaluation train/test metrics
        evaluators = {"training": train_evaluator, "test": evaluator}
        tb_logger = common.setup_tb_logging(output_path,
                                            trainer,
                                            optimizer,
                                            evaluators=evaluators)

    # Store 2 best models by validation accuracy starting from num_epochs / 2:
    best_model_handler = Checkpoint(
        {"model": model},
        get_save_handler(config),
        filename_prefix="best",
        n_saved=2,
        global_step_transform=global_step_from_engine(trainer),
        score_name="test_accuracy",
        score_function=Checkpoint.get_default_score_fn("Accuracy"),
    )
    evaluator.add_event_handler(
        Events.COMPLETED(
            lambda *_: trainer.state.epoch > config["num_epochs"] // 2),
        best_model_handler)

    # In order to check training resuming we can stop training on a given iteration
    if config["stop_iteration"] is not None:

        @trainer.on(Events.ITERATION_STARTED(once=config["stop_iteration"]))
        def _():
            logger.info(
                f"Stop training on {trainer.state.iteration} iteration")
            trainer.terminate()

    try:
        trainer.run(train_loader, max_epochs=config["num_epochs"])
    except Exception as e:
        logger.exception("")
        raise e

    if rank == 0:
        tb_logger.close()
コード例 #16
0
import os
from tempfile import gettempdir

import numpy as np
from PIL import Image
from torch.utils.tensorboard import SummaryWriter

from clearml import Task

# Connecting ClearML with the current process,
# from here on everything is logged automatically
task = Task.init(project_name='examples',
                 task_name='pytorch tensorboard toy example')

writer = SummaryWriter(log_dir=os.path.join(gettempdir(), 'tensorboard_logs'))

# convert to 4d [batch, col, row, RGB-channels]
image_open = Image.open(
    os.path.join("..", "..", "reporting", "data_samples", "picasso.jpg"))
image = np.asarray(image_open)
image_gray = image[:, :, 0][np.newaxis, :, :, np.newaxis]
image_rgba = np.concatenate(
    (image,
     255 * np.atleast_3d(np.ones(shape=image.shape[:2], dtype=np.uint8))),
    axis=2)
image_rgba = image_rgba[np.newaxis, :, :, :]
image = image[np.newaxis, :, :, :]

writer.add_image("test/first", image[0], dataformats='HWC')
writer.add_image("test_gray/second", image_gray[0], dataformats='HWC')
writer.add_image("test_rgba/third", image_rgba[0], dataformats='HWC')
コード例 #17
0
 def __init__(self):
     self._task = Task.init(project_name="lie-pose-net",
                            task_name="LiePoseNet on local machine")
     self._factory = UniversalFactory(
         [PoseNet, PoseNetCriterion, SE3Criterion, SimpleSE3Criterion])
     self._scene = None
コード例 #18
0
    "Modify config options by adding 'KEY VALUE' pairs at the end of the command. "
    "See config references at "
    "https://detectron2.readthedocs.io/modules/config.html#config-references",
    default=None,
    nargs=argparse.REMAINDER,
)
args = parser.parse_args()

print("Command Line Args:", args)
"""
Clearml
"""
if not args.noclearml:
    # task = Task.init(project_name='persdet2',task_name='Train',task_type='training', output_uri='s3://192.168.56.253:9000/models/snapshots/')
    task = Task.init(project_name=CLEARML_PROJECT_NAME,
                     task_name=args.clearml_task_name,
                     task_type=args.clearml_task_type)
    task.set_base_docker(
        "harbor.io/custom/detectron2:v3 --env GIT_SSL_NO_VERIFY=true --env TRAINS_AGENT_GIT_USER=testuser --env TRAINS_AGENT_GIT_PASS=testuser"
    )
    task.execute_remotely(queue_name="gpu", exit_process=True)
'''
S3 downloading
'''
import boto3
from botocore.client import Config
import tarfile
s3 = boto3.resource('s3',
                    endpoint_url='http://192.168.56.253:9000/',
                    aws_access_key_id='lingevan',
                    aws_secret_access_key=args.awskey,
コード例 #19
0
from clearml import Task, Logger
task = Task.init(project_name='DETECTRON2',task_name='Default Model Architecture',task_type='training', output_uri='http://jax79sg.hopto.org:9000/clearml-models/artifact')
task.set_base_docker("quay.io/jax79sg/detectron2:v4 --env GIT_SSL_NO_VERIFY=true --env TRAINS_AGENT_GIT_USER=testuser --env TRAINS_AGENT_GIT_PASS=testuser" )
task.execute_remotely(queue_name="single_gpu", exit_process=True)


import detectron2
from detectron2.utils.logger import setup_logger
setup_logger()

# import some common libraries
import numpy as np
import os, json, cv2, random
import boto3
import argparse
# import some common detectron2 utilities
from detectron2 import model_zoo
from detectron2.engine import DefaultPredictor
from detectron2.config import get_cfg
from detectron2.utils.visualizer import Visualizer
from detectron2.data import MetadataCatalog, DatasetCatalog
from detectron2.structures import BoxMode
from detectron2.engine import DefaultTrainer
from botocore.client import Config

def download_s3_folder(bucket_name, s3_folder, local_dir=None):
    bucket = s3.Bucket(bucket_name)
    for obj in bucket.objects.filter(Prefix=s3_folder):
        target = obj.key if local_dir is None \
                else os.path.join(local_dir, os.path.relpath(obj.key, s3_folder))
        if not os.path.exists(os.path.dirname(target)):
コード例 #20
0
# ClearML - Example of manual model reporting
from clearml import Task, OutputModel

# Connecting ClearML with the current process,
task = Task.init(project_name="examples", task_name="Model reporting example")

# Create output model and connect it to the task
output_model = OutputModel(task=task)

labels = {"background": 0, "cat": 1, "dog": 2}
output_model.update_labels(labels)

model_url = "https://allegro-examples.s3.amazonaws.com/clearml-public-resources/v1.0/clearml-examples-open/newexamples/examples/pytorch%20lightning%20mnist%20example.fb969db720e241e5859d522aa5226b81/models/training.pt"

# Manually log a model file, which will have the labels connected above
output_model.update_weights(register_uri=model_url)
コード例 #21
0
ファイル: 04_more_interfaces.py プロジェクト: abiller/events
    values = default_values.copy() if train_dataset_id is None \
        else get_normalization_info(train_dataset_id)

    values.update({"p": 1.0})
    return albumentations.Normalize(**values)


if __name__ == "__main__":
    # force colab to get dataclasses
    Task.add_requirements('dataclasses')
    # override numpy version for colab
    Task.add_requirements('numpy', '1.19.5')
    # Track everything on ClearML Free
    task = Task.init(
        project_name='R|D?R&D! Webinar 01',
        task_name='remove all hardcoded',
        output_uri=True,  # auto save everything to Clearml Free
    )

    cfg = FlowerTrainingConfig()
    aug_cfg = AugConfig()  # <---
    task.connect(cfg, 'config')
    task.connect(aug_cfg, 'augmentation_config')  # <---

    # Need to run on cpu only?
    device = "cuda" if torch.cuda.is_available() else "cpu"
    if device == "cpu":
        warnings.warn('GPU not available!, using CPU mode')
        warnings.filterwarnings("ignore", module='torch.cuda.amp.autocast')

    # factored out augmentations # <---
コード例 #22
0
from clearml import Task, Logger
task = Task.init(project_name='DETECTRON2',task_name='Default Model Architecture',task_type='training', output_uri='http://mlops.sytes.net:9000/digitalhub/clearml-models/')
task.set_base_docker("quay.io/jax79sg/detectron2:v4 --env GIT_SSL_NO_VERIFY=true --env TRAINS_AGENT_GIT_USER=testuser --env TRAINS_AGENT_GIT_PASS=testuser" --env SSL_CERT_DIR="/usr/share/ca-certificates/extra/ca.dsta.ai.crt" )
task.execute_remotely(queue_name="1gpu", exit_process=True)


import detectron2
from detectron2.utils.logger import setup_logger
setup_logger()

# import some common libraries
import numpy as np
import os, json, cv2, random
import boto3
import argparse
# import some common detectron2 utilities
from detectron2 import model_zoo
from detectron2.engine import DefaultPredictor
from detectron2.config import get_cfg
from detectron2.utils.visualizer import Visualizer
from detectron2.data import MetadataCatalog, DatasetCatalog
from detectron2.structures import BoxMode
from detectron2.engine import DefaultTrainer
from botocore.client import Config

def download_s3_folder(bucket_name, s3_folder, local_dir=None):
    bucket = s3.Bucket(bucket_name)
    for obj in bucket.objects.filter(Prefix=s3_folder):
        target = obj.key if local_dir is None \
                else os.path.join(local_dir, os.path.relpath(obj.key, s3_folder))
        if not os.path.exists(os.path.dirname(target)):
コード例 #23
0
import xgboost as xgb
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split

from clearml import Task

task = Task.init(project_name="examples",
                 task_name="XGBoost metric auto reporting")

X, y = load_iris(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.2,
                                                    random_state=100)

dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

params = {"objective": "reg:squarederror", "eval_metric": "rmse"}

bst = xgb.train(
    params,
    dtrain,
    num_boost_round=100,
    evals=[(dtrain, "train"), (dtest, "test")],
    verbose_eval=0,
)

bst.save_model("best_model")
コード例 #24
0
ファイル: tensorflow_mnist.py プロジェクト: tmankita/trains
from __future__ import absolute_import, division, print_function, unicode_literals

import os
from tempfile import gettempdir

import tensorflow as tf

from tensorflow.keras.layers import Dense, Flatten, Conv2D
from tensorflow.keras import Model

from clearml import Task

# Connecting ClearML with the current process,
# from here on everything is logged automatically
task = Task.init(project_name='examples',
                 task_name='Tensorflow v2 mnist with summaries')

# Load and prepare the MNIST dataset.
mnist = tf.keras.datasets.mnist

(x_train, y_train), (x_test, y_test) = mnist.load_data()
x_train, x_test = x_train / 255.0, x_test / 255.0

# Add a channels dimension
x_train = x_train[..., tf.newaxis].astype('float32')
x_test = x_test[..., tf.newaxis].astype('float32')

# Use tf.data to batch and shuffle the dataset
train_ds = tf.data.Dataset.from_tensor_slices(
    (x_train, y_train)).shuffle(10000).batch(32)
test_ds = tf.data.Dataset.from_tensor_slices((x_test, y_test)).batch(32)
コード例 #25
0
ファイル: extract_corpora.py プロジェクト: sillsdev/silnlp
def main() -> None:
    parser = argparse.ArgumentParser(
        description="Extracts text corpora from Paratext projects")
    parser.add_argument("projects",
                        nargs="+",
                        metavar="name",
                        help="Paratext project")
    parser.add_argument("--include",
                        metavar="books",
                        nargs="+",
                        default=[],
                        help="The books to include; e.g., 'NT', 'OT', 'GEN'")
    parser.add_argument("--exclude",
                        metavar="books",
                        nargs="+",
                        default=[],
                        help="The books to exclude; e.g., 'NT', 'OT', 'GEN'")
    parser.add_argument("--markers",
                        default=False,
                        action="store_true",
                        help="Include USFM markers")
    parser.add_argument("--lemmas",
                        default=False,
                        action="store_true",
                        help="Extract lemmas if available")
    parser.add_argument("--project-vrefs",
                        default=False,
                        action="store_true",
                        help="Extract project verse refs")

    parser.add_argument("--clearml",
                        default=False,
                        action="store_true",
                        help="Register Extraction in ClearML")

    args = parser.parse_args()

    projects: Set[str] = set(args.projects)

    if args.clearml:
        import datetime

        from clearml import Task

        Task.init(project_name="LangTech_ExtractCorpora",
                  task_name=str(args.projects) + "_" +
                  str(datetime.datetime.now()))

    # Which projects have data we can find?
    projects_found: Set[str] = set()
    for project in projects:
        project_path = SIL_NLP_ENV.pt_projects_dir / project
        if project_path.is_dir():
            projects_found.add(project)

    # Process the projects that have data and tell the user.
    if len(projects_found) > 0:
        expected_verse_count = get_expected_verse_count(
            args.include, args.exclude)
        SIL_NLP_ENV.mt_scripture_dir.mkdir(exist_ok=True, parents=True)
        SIL_NLP_ENV.mt_terms_dir.mkdir(exist_ok=True, parents=True)
        for project in projects_found:
            LOGGER.info(f"Extracting {project}...")
            project_dir = get_project_dir(project)
            corpus_filename, verse_count = extract_project(
                project_dir,
                SIL_NLP_ENV.mt_scripture_dir,
                args.include,
                args.exclude,
                args.markers,
                args.lemmas,
                args.project_vrefs,
            )
            # check if the number of lines in the file is correct (the same as vref.txt)
            LOGGER.info(f"# of Verses: {verse_count}")
            if verse_count != expected_verse_count:
                LOGGER.error(
                    f"The number of verses is {verse_count}, but should be {expected_verse_count}."
                )
            terms_count = extract_term_renderings(project_dir, corpus_filename,
                                                  SIL_NLP_ENV.mt_terms_dir)
            LOGGER.info(f"# of Terms: {terms_count}")
            LOGGER.info("Done.")
    else:
        LOGGER.warning(
            f"Couldn't find any data to process for any project in {SIL_NLP_ENV.pt_projects_dir}."
        )

    # Tell the user which projects couldn't be found.
    for project in projects:
        if project not in projects_found:
            LOGGER.warning(
                f"Couldn't find project {project} in {SIL_NLP_ENV.pt_projects_dir}."
            )
コード例 #26
0
ファイル: subprocess_example.py プロジェクト: tmankita/trains
    parser.set_defaults(subprocess=True)
    # this argument we will not be logging, see below Task.init
    parser.add_argument('--counter',
                        help='integer value',
                        type=int,
                        default=-1)

    args = parser.parse_args()
    print(os.getpid(), 'ARGS:', args)

    # We have to initialize the task in the master process,
    # it will make sure that any sub-process calling Task.init will get the master task object
    # notice that we exclude the `counter` argument, so we can launch multiple sub-processes with clearml-agent
    # otherwise, the `counter` will always be set to the original value.
    task = Task.init('examples',
                     'Popen example',
                     auto_connect_arg_parser={'counter': False})

    # we can connect multiple dictionaries, each from different process, as long as the keys have different names
    param = {
        'args_{}'.format(args.num_workers):
        'some value {}'.format(args.num_workers)
    }
    task.connect(param)

    # check if we need to start the process, meaning counter is negative
    counter = args.num_workers if args.counter < 0 else args.counter

    p = None
    # launch sub-process, every subprocess will launch the next in the chain, until we launch them all.
    # We could also launch all of them here, but that would have been to simple for us J
コード例 #27
0
def main():
    task = Task.init(project_name="TLT3", task_name="TLT eval")
    parser = ArgumentParser()

    parser.add_argument(
        "-a",
        "--arch",
        help="Architecture",
        default="classification",
        choices=[
            "classification",
            "detectnet_v2",
            "ssd",
            "dssd",
            "yolo",
            "faster_rcnn",
            "retinanet",
            "mask_rcnn",
        ],
    )
    parser.add_argument(
        "-e", "--experiment_spec_file", help="Path to configuration file", required=True
    )

    parser.add_argument(
        "-t",
        "--train-task",
        help="The training task id",
        required=True,
    )

    parser.add_argument(
        "--dataset-export-spec",
        help="Path to the detection dataset spec containing the config for exporting .tfrecord files",
        required=True,
    )

    parser.add_argument(
        "-d",
        "--dataset-task",
        help="The task id with dataset as artifact. Artifact name should be 'dataset'",
    )

    parser.add_argument(
        "-k",
        "--key",
        default=None,
        type=str,
        help="The key to load pretrained weights and save intermediate snapshopts and final model. "
             "If not provided, an OS environment named 'KEY' must be set.",
    )
    cmd_train_task = None
    flag = False
    if "-m" not in sys.argv and "--model_file" not in sys.argv:
        for ar in sys.argv:
            if flag:
                cmd_train_task = ar
                break
            if ar == "-t" or ar == "--train-task":
                flag = True
    if cmd_train_task:
        weights_task = Task.get_task(task_id=cmd_train_task)
        unpruned_weights = weights_task.artifacts["unpruned_weights"].get()
        sys.argv.extend(["-m", str(unpruned_weights)])
    parser.add_argument(
        "-m", "--model_file",
        default=str(unpruned_weights) if cmd_train_task else None,
        type=str,
    )
    args = parser.parse_args()
    arch = args.arch
    config_file = args.experiment_spec_file
    train_task = args.train_task
    dataset_export_spec = args.dataset_export_spec
    key = args.key

    task.set_base_docker("nvcr.io/nvidia/tlt-streamanalytics:v3.0-dp-py3")
    config_file = task.connect_configuration(config_file, name="config file")
    get_converted_data(args.dataset_task, config_file)
    dataset_export_spec = task.connect_configuration(
        dataset_export_spec, name="dataset export spec"
    )
    kitti_to_tfrecord(dataset_export_spec, config_file)
    if train_task and running_remotely():
        unpruned_weights = Task.get_task(task_id=train_task).artifacts["unpruned_weights"].get()
        os.system(f"ls {str(unpruned_weights).rpartition('/')[0]}")
        params = task.get_parameters_as_dict()
        os.system(f"mkdir -p {params['Args']['model_file'].rpartition('/')[0]}")
        os.system(f"cp {unpruned_weights} {params['Args']['model_file']}")
    eval_unpruned()
コード例 #28
0
from clearml import Task
from clearml.automation.controller import PipelineController

task = Task.init(project_name='mushrooms',
                 task_name='Model creation mushrooms',
                 task_type=Task.TaskTypes.controller,
                 reuse_last_task_id=False)
args = {
    'worker_queue': 'default',
}
task.connect(args)
task.execute_remotely()

pipe = PipelineController(default_execution_queue='default',
                          add_pipeline_tags=False)

pipe.add_step(name='stage_data',
              base_task_project='mushrooms',
              base_task_name='mushrooms step 1 dataset artifact',
              execution_queue=args["worker_queue"])
pipe.add_step(
    name='stage_train',
    parents=[
        'stage_data',
    ],
    base_task_project='mushrooms',
    base_task_name='mushrooms step 2 train model',
    parameter_override={'General/stage_data_task_id': '${stage_data.id}'},
    execution_queue=args["worker_queue"])

pipe.start()
コード例 #29
0
# TRAINS - Example of Matplotlib and Seaborn integration and reporting
#
import matplotlib

matplotlib.use('agg')  # use agg instead of tkinter
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from clearml import Task

task = Task.init(project_name='examples',
                 task_name='Matplotlib example by Harry')

# Create a plot
N = 50
x = np.random.rand(N)
y = np.random.rand(N)
colors = np.random.rand(N)
area = (30 * np.random.rand(N))**2  # 0 to 15 point radii
plt.scatter(x, y, s=area, c=colors, alpha=0.5)
# Plot will be reported automatically
plt.show()

# Alternatively, in order to report the plot with a more meaningful title/series and iteration number
area = (40 * np.random.rand(N))**2
plt.scatter(x, y, s=area, c=colors, alpha=0.5)
task.logger.report_matplotlib_figure(title="My Plot Title",
                                     series="My Plot Series",
                                     iteration=10,
                                     figure=plt)
plt.show()
コード例 #30
0
        use_dropout=True,
        dropout_rate=0.1,
        use_image_features=False,
        use_likes=False,
    )


@dataclass
class MyFeatureConfig():
    """Config for my new feature"""
    # the word size
    word_size: int = 128


parser = ArgumentParser()
parser.add_arguments(HyperParameters, dest="hparams")
args = parser.parse_args()

if __name__ == '__main__':
    task = Task.init(project_name='simple_parse',
                     task_name='nested using simple-parsing',
                     auto_connect_arg_parser=False,
                     reuse_last_task_id=False)

    task.connect(parser,name='command line')
    extra_args = task.connect(MyFeatureConfig, name='my_feature1')

    hparams: HyperParameters = args.hparams
    my_feature_conf : MyFeatureConfig
    print(hparams)
    task.close()