'--task-id', type=str, required=True) parser.add_argument('--execution-queue', '-q', type=str, default='rtx2080ti') parser.add_argument('--run-as-service', '--service', action="store_true") parser.add_argument('--no-reuse-last-task-id', dest='reuse_id', action="store_false", default=True) args = parser.parse_args() task = Task.init(project_name='language-model-hp', task_name=f'{args.model}', task_type=Task.TaskTypes.optimizer, reuse_last_task_id=args.reuse_id) task.connect(args) optimizer = HyperParameterOptimizer( base_task_id=args. template_task_id, # This is the experiment we want to optimize # here we define the hyper-parameters to optimize hyper_parameters=hyper_parameters[args.model], # setting the objective metric we want to maximize/minimize objective_metric_title='val_ppl', objective_metric_series='val_ppl', objective_metric_sign='min', # maximize or minimize the objective metric # setting optimizer - clearml supports GridSearch, RandomSearch, OptimizerBOHB and OptimizerOptuna optimizer_class=OptimizerOptuna, # Configuring optimization parameters
max_pixel_value=255.0, ) values = default_values.copy() if norm_setting is None \ else norm_setting.copy() values.update({"p": 1.0}) return albumentations.Normalize(**values) if __name__ == "__main__": # force colab to get dataclasses Task.add_requirements('dataclasses', '0.4') # override numpy version for colab Task.add_requirements('numpy', '1.19.5') # Track everything on ClearML Free task = Task.init(project_name='R|D?R&D! Webinar 01', task_name='Full integration', output_uri=True, # auto save everything to Clearml Free ) # Need to run on cpu only? device = "cuda" if torch.cuda.is_available() else "cpu" if device == "cpu": warnings.warn('GPU not available!, using CPU mode') warnings.filterwarnings("ignore", module='torch.cuda.amp.autocast') # configs cfg = FlowerTrainingConfig() aug_cfg = AugConfig() task.connect(cfg, 'config') task.connect(aug_cfg, 'augmentation_config') # default model config task.set_model_config(config_dict=asdict(ModelConfig()))
def main(): pipeline_name = str(os.getenv('PPS_PIPELINE_NAME', 'None')) print("Pachyderm pipeline") # Connecting ClearML with the current process, # from here on everything is logged automatically task = Task.init(project_name=pipeline_name, task_name='Pachyderm PyTorch MNIST Train') # Training settings parser = argparse.ArgumentParser(description='PyTorch MNIST Example') parser.add_argument('--batch-size', type=int, default=64, metavar='N', help='input batch size for training (default: 64)') parser.add_argument('--test-batch-size', type=int, default=1000, metavar='N', help='input batch size for testing (default: 1000)') parser.add_argument('--epochs', type=int, default=10, metavar='N', help='number of epochs to train (default: 10)') parser.add_argument('--lr', type=float, default=0.01, metavar='LR', help='learning rate (default: 0.01)') parser.add_argument('--momentum', type=float, default=0.5, metavar='M', help='SGD momentum (default: 0.5)') parser.add_argument('--no-cuda', action='store_true', default=False, help='disables CUDA training') parser.add_argument('--seed', type=int, default=1, metavar='S', help='random seed (default: 1)') parser.add_argument( '--log-interval', type=int, default=10, metavar='N', help='how many batches to wait before logging training status') parser.add_argument('--save-model', action='store_true', default=True, help='For Saving the current Model') parser.add_argument('--save-location', type=str, default='./', help='For Saving the current Model') parser.add_argument('--data-location', type=str, default=os.path.join('..', 'data'), help='For loading the dataset') args = parser.parse_args() use_cuda = not args.no_cuda and torch.cuda.is_available() torch.manual_seed(args.seed) device = torch.device("cuda" if use_cuda else "cpu") kwargs = {"num_workers": 4, "pin_memory": True} if use_cuda else {} train_loader = torch.utils.data.DataLoader(datasets.MNIST( args.data_location, train=True, download=True, transform=transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.1307, ), (0.3081, )) ]), ), batch_size=args.batch_size, shuffle=True, **kwargs) test_loader = torch.utils.data.DataLoader(datasets.MNIST( args.data_location, train=False, transform=transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.1307, ), (0.3081, )) ]), ), batch_size=args.test_batch_size, shuffle=True, **kwargs) model = Net().to(device) optimizer = optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum) for epoch in range(1, args.epochs + 1): train(args, model, device, train_loader, optimizer, epoch) test(args, model, device, test_loader, epoch) if args.save_model: torch.save(model.state_dict(), os.path.join(args.save_location, "mnist_cnn.pt"))
def main(): print('ClearML experiment monitor Slack service\n') # Slack Monitor arguments parser = argparse.ArgumentParser( description='ClearML monitor experiments and post Slack Alerts') parser.add_argument('--channel', type=str, help='Set the channel to post the Slack alerts') parser.add_argument('--slack_api', type=str, default=os.environ.get('SLACK_API_TOKEN', None), help='Slack API key for sending messages') parser.add_argument( '--message_prefix', type=str, help= 'Add message prefix (For example, to alert all channel members use: "Hey <!here>,")' ) parser.add_argument( '--project', type=str, default='', help= 'The name (or partial name) of the project to monitor, use empty for all projects' ) parser.add_argument( '--min_num_iterations', type=int, default=0, help= 'Minimum number of iterations of failed/completed experiment to alert. ' 'This will help eliminate unnecessary debug sessions that crashed right after starting ' '(default:0 alert on all)') parser.add_argument( '--include_manual_experiments', action="store_true", default=False, help='Include experiments running manually (i.e. not by clearml-agent)' ) parser.add_argument( '--include_completed_experiments', action="store_true", default=False, help='Include completed experiments (i.e. not just failed experiments)' ) parser.add_argument( '--refresh_rate', type=float, default=10., help= 'Set refresh rate of the monitoring service, default every 10.0 sec') parser.add_argument( '--service_queue', type=str, default='services', help= 'Queue name to use when running as a service (default: \'services\'') parser.add_argument( '--local', action="store_true", default=False, help='Run service locally instead of as a service ' '(Default: Automatically launch itself on the services queue)') args = parser.parse_args() if not args.slack_api: print( 'Slack API key was not provided, please run with --slack_api <KEY>' ) exit(1) if not args.channel: print( 'Slack channel was not provided, please run with --channel <channel_name>' ) exit(1) # create the slack monitoring object slack_monitor = SlackMonitor(slack_api_token=args.slack_api, channel=args.channel, message_prefix=args.message_prefix) # configure the monitoring filters slack_monitor.min_num_iterations = args.min_num_iterations slack_monitor.include_manual_experiments = args.include_manual_experiments if args.project: slack_monitor.set_projects(project_names_re=[args.project]) if args.include_completed_experiments: slack_monitor.status_alerts += ["completed"] # start the monitoring Task # Connecting ClearML with the current process, # from here on everything is logged automatically task = Task.init(project_name='Monitoring', task_name='Slack Alerts', task_type=Task.TaskTypes.monitor) if not args.local: task.execute_remotely(queue_name=args.service_queue) # we will not get here if we are running locally print('\nStarting monitoring service\nProject: "{}"\nRefresh rate: {}s\n'. format(args.project or 'all', args.refresh_rate)) # Let everyone know we are up and running start_message = \ '{}Allegro ClearML Slack monitoring service started\nMonitoring project \'{}\''.format( (args.message_prefix + ' ') if args.message_prefix else '', args.project or 'all') slack_monitor.post_message(start_message) # Start the monitor service, this function will never end slack_monitor.monitor(pool_period=args.refresh_rate)
import os from tempfile import gettempdir import numpy as np from PIL import Image from torch.utils.tensorboard import SummaryWriter from clearml import Task # Connecting ClearML with the current process, # from here on everything is logged automatically task = Task.init(project_name='examples', task_name='PyTorch TensorBoard toy example') writer = SummaryWriter(log_dir=os.path.join(gettempdir(), 'tensorboard_logs')) # convert to 4d [batch, col, row, RGB-channels] image_open = Image.open( os.path.join("..", "..", "reporting", "data_samples", "picasso.jpg")) image = np.asarray(image_open) image_gray = image[:, :, 0][np.newaxis, :, :, np.newaxis] image_rgba = np.concatenate( (image, 255 * np.atleast_3d(np.ones(shape=image.shape[:2], dtype=np.uint8))), axis=2) image_rgba = image_rgba[np.newaxis, :, :, :] image = image[np.newaxis, :, :, :] writer.add_image("test/first", image[0], dataformats='HWC') writer.add_image("test_gray/second", image_gray[0], dataformats='HWC') writer.add_image("test_rgba/third", image_rgba[0], dataformats='HWC')
def main(): # Training settings parser = argparse.ArgumentParser(description='PyTorch MNIST Example') parser.add_argument('--batch-size', type=int, default=64, metavar='N', help='input batch size for training (default: 64)') parser.add_argument('--test-batch-size', type=int, default=1000, metavar='N', help='input batch size for testing (default: 1000)') parser.add_argument('--epochs', type=int, default=2, metavar='N', help='number of epochs to train (default: 10)') parser.add_argument('--lr', type=float, default=0.01, metavar='LR', help='learning rate (default: 0.01)') parser.add_argument('--momentum', type=float, default=0.5, metavar='M', help='SGD momentum (default: 0.5)') parser.add_argument('--no-cuda', action='store_true', default=False, help='disables CUDA training') parser.add_argument('--seed', type=int, default=1, metavar='S', help='random seed (default: 1)') parser.add_argument( '--log-interval', type=int, default=10, metavar='N', help='how many batches to wait before logging training status') args = parser.parse_args() args.cuda = not args.no_cuda and torch.cuda.is_available() # Connecting ClearML with the current process, # from here on everything is logged automatically task = Task.init(project_name='examples', task_name='PyTorch with tensorboardX') writer = SummaryWriter('runs') writer.add_text('TEXT', 'This is some text', 0) torch.manual_seed(args.seed) if args.cuda: torch.cuda.manual_seed(args.seed) kwargs = {'num_workers': 4, 'pin_memory': True} if args.cuda else {} train_loader = torch.utils.data.DataLoader(datasets.MNIST( '../data', train=True, download=True, transform=transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.1307, ), (0.3081, )) ])), batch_size=args.batch_size, shuffle=True, **kwargs) test_loader = torch.utils.data.DataLoader(datasets.MNIST( '../data', train=False, transform=transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.1307, ), (0.3081, )) ])), batch_size=args.test_batch_size, shuffle=True, **kwargs) model = Net() if args.cuda: model.cuda() optimizer = optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum) for epoch in range(1, args.epochs + 1): train(model, epoch, train_loader, args, optimizer, writer) torch.save(model, os.path.join(gettempdir(), 'model{}'.format(epoch))) test(model, test_loader, args, optimizer, writer)
from clearml import Task from time import sleep # Initialize the Task Pipe's first Task used to start the Task Pipe task = Task.init("examples", "Simple Controller Task", task_type=Task.TaskTypes.controller) # Create a hyper-parameter dictionary for the task param = dict() # Connect the hyper-parameter dictionary to the task param = task.connect(param) # In this example we pass next task's name as a parameter param["next_task_name"] = "Toy Base Task" # This is a parameter name in the next task we want to change param["param_name"] = "Example_Param" # This is the parameter value in the next task we want to change param["param_name_new_value"] = 3 # The queue where we want the template task (clone) to be sent to param["execution_queue_name"] = "default" # Simulate the work of a Task print("Processing....") sleep(2.0) print("Done processing :)") # Get a reference to the task to pipe to. next_task = Task.get_task(project_name=task.get_project_name(), task_name=param["next_task_name"])
# ClearML - example code, ArgumentParser parameter logging and dictionary parameter logging # from __future__ import absolute_import from __future__ import division from __future__ import print_function import sys from argparse import ArgumentParser from clearml import Task # Connecting ClearML with the current process, # from here on everything is logged automatically task = Task.init(project_name='examples', task_name='hyper-parameters example') parameters = { 'list': [1, 2, 3], 'dict': {'a': 1, 'b': 2}, 'tuple': (1, 2, 3), 'int': 3, 'float': 2.2, 'string': 'my string', } parameters = task.connect(parameters) # adding new parameter after connect (will be logged as well) parameters['new_param'] = 'this is new' # changing the value of a parameter (new value will be stored instead of previous one) parameters['float'] = '9.9'
model.add(Dense(10)) model.add(Activation('softmax')) model2 = Sequential() model2.add(Dense(512, input_shape=(784, ))) model2.add(Activation('relu')) model.summary() model.compile(loss='categorical_crossentropy', optimizer=RMSprop(), metrics=['accuracy']) # Connecting ClearML with the current process, # from here on everything is logged automatically task = Task.init(project_name='examples', task_name='Keras with TensorBoard example') task.connect_configuration({ 'test': 1337, 'nested': { 'key': 'value', 'number': 1 } }) # Advanced: setting model class enumeration labels = dict(('digit_%d' % i, i) for i in range(10)) task.set_model_label_enumeration(labels) output_folder = os.path.join(tempfile.gettempdir(), 'keras_example') board = TensorBoard(histogram_freq=1,
import logging import os from datetime import datetime from glob import glob from shutil import rmtree from time import sleep, time from clearml.backend_api.session.client import APIClient from clearml import Task # Connecting ClearML with the current process, # from here on everything is logged automatically task = Task.init( project_name="DevOps", task_name="Cleanup Service", task_type=Task.TaskTypes.service, reuse_last_task_id=False, ) # set the base docker including the mount point for the file server data data file_server_mount = "/opt/trains/data/fileserver/" task.set_base_docker("ubuntu:18.04 -v /opt/trains/data/fileserver/:{}".format( file_server_mount)) # args for the running task args = { "delete_threshold_days": 30.0, "cleanup_period_in_days": 1.0, "run_as_service": True, "force_delete": False, }
# ClearML - Example of manual model configuration and uploading # import os from tempfile import gettempdir from keras import Input, layers, Model from clearml import Task # Connecting ClearML with the current process, # from here on everything is logged automatically task = Task.init(project_name='examples', task_name='Model configuration and upload') def get_model(): # Create a simple model. inputs = Input(shape=(32, )) outputs = layers.Dense(1)(inputs) keras_model = Model(inputs, outputs) keras_model.compile(optimizer='adam', loss='mean_squared_error') return keras_model # create a model model = get_model() # Connect a local configuration file config_file = os.path.join('..', '..', 'reporting', 'data_samples', 'sample.json') config_file = task.connect_configuration(config_file)
import os from time import sleep import pandas as pd import numpy as np from PIL import Image from clearml import Task # Connecting ClearML with the current process, # from here on everything is logged automatically task = Task.init(project_name='examples', task_name='Artifacts example') df = pd.DataFrame( { 'num_legs': [2, 4, 8, 0], 'num_wings': [2, 0, 0, 0], 'num_specimen_seen': [10, 2, 1, 8] }, index=['falcon', 'dog', 'spider', 'fish']) # Register Pandas object as artifact to watch # (it will be monitored in the background and automatically synced and uploaded) task.register_artifact('train', df, metadata={ 'counting': 'legs', 'max legs': 69 }) # change the artifact object df.sample(frac=0.5, replace=True, random_state=1) # or access it from anywhere using the Task's get_registered_artifacts()
import torch.optim as optim import torchvision.datasets as datasets import torchvision.transforms as transforms from ignite.contrib.handlers import TensorboardLogger from ignite.engine import Events, create_supervised_trainer, create_supervised_evaluator from ignite.handlers import global_step_from_engine from ignite.metrics import Accuracy, Loss, Recall from ignite.utils import setup_logger from torch.utils.tensorboard import SummaryWriter from tqdm import tqdm from clearml import Task, StorageManager # Connecting ClearML with the current process, # from here on everything is logged automatically task = Task.init(project_name='Image Example', task_name='image classification CIFAR10') params = { 'number_of_epochs': 20, 'batch_size': 64, 'dropout': 0.25, 'base_lr': 0.001, 'momentum': 0.9, 'loss_report': 100 } params = task.connect(params) # enabling configuration override by clearml print(params) # printing actual configuration (after override in remote mode) manager = StorageManager() dataset_path = Path( manager.get_local_copy(
def main(): parser = ArgumentParser() parser.add_argument( "--run", help="Run the autoscaler after wizard finished", action="store_true", default=False, ) parser.add_argument( "--remote", help="Run the autoscaler as a service, launch on the `services` queue", action="store_true", default=False, ) parser.add_argument( "--config-file", help="Configuration file name", type=Path, default=Path("aws_autoscaler.yaml"), ) args = parser.parse_args() if running_remotely(): conf = default_config else: print("AWS Autoscaler setup wizard\n" "---------------------------\n" "Follow the wizard to configure your AWS auto-scaler service.\n" "Once completed, you will be able to view and change the configuration in the clearml-server web UI.\n" "It means there is no need to worry about typos or mistakes :)\n") if args.config_file.exists() and input_bool( "Load configurations from config file '{}' [Y/n]? ".format(args.config_file), default=True, ): with args.config_file.open("r") as f: conf = yaml.load(f, Loader=yaml.SafeLoader) else: configurations, hyper_params = run_wizard() conf = { "hyper_params": hyper_params, "configurations": configurations, } # noinspection PyBroadException try: with args.config_file.open("w+") as f: yaml.safe_dump(conf, f) except Exception: print( "Error! Could not write configuration file at: {}".format( args.config_file ) ) return # Connecting ClearML with the current process, # from here on everything is logged automatically task = Task.init(project_name="DevOps", task_name="AWS Auto-Scaler", task_type=Task.TaskTypes.service) task.connect(conf['hyper_params']) configurations = conf['configurations'] configurations.update(json.loads(task.get_configuration_object(name="General") or "{}")) task.set_configuration_object(name="General", config_text=json.dumps(configurations, indent=2)) if args.remote or args.run: print("Running AWS auto-scaler as a service\nExecution log {}".format(task.get_output_log_web_page())) if args.remote: # if we are running remotely enqueue this run, and leave the process # the clearml-agent services will pick it up and execute it for us. task.execute_remotely(queue_name='services') driver = AWSDriver.from_config(conf) conf = ScalerConfig.from_config(conf) autoscaler = AutoScaler(conf, driver) if running_remotely() or args.run: autoscaler.start()
def training(local_rank, config): rank = idist.get_rank() manual_seed(config["seed"] + rank) device = idist.device() logger = setup_logger(name="CIFAR10-Training") log_basic_info(logger, config) output_path = config["output_path"] if rank == 0: if config["stop_iteration"] is None: now = datetime.now().strftime("%Y%m%d-%H%M%S") else: now = f"stop-on-{config['stop_iteration']}" folder_name = f"{config['model']}_backend-{idist.backend()}-{idist.get_world_size()}_{now}" output_path = Path(output_path) / folder_name if not output_path.exists(): output_path.mkdir(parents=True) config["output_path"] = output_path.as_posix() logger.info(f"Output path: {config['output_path']}") if "cuda" in device.type: config["cuda device name"] = torch.cuda.get_device_name(local_rank) if config["with_clearml"]: try: from clearml import Task except ImportError: # Backwards-compatibility for legacy Trains SDK from trains import Task task = Task.init("CIFAR10-Training", task_name=output_path.stem) task.connect_configuration(config) # Log hyper parameters hyper_params = [ "model", "batch_size", "momentum", "weight_decay", "num_epochs", "learning_rate", "num_warmup_epochs", ] task.connect({k: config[k] for k in hyper_params}) # Setup dataflow, model, optimizer, criterion train_loader, test_loader = get_dataflow(config) config["num_iters_per_epoch"] = len(train_loader) model, optimizer, criterion, lr_scheduler = initialize(config) # Create trainer for current task trainer = create_trainer(model, optimizer, criterion, lr_scheduler, train_loader.sampler, config, logger) # Let's now setup evaluator engine to perform model's validation and compute metrics metrics = { "Accuracy": Accuracy(), "Loss": Loss(criterion), } # We define two evaluators as they wont have exactly similar roles: # - `evaluator` will save the best model based on validation score evaluator = create_evaluator(model, metrics=metrics, config=config) train_evaluator = create_evaluator(model, metrics=metrics, config=config) def run_validation(engine): epoch = trainer.state.epoch state = train_evaluator.run(train_loader) log_metrics(logger, epoch, state.times["COMPLETED"], "Train", state.metrics) state = evaluator.run(test_loader) log_metrics(logger, epoch, state.times["COMPLETED"], "Test", state.metrics) trainer.add_event_handler( Events.EPOCH_COMPLETED(every=config["validate_every"]) | Events.COMPLETED, run_validation) if rank == 0: # Setup TensorBoard logging on trainer and evaluators. Logged values are: # - Training metrics, e.g. running average loss values # - Learning rate # - Evaluation train/test metrics evaluators = {"training": train_evaluator, "test": evaluator} tb_logger = common.setup_tb_logging(output_path, trainer, optimizer, evaluators=evaluators) # Store 2 best models by validation accuracy starting from num_epochs / 2: best_model_handler = Checkpoint( {"model": model}, get_save_handler(config), filename_prefix="best", n_saved=2, global_step_transform=global_step_from_engine(trainer), score_name="test_accuracy", score_function=Checkpoint.get_default_score_fn("Accuracy"), ) evaluator.add_event_handler( Events.COMPLETED( lambda *_: trainer.state.epoch > config["num_epochs"] // 2), best_model_handler) # In order to check training resuming we can stop training on a given iteration if config["stop_iteration"] is not None: @trainer.on(Events.ITERATION_STARTED(once=config["stop_iteration"])) def _(): logger.info( f"Stop training on {trainer.state.iteration} iteration") trainer.terminate() try: trainer.run(train_loader, max_epochs=config["num_epochs"]) except Exception as e: logger.exception("") raise e if rank == 0: tb_logger.close()
import os from tempfile import gettempdir import numpy as np from PIL import Image from torch.utils.tensorboard import SummaryWriter from clearml import Task # Connecting ClearML with the current process, # from here on everything is logged automatically task = Task.init(project_name='examples', task_name='pytorch tensorboard toy example') writer = SummaryWriter(log_dir=os.path.join(gettempdir(), 'tensorboard_logs')) # convert to 4d [batch, col, row, RGB-channels] image_open = Image.open( os.path.join("..", "..", "reporting", "data_samples", "picasso.jpg")) image = np.asarray(image_open) image_gray = image[:, :, 0][np.newaxis, :, :, np.newaxis] image_rgba = np.concatenate( (image, 255 * np.atleast_3d(np.ones(shape=image.shape[:2], dtype=np.uint8))), axis=2) image_rgba = image_rgba[np.newaxis, :, :, :] image = image[np.newaxis, :, :, :] writer.add_image("test/first", image[0], dataformats='HWC') writer.add_image("test_gray/second", image_gray[0], dataformats='HWC') writer.add_image("test_rgba/third", image_rgba[0], dataformats='HWC')
def __init__(self): self._task = Task.init(project_name="lie-pose-net", task_name="LiePoseNet on local machine") self._factory = UniversalFactory( [PoseNet, PoseNetCriterion, SE3Criterion, SimpleSE3Criterion]) self._scene = None
"Modify config options by adding 'KEY VALUE' pairs at the end of the command. " "See config references at " "https://detectron2.readthedocs.io/modules/config.html#config-references", default=None, nargs=argparse.REMAINDER, ) args = parser.parse_args() print("Command Line Args:", args) """ Clearml """ if not args.noclearml: # task = Task.init(project_name='persdet2',task_name='Train',task_type='training', output_uri='s3://192.168.56.253:9000/models/snapshots/') task = Task.init(project_name=CLEARML_PROJECT_NAME, task_name=args.clearml_task_name, task_type=args.clearml_task_type) task.set_base_docker( "harbor.io/custom/detectron2:v3 --env GIT_SSL_NO_VERIFY=true --env TRAINS_AGENT_GIT_USER=testuser --env TRAINS_AGENT_GIT_PASS=testuser" ) task.execute_remotely(queue_name="gpu", exit_process=True) ''' S3 downloading ''' import boto3 from botocore.client import Config import tarfile s3 = boto3.resource('s3', endpoint_url='http://192.168.56.253:9000/', aws_access_key_id='lingevan', aws_secret_access_key=args.awskey,
from clearml import Task, Logger task = Task.init(project_name='DETECTRON2',task_name='Default Model Architecture',task_type='training', output_uri='http://jax79sg.hopto.org:9000/clearml-models/artifact') task.set_base_docker("quay.io/jax79sg/detectron2:v4 --env GIT_SSL_NO_VERIFY=true --env TRAINS_AGENT_GIT_USER=testuser --env TRAINS_AGENT_GIT_PASS=testuser" ) task.execute_remotely(queue_name="single_gpu", exit_process=True) import detectron2 from detectron2.utils.logger import setup_logger setup_logger() # import some common libraries import numpy as np import os, json, cv2, random import boto3 import argparse # import some common detectron2 utilities from detectron2 import model_zoo from detectron2.engine import DefaultPredictor from detectron2.config import get_cfg from detectron2.utils.visualizer import Visualizer from detectron2.data import MetadataCatalog, DatasetCatalog from detectron2.structures import BoxMode from detectron2.engine import DefaultTrainer from botocore.client import Config def download_s3_folder(bucket_name, s3_folder, local_dir=None): bucket = s3.Bucket(bucket_name) for obj in bucket.objects.filter(Prefix=s3_folder): target = obj.key if local_dir is None \ else os.path.join(local_dir, os.path.relpath(obj.key, s3_folder)) if not os.path.exists(os.path.dirname(target)):
# ClearML - Example of manual model reporting from clearml import Task, OutputModel # Connecting ClearML with the current process, task = Task.init(project_name="examples", task_name="Model reporting example") # Create output model and connect it to the task output_model = OutputModel(task=task) labels = {"background": 0, "cat": 1, "dog": 2} output_model.update_labels(labels) model_url = "https://allegro-examples.s3.amazonaws.com/clearml-public-resources/v1.0/clearml-examples-open/newexamples/examples/pytorch%20lightning%20mnist%20example.fb969db720e241e5859d522aa5226b81/models/training.pt" # Manually log a model file, which will have the labels connected above output_model.update_weights(register_uri=model_url)
values = default_values.copy() if train_dataset_id is None \ else get_normalization_info(train_dataset_id) values.update({"p": 1.0}) return albumentations.Normalize(**values) if __name__ == "__main__": # force colab to get dataclasses Task.add_requirements('dataclasses') # override numpy version for colab Task.add_requirements('numpy', '1.19.5') # Track everything on ClearML Free task = Task.init( project_name='R|D?R&D! Webinar 01', task_name='remove all hardcoded', output_uri=True, # auto save everything to Clearml Free ) cfg = FlowerTrainingConfig() aug_cfg = AugConfig() # <--- task.connect(cfg, 'config') task.connect(aug_cfg, 'augmentation_config') # <--- # Need to run on cpu only? device = "cuda" if torch.cuda.is_available() else "cpu" if device == "cpu": warnings.warn('GPU not available!, using CPU mode') warnings.filterwarnings("ignore", module='torch.cuda.amp.autocast') # factored out augmentations # <---
from clearml import Task, Logger task = Task.init(project_name='DETECTRON2',task_name='Default Model Architecture',task_type='training', output_uri='http://mlops.sytes.net:9000/digitalhub/clearml-models/') task.set_base_docker("quay.io/jax79sg/detectron2:v4 --env GIT_SSL_NO_VERIFY=true --env TRAINS_AGENT_GIT_USER=testuser --env TRAINS_AGENT_GIT_PASS=testuser" --env SSL_CERT_DIR="/usr/share/ca-certificates/extra/ca.dsta.ai.crt" ) task.execute_remotely(queue_name="1gpu", exit_process=True) import detectron2 from detectron2.utils.logger import setup_logger setup_logger() # import some common libraries import numpy as np import os, json, cv2, random import boto3 import argparse # import some common detectron2 utilities from detectron2 import model_zoo from detectron2.engine import DefaultPredictor from detectron2.config import get_cfg from detectron2.utils.visualizer import Visualizer from detectron2.data import MetadataCatalog, DatasetCatalog from detectron2.structures import BoxMode from detectron2.engine import DefaultTrainer from botocore.client import Config def download_s3_folder(bucket_name, s3_folder, local_dir=None): bucket = s3.Bucket(bucket_name) for obj in bucket.objects.filter(Prefix=s3_folder): target = obj.key if local_dir is None \ else os.path.join(local_dir, os.path.relpath(obj.key, s3_folder)) if not os.path.exists(os.path.dirname(target)):
import xgboost as xgb from sklearn.datasets import load_iris from sklearn.model_selection import train_test_split from clearml import Task task = Task.init(project_name="examples", task_name="XGBoost metric auto reporting") X, y = load_iris(return_X_y=True) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=100) dtrain = xgb.DMatrix(X_train, label=y_train) dtest = xgb.DMatrix(X_test, label=y_test) params = {"objective": "reg:squarederror", "eval_metric": "rmse"} bst = xgb.train( params, dtrain, num_boost_round=100, evals=[(dtrain, "train"), (dtest, "test")], verbose_eval=0, ) bst.save_model("best_model")
from __future__ import absolute_import, division, print_function, unicode_literals import os from tempfile import gettempdir import tensorflow as tf from tensorflow.keras.layers import Dense, Flatten, Conv2D from tensorflow.keras import Model from clearml import Task # Connecting ClearML with the current process, # from here on everything is logged automatically task = Task.init(project_name='examples', task_name='Tensorflow v2 mnist with summaries') # Load and prepare the MNIST dataset. mnist = tf.keras.datasets.mnist (x_train, y_train), (x_test, y_test) = mnist.load_data() x_train, x_test = x_train / 255.0, x_test / 255.0 # Add a channels dimension x_train = x_train[..., tf.newaxis].astype('float32') x_test = x_test[..., tf.newaxis].astype('float32') # Use tf.data to batch and shuffle the dataset train_ds = tf.data.Dataset.from_tensor_slices( (x_train, y_train)).shuffle(10000).batch(32) test_ds = tf.data.Dataset.from_tensor_slices((x_test, y_test)).batch(32)
def main() -> None: parser = argparse.ArgumentParser( description="Extracts text corpora from Paratext projects") parser.add_argument("projects", nargs="+", metavar="name", help="Paratext project") parser.add_argument("--include", metavar="books", nargs="+", default=[], help="The books to include; e.g., 'NT', 'OT', 'GEN'") parser.add_argument("--exclude", metavar="books", nargs="+", default=[], help="The books to exclude; e.g., 'NT', 'OT', 'GEN'") parser.add_argument("--markers", default=False, action="store_true", help="Include USFM markers") parser.add_argument("--lemmas", default=False, action="store_true", help="Extract lemmas if available") parser.add_argument("--project-vrefs", default=False, action="store_true", help="Extract project verse refs") parser.add_argument("--clearml", default=False, action="store_true", help="Register Extraction in ClearML") args = parser.parse_args() projects: Set[str] = set(args.projects) if args.clearml: import datetime from clearml import Task Task.init(project_name="LangTech_ExtractCorpora", task_name=str(args.projects) + "_" + str(datetime.datetime.now())) # Which projects have data we can find? projects_found: Set[str] = set() for project in projects: project_path = SIL_NLP_ENV.pt_projects_dir / project if project_path.is_dir(): projects_found.add(project) # Process the projects that have data and tell the user. if len(projects_found) > 0: expected_verse_count = get_expected_verse_count( args.include, args.exclude) SIL_NLP_ENV.mt_scripture_dir.mkdir(exist_ok=True, parents=True) SIL_NLP_ENV.mt_terms_dir.mkdir(exist_ok=True, parents=True) for project in projects_found: LOGGER.info(f"Extracting {project}...") project_dir = get_project_dir(project) corpus_filename, verse_count = extract_project( project_dir, SIL_NLP_ENV.mt_scripture_dir, args.include, args.exclude, args.markers, args.lemmas, args.project_vrefs, ) # check if the number of lines in the file is correct (the same as vref.txt) LOGGER.info(f"# of Verses: {verse_count}") if verse_count != expected_verse_count: LOGGER.error( f"The number of verses is {verse_count}, but should be {expected_verse_count}." ) terms_count = extract_term_renderings(project_dir, corpus_filename, SIL_NLP_ENV.mt_terms_dir) LOGGER.info(f"# of Terms: {terms_count}") LOGGER.info("Done.") else: LOGGER.warning( f"Couldn't find any data to process for any project in {SIL_NLP_ENV.pt_projects_dir}." ) # Tell the user which projects couldn't be found. for project in projects: if project not in projects_found: LOGGER.warning( f"Couldn't find project {project} in {SIL_NLP_ENV.pt_projects_dir}." )
parser.set_defaults(subprocess=True) # this argument we will not be logging, see below Task.init parser.add_argument('--counter', help='integer value', type=int, default=-1) args = parser.parse_args() print(os.getpid(), 'ARGS:', args) # We have to initialize the task in the master process, # it will make sure that any sub-process calling Task.init will get the master task object # notice that we exclude the `counter` argument, so we can launch multiple sub-processes with clearml-agent # otherwise, the `counter` will always be set to the original value. task = Task.init('examples', 'Popen example', auto_connect_arg_parser={'counter': False}) # we can connect multiple dictionaries, each from different process, as long as the keys have different names param = { 'args_{}'.format(args.num_workers): 'some value {}'.format(args.num_workers) } task.connect(param) # check if we need to start the process, meaning counter is negative counter = args.num_workers if args.counter < 0 else args.counter p = None # launch sub-process, every subprocess will launch the next in the chain, until we launch them all. # We could also launch all of them here, but that would have been to simple for us J
def main(): task = Task.init(project_name="TLT3", task_name="TLT eval") parser = ArgumentParser() parser.add_argument( "-a", "--arch", help="Architecture", default="classification", choices=[ "classification", "detectnet_v2", "ssd", "dssd", "yolo", "faster_rcnn", "retinanet", "mask_rcnn", ], ) parser.add_argument( "-e", "--experiment_spec_file", help="Path to configuration file", required=True ) parser.add_argument( "-t", "--train-task", help="The training task id", required=True, ) parser.add_argument( "--dataset-export-spec", help="Path to the detection dataset spec containing the config for exporting .tfrecord files", required=True, ) parser.add_argument( "-d", "--dataset-task", help="The task id with dataset as artifact. Artifact name should be 'dataset'", ) parser.add_argument( "-k", "--key", default=None, type=str, help="The key to load pretrained weights and save intermediate snapshopts and final model. " "If not provided, an OS environment named 'KEY' must be set.", ) cmd_train_task = None flag = False if "-m" not in sys.argv and "--model_file" not in sys.argv: for ar in sys.argv: if flag: cmd_train_task = ar break if ar == "-t" or ar == "--train-task": flag = True if cmd_train_task: weights_task = Task.get_task(task_id=cmd_train_task) unpruned_weights = weights_task.artifacts["unpruned_weights"].get() sys.argv.extend(["-m", str(unpruned_weights)]) parser.add_argument( "-m", "--model_file", default=str(unpruned_weights) if cmd_train_task else None, type=str, ) args = parser.parse_args() arch = args.arch config_file = args.experiment_spec_file train_task = args.train_task dataset_export_spec = args.dataset_export_spec key = args.key task.set_base_docker("nvcr.io/nvidia/tlt-streamanalytics:v3.0-dp-py3") config_file = task.connect_configuration(config_file, name="config file") get_converted_data(args.dataset_task, config_file) dataset_export_spec = task.connect_configuration( dataset_export_spec, name="dataset export spec" ) kitti_to_tfrecord(dataset_export_spec, config_file) if train_task and running_remotely(): unpruned_weights = Task.get_task(task_id=train_task).artifacts["unpruned_weights"].get() os.system(f"ls {str(unpruned_weights).rpartition('/')[0]}") params = task.get_parameters_as_dict() os.system(f"mkdir -p {params['Args']['model_file'].rpartition('/')[0]}") os.system(f"cp {unpruned_weights} {params['Args']['model_file']}") eval_unpruned()
from clearml import Task from clearml.automation.controller import PipelineController task = Task.init(project_name='mushrooms', task_name='Model creation mushrooms', task_type=Task.TaskTypes.controller, reuse_last_task_id=False) args = { 'worker_queue': 'default', } task.connect(args) task.execute_remotely() pipe = PipelineController(default_execution_queue='default', add_pipeline_tags=False) pipe.add_step(name='stage_data', base_task_project='mushrooms', base_task_name='mushrooms step 1 dataset artifact', execution_queue=args["worker_queue"]) pipe.add_step( name='stage_train', parents=[ 'stage_data', ], base_task_project='mushrooms', base_task_name='mushrooms step 2 train model', parameter_override={'General/stage_data_task_id': '${stage_data.id}'}, execution_queue=args["worker_queue"]) pipe.start()
# TRAINS - Example of Matplotlib and Seaborn integration and reporting # import matplotlib matplotlib.use('agg') # use agg instead of tkinter import numpy as np import matplotlib.pyplot as plt import seaborn as sns from clearml import Task task = Task.init(project_name='examples', task_name='Matplotlib example by Harry') # Create a plot N = 50 x = np.random.rand(N) y = np.random.rand(N) colors = np.random.rand(N) area = (30 * np.random.rand(N))**2 # 0 to 15 point radii plt.scatter(x, y, s=area, c=colors, alpha=0.5) # Plot will be reported automatically plt.show() # Alternatively, in order to report the plot with a more meaningful title/series and iteration number area = (40 * np.random.rand(N))**2 plt.scatter(x, y, s=area, c=colors, alpha=0.5) task.logger.report_matplotlib_figure(title="My Plot Title", series="My Plot Series", iteration=10, figure=plt) plt.show()
use_dropout=True, dropout_rate=0.1, use_image_features=False, use_likes=False, ) @dataclass class MyFeatureConfig(): """Config for my new feature""" # the word size word_size: int = 128 parser = ArgumentParser() parser.add_arguments(HyperParameters, dest="hparams") args = parser.parse_args() if __name__ == '__main__': task = Task.init(project_name='simple_parse', task_name='nested using simple-parsing', auto_connect_arg_parser=False, reuse_last_task_id=False) task.connect(parser,name='command line') extra_args = task.connect(MyFeatureConfig, name='my_feature1') hparams: HyperParameters = args.hparams my_feature_conf : MyFeatureConfig print(hparams) task.close()