def __init__(self, task: Task = None, projectName: str = None, taskName: str = None, additionalLoggingValuesDict=None): """ :param task: instances of trains.Task :param projectName: only necessary if task is not provided :param taskName: only necessary if task is not provided :param additionalLoggingValuesDict: """ if task is None: if projectName is None or taskName is None: raise ValueError("Either the trains task or the project name and task name have to be provided") self.task = Task.init(project_name=projectName, task_name=taskName, reuse_last_task_id=False) else: if projectName is not None: log.warning( f"projectName parameter with value {projectName} passed even though task has been given, " f"will ignore this parameter" ) if taskName is not None: log.warning( f"taskName parameter with value {taskName} passed even though task has been given, " f"will ignore this parameter" ) self.task = task self.logger = self.task.get_logger() super().__init__(additionalLoggingValuesDict=additionalLoggingValuesDict)
def main(): # Create the experiment Task task = Task.init(project_name="examples", task_name="text reporting") print('reporting text logs') # report regular console print print('This is standard output test') # report stderr print('This is standard error test', file=sys.stderr) # Get the task logger, # You can also call Task.current_task().get_logger() from anywhere in your code. logger = task.get_logger() # report text based logs report_logs(logger) # force flush reports # If flush is not called, reports are flushed in the background every couple of seconds, # and at the end of the process execution logger.flush() print('We are done reporting, have a great day :)')
def __init__( self, project_name: Optional[str] = None, task_name: Optional[str] = None, task_type: str = 'training', reuse_last_task_id: bool = True, output_uri: Optional[str] = None, auto_connect_arg_parser: bool = True, auto_connect_frameworks: bool = True, auto_resource_monitoring: bool = True ) -> None: super().__init__() if self._bypass: self._trains = None else: self._trains = Task.init( project_name=project_name, task_name=task_name, task_type=task_type, reuse_last_task_id=reuse_last_task_id, output_uri=output_uri, auto_connect_arg_parser=auto_connect_arg_parser, auto_connect_frameworks=auto_connect_frameworks, auto_resource_monitoring=auto_resource_monitoring )
def __init__( self, pool_frequency=0.2, # type: float default_execution_queue=None, # type: Optional[str] pipeline_time_limit=None, # type: Optional[float] auto_connect_task=True, # type: Union[bool, Task] always_create_task=False, # type: bool add_pipeline_tags=False, # type: bool ): # type: (...) -> () """ Create a new pipeline controller. The newly created object will launch and monitor the new experiments. :param float pool_frequency: The pooling frequency (in minutes) for monitoring experiments / states. :param str default_execution_queue: The execution queue to use if no execution queue is provided :param float pipeline_time_limit: The maximum time (minutes) for the entire pipeline process. The default is ``None``, indicating no time limit. :param bool auto_connect_task: Store pipeline arguments and configuration in the Task - ``True`` - The pipeline argument and configuration will be stored in the current Task. All arguments will be under the hyper-parameter section ``Pipeline``, and the pipeline DAG will be stored as a Task configuration object named ``Pipeline``. - ``False`` - Do not store with Task. - ``Task`` - A specific Task object to connect the pipeline with. :param bool always_create_task: Always create a new Task - ``True`` - No current Task initialized. Create a new task named ``Pipeline`` in the ``base_task_id`` project. - ``False`` - Use the :py:meth:`task.Task.current_task` (if exists) to report statistics. :param bool add_pipeline_tags: (default: False) if True, add `pipe: <pipeline_task_id>` tag to all steps (Tasks) created by this pipeline. """ self._nodes = {} self._running_nodes = [] self._start_time = None self._pipeline_time_limit = pipeline_time_limit * 60. if pipeline_time_limit else None self._default_execution_queue = default_execution_queue self._pool_frequency = pool_frequency * 60. self._thread = None self._stop_event = None self._experiment_created_cb = None self._add_pipeline_tags = add_pipeline_tags self._task = auto_connect_task if isinstance( auto_connect_task, Task) else Task.current_task() self._step_ref_pattern = re.compile(self._step_pattern) if not self._task and always_create_task: self._task = Task.init( project_name='Pipelines', task_name='Pipeline {}'.format(datetime.now()), task_type=Task.TaskTypes.controller, ) # make sure all the created tasks are our children, as we are creating them if self._task: self._task.add_tags([self._tag]) self._auto_connect_task = bool(auto_connect_task)
def setup_trains_logging(config): if config["with_trains"]: from trains import Task task = Task.init("Carbon Black Semantic Segmentation Training", config["task_name"]) task.connect_configuration(config) # Log hyper parameters hyper_parameters = list(config.keys()) task.connect({k: config[k] for k in hyper_parameters})
def main(): # Training settings parser = argparse.ArgumentParser(description='PyTorch MNIST Example') parser.add_argument('--batch-size', type=int, default=64, metavar='N', help='input batch size for training (default: 64)') parser.add_argument('--test-batch-size', type=int, default=1000, metavar='N', help='input batch size for testing (default: 1000)') parser.add_argument('--epochs', type=int, default=2, metavar='N', help='number of epochs to train (default: 10)') parser.add_argument('--lr', type=float, default=0.01, metavar='LR', help='learning rate (default: 0.01)') parser.add_argument('--momentum', type=float, default=0.5, metavar='M', help='SGD momentum (default: 0.5)') parser.add_argument('--no-cuda', action='store_true', default=False, help='disables CUDA training') parser.add_argument('--seed', type=int, default=1, metavar='S', help='random seed (default: 1)') parser.add_argument('--log-interval', type=int, default=10, metavar='N', help='how many batches to wait before logging training status') args = parser.parse_args() args.cuda = not args.no_cuda and torch.cuda.is_available() task = Task.init(project_name='examples', task_name='pytorch with tensorboardX') writer = SummaryWriter('runs') writer.add_text('TEXT', 'This is some text', 0) torch.manual_seed(args.seed) if args.cuda: torch.cuda.manual_seed(args.seed) kwargs = {'num_workers': 4, 'pin_memory': True} if args.cuda else {} train_loader = torch.utils.data.DataLoader(datasets.MNIST('../data', train=True, download=True, transform=transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,))])), batch_size=args.batch_size, shuffle=True, **kwargs) test_loader = torch.utils.data.DataLoader(datasets.MNIST('../data', train=False, transform=transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,))])), batch_size=args.batch_size, shuffle=True, **kwargs) model = Net() if args.cuda: model.cuda() optimizer = optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum) for epoch in range(1, args.epochs + 1): train(model, epoch, train_loader, args, optimizer, writer) torch.save(model, os.path.join(gettempdir(), 'model{}'.format(epoch))) test(model, test_loader, args, optimizer, writer)
def TrainModel(model, base_model, model_name): task = Task.init(project_name="Ex3ModelTrains", task_name=model_name) reporter = TrainsReporter() # Show a summary of the model. Check the number of trainable parameters model.summary() # Compile the model model.compile(loss=keras.losses.BinaryCrossentropy(from_logits=True), optimizer=keras.optimizers.Adam(), metrics=[metrics.BinaryAccuracy()]) # Train the model model.fit(train_ds, steps_per_epoch=train_ds.samples / train_ds.batch_size, epochs=20, validation_data=valid_ds, validation_steps=valid_ds.samples / valid_ds.batch_size, callbacks=[reporter], verbose=1) # Unfreeze the base_model. Note that it keeps running in inference mode # since we passed `training=False` when calling it. This means that # the batchnorm layers will not update their batch statistics. # This prevents the batchnorm layers from undoing all the training # we've done so far. base_model.trainable = True reporter.epoch_ref = 20 score = model.evaluate(test_ds) print('Test evaluation Score:', model.evaluate(test_ds)) print('validation evaluation Score:', model.evaluate(valid_ds)) model.compile( optimizer=keras.optimizers.Adam(1e-5), # Low learning rate loss=keras.losses.BinaryCrossentropy(from_logits=True), metrics=[keras.metrics.BinaryAccuracy()], ) model.fit(train_ds, steps_per_epoch=train_ds.samples / train_ds.batch_size, epochs=10, validation_data=valid_ds, validation_steps=valid_ds.samples / valid_ds.batch_size, callbacks=[reporter], verbose=1) score = model.evaluate(test_ds) print('Test evaluation Score:', model.evaluate(test_ds)) print('validation evaluation Score:', model.evaluate(valid_ds))
def main(): parser = ArgumentParser() parser.add_argument( "--run", help="Run the autoscaler after wizard finished", action="store_true", default=False, ) args = parser.parse_args() if running_remotely(): hyper_params = AwsAutoScaler.Settings().as_dict() configurations = AwsAutoScaler.Configuration().as_dict() else: print("AWS Autoscaler setup\n") config_file = Path(CONF_FILE).absolute() if config_file.exists() and input_bool( "Load configurations from config file '{}' [Y/n]? ".format( str(CONF_FILE)), default=True, ): with config_file.open("r") as f: conf = yaml.load(f, Loader=yaml.SafeLoader) hyper_params = conf["hyper_params"] configurations = conf["configurations"] else: configurations, hyper_params = run_wizard() try: with config_file.open("w+") as f: conf = { "hyper_params": hyper_params, "configurations": configurations, } yaml.safe_dump(conf, f) except Exception: print( "Error! Could not write configuration file at: {}".format( str(CONF_FILE))) return task = Task.init(project_name="Auto-Scaler", task_name="AWS Auto-Scaler") task.connect(hyper_params) task.connect_configuration(configurations) autoscaler = AwsAutoScaler(hyper_params, configurations) if running_remotely() or args.run: autoscaler.start()
def __init__(self, *_, **kwargs): try: from trains import Task from trains.binding.frameworks.tensorflow_bind import WeightsGradientHistHelper except ImportError: raise RuntimeError( "This contrib module requires trains to be installed. " "You may install trains using: \n pip install trains \n") experiment_kwargs = { k: v for k, v in kwargs.items() if k not in ( "project_name", "task_name", "task_type", ) } if self.bypass_mode(): warnings.warn("TrainsSaver: running in bypass mode") class _Stub(object): def __call__(self, *_, **__): return self def __getattr__(self, attr): if attr in ("name", "id"): return "" return self def __setattr__(self, attr, val): pass self._task = _Stub() else: self._task = Task.init( project_name=kwargs.get("project_name"), task_name=kwargs.get("task_name"), task_type=kwargs.get("task_type", Task.TaskTypes.training), **experiment_kwargs, ) self.trains_logger = self._task.get_logger() self.grad_helper = WeightsGradientHistHelper( logger=self.trains_logger, )
def run(config, logger=None, local_rank=0, **kwargs): assert torch.cuda.is_available() assert torch.backends.cudnn.enabled, "Nvidia/Amp requires cudnn backend to be enabled." task = Task.init( "ignite", "DeeplabV3_ResNet101 pascal_voc2012 segmentation example") dist.init_process_group("nccl", init_method="env://") # As we passed config with option --manual_config_load assert hasattr(config, "setup"), ( "We need to manually setup the configuration, please set --manual_config_load " "to py_config_runner") config = config.setup() assert_config(config, TRAINVAL_CONFIG) # The following attributes are automatically added by py_config_runner assert hasattr(config, "config_filepath") and isinstance( config.config_filepath, Path) assert hasattr(config, "script_filepath") and isinstance( config.script_filepath, Path) # dump python files to reproduce the run task.connect_configuration(config.config_filepath.as_posix()) task.upload_artifact("script", config.script_filepath) config.output_path = Path("./artifacts") # log the configuration, if we are the master node if dist.get_rank() == 0: task.connect(get_params(config, TRAINVAL_CONFIG)) try: training(config, local_rank=local_rank, with_trains_logging=True) except KeyboardInterrupt: logger.info("Caught KeyboardInterrupt -> exit") except Exception as e: # noqa logger.exception("") dist.destroy_process_group() raise e dist.destroy_process_group()
def main(): # Create the experiment Task task = Task.init(project_name="examples", task_name="scalar reporting") print('reporting scalar graphs') # Get the task logger, # You can also call Task.current_task().get_logger() from anywhere in your code. logger = task.get_logger() # report scalars report_scalars(logger) # force flush reports # If flush is not called, reports are flushed in the background every couple of seconds, # and at the end of the process execution logger.flush() print('We are done reporting, have a great day :)')
def run(config, **kwargs): """This is the main method to run the training. As this training script is launched with `py_config_runner` it should obligatory contain `run(config, **kwargs)` method. """ assert torch.cuda.is_available(), torch.cuda.is_available() assert torch.backends.cudnn.enabled, "Nvidia/Amp requires cudnn backend to be enabled." with idist.Parallel(backend="nccl") as parallel: logger = setup_logger(name="Pascal-VOC12 Training", distributed_rank=idist.get_rank()) assert_config(config, TRAINVAL_CONFIG) # The following attributes are automatically added by py_config_runner assert hasattr(config, "config_filepath") and isinstance( config.config_filepath, Path) assert hasattr(config, "script_filepath") and isinstance( config.script_filepath, Path) if idist.get_rank() == 0 and exp_tracking.has_trains: from trains import Task task = Task.init("Pascal-VOC12 Training", config.config_filepath.stem) task.connect_configuration(config.config_filepath.as_posix()) log_basic_info(logger, config) config.output_path = Path(exp_tracking.get_output_path()) # dump python files to reproduce the run exp_tracking.log_artifact(config.config_filepath.as_posix()) exp_tracking.log_artifact(config.script_filepath.as_posix()) exp_tracking.log_params(get_params(config, TRAINVAL_CONFIG)) try: parallel.run(training, config, logger=logger) except KeyboardInterrupt: logger.info("Catched KeyboardInterrupt -> exit") except Exception as e: # noqa logger.exception("") raise e
def trains(self, x: data_type, y: data_type = None, x_cv: data_type = None, y_cv: data_type = None, *, trains_config: Dict[str, Any] = None, keep_task_open: bool = False, queue: str = None) -> "Wrapper": if trains_config is None: return self.fit(x, y, x_cv, y_cv) # init trains if trains_config is None: trains_config = {} project_name = trains_config.get("project_name") task_name = trains_config.get("task_name") if queue is None: task = Task.init(**trains_config) cloned_task = None else: task = Task.get_task(project_name=project_name, task_name=task_name) cloned_task = Task.clone(source_task=task, parent=task.id) # before loop self._verbose_level = 6 self._data_config["verbose_level"] = 6 self._before_loop(x, y, x_cv, y_cv) self.pipeline.use_tqdm = False copied_config = shallow_copy_dict(self.config) if queue is not None: cloned_task.set_parameters(copied_config) Task.enqueue(cloned_task.id, queue) return self # loop task.connect(copied_config) global trains_logger trains_logger = task.get_logger() self._loop() if not keep_task_open: task.close() trains_logger = None return self
def __init__(self, project_name: Optional[str] = None, task_name: Optional[str] = None, task_type: str = 'training', reuse_last_task_id: bool = True, output_uri: Optional[str] = None, auto_connect_arg_parser: bool = True, auto_connect_frameworks: bool = True, auto_resource_monitoring: bool = True) -> None: if not _TRAINS_AVAILABLE: raise ImportError( 'You want to use `test_tube` logger which is not installed yet,' ' install it with `pip install test-tube`.') super().__init__() if self.bypass_mode(): self._trains = None print('TRAINS Task: running in bypass mode') print('TRAINS results page: disabled') class _TaskStub(object): def __call__(self, *args, **kwargs): return self def __getattr__(self, attr): if attr in ('name', 'id'): return '' return self def __setattr__(self, attr, val): pass self._trains = _TaskStub() else: self._trains = Task.init( project_name=project_name, task_name=task_name, task_type=task_type, reuse_last_task_id=reuse_last_task_id, output_uri=output_uri, auto_connect_arg_parser=auto_connect_arg_parser, auto_connect_frameworks=auto_connect_frameworks, auto_resource_monitoring=auto_resource_monitoring)
def __init__( self, project_name: Optional[str] = None, task_name: Optional[str] = None, task_type: str = 'training', reuse_last_task_id: bool = True, output_uri: Optional[str] = None, auto_connect_arg_parser: bool = True, auto_connect_frameworks: bool = True, auto_resource_monitoring: bool = True ) -> None: super().__init__() if self.bypass_mode(): self._trains = None print('TRAINS Task: running in bypass mode') print('TRAINS results page: disabled') class _TaskStub(object): def __call__(self, *args, **kwargs): return self def __getattr__(self, attr): if attr in ('name', 'id'): return '' return self def __setattr__(self, attr, val): pass self._trains = _TaskStub() else: self._trains = Task.init( project_name=project_name, task_name=task_name, task_type=task_type, reuse_last_task_id=reuse_last_task_id, output_uri=output_uri, auto_connect_arg_parser=auto_connect_arg_parser, auto_connect_frameworks=auto_connect_frameworks, auto_resource_monitoring=auto_resource_monitoring )
def initialize_trains(arg_parser, project_name, tag): tb_logdir = None OPTS.trains_task = None if is_root_node(): if OPTS.tensorboard: try: from trains import Task task = Task.init(project_name=project_name, task_name=tag, auto_connect_arg_parser=False, output_uri="{}/data/model_backups".format( os.getenv("HOME"))) task.connect(arg_parser) task.set_random_seed(OPTS.seed) OPTS.trains_task = task except SystemError as e: print(e) pass tb_logdir = os.path.join(OPTS.root, "tensorboard") if not os.path.exists(tb_logdir): os.mkdir(tb_logdir) return tb_logdir
def main(): # Create the experiment Task task = Task.init(project_name="examples", task_name="html samples reporting") print('reporting html files into debug samples section') # Get the task logger, # You can also call Task.current_task().get_logger() from anywhere in your code. logger = task.get_logger() # report html as debug samples report_html_image(logger) report_html_graph(logger) report_html_groupby(logger) report_html_periodic_table(logger) report_html_url(logger) # force flush reports # If flush is not called, reports are flushed in the background every couple of seconds, # and at the end of the process execution logger.flush() print('We are done reporting, have a great day :)')
def __init__(self, project_name: Optional[str] = None, task_name: Optional[str] = None, task_type: str = 'training', reuse_last_task_id: bool = True, output_uri: Optional[str] = None, auto_connect_arg_parser: bool = True, auto_connect_frameworks: bool = True, auto_resource_monitoring: bool = True) -> None: super().__init__() if self.bypass_mode(): # pragma: no-cover self._trains = None print('TRAINS Task: running in bypass mode') print('TRAINS results page: disabled') else: self._trains = Task.init( project_name=project_name, task_name=task_name, task_type=task_type, reuse_last_task_id=reuse_last_task_id, output_uri=output_uri, auto_connect_arg_parser=auto_connect_arg_parser, auto_connect_frameworks=auto_connect_frameworks, auto_resource_monitoring=auto_resource_monitoring)
from time import sleep import pandas as pd import numpy as np from PIL import Image from trains import Task task = Task.init('examples', 'artifacts toy') df = pd.DataFrame( { 'num_legs': [2, 4, 8, 0], 'num_wings': [2, 0, 0, 0], 'num_specimen_seen': [10, 2, 1, 8] }, index=['falcon', 'dog', 'spider', 'fish']) # Register Pandas object as artifact to watch # (it will be monitored in the background and automatically synced and uploaded) task.register_artifact('train', df, metadata={ 'counting': 'legs', 'max legs': 69 }) # change the artifact object df.sample(frac=0.5, replace=True, random_state=1) # or access it from anywhere using the Task's get_registered_artifacts() Task.current_task().get_registered_artifacts()['train'].sample(frac=0.5, replace=True, random_state=1)
import os import socket import subprocess import sys from copy import deepcopy from tempfile import mkstemp import psutil # make sure we have jupyter in the auto requirements import jupyter # noqa from trains import Task # initialize TRAINS task = Task.init( project_name="DevOps", task_name="Allocate Jupyter Notebook Instance", task_type=Task.TaskTypes.service) # get rid of all the runtime TRAINS preserve = ( "TRAINS_API_HOST", "TRAINS_WEB_HOST", "TRAINS_FILES_HOST", "TRAINS_CONFIG_FILE", "TRAINS_API_ACCESS_KEY", "TRAINS_API_SECRET_KEY", "TRAINS_API_HOST_VERIFY_CERT", "TRAINS_DOCKER_IMAGE", ) # setup os environment env = deepcopy(os.environ)
gpu_num = hvd.size() else: part_index = 0 part_num = 1 gpu_num = 1 # Tensorboard Logging tb_logdir = None OPTS.trains_task = None if is_root_node(): print("Running on {} GPUs".format(gpu_num)) if OPTS.tensorboard: try: from trains import Task task = Task.init(project_name="lanmt2", task_name=OPTS.result_tag, auto_connect_arg_parser=False, output_uri=OPTS.root) task.connect(ap) task.set_random_seed(OPTS.seed) task.set_output_model_id(OPTS.model_tag) OPTS.trains_task = task except: pass if envswitch.who() != "shu": tb_str = "{}_lat{}_noise{}_lr{}".format(OPTS.modeltype, OPTS.latentdim, OPTS.noise, OPTS.ebm_lr) if OPTS.train_sgd_steps > 0: tb_str += "_imit{}".format(OPTS.train_sgd_steps) tb_logdir = os.path.join(HOME_DIR, "tensorboard", "ebm", "{}_cassio".format(OPTS.dtok), tb_str)
def main(): # Init environment use_trains = False problem_name = 'cvrp' problem_type = 'uniform_offline' max_customer_times = 0 size = 20 vehicle_velocity = 1 vehicle_capacity = 30 random_seed = 0 max_demand = 10 start_at_depot = True EVAL_BASELINES_RESULTS_FILENAME = ( f"experiments/{problem_name}/{size}s_{vehicle_capacity}c_{max_customer_times}t/" f"baseline_values.json") env_config = { 'problem_type': problem_type, 'max_customer_times': max_customer_times, 'size': size, 'max_demand': max_demand, 'vehicle_velocity': vehicle_velocity, 'vehicle_capacity': vehicle_capacity, 'start_at_depot': start_at_depot, 'random_seed': random_seed, 'eval_baseline_results_filename': EVAL_BASELINES_RESULTS_FILENAME } if use_trains: task = Task.init( project_name="train_cvrp_pytorch", task_name= f'train_ppo_agent_{size}s_{vehicle_capacity}c_{max_customer_times}t' ) logger = Task.current_task().get_logger() logger.tensorboard_single_series_per_graph(single_series=True) else: logger = None env = create_uniform_dynamic_problem(max_customer_times=max_customer_times, size=size, max_demand=max_demand, vehicle_velocity=vehicle_velocity, vehicle_capacity=vehicle_capacity, random_seed=random_seed, start_at_depot=start_at_depot) # customer_positions = [[0.25, 0.25], [0.5, 0.5], [1, 1]] # env = create_fixed_static_problem(customer_positions=customer_positions, # depot_position=[0, 0], # initial_vehicle_capacity=10, # initial_vehicle_position=[0, 0], # customer_demands=[1]*len(customer_positions), # customer_times=[0]*len(customer_positions), # vehicle_velocity=1) # # env_config = {'problem_type': 'fixed_problem', # 'size': 3, # 'vehicle_capacity': 10, # 'vehicle_position': [0, 0], # 'customer_positions': customer_positions, # 'start_at_depot': True # } # EVAL_BASELINES_RESULTS_FILENAME = (f'experiments/{3}s_{10}c_{0}t/' # f'baseline_values.json') tg_env = GeometricAttentionWrapper(env) tg_env.reset() # model_config = { # 'use_value_critic': True, # 'num_features': 4, # 'embedding_dim': 128, # 'value_embedding_dim': 128, # 'use_batch_norm': False # } model_config = { 'n_passes': 4, 'edge_embedding_dim': 64, 'node_embedding_dim': 64, 'global_embedding_dim': 64, 'edge_hidden_dim': 64, 'edge_target_dim': 64, 'node_target_dim': 64, 'node_dim_out': 1, 'edge_dim_out': 1, 'node_hidden_dim': 64, 'global_hidden_dim': 64, 'global_target_dim': 64, 'global_dim_out': 64, 'edge_feature_dim': 1, 'node_feature_dim': 5, # indicator, x, y, demand/capacity, is_visited 'global_feature_dim': 1, 'value_embedding_dim': 64, 'use_value_critic': True, 'use_batch_norm': False } agent_config = { 'lr': 0.0003, 'discount': 0.99, # number of episodes to do altogether 'number_of_episodes': 50000000, # a batch is N episodes where N is number_of_episodes_in_batch 'number_of_episodes_in_batch': 20, # this must be a division of number of episodes 'total_num_eval_seeds': 100, 'num_eval_seeds': 10, 'evaluate_every': 50, 'num_train_seeds': 1000, 'reward_average_window_size': 10, 'entropy_coeff': 0.001, # consider decreasing this back 'value_coeff': 0.1, 'model_config': model_config, 'save_checkpoint_every': 1000, 'eps_clip': 0.5, 'n_ppo_updates': 80, 'target_kl': 0.0001, 'logit_normalizer': 5, 'problem_name': problem_name # used for saving results } model_config['logit_normalizer'] = agent_config['logit_normalizer'] agent_config['run_name'] = f"ep_in_batch_{agent_config['number_of_episodes_in_batch']}_" \ f"n_eval_{agent_config['num_eval_seeds']}_lr_{agent_config['lr']}" eval_seeds = list(range(agent_config['total_num_eval_seeds'])) baseline_results_path = Path(EVAL_BASELINES_RESULTS_FILENAME) or_tools_policy = ORToolsPolicy(timeout=10) if not baseline_results_path.exists(): baseline_values = { 'distance': evaluate_policy_simple(env, eval_seeds, distance_proportional_policy, samples_per_seed=5), 'ORTools': evaluate_policy_simple(env, eval_seeds, or_tools_policy, samples_per_seed=5) } baseline_results_path.parent.mkdir(parents=True, exist_ok=True) with open(baseline_results_path, 'w') as f: json.dump(baseline_values, f, indent=2) else: print(f"loading: {EVAL_BASELINES_RESULTS_FILENAME}") with open(baseline_results_path, 'r') as f: baseline_values = json.load(f) # JSON saves dictionary keys as strings, so we have to convert them back to ints baseline_values = { baseline: {int(seed): val for seed, val in baseline_dict.items()} for baseline, baseline_dict in baseline_values.items() } # model = PolicyFullyConnectedGAT(cfg=model_config, model_name='ppo_policy_model') model = PolicyFullyConnectedMessagePassing( cfg=model_config, model_name='ppo_message_passing_model') set_seeds() if use_trains: parameters_agent = task.connect(agent_config, name='agent_config') parameters_env = task.connect(env_config, name='env_config') agent_config['env_config'] = env_config ppo_agent = PPOAgent(tg_env, config=agent_config, model=model, eval_seeds=eval_seeds, baseline_eval_values=baseline_values) ppo_agent.train()
# TRAINS - Example of manual graphs and statistics reporting # import numpy as np import logging from trains import Task task = Task.init(project_name='examples', task_name='Manual reporting') # example python logger logging.getLogger().setLevel('DEBUG') logging.debug('This is a debug message') logging.info('This is an info message') logging.warning('This is a warning message') logging.error('This is an error message') logging.critical('This is a critical message') # get TRAINS logger object for any metrics / reports logger = task.get_logger() # log text logger.console("hello") # report scalar values logger.report_scalar("example_scalar", "series A", iteration=0, value=100) logger.report_scalar("example_scalar", "series A", iteration=1, value=200) # report histogram histogram = np.random.randint(10, size=10) logger.report_vector("example_histogram", "random histogram", iteration=1,
else: part_index = 0 part_num = 1 gpu_num = 1 # Tensorboard Logging tb_logdir = None OPTS.trains_task = None if is_root_node(): print("Running on {} GPUs".format(gpu_num)) if OPTS.tensorboard: try: from trains import Task task = Task.init(project_name="EBM_LM", task_name=OPTS.result_tag, auto_connect_arg_parser=False, output_uri="{}/data/model_backups".format( os.getenv("HOME"))) task.connect(ap) task.set_random_seed(OPTS.seed) OPTS.trains_task = task except SystemError as e: print(e) pass tb_logdir = os.path.join(OPTS.root, "tensorboard") if not os.path.exists(tb_logdir): os.mkdir(tb_logdir) # Get the path variables (train_src_corpus, train_tgt_corpus, distilled_tgt_corpus, truncate_datapoints, test_src_corpus, test_tgt_corpus, ref_path, src_vocab_path, tgt_vocab_path,
import sys from argparse import ArgumentParser from absl import app from absl import flags from absl import logging from trains import Task FLAGS = flags.FLAGS flags.DEFINE_string('echo', None, 'Text to echo.') flags.DEFINE_string('another_str', 'My string', 'A string', module_name='test') task = Task.init(project_name='examples', task_name='hyper-parameters example') flags.DEFINE_integer('echo3', 3, 'Text to echo.') flags.DEFINE_string('echo5', '5', 'Text to echo.', module_name='test') parameters = { 'list': [1, 2, 3], 'dict': {'a': 1, 'b': 2}, 'tuple': (1, 2, 3), 'int': 3, 'float': 2.2, 'string': 'my string', } parameters = task.connect(parameters)
from argparse import ArgumentParser from pathlib2 import Path from utilities import get_iou_types, draw_boxes, get_model_instance_segmentation, CocoLikeAnnotations, get_backbone from torchvision_references import utils from torchvision.transforms import functional as F from PIL import Image from transforms import get_transform from SSD.ssd_model import SSD from SSD.multibox_loss import SSDLoss from trains import Task task = Task.init( project_name='Object Detection with TRAINS, Ignite and TensorBoard', task_name='Inference with trained SSD model') def rescale_box(box, image_size, orig_height, orig_width): rescale_height = float(orig_height) / image_size rescale_width = float(orig_width) / image_size box[:2] *= rescale_width box[2:] *= rescale_height return box def run(task_args): writer = SummaryWriter(log_dir=task_args.log_dir) input_checkpoint = torch.load(task_args.input_checkpoint) labels_enum = input_checkpoint.get('labels_enumeration')
from __future__ import absolute_import, division, print_function import argparse import os import sys import time from tempfile import gettempdir import tensorflow as tf from tensorflow.examples.tutorials.mnist import input_data from trains import Task tf.compat.v1.enable_eager_execution() task = Task.init(project_name='examples', task_name='Tensorflow eager mode') FLAGS = tf.app.flags.FLAGS tf.app.flags.DEFINE_integer('data_num', 100, """Flag of type integer""") tf.app.flags.DEFINE_string('img_path', './img', """Flag of type string""") layers = tf.keras.layers FLAGS = None class Discriminator(tf.keras.Model): """ GAN Discriminator. A network to differentiate between generated and real handwritten digits. """ def __init__(self, data_format):
import torch import torch.nn as nn import torch.nn.functional as F import torch.optim as optim from PIL import Image import matplotlib.pyplot as plt import torchvision.transforms as transforms import torchvision.models as models import copy from trains import Task task = Task.init(project_name='examples', task_name='pytorch with matplotlib example', task_type=Task.TaskTypes.testing) ###################################################################### # Next, we need to choose which device to run the network on and import the # content and style images. Running the neural transfer algorithm on large # images takes longer and will go much faster when running on a GPU. We can # use ``torch.cuda.is_available()`` to detect if there is a GPU available. # Next, we set the ``torch.device`` for use throughout the tutorial. Also the ``.to(device)`` # method is used to move tensors or modules to a desired device. device = torch.device("cuda" if torch.cuda.is_available() else "cpu") ###################################################################### # Loading the Images # ------------------
def _patched_parse_args(original_parse_fn, self, args=None, namespace=None): # if we are running remotely, we always have a task id, so we better patch the argparser as soon as possible. if not PatchArgumentParser._current_task: from ..config import running_remotely if running_remotely(): # this will cause the current_task() to set PatchArgumentParser._current_task from trains import Task # noinspection PyBroadException try: Task.init() except Exception: pass # automatically connect to current task: if PatchArgumentParser._current_task: from ..config import running_remotely if PatchArgumentParser._calling_current_task: # if we are here and running remotely by now we should try to parse the arguments if original_parse_fn: PatchArgumentParser._add_last_parsed_args( original_parse_fn(self, args=args, namespace=namespace)) return PatchArgumentParser._last_parsed_args[-1] PatchArgumentParser._calling_current_task = True # Store last instance and result PatchArgumentParser._add_last_arg_parser(self) parsed_args = None # parse if we are running in dev mode if not running_remotely() and original_parse_fn: parsed_args = original_parse_fn(self, args=args, namespace=namespace) PatchArgumentParser._add_last_parsed_args(parsed_args) # noinspection PyBroadException try: # sync to/from task # noinspection PyProtectedMember PatchArgumentParser._current_task._connect_argparse( self, args=args, namespace=namespace, parsed_args=parsed_args[0] if isinstance( parsed_args, tuple) else parsed_args) except Exception: pass # sync back and parse if running_remotely() and original_parse_fn: # if we are running python2 check if we have subparsers, # if we do we need to patch the args, because there is no default subparser if PY2: import itertools def _get_sub_parsers_defaults(subparser, prev=[]): actions_grp = [ a._actions for a in subparser.choices.values() ] if isinstance(subparser, _SubParsersAction) else [ subparser._actions ] sub_parsers_defaults = [[ subparser ]] if hasattr(subparser, 'default') and subparser.default else [] for actions in actions_grp: sub_parsers_defaults += [ _get_sub_parsers_defaults(a, prev) for a in actions if isinstance(a, _SubParsersAction) and hasattr(a, 'default') and a.default ] return list( itertools.chain.from_iterable( sub_parsers_defaults)) sub_parsers_defaults = _get_sub_parsers_defaults(self) if sub_parsers_defaults: if args is None: # args default to the system args import sys as _sys args = _sys.argv[1:] else: args = list(args) # make sure we append the subparsers for a in sub_parsers_defaults: if a.default not in args: args.append(a.default) PatchArgumentParser._add_last_parsed_args( original_parse_fn(self, args=args, namespace=namespace)) else: PatchArgumentParser._add_last_parsed_args(parsed_args or {}) PatchArgumentParser._calling_current_task = False return PatchArgumentParser._last_parsed_args[-1] # Store last instance and result PatchArgumentParser._add_last_arg_parser(self) PatchArgumentParser._add_last_parsed_args( {} if not original_parse_fn else original_parse_fn( self, args=args, namespace=namespace)) return PatchArgumentParser._last_parsed_args[-1]
# TRAINS - Example of manual model configuration and uploading # import os from tempfile import gettempdir import torch from trains import Task task = Task.init(project_name='examples', task_name='Model configuration and upload') # create a model model = torch.nn.Module # Connect a local configuration file config_file = os.path.join('..', '..', 'reporting', 'data_samples', 'sample.json') config_file = task.connect_configuration(config_file) # then read configuration as usual, the backend will contain a copy of it. # later when executing remotely, the returned `config_file` will be a temporary file # containing a new copy of the configuration retrieved form the backend # # model_config_dict = json.load(open(config_file, 'rt')) # Or Store dictionary of definition for a specific network design model_config_dict = { 'value': 13.37, 'dict': { 'sub_value': 'string', 'sub_integer': 11 }, 'list_of_ints': [1, 2, 3, 4],