def upload_files(share_name, share_path, local_path, username=None, overwrite=True, show_progress=True): ''' uploads files specified by local_path to the datastore path formed by: share_name/username/share_path. :param share_name: the XT share name where files will be stored (usually one of: data, models, trajectories) :param share_path: the path where file(s) will be stored on the share (e.g., "maze" or "procgen") :param local_path: the path to the local files to be uploaded :param username: the username associated with the data on the share (if None, will use OS username) :param overwrite: if False, existing files will not be overwritten (not yet supported) :param show_progress: if True, progress messages will be printed ''' if username is None: username = utils.get_username() share_path = os.path.join(username, share_path) share_path = share_path.replace("\\", "/") # use XT to prevent interactive authenication (which will fail for remote runs) xt_run = Run() results = xt_run.upload_files_to_share(share_name, share_path, local_path, show_feedback=show_progress) return results
def init_xt_run(self, logging, tb_path, args): # init xtlib self.run = None if args.xtlib and (os.getenv("XT_RUN_NAME") or tb_path): # access to the XTLib API from xtlib.run import Run as XTRun # create an instance of XTRunLog to log info for current run print("---> tb_path=", tb_path) self.run = XTRun(xt_logging=logging, aml_logging=logging, checkpoints_enabled=logging, tensorboard_path=tb_path) #utils.debug_break() if args.tag_job: self.run.tag_job( {"plotted_metric": "test_acc"} ) # if "call search API" test was specified and if we are running under XT if args.search_api and run.run_name: fn_sweeps = os.path.join(file_utils.get_my_file_dir(__file__), "miniSweeps.yaml") sweeps = file_utils.load_yaml(fn_sweeps) hp_space_dict = sweeps[constants.HPARAMS_DIR] print("hp_space_dict=", hp_space_dict) search_type = "random" hp_set = run.get_next_hp_set_in_search(hp_space_dict, search_type=search_type) print("hp_set=", hp_set) # apply to args for name, value in hp_set.items(): setattr(args, name, value)
def __init__(self): self.file_paths = [] self.stage_num = 0 self.last_metric = None self.last_y_axis_name = None self.last_x_axis_name = None self.last_x_axis_value = None self.last_stage_num = None # XT related self.xt_run_name = os.getenv("XT_RUN_NAME", None) self.xt_run = None if self.xt_run_name: from xtlib.run import Run as XTRun self.xt_run = XTRun()
def __init__(self): """ Preprocesses the runspec before the call to yaml.load(). Manages communication with XT regarding hyperparameters. """ self.uploaded_hp_config_filename = 'uploaded_hp_config.yaml' self.downloaded_hp_config_filename = 'downloaded_hp_config.yaml' self.xt_run_name = os.getenv("XT_RUN_NAME") self.xt_run = None self.in_hp_search = False self.randint_in_spec = False if self.xt_run_name: from xtlib.run import Run as XTRun self.xt_run = XTRun() if os.path.isfile(self.downloaded_hp_config_filename): self.in_hp_search = True self.hparams = []
def __init__(self): self.file_paths = [] self.stage_num = 0 self.last_metric = None self.last_y_axis_name = None self.last_x_axis_name = None self.last_x_axis_value = None self.last_stage_num = None # XT related self.xt_run_name = os.getenv("XT_RUN_NAME", None) self.xt_run = None if self.xt_run_name: from xtlib.run import Run as XTRun self.xt_run = XTRun() self.estimate_record = {} self.mse_record = {} self.all_estimators = ['PDIS', 'WPDIS', 'MB-K', 'LSTD', 'LSTDQ', 'TDREG-K', 'MWL', 'MSWL', 'MQL', 'DualDICE', \ 'TDREG-N', 'FQE', 'MB-N', 'W-Regression', 'FL', 'On_Policy', 'Behavior'] for estimator in self.all_estimators: self.estimate_record[estimator] = [] self.mse_record[estimator] = []
class NexusLogger(): ''' Centralizes various forms of low-frequency output, such as occasional metric reports. Not intended for high-frequency logging (multiple calls per second throughout a run). ''' def __init__(self): self.file_paths = [] self.stage_num = 0 self.last_metric = None self.last_y_axis_name = None self.last_x_axis_name = None self.last_x_axis_value = None self.last_stage_num = None # XT related self.xt_run_name = os.getenv("XT_RUN_NAME", None) self.xt_run = None if self.xt_run_name: from xtlib.run import Run as XTRun self.xt_run = XTRun() def add_output_file(self, file_path): # Add one console output file. file_path = os.path.abspath(file_path) self.file_paths.append(file_path) utils.ensure_dir_exists(file=file_path) output_file = open(file_path, 'w') output_file.close() def write_line(self, line): # Write one line to stdout and all console files. print(line) for path in self.file_paths: output_file = open(path, 'a') output_file.write(line + '\n') output_file.close() def write_and_condense_metrics(self, total_seconds, x_axis_name, x_axis_value, saved, metrics, tf_writer): ''' Outputs the given metric values for the last reporting period and condenses the metric. ''' hours = total_seconds / 3600 self.last_x_axis_name = x_axis_name self.last_x_axis_value = x_axis_value self.last_stage_num = self.stage_num # Report one line. sz = "{:7.3f} hrs {:12,d} {}".format(hours, x_axis_value, x_axis_name) # Write one line of formatted metrics. for metric in metrics: sz_format = ' {} {{}}'.format(metric.formatting_string) sz += sz_format.format(metric.aggregate_value, metric.short_name) if saved: sz += " SAVED" self.write_line(sz) if self.xt_run: # Log metrics to XT xt_metrics = {} xt_metrics["hrs"] = hours xt_metrics[x_axis_name] = x_axis_value for metric in metrics: xt_metrics[metric.short_name] = metric.aggregate_value self.xt_run.log_metrics(data_dict=xt_metrics, step_name=x_axis_name, stage='s{}'.format(self.stage_num)) if tf_writer: # Log metrics to tensorboard. for metric in metrics: tf_writer.add_scalar(metric.long_name, metric.aggregate_value, x_axis_value) tf_writer.flush() # Condense the metrics for metric in metrics: metric.condense_values() def summarize_stage(self, metric): ''' Outputs the metric value for the entire processing stage. ''' metric.condense_values( ) # Condense any values accumulated since the last report. sz_format = 'Stage summary (mean {{}}): {}'.format( metric.formatting_string) self.write_line( sz_format.format(metric.long_name, metric.lifetime_value)) self.last_metric = metric self.last_y_axis_name = metric.short_name return metric.lifetime_value def finish_run(self, in_hp_search): ''' Outputs the final stage's summary metric as hpmax (used for hyperparameter tuning). ''' if self.last_metric: # Log hpmax. explanation = 'Objective that would be maximized by hyperparameter tuning (hpmax):' hpmax = self.last_metric.lifetime_value if not self.last_metric.higher_is_better: hpmax = -hpmax if self.xt_run: # Log hpmax to XT xt_metrics = {} xt_metrics[self.last_x_axis_name] = self.last_x_axis_value xt_metrics['hpmax'] = hpmax self.xt_run.log_metrics(data_dict=xt_metrics, step_name=self.last_x_axis_name) self.xt_run.tag_job({ 'plotted_metric': 's{}-{}'.format(self.last_stage_num, self.last_y_axis_name) }) # self.xt_run.tag_job({'primary_metric': 'hpmax'}) # To override xt_config.yaml's default of 'hpmax'. # self.xt_run.tag_job({'step_name': 'iters'}) # To override xt_config.yaml's default of 'iters'. if in_hp_search: explanation = 'Objective being maximized by hyperparameter tuning (hpmax):' sz_format = '{} {}\n'.format(explanation, self.last_metric.formatting_string) self.write_line(sz_format.format(hpmax))
class OPELogger(): ''' Centralizes various forms of low-frequency output, such as occasional metric reports. Not intended for high-frequency logging (multiple calls per second throughout a run). ''' def __init__(self): self.file_paths = [] self.stage_num = 0 self.last_metric = None self.last_y_axis_name = None self.last_x_axis_name = None self.last_x_axis_value = None self.last_stage_num = None # XT related self.xt_run_name = os.getenv("XT_RUN_NAME", None) self.xt_run = None if self.xt_run_name: from xtlib.run import Run as XTRun self.xt_run = XTRun() self.estimate_record = {} self.mse_record = {} self.all_estimators = ['PDIS', 'WPDIS', 'MB-K', 'LSTD', 'LSTDQ', 'TDREG-K', 'MWL', 'MSWL', 'MQL', 'DualDICE', \ 'TDREG-N', 'FQE', 'MB-N', 'W-Regression', 'FL', 'On_Policy', 'Behavior'] for estimator in self.all_estimators: self.estimate_record[estimator] = [] self.mse_record[estimator] = [] # self.estimate_record['On_Policy'] = [] # self.estimate_record['LSTDQ'] = [] # self.mse_record['LSTDQ'] = [] # self.estimate_record['MB'] = [] # self.mse_record['MB'] = [] # self.estimate_record['TDREG_Neural'] = [] # self.mse_record['TDREG_Neural'] = [] # self.estimate_record['FQE'] = [] # self.mse_record['FQE'] = [] def add_output_file(self, file_path): # Add one console output file. file_path = os.path.abspath(file_path) self.file_paths.append(file_path) utils.ensure_dir_exists(file=file_path) output_file = open(file_path, 'w') output_file.close() def write_line(self, line): # Write one line to stdout and all console files. print(line) for path in self.file_paths: output_file = open(path, 'a') output_file.write(line + '\n') output_file.close() def write_ope_metrics(self, dataset_seed, metrics, result): # report one line formatting_string = '{:6.4f}' sz = "Dataset {} - Relative Error:".format(dataset_seed) for estimator, error in metrics.items(): sz_format = ' {{}}: {}'.format(formatting_string) sz += sz_format.format(estimator,error) self.write_line(sz) self.estimate_record['On_Policy'].append(result['On_Policy']) self.estimate_record['Behavior'].append(result['Behavior']) for estimator, error in metrics.items(): self.estimate_record[estimator].append(result[estimator]) self.mse_record[estimator].append(error) # summary_metrics = {} # summary_metrics['On_Policy'] = sum(self.estimate_record['On_Policy']) / len(self.estimate_record['On_Policy']) # summary_metrics['LSTDQ'] = sum(self.estimate_record['LSTDQ']) / len(self.estimate_record['LSTDQ']) # summary_metrics['squared_error'] = sum(self.mse_record['LSTDQ']) / len(self.mse_record['LSTDQ']) # print(summary_metrics) # if self.xt_run: # xt_metrics = {} # xt_metrics["True Val"] = result['On_Policy'] # # xt_metrics[x_axis_name] = x_axis_value # for estimator, error in metrics.items(): # xt_metrics[estimator] = result[estimator] # xt_metrics['squared_error'] = error # # xt_metrics[estimator] = error # # self.xt_run.log_metrics(data_dict=xt_metrics, step_name="Dataset") # self.xt_run.log_metrics(data_dict=xt_metrics) def write_and_condense_metrics(self, total_seconds, x_axis_name, x_axis_value, saved, metrics, tf_writer): ''' Outputs the given metric values for the last reporting period and condenses the metric. ''' hours = total_seconds / 3600 self.last_x_axis_name = x_axis_name self.last_x_axis_value = x_axis_value self.last_stage_num = self.stage_num # Report one line. sz = "{:7.3f} hrs {:12,d} {}".format(hours, x_axis_value, x_axis_name) # Write one line of formatted metrics. for metric in metrics: sz_format = ' {} {{}}'.format(metric.formatting_string) sz += sz_format.format(metric.aggregate_value, metric.short_name) if saved: sz += " SAVED" self.write_line(sz) if self.xt_run: # Log metrics to XT xt_metrics = {} xt_metrics["hrs"] = hours xt_metrics[x_axis_name] = x_axis_value for metric in metrics: xt_metrics[metric.short_name] = metric.aggregate_value self.xt_run.log_metrics(data_dict=xt_metrics, step_name=x_axis_name, stage='s{}'.format(self.stage_num)) if tf_writer: # Log metrics to tensorboard. for metric in metrics: tf_writer.add_scalar(metric.long_name, metric.aggregate_value, x_axis_value) tf_writer.flush() # Condense the metrics for metric in metrics: metric.condense_values() def summarize_stage(self, metric): ''' Outputs the metric value for the entire processing stage. ''' metric.condense_values() # Condense any values accumulated since the last report. sz_format = 'Stage summary (mean {{}}): {}'.format(metric.formatting_string) self.write_line(sz_format.format(metric.long_name, metric.lifetime_value)) self.last_metric = metric self.last_y_axis_name = metric.short_name return metric.lifetime_value def finish_run(self, in_hp_search): ''' Outputs the final stage's summary metric as hpmax (used for hyperparameter tuning). ''' summary_metrics = {} summary_metrics['On_Policy'] = sum(self.estimate_record['On_Policy']) / len(self.estimate_record['On_Policy']) summary_metrics['Behavior'] = sum(self.estimate_record['Behavior']) / len(self.estimate_record['Behavior']) # summary_metrics['MB'] = sum(self.estimate_record['MB']) / len(self.estimate_record['MB']) # summary_metrics['squared_error'] = sum(self.mse_record['MB']) / len(self.mse_record['MB']) # summary_metrics['TDREG_Neural'] = sum(self.estimate_record['TDREG_Neural']) / len(self.estimate_record['TDREG_Neural']) # summary_metrics['squared_error'] = sum(self.mse_record['TDREG_Neural']) / len(self.mse_record['TDREG_Neural']) for estimator in self.all_estimators: if estimator != 'On_Policy' and estimator != 'Behavior' and len(self.estimate_record[estimator]) >0: summary_metrics[estimator] = sum(self.estimate_record[estimator]) / len(self.estimate_record[estimator]) summary_metrics[estimator+'_se'] = sum(self.mse_record[estimator]) / len(self.mse_record[estimator]) # summary_metrics['FQE'] = sum(self.estimate_record['FQE']) / len(self.estimate_record['FQE']) # summary_metrics['squared_error'] = sum(self.mse_record['FQE']) / len(self.mse_record['FQE']) # print(summary_metrics) if self.xt_run: self.xt_run.log_metrics(data_dict = summary_metrics)
def runner(self, concurrent_index, job_id, delay, duration, child_count, reports, search_type): ws_name = "quick-test" exper_name = "qtexper" fn = "code/miniSweeps.yaml" yd = file_utils.load_yaml(fn) hd = yd[constants.HPARAM_DIST] # simulate a controller for each concurrent runner hparam_search = HParamSearch() for index in range(child_count): # create a new RUN record run_name = self.store.start_run(ws_name, exper_name=exper_name, is_parent=False, job_id=job_id, node_index=0, search_type=search_type, search_style="dynamic") os.environ["XT_RUN_NAME"] = run_name os.environ["XT_WORKSPACE_NAME"] = ws_name os.environ["XT_EXPERIMENT_NAME"] = exper_name fake_context = cmd_core.build_mock_context(self.config, job_id, ws_name, exper_name, run_name) metric_name = fake_context.primary_metric xt_run = Run(self.config, self.store, supress_normal_output=True) xt_run.direct_run = True xt_run.context = fake_context #print(" starting: concurrent_index={}, child_index={}".format(concurrent_index, index)) # delay start sleep_time = delay * random.random() time.sleep(sleep_time) hp_set = xt_run.get_next_hp_set_in_search( hd, search_type, hparam_search=hparam_search) self._assert("channels1" in hp_set) # log HPARAMS xt_run.log_hparams(hp_set) for i in range(reports): run_time = (duration / reports) * random.random() time.sleep(run_time) # log METRICS fake_metric = random.random() md = {"epoch": 1 + i, "acc": fake_metric} xt_run.log_metrics(md, step_name="epoch", stage="test") # mark the run as completed xt_run.close()
class Trainer(): def __init__(self): pass def train(self, args, model, device, optimizer, epoch): model.train() total_correct = 0 total = 0 steps = 0 for batch_idx, (data, target) in enumerate(self.train_loader): data, target = data.to(device), target.to(device) optimizer.zero_grad() output = model(data) loss = F.nll_loss(output, target) loss.backward() optimizer.step() # compute train-acc pred = output.argmax( dim=1, keepdim=True) # get the index of the max log-probability correct = pred.eq(target.view_as(pred)).sum().item() total_correct += correct total += len(data) steps += 1 return loss.item(), total_correct / total, steps, len( data), loss, total_correct, total def test(self, args, model, device): test_loader = self.test_loader model.eval() test_loss = 0 correct = 0 with torch.no_grad(): for data, target in test_loader: data, target = data.to(device), target.to(device) output = model(data) test_loss += F.nll_loss( output, target, reduction='sum').item() # sum up batch loss pred = output.argmax( dim=1, keepdim=True) # get the index of the max log-probability correct += pred.eq(target.view_as(pred)).sum().item() test_loss /= len(test_loader.dataset) test_acc = correct / len(test_loader.dataset) print('Test set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'. format(test_loss, correct, len(test_loader.dataset), 100. * test_acc)) return test_loss, test_acc def get_dataset(self, data_dir, train, auto_download): ds = datasets.MNIST( data_dir, train=train, download=auto_download, transform=transforms.Compose([ # PIL transforms #transforms.Resize(22), #transforms.Resize(28), #transforms.RandomCrop(28), #transforms.RandomHorizontalFlip(), #transforms.RandomRotation(3, resample=PIL.Image.BILINEAR), # TENSOR transforms transforms.ToTensor(), transforms.Normalize((0.1307, ), (0.3081, )), # requires pytorch 1.2 #transforms.RandomErasing(p=.25, value="random"), ])) return ds def sample_mnist(self, data_dir, train, rand, percent, auto_download): # get MNIST data ds = self.get_dataset(data_dir, train, auto_download) # support previous torchvision version as well as current (AML workaround) if hasattr(ds, "data"): data_attr = "data" target_attr = "targets" elif train: data_attr = "train_data" target_attr = "train_labels" else: data_attr = "test_data" target_attr = "test_labels" # extract data and targets data = getattr(ds, data_attr) targets = getattr(ds, target_attr) count = len(data) indexes = list(range(count)) rand.shuffle(indexes) samples = int(count * percent) indexes = indexes[0:samples] # update data setattr(ds, data_attr, data[indexes]) # update targets setattr(ds, target_attr, targets[indexes]) which = "TRAIN" if train else "TEST" print("Sampled " + which + " data: ", len(data), ", targets=", len(targets)) return ds def save_model(self, model, fn): # ensure output dir exists dir = os.path.dirname(fn) if not os.path.exists(dir): os.makedirs(dir) torch.save(model.state_dict(), fn) def text_log(self, msg): with open(self.fn_text_log, "a") as outfile: outfile.write(msg + "\n") def log_stats_and_test(self, epoch, steps, data_len, loss, total_correct, total, model, device, checkpoint_freq, run, train_loss, train_acc, args): msg = 'Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}\tAcc: {:.6f}'.format( epoch, steps * data_len, len(self.train_loader.dataset), 100. * steps / len(self.train_loader), loss.item(), total_correct / total) # print to console print(msg) # log to simple text logger self.text_log(msg) if checkpoint_freq and run and run.store: if checkpoint_units == "epochs" and epoch % checkpoint_freq == 0: cp_now = True elif checkpoint_units == "mins" and time.time( ) - last_checkpoint > checkpoint_freq * 60: cp_now = True else: cp_now = False if cp_now: checkpoint_count += 1 print("checkpointing model (#{})\n".format(checkpoint_count)) save_model(model, fn_checkpoint) run.set_checkpoint({"epoch": epoch}, fn_checkpoint) last_checkpoint = time.time() if run: # log TRAINING stats run.log_metrics( { "epoch": epoch, "loss": train_loss, "acc": train_acc }, step_name="epoch", stage="train") # log EVAL/TEST stats half as often if (epoch / args.log_interval) % 2 == 0: test_loss, test_acc = self.test_model_and_log_metrics( run, model, device, epoch, args) # early stopping if math.isnan(test_loss): run.log_event("early_stopping", {"reason": "loss_is_nan"}) # exit without error sys.exit(0) def test_model_and_log_metrics(self, run, model, device, epoch, args): # TEST the model test_loss, test_acc = self.test(args, model, device) # log TEST METRICS #print("test_loss=", test_loss, ", test_acc=", test_acc) run.log_metrics({ "epoch": epoch, "loss": test_loss, "acc": test_acc }, step_name="epoch", stage="test") return test_loss, test_acc def train_test_loop(self, run, model, device, optimizer, start_epoch, checkpoint_freq, args): total_steps = 0 start = time.time() print("train_test_loop: start_epoch={} end_epoch={}\n".format( start_epoch, args.epochs + 1)) for epoch in range(start_epoch, args.epochs + 1): # train an epoch train_loss, train_acc, steps, data_len, loss, total_correct, total = \ self.train(args, model, device, optimizer, epoch) total_steps += steps if epoch % args.log_interval == 0: elapsed = time.time() - start #print("{} epoch(s) training took: {:.2f} secs".format(args.log_interval, elapsed)) self.log_stats_and_test(epoch, steps, data_len, loss, total_correct, total, model, device, checkpoint_freq, run, train_loss, train_acc, args) start = time.time() def init_xt_run(self, logging, tb_path, args): # init xtlib self.run = None if args.xtlib and (os.getenv("XT_RUN_NAME") or tb_path): # access to the XTLib API from xtlib.run import Run as XTRun # create an instance of XTRunLog to log info for current run print("---> tb_path=", tb_path) self.run = XTRun(xt_logging=logging, aml_logging=logging, checkpoints_enabled=logging, tensorboard_path=tb_path) #utils.debug_break() if args.tag_job: self.run.tag_job({"plotted_metric": "test_acc"}) # if "call search API" test was specified and if we are running under XT if args.search_api and run.run_name: fn_sweeps = os.path.join(file_utils.get_my_file_dir(__file__), "miniSweeps.yaml") sweeps = file_utils.load_yaml(fn_sweeps) hp_space_dict = sweeps[constants.HPARAMS_DIR] print("hp_space_dict=", hp_space_dict) search_type = "random" hp_set = run.get_next_hp_set_in_search(hp_space_dict, search_type=search_type) print("hp_set=", hp_set) # apply to args for name, value in hp_set.items(): setattr(args, name, value) def init_datasets(self, data_dir, use_cuda, args): kwargs = {'num_workers': 0, 'pin_memory': True} if use_cuda else {} # load subset of training and test data ds_train = self.sample_mnist(data_dir, True, self.rand, args.train_percent, args.auto_download) ds_test = self.sample_mnist(data_dir, False, self.rand, args.test_percent, args.auto_download) if args.distributed: # Partition dataset among workers using DistributedSampler train_sampler = torch.utils.data.distributed.DistributedSampler( ds_train, num_replicas=hvd.size(), rank=hvd.rank()) shuffle = False else: train_sampler = None shuffle = True print("loading TRAIN data...") self.train_loader = torch.utils.data.DataLoader( ds_train, batch_size=args.batch_size, shuffle=shuffle, sampler=train_sampler, **kwargs) print("loading TEST data...") self.test_loader = torch.utils.data.DataLoader( ds_test, batch_size=args.test_batch_size, shuffle=True, **kwargs) def init_cuda(self, args): #---- CUDA init ---- cuda_avail = torch.cuda.is_available() use_cuda = cuda_avail and args.cuda gpu_count = torch.cuda.device_count() if use_cuda and not args.parallel: torch.cuda.set_device(args.gpu) print(" cuda_avail={}, GPU count={}, use_cuda={}, gpu={} ---".format( cuda_avail, gpu_count, use_cuda, args.gpu)) if use_cuda and not cuda_avail: # if we cannot find a GPU, consider that a hard error (used to detect problems with seeing Philly GPUs) errors.env_error("CUDA not available on this platform") if args.distributed: # Initialize Horovod global hvd import horovod.torch as hvd hvd.init() # Pin GPU to be used to process local rank (one GPU per process) print(" distributed: rank={}, size={}".format( hvd.rank(), hvd.size())) device = torch.device("cuda:" + str(hvd.local_rank())) # only log HPARAMS and METRICS for job if running as rank 0 logging = (hvd.rank() == 0) else: device = torch.device("cuda" if use_cuda else "cpu") logging = True return use_cuda, device, logging def init_dirs(self, args): # set mnt_output_dir (using environment variable setting from xt) mnt_output_dir = os.getenv("XT_OUTPUT_MNT", "output") mnt_output_dir = os.path.expanduser(mnt_output_dir) file_utils.ensure_dir_exists(mnt_output_dir) print("writing mnt_output to: " + mnt_output_dir) # set local_output_dir (using environment variable setting from xt) local_output_dir = "output" file_utils.ensure_dir_exists(local_output_dir) print("writing local_output to: " + local_output_dir) # set data_dir (allowing overridden by environment variable) data_dir = os.getenv("XT_DATA_DIR", args.data) data_dir = os.path.expanduser(data_dir) file_utils.ensure_dir_exists(data_dir) print("getting data from: " + data_dir) fn_test = data_dir + "/MNIST/processed/test.pt" exists = os.path.exists(fn_test) print("fn_test={}, exists={}".format(fn_test, exists)) fn_train = data_dir + "/MNIST/processed/training.pt" exists = os.path.exists(fn_train) print("fn_train={}, exists={}".format(fn_train, exists)) if args.download_only: print("miniMnist (ensuring data is downloaded)") self.get_dataget_dataset(data_dir, True, True) self.get_dataset(data_dir, False, True) return mnt_output_dir, local_output_dir, data_dir def print_settings(self, args): print("--- miniMnist settings ---") print(" command-line args:", sys.argv) if args.env_vars: print(" env vars:") keys = list(os.environ.keys()) keys.sort() for key in keys: value = os.environ[key] if len(value) > 100: value = value[0:100] + "..." print(" {}: {}".format(key, value)) print(" cwd: " + os.getcwd()) print(" python: " + sys.version.replace("\n", " ")) print(" torch.__version__=", torch.__version__) # bug workaround: torchvision version 0.4.2 is missing the "__version__" attribute if hasattr(torchvision, "__version__"): print(" torchvision: " + str(torchvision.__version__)) else: print(" dir(torchvision)=", dir(torchvision)) in_docker = os.path.exists(".dockerenv") or os.getenv("XT_IN_DOCKER") print(" in_docker: " + str(in_docker)) if args.xtlib: import xtlib print(" xtlib: " + str(xtlib.__version__)) def init_model(self, device, args): use_cnn = True if use_cnn: print("created CNN model...") model = SimpleCNN(num_mid_conv=args.mid_conv, channels1=args.channels1, channels2=args.channels2, kernel_size=args.kernel_size, mlp_units=args.mlp_units) else: print("created MLP model...") model = MLP() gpu_count = torch.cuda.device_count() if args.parallel and gpu_count > 1: model = nn.DataParallel(model) print("using PARALLEL training with {} GPUs".format(gpu_count)) elif args.parallel: print( "PARALLEL requested but only found {} GPUs".format(gpu_count)) else: print("using single GPU; gpu_count=", gpu_count) model.to(device) return model def init_random_seeds(self, args): #---- random seeds ---- if args.seed == 0: args.seed = int(time.time()) self.rand = random.Random(args.seed) fn_checkpoint = "checkpoints/mnist_cnn.pt" torch.manual_seed(args.seed) def init_stuff(self): args = self.args mnt_output_dir, local_output_dir, data_dir = self.init_dirs(args) self.print_settings(args) self.init_random_seeds(args) use_cuda, device, logging = self.init_cuda(args) print("-------------") tb_path = mnt_output_dir if args.tensorboard else None self.init_xt_run(logging, tb_path, args) self.init_datasets(data_dir, use_cuda, args) model = self.init_model(device, args) return model, device, mnt_output_dir, local_output_dir def apply_runset_file(self, args, fn): #utils.debug_break() fn = os.path.abspath(fn) with open(fn, "rt") as infile: yd = yaml.safe_load(infile) if not constants.HPARAM_RUNSET in yd: errors.internal_error( "found runset file without {} property: {}".format( constants.HPARAM_RUNSET, fn)) print("applying runset file to args: {}".format(fn)) hd = yd[constants.HPARAM_RUNSET] for prop, val in hd.items(): prop = prop.replace("-", "_") setattr(args, prop, val) def run(self): print("args=", sys.argv) self.args = parse_cmdline_args() args = self.args fn_runset = "runset.yaml" if os.path.exists(fn_runset): self.apply_runset_file(args, fn_runset) model, device, mnt_output_dir, local_output_dir = self.init_stuff() start_epoch = 1 run = self.run if args.raise_error: #errors.internal_error("Raising an intentional error") # try a different type of error abc.foo = 1 # log hyperparameters to xt if run: hp_dict = { "seed": args.seed, "batch-size": args.batch_size, "epochs": args.epochs, "lr": args.lr, "momentum": args.momentum, "channels1": args.channels1, "channels2": args.channels2, "kernel_size": args.kernel_size, "mlp-units": args.mlp_units, "weight-decay": args.weight_decay, "optimizer": args.optimizer, "mid-conv": args.mid_conv, "gpu": args.gpu, "log-interval": args.log_interval } run.log_hparams(hp_dict) if args.cuda: # if on linux, show GPU info if os.name != "nt": os.system("nvidia-smi") # print hyperparameters print("hyperparameters:", hp_dict) print() # see if we are resuming a preempted run if run and run.resume_name: print("resuming from run=", run.resume_name) dd = run.get_checkpoint(fn_checkpoint) if dd and dd["epoch"]: model.load_state_dict(torch.load(fn_checkpoint)) start_epoch = 1 + dd["epoch"] if args.optimizer == "sgd": #print("using SGD optimizer") optimizer = optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay) else: #print("using Adam optimizer") optimizer = optim.Adam(model.parameters(), lr=args.lr, weight_decay=args.weight_decay) if args.distributed: optimizer = hvd.DistributedOptimizer( optimizer, named_parameters=model.named_parameters()) # Broadcast parameters from rank 0 to all other processes. hvd.broadcast_parameters(model.state_dict(), root_rank=0) checkpoint_freq = 0 checkpoint_units = "" last_checkpoint = time.time() checkpoint_count = 0 # force a ML app error to kill the app #x = foo/bar # parse checkpoint arg #print("args.checkpoint=", args.checkpoint, ", type(args.checkpoint)", type(args.checkpoint)) if False: # args.checkpoint: if type(args.checkpoint) in ["int", "float"]: checkpoint_freq = int(args.checkpoint) checkpoint_units = "epochs" elif isinstance(args.checkpoint, str): parts = args.checkpoint.split(' ') if len(parts) == 2: checkpoint_freq, checkpoint_units = parts checkpoint_freq = float(checkpoint_freq) checkpoint_units = checkpoint_units.strip().lower() else: checkpoint_freq = float(args.checkpoint) checkpoint_units = "epochs" model_dir = os.getenv("XT_MODEL_DIR", "models/miniMnist") fn_model = model_dir + "/mnist_cnn.pt" self.fn_text_log = mnt_output_dir + "/text_log.txt" if args.eval_model: # load model and evaluate it print("loading existing MODEL and evaluating it, fn=", fn_model) exists = os.path.exists(fn_model) print("model exists=", exists) model.load_state_dict(torch.load(fn_model)) print("model loaded!") # just test model self.test_model_and_log_metrics(run, model, device, epoch=1, args=args) else: self.train_test_loop(run, model, device, optimizer, 1, checkpoint_freq, args=args) if (args.save_model): file_utils.ensure_dir_exists(model_dir) self.save_model(model, fn_model) # always save a copy of model in the AFTER FILES self.save_model(model, "output/mnist_cnn.pt") if args.clear_checkpoint_at_end: if checkpoint_freq and run and run.store: run.clear_checkpoint() # create a file to be captured in OUTPUT FILES fn_app_log = os.path.join(local_output_dir, "miniMnist_log.txt") with open(fn_app_log, "wt") as outfile: outfile.write("This is a log for miniMnist app\n") outfile.write("miniMnist app completed\n") # create a file to be ignored in OUTPUT FILES fn_app_log = os.path.join(local_output_dir, "test.junk") with open(fn_app_log, "wt") as outfile: outfile.write( "This is a file that should be omitted from AFTER upload\n") outfile.write("end of junk file\n") if run: # ensure we close all logging run.close()
class HyperparameterHandler(): def __init__(self): """ Preprocesses the runspec before the call to yaml.load(). Manages communication with XT regarding hyperparameters. """ self.uploaded_hp_config_filename = 'uploaded_hp_config.yaml' self.downloaded_hp_config_filename = 'downloaded_hp_config.yaml' self.xt_run_name = os.getenv("XT_RUN_NAME") self.xt_run = None self.in_hp_search = False self.randint_in_spec = False if self.xt_run_name: from xtlib.run import Run as XTRun self.xt_run = XTRun() if os.path.isfile(self.downloaded_hp_config_filename): self.in_hp_search = True self.hparams = [] def split_spec(self, run_spec_file): # Read the spec into 3 sections. pre_hp_section = [] hp_section = [] post_hp_section = [] current_section = pre_hp_section for line in run_spec_file: if current_section == pre_hp_section: # Look for the start of the hp section. if line.startswith('hyperparameters:'): current_section = hp_section elif current_section == hp_section: # Look for the end of the hp section. if line[0] not in ' -#\n\r': current_section = post_hp_section else: assert current_section == post_hp_section # Append this line to the current section. current_section.append(line) return pre_hp_section, hp_section, post_hp_section def preprocess(self, run_spec_file): """ Modifies the hyperparameter section of a runspec before yaml.load() is called on it. """ # Read the spec into 3 sections. pre_hp_section, hp_section, post_hp_section = self.split_spec(run_spec_file) # Modify the HP section, if present. if len(hp_section) > 0: self.hparams = self.parse_hp_section(hp_section) if self.in_hp_search: self.read_hp_config_file() else: for hp in self.hparams: hp.choose_value(self.in_hp_search) parsed_hp_section = ['hyperparameters:\n'] for hp in self.hparams: parsed_hp_section += hp.format_chosen_value() parsed_hp_section.append('\n') else: parsed_hp_section = [] # Reassemble the modified runspec. spec_str = ''.join(pre_hp_section + parsed_hp_section + post_hp_section) # Check for randint. self.randint_in_spec = 'randint' in spec_str # Return the modified runspec. return spec_str def parse_hp_section(self, hp_section_in): """ Parses the hyperparameters section of a runspec. Returns a list of Hparam objects. For example... Input string hp_section_in: hyperparameters: - name: &rscale ordered_tuning_values: [2, 4, 8, 16, 32] tuned_value: 32 - name: &units ordered_tuning_values: [128, 192, 256, 384, 512] tuned_value: 384 Output returned: List of Hparam objects: hp[0].name = 'rscale' .values = [2, 4, 8, 16, 32] .tuned_value = 32 hp[1].name = 'units' .values = [128, 192, 256, 384, 512] .tuned_value = 384 """ hparams = [] name_line = '' values_line = '' i = 0 for full_line in hp_section_in: line = full_line.strip().rstrip() if line.startswith('hyperparameters:') or (len(line) == 0) or (line[0] == '#'): continue if i == 0: if line.startswith('- name:'): name_line = line i = 1 else: raise SyntaxError('First line of a hyperparameter definition must start with "- name:"\n=====> {}'.format(line)) elif i == 1: if (line.startswith('ordered_tuning_values:')) or (line.startswith('unordered_tuning_values:')): values_line = line i = 2 else: raise SyntaxError('Second line of a hyperparameter definition must start with "ordered_tuning_values:" or "unordered_tuning_values:"\n=====> {}'.format(line)) elif i == 2: if line.startswith('tuned_value:'): hp = Hparam(name_line, values_line, line) hparams.append(hp) i = 0 else: raise SyntaxError('Third line of a hyperparameter definition must start with "tuned_value:"\n=====> {}'.format(line)) else: raise SyntaxError('Unexpected line in the hyperparameters section of the runspec:{}'.format(line)) return hparams def log_chosen_values(self, logger): """ Logs the chosen HP values to the console for reference, and (optionally) to XT. """ if len(self.hparams) > 0: hparam_dict = {} logger.write_line("Chosen hyperparameter values:") for hp in self.hparams: hp.log_chosen_value(logger) hparam_dict[hp.name] = hp.chosen_value logger.write_line('') if self.xt_run: self.xt_run.log_hparams(hparam_dict) def write_hp_config_file(self): """ Generates the file that XT needs to support HP tuning. """ assert len(self.hparams) > 0, 'Hyperparameters must be specified.' # Warn the user if randint is missing from a hyperparameter search. if not self.randint_in_spec: response = None while (response != 'y') and (response != 'n'): print("WARNING: Hyperparameter tuning typically requires randomization,") print("which is usually achieved by setting the environment or agent seed to randint,") print("but randint is missing from this runspec. Are you sure you want to proceed? [y/n]") response = input() if response == 'n': exit(0) # Write the hp config file for the job launcher. hp_config_file = open(self.uploaded_hp_config_filename, 'w') hp_config_file.write('hyperparameter-distributions:\n') for hp in self.hparams: value_list = [] for value in hp.values: value_list.append(hp.yaml_value_from_python(value)) values_str = ', '.join(value_list) hp_config_file.write(' {}: [{}]\n'.format(hp.name, values_str)) hp_config_file.close() def read_hp_config_file(self): """ Reads the file containing the HP values chosen by XT. """ assert len(self.hparams) > 0, 'Hyperparameters must be specified.' print('Reading chosen hp values from downloaded_hp_config.yaml') chosen_hp_value_dict = yaml.load(open(self.downloaded_hp_config_filename, 'r'), Loader=yaml.Loader) hp_runset = chosen_hp_value_dict['hyperparameter-runset'] # for hp_name in hp_runset: # print('{} {}'.format(hp_name, hp_runset[hp_name])) assert len(hp_runset) == len(self.hparams) for hp in self.hparams: hp.chosen_value = hp_runset[hp.name]
help='number of units in the MLP layer of the model') # OPTIMIZER parser.add_argument('--optimizer', type=str, default="sgd", help='sets the optimizer for the model') parser.add_argument('--weight-decay', type=float, default=0, help='sets rate of weight decay for weights') args = parser.parse_args() # create an instance of XTRunLog to log info for current run run = Run() # log hyperparameters to xt hp_dict = { "seed": args.seed, "batch-size": args.batch_size, "epochs": args.epochs, "lr": args.lr, "momentum": args.momentum, "channels1": args.channels1, "channels2": args.channels2, "kernel_size": args.kernel_size, "mlp-units": args.mlp_units, "weight-decay": args.weight_decay, "optimizer": args.optimizer, "mid-conv": args.mid_conv