def run_a_trial(args): """Run one TPE meta optimisation step and save its results.""" results_pickle_file = os.path.join(RESULTS_DIR, "{}.pkl".format(args.dataset)) max_evals = nb_evals = 1 print("Attempt to resume a past training if it exists:") try: # https://github.com/hyperopt/hyperopt/issues/267 trials = pickle.load(open(results_pickle_file, "rb")) print("Found saved Trials! Loading...") max_evals = len(trials.trials) + nb_evals print("Rerunning from {} trials to add another one.".format( len(trials.trials))) except: trials = Trials() print("Starting from scratch: new trials.") best = fmin(get_optimizer(args.dataset, args.device), Model.HYPERPARAMETERS_SPACE, algo=tpe.suggest, trials=trials, max_evals=max_evals) pickle.dump(trials, open(results_pickle_file, "wb")) print("\nOPTIMIZATION STEP COMPLETE.\n")
def get_best_threshold(ensemble_model): """Run through a validation dataset, get best iou threshold.""" train_gen, val_gen = get_generators() # Add metrics ensemble_model.compile(loss='binary_crossentropy', optimizer=get_optimizer(), metrics=get_metric(scan=True)) preds_test = ensemble_model.evaluate_generator(generator=val_gen, verbose=True) # Now, simply fetch the right threshold ts = np.linspace(0, 1, len(preds_test)) best = ts[np.argmax(preds_test)] print("Found {} to be the best threshold!" % best) return best
def run_possibilities(dataset_path, logs_path, possibilities): x_train_labeled, x_train_unlabeled, y_train_labeled, x_val, y_val = get_data( dataset_path=dataset_path, normalization=NORMALIZATION, unlabeled_percentage=UNLABELED_PERCENTAGE, seed=SEED) _, evaluation_mapping, _ = timit.get_phone_mapping() n_classes = get_number_of_classes() for consistency_loss, schedule, sigma, consistency_scale, stabilization_scale, xi in possibilities: hparams = { 'consistency_loss': consistency_loss, 'schedule': schedule, 'sigma': sigma, 'consistency_scale': consistency_scale, 'stabilization_scale': stabilization_scale, 'xi': xi } for k, v in hparams.items(): print(f'{k}={v}, ', end='') print() config = Config(version='mono_directional', n_hidden_layers=N_HIDDEN_LAYERS, n_units=N_UNITS, n_epochs=N_EPOCHS, batch_size=BATCH_SIZE, unlabeled_percentage=UNLABELED_PERCENTAGE, optimizer=OPTIMIZER, consistency_loss=consistency_loss, consistency_scale=consistency_scale, stabilization_scale=stabilization_scale, xi=xi, sigma=sigma, schedule=schedule, schedule_length=SCHEDULE_LENGTH, normalization=NORMALIZATION, seed=SEED) logs_path_ = logs_path / str(config) if logs_path_.is_dir( ): # skip what already done (e.g. in case of crashes) print('already done, skipping...') continue logs_path_.mkdir(parents=True) logs_path_ = str(logs_path_) model = DualStudent(n_classes=n_classes, n_hidden_layers=config.n_hidden_layers, n_units=config.n_units, consistency_loss=config.consistency_loss, consistency_scale=config.consistency_scale, stabilization_scale=config.stabilization_scale, xi=config.xi, padding_value=PADDING_VALUE, sigma=config.sigma, schedule=config.schedule, schedule_length=config.schedule_length, version=config.version) model.compile(optimizer=get_optimizer(config.optimizer)) model.train(x_labeled=x_train_labeled, x_unlabeled=x_train_unlabeled, y_labeled=y_train_labeled, n_epochs=config.n_epochs, batch_size=config.batch_size, seed=config.seed) results = model.test(x=x_val, y=y_val, batch_size=config.batch_size, evaluation_mapping=evaluation_mapping) with tf.summary.create_file_writer(logs_path_).as_default(): hp.hparams(hparams) for k, v in results.items(): tf.summary.scalar(k, v, step=N_EPOCHS)
def train_ray(opt, checkpoint_dir=None, data_dir="../data"): device = torch.device("cuda" if torch.cuda.is_available() else "cpu") data = get_dataset(opt) g = data[0] if opt['gpu'] < 0: cuda = False else: cuda = True g = g.int().to(opt['gpu']) features = g.ndata['feat'] labels = g.ndata['label'] train_mask = g.ndata['train_mask'] val_mask = g.ndata['val_mask'] test_mask = g.ndata['test_mask'] num_feats = features.shape[1] n_classes = data.num_classes n_edges = data.graph.number_of_edges() print("""----Data statistics------' #Edges %d #Classes %d #Train samples %d #Val samples %d #Test samples %d""" % (n_edges, n_classes, train_mask.int().sum().item(), val_mask.int().sum().item(), test_mask.int().sum().item())) # add self loop g = dgl.remove_self_loop(g) g = dgl.add_self_loop(g) n_edges = g.number_of_edges() # create model heads = ([opt['num_heads']] * opt['num_layers']) + [opt['num_out_heads']] models = [] optimizers = [] datas = [g for i in range(opt['num_init'])] for split in range(opt['num_init']): if opt['model'] == 'GAT': model = GAT(g, opt['num_layers'], num_feats, opt['num_hidden'], n_classes, heads, F.elu, opt['in_drop'], opt['attn_drop'], opt['negative_slope'], opt['residual'], opt) elif opt['model'] == 'AGNN': model = AGNN(g, opt['num_layers'], num_feats, opt['num_hidden'], n_classes, opt['in_drop'], opt) train_this = train model = model.to(device) models.append(model) if torch.cuda.device_count() > 1: model = nn.DataParallel(model) # model = model.to(device) parameters = [p for p in model.parameters() if p.requires_grad] optimizer = get_optimizer(opt['optimizer'], parameters, lr=opt['lr'], weight_decay=opt['weight_decay']) optimizers.append(optimizer) # The `checkpoint_dir` parameter gets passed by Ray Tune when a checkpoint # should be restored. if checkpoint_dir: checkpoint = os.path.join(checkpoint_dir, "checkpoint") model_state, optimizer_state = torch.load(checkpoint) model.load_state_dict(model_state) optimizer.load_state_dict(optimizer_state) for epoch in range(1, opt['epochs']): loss = np.mean([ train_this(model, optimizer, features, train_mask, labels)[0].item() for model, optimizer in zip(models, optimizers) ]) train_accs, val_accs, tmp_test_accs = average_test(models, datas) with tune.checkpoint_dir(step=epoch) as checkpoint_dir: best = np.argmax(val_accs) path = os.path.join(checkpoint_dir, "checkpoint") torch.save( (models[best].state_dict(), optimizers[best].state_dict()), path) tune.report(loss=loss, accuracy=np.mean(val_accs), test_acc=np.mean(tmp_test_accs), train_acc=np.mean(train_accs))
def train_ray_int(opt, checkpoint_dir=None, data_dir="../data"): device = torch.device("cuda" if torch.cuda.is_available() else "cpu") data = get_dataset(opt) g = data[0] if opt['gpu'] < 0: cuda = False else: cuda = True g = g.int().to(opt['gpu']) # if opt["num_splits"] > 0: # dataset.data = set_train_val_test_split( # 23 * np.random.randint(0, opt["num_splits"]), # random prime 23 to make the splits 'more' random. Could remove # dataset.data, # num_development=5000 if opt["dataset"] == "CoauthorCS" else 1500) features = g.ndata['feat'] labels = g.ndata['label'] train_mask = g.ndata['train_mask'] val_mask = g.ndata['val_mask'] test_mask = g.ndata['test_mask'] num_feats = features.shape[1] n_classes = data.num_classes n_edges = data.graph.number_of_edges() print("""----Data statistics------' #Edges %d #Classes %d #Train samples %d #Val samples %d #Test samples %d""" % (n_edges, n_classes, train_mask.int().sum().item(), val_mask.int().sum().item(), test_mask.int().sum().item())) # add self loop g = dgl.remove_self_loop(g) g = dgl.add_self_loop(g) n_edges = g.number_of_edges() # create model heads = ([opt['num_heads']] * opt['num_layers']) + [opt['num_out_heads']] if opt['model'] == 'GAT': model = GAT(g, opt['num_layers'], num_feats, opt['num_hidden'], n_classes, heads, F.elu, opt['in_drop'], opt['attn_drop'], opt['negative_slope'], opt['residual'], opt) elif opt['model'] == 'AGNN': model = AGNN(g, opt['num_layers'], num_feats, opt['num_hidden'], n_classes, opt['in_drop'], opt) model = model.to(device) if torch.cuda.device_count() > 1: model = nn.DataParallel(model) parameters = [p for p in model.parameters() if p.requires_grad] optimizer = get_optimizer(opt["optimizer"], parameters, lr=opt["lr"], weight_decay=opt["weight_decay"]) if checkpoint_dir: checkpoint = os.path.join(checkpoint_dir, "checkpoint") model_state, optimizer_state = torch.load(checkpoint) model.load_state_dict(model_state) optimizer.load_state_dict(optimizer_state) train_this = train this_test = test_OGB if opt['dataset'] == 'ogbn-arxiv' else test best_time = best_epoch = train_acc = val_acc = test_acc = 0 for epoch in range(1, opt["epoch"]): # loss = train(model, optimizer, data) loss = train_this(model, optimizer, features, train_mask, labels)[0].item() if opt["no_early"]: tmp_train_acc, tmp_val_acc, tmp_test_acc = this_test(model, g) best_time = opt['time'] else: tmp_train_acc, tmp_val_acc, tmp_test_acc = this_test(model, g) if tmp_val_acc > val_acc: best_epoch = epoch train_acc = tmp_train_acc val_acc = tmp_val_acc test_acc = tmp_test_acc with tune.checkpoint_dir(step=epoch) as checkpoint_dir: path = os.path.join(checkpoint_dir, "checkpoint") torch.save((model.state_dict(), optimizer.state_dict()), path) tune.report(loss=loss, accuracy=val_acc, test_acc=test_acc, train_acc=train_acc, best_time=best_time, best_epoch=best_epoch)
def predict_dl_model(data_type, variation, input_level, word_embed_type, word_embed_trainable, batch_size, learning_rate, optimizer_type, model_name, checkpoint_dir=None, return_proba=True, **kwargs): config = ModelConfig() config.variation = variation config.input_level = input_level if '_aug' in variation: config.max_len = { 'word': config.aug_word_max_len, 'char': config.aug_char_max_len } config.word_embed_type = word_embed_type config.word_embed_trainable = word_embed_trainable config.word_embeddings = np.load( format_filename(PROCESSED_DATA_DIR, EMBEDDING_MATRIX_TEMPLATE, variation=variation, type=word_embed_type)) config.batch_size = batch_size config.learning_rate = learning_rate config.optimizer = get_optimizer(optimizer_type, learning_rate) if checkpoint_dir is not None: config.checkpoint_dir = checkpoint_dir config.exp_name = '{}_{}_{}_{}_{}'.format( variation, model_name, input_level, word_embed_type, 'tune' if word_embed_trainable else 'fix') print('Logging Info - Experiment: ', config.exp_name) if model_name == 'bilstm': model = BiLSTM(config, **kwargs) elif model_name == 'cnnrnn': model = CNNRNN(config, **kwargs) elif model_name == 'dcnn': model = DCNN(config, **kwargs) elif model_name == 'dpcnn': model = DPCNN(config, **kwargs) elif model_name == 'han': model = HAN(config, **kwargs) elif model_name == 'multicnn': model = MultiTextCNN(config, **kwargs) elif model_name == 'rcnn': model = RCNN(config, **kwargs) elif model_name == 'rnncnn': model = RNNCNN(config, **kwargs) elif model_name == 'cnn': model = TextCNN(config, **kwargs) elif model_name == 'vdcnn': model = VDCNN(config, **kwargs) else: raise ValueError('Model Name Not Understood : {}'.format(model_name)) model_save_path = path.join(config.checkpoint_dir, '{}.hdf5'.format(config.exp_name)) if not path.exists(model_save_path): raise FileNotFoundError('Model Not Found: {}'.format(model_save_path)) # load the best model model.load_best_model() data = load_processed_data(variation, input_level, data_type) if data is None: return None, config.exp_name if return_proba: return model.predict_proba(data), config.exp_name else: return model.predict(data), config.exp_name
logging.basicConfig(format='%(asctime)s %(module)s - %(funcName)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S') logging.getLogger().setLevel(logging.INFO) if __name__ == '__main__': parser = argparse.ArgumentParser(description='Restoring training run from a given checkpoint') parser.add_argument('--checkpoint-path', help="The path of the checkpoint file", required=True) args = parser.parse_args() checkpoint = torch.load(args.checkpoint_path) opts = checkpoint['opts'] logging.info("Loading the data") model_opts = create_model_opts(opts) train_data, test_data = get_data(opts, model_opts) logging.info("Restore the {0} model to epoch {1} on {2} dataset(Loss:{3}, train accuracy:{4})".format(opts.model, checkpoint["epoch"], opts.data, checkpoint["loss"], checkpoint["train_accuracy"])) model = models.get_model(opts, datasets_info[opts.data], pretrained=False) model.load_state_dict(checkpoint['model_state_dict']) model.train() optimizer, lr_scheduler = get_optimizer(opts, model) optimizer.load_state_dict(checkpoint['optimizer_state_dict']) training_model = convert_to_ipu_model(model, opts, optimizer) train(training_model, train_data, opts, lr_scheduler, range(checkpoint["epoch"]+1, opts.epoch+1), optimizer) checkpoint_folder = os.path.dirname(os.path.realpath(args.checkpoint_path)) checkpoint_files = [os.path.join(checkpoint_folder, file_name) for file_name in os.listdir(checkpoint_folder)] validate_checkpoints(checkpoint_files, test_data=test_data)
opts = checkpoint['opts'] utils.Logger.setup_logging_folder(opts) logging.info("Loading the data") model_opts = create_model_opts(opts) train_data = data.get_data(opts, model_opts, train=True, async_dataloader=True) if not opts.no_validation: inference_model_opts = poptorch.Options().deviceIterations(max(opts.device_iterations, 1+len(opts.pipeline_splits))) inference_model_opts.replicationFactor(opts.replicas) test_data = data.get_data(opts, inference_model_opts, train=False, async_dataloader=True) logging.info(f"Restore the {opts.model} model to epoch {checkpoint['epoch']} on {opts.data} dataset(Loss:{checkpoint['loss']}, train accuracy:{checkpoint['train_accuracy']})") model = models.get_model(opts, data.datasets_info[opts.data], pretrained=False) model.load_state_dict(checkpoint['model_state_dict']) model.train() optimizer = get_optimizer(opts, model) lr_scheduler = get_lr_scheduler(opts, optimizer) optimizer.load_state_dict(checkpoint['optimizer_state_dict']) # set the LR scheduler to the correct position lr_scheduler.last_epoch += checkpoint["epoch"] training_model = convert_to_ipu_model(model, opts, optimizer) train(training_model, train_data, opts, lr_scheduler, range(checkpoint["epoch"]+1, opts.epoch+1), optimizer) if not opts.no_validation: checkpoint_folder = os.path.dirname(os.path.realpath(args.checkpoint_path)) checkpoint_files = [os.path.join(checkpoint_folder, file_name) for file_name in os.listdir(checkpoint_folder) if file_name.endswith(".pt")] validate_checkpoints(checkpoint_files, test_data)
opts, train=True, async_dataloader=True) logging.info( f"Restore the {args.model} model to epoch {checkpoint['epoch']} on {args.data} dataset(Loss:{checkpoint['loss']}, train accuracy:{checkpoint['train_accuracy']})" ) model = models.get_model(args, datasets.datasets_info[args.data], pretrained=False, use_mixup=args.mixup_enabled, use_cutmix=args.cutmix_enabled) models.load_model_state_dict(model, checkpoint['model_state_dict']) model.train() optimizer = get_optimizer(args, model) optimizer.load_state_dict(checkpoint['optimizer_state_dict']) lr_scheduler = get_lr_scheduler(args, optimizer, len(train_data), start_epoch=checkpoint["epoch"]) training_model = convert_to_ipu_model(model, args, optimizer) if args.validation_mode == "during": training_validation_func = get_validation_function(args, model).func else: training_validation_func = None train(training_model, train_data, args, lr_scheduler, range(checkpoint["epoch"] + 1, args.epoch + 1), optimizer, training_validation_func)