def train(train_env, tok, n_iters, log_every=100, val_envs={}, aug_env=None): writer = SummaryWriter(logdir=log_dir) listner = Seq2SeqAgent(train_env, "", tok, args.maxAction) speaker = None if args.self_train: speaker = Speaker(train_env, listner, tok) if args.speaker is not None: if args.upload: print("Load the speaker from %s." % args.speaker) speaker.load( get_sync_dir(os.path.join(args.upload_path, args.speaker))) else: print("Load the speaker from %s." % args.speaker) speaker.load(os.path.join(args.R2R_Aux_path, args.speaker)) start_iter = 0 if args.load is not None: if args.upload: refs_paths = get_outputs_refs_paths()['experiments'][0] print(refs_paths) load_model = os.path.join(refs_paths, args.load) print(load_model) print("LOAD THE listener from %s" % load_model) start_iter = listner.load(load_model) else: print("LOAD THE listener from %s" % args.load) start_iter = listner.load( os.path.join(args.R2R_Aux_path, args.load)) start = time.time() best_val = { 'val_seen': { "accu": 0., "state": "", 'update': False }, 'val_unseen': { "accu": 0., "state": "", 'update': False } } if args.fast_train: log_every = 40 for idx in range(start_iter, start_iter + n_iters, log_every): listner.logs = defaultdict(list) interval = min(log_every, start_iter + n_iters - idx) iter = idx + interval # Train for log_every interval if aug_env is None: # The default training process listner.env = train_env listner.train(interval, feedback=feedback_method) # Train interval iters else: if args.accumulate_grad: for _ in range(interval // 2): listner.zero_grad() listner.env = train_env # Train with GT data args.ml_weight = 0.2 listner.accumulate_gradient(feedback_method) listner.env = aug_env # Train with Back Translation args.ml_weight = 0.6 # Sem-Configuration listner.accumulate_gradient(feedback_method, speaker=speaker) listner.optim_step() else: for _ in range(interval // 2): # Train with GT data listner.env = train_env args.ml_weight = 0.2 listner.train(1, feedback=feedback_method) # Train with Back Translation listner.env = aug_env args.ml_weight = 0.6 listner.train(1, feedback=feedback_method, speaker=speaker) # Log the training stats to tensorboard total = max(sum(listner.logs['total']), 1) # import pdb; pdb.set_trace() # length_rl == length_ml ? entropy length assert (max(len(listner.logs['rl_loss']), 1) == max(len(listner.logs['ml_loss']), 1)) max_rl_length = max(len(listner.logs['critic_loss']), 1) log_length = max(len(listner.logs['rl_loss']), 1) rl_loss = sum(listner.logs['rl_loss']) / log_length ml_loss = sum(listner.logs['ml_loss']) / log_length critic_loss = sum(listner.logs['critic_loss'] ) / log_length #/ length / args.batchSize spe_loss = sum(listner.logs['spe_loss']) / log_length pro_loss = sum(listner.logs['pro_loss']) / log_length mat_loss = sum(listner.logs['mat_loss']) / log_length fea_loss = sum(listner.logs['fea_loss']) / log_length ang_loss = sum(listner.logs['ang_loss']) / log_length entropy = sum( listner.logs['entropy']) / log_length #/ length / args.batchSize predict_loss = sum(listner.logs['us_loss']) / log_length writer.add_scalar("loss/rl_loss", rl_loss, idx) writer.add_scalar("loss/ml_loss", ml_loss, idx) writer.add_scalar("policy_entropy", entropy, idx) writer.add_scalar("loss/spe_loss", spe_loss, idx) writer.add_scalar("loss/pro_loss", pro_loss, idx) writer.add_scalar("loss/mat_loss", mat_loss, idx) writer.add_scalar("loss/fea_loss", fea_loss, idx) writer.add_scalar("loss/ang_loss", ang_loss, idx) writer.add_scalar("total_actions", total, idx) writer.add_scalar("max_rl_length", max_rl_length, idx) writer.add_scalar("loss/critic", critic_loss, idx) writer.add_scalar("loss/unsupervised", predict_loss, idx) print("total_actions", total) print("max_rl_length", max_rl_length) # Run validation loss_str = "" for env_name, (env, evaluator) in val_envs.items(): listner.env = env # Get validation loss under the same conditions as training iters = None if args.fast_train or env_name != 'train' else 20 # 20 * 64 = 1280 # Get validation distance from goal under test evaluation conditions listner.test(use_dropout=False, feedback='argmax', iters=iters) result = listner.get_results() score_summary, _ = evaluator.score(result) loss_str += "%s " % env_name for metric, val in score_summary.items(): if metric in ['success_rate']: loss_str += ', %s: %.4f' % (metric, val) writer.add_scalar("%s/accuracy" % env_name, val, idx) if env_name in best_val: if val > best_val[env_name]['accu']: best_val[env_name]['accu'] = val best_val[env_name]['update'] = True if metric in ['spl']: writer.add_scalar("%s/spl" % env_name, val, idx) loss_str += ', %s: %.4f' % (metric, val) loss_str += '\n' loss_str += '\n' for env_name in best_val: if best_val[env_name]['update']: best_val[env_name]['state'] = 'Iter %d \n%s' % (iter, loss_str) best_val[env_name]['update'] = False file_dir = os.path.join(output_dir, "snap", args.name, "state_dict", "best_%s" % (env_name)) listner.save(idx, file_dir) print(('%s (%d %d%%) \n%s' % (timeSince(start, float(iter) / n_iters), iter, float(iter) / n_iters * 100, loss_str))) if iter % 1000 == 0: print("BEST RESULT TILL NOW") for env_name in best_val: print(env_name, best_val[env_name]['state']) if iter % args.save_iter == 0: file_dir = os.path.join(output_dir, "snap", args.name, "state_dict", "Iter_%06d" % (iter)) listner.save(idx, file_dir)
def run(train_config, logger, **kwargs): logger = logging.getLogger('UDA') if getattr(train_config, 'debug', False): setup_logger(logger, logging.DEBUG) # Set Polyaxon environment if needed plx_logger = None save_dir = None output_experiment_path = None try: plx_logger = PolyaxonLogger() experiment = plx_logger.experiment save_dir = get_outputs_path() output_experiment_path = get_outputs_refs_paths() output_experiment_path = output_experiment_path['experiments'][ 0] if output_experiment_path else None logger.debug("Experiment info: {}".format( experiment.get_experiment_info())) except PolyaxonClientException as e: logger.warning('Logger Polyaxon : ' + str(e)) # Path configuration saves_dict = getattr(train_config, 'saves', {}) save_dir = saves_dict.get('save_dir', '') if save_dir is None else save_dir log_dir = os.path.join(save_dir, saves_dict.get('log_dir', '')) save_model_dir = os.path.join(save_dir, saves_dict.get('model_dir', '')) save_prediction_dir = os.path.join(save_dir, saves_dict.get('prediction_dir', '')) save_config_dir = os.path.join(save_dir, saves_dict.get('config_dir', '')) load_model_file = saves_dict.get('load_model_file', '') load_optimizer_file = saves_dict.get('load_optimizer_file', '') # Create folders create_save_folders(save_dir, saves_dict) if output_experiment_path is not None: model_dir = saves_dict.get('model_dir', '') load_model_file = os.path.join( output_experiment_path, model_dir, load_model_file) if load_model_file else None load_optimizer_file = os.path.join( output_experiment_path, model_dir, load_optimizer_file) if load_optimizer_file else None num_epochs = getattr(train_config, 'num_epochs') num_classes = getattr(train_config, 'num_classes') device = getattr(train_config, 'device', 'cpu') # Set magical acceleration if torch.cuda.is_available(): torch.backends.cudnn.benchmark = True else: assert device == 'cpu', 'CUDA device selected but none is available' # Set half precision if required use_fp_16 = getattr(train_config, 'use_fp_16', False) train1_sup_loader = getattr(train_config, 'train1_sup_loader') train1_unsup_loader = getattr(train_config, 'train1_unsup_loader') train2_unsup_loader = getattr(train_config, 'train2_unsup_loader') test_loader = getattr(train_config, 'test_loader') save_interval = saves_dict.get('save_interval', 0) n_saved = saves_dict.get('n_saved', 0) val_interval = getattr(train_config, 'val_interval', 1) pred_interval = getattr(train_config, 'pred_interval', 0) model = getattr(train_config, 'model').to(device) optimizer = getattr(train_config, 'optimizer') criterion = getattr(train_config, 'criterion').to(device) consistency_criterion = getattr(train_config, 'consistency_criterion').to(device) cm_metric = getattr( train_config, 'cm_metric', ConfusionMatrix(num_classes=num_classes, output_transform=lambda x: (x['y_pred'], x['y']))) # AMP initialization for half precision if use_fp_16: assert 'cuda' in device assert torch.backends.cudnn.enabled, "NVIDIA/Apex:Amp requires cudnn backend to be enabled." try: from apex import amp except: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to run this example." ) # Initialize amp model, optimizer = amp.initialize(model, optimizer, opt_level="O2") # Load checkpoint load_params(model, optimizer=optimizer, model_file=load_model_file, optimizer_file=load_optimizer_file, device_name=device) # Add batch norm is_bn = getattr(train_config, 'is_bn', False) if is_bn: batch_norm = nn.BatchNorm2d(3).to(device) if use_fp_16: batch_norm = amp.initialize(batch_norm) batch_norm.reset_parameters() model = nn.Sequential(batch_norm, model) # Copy the config file shutil.copy2(os.path.abspath(train_config.__file__), os.path.join(save_config_dir, 'checkpoint_module.py')) le = len(train1_sup_loader) num_train_steps = le * num_epochs mlflow.log_param("num train steps", num_train_steps) lr = getattr(train_config, 'learning_rate') num_warmup_steps = getattr(train_config, 'num_warmup_steps', 0) lr_scheduler = getattr(train_config, 'lr_scheduler', None) if lr_scheduler is not None: lr_scheduler = lr_scheduler(optimizer) if num_warmup_steps > 0: lr_scheduler = create_lr_scheduler_with_warmup( lr_scheduler, warmup_start_value=0.0, warmup_end_value=lr * (1.0 + 1.0 / num_warmup_steps), warmup_duration=num_warmup_steps) train1_sup_loader_iter = cycle(train1_sup_loader) train1_unsup_loader_iter = cycle(train1_unsup_loader) train2_unsup_loader_iter = cycle(train2_unsup_loader) # Reduce on plateau reduce_on_plateau = getattr(train_config, 'reduce_on_plateau', None) # Output transform model output_transform_model = getattr(train_config, 'output_transform_model', lambda x: x) inference_fn = getattr(train_config, 'inference_fn', inference_standard) lam = getattr(train_config, 'consistency_lambda') beta = getattr(train_config, 'consistency_beta', lam) tsa = TrainingSignalAnnealing( num_steps=num_train_steps, min_threshold=getattr(train_config, 'TSA_proba_min'), max_threshold=getattr(train_config, 'TSA_proba_max')) with_tsa = getattr(train_config, 'with_TSA', False) cfg = { 'tsa': tsa, 'lambda': lam, 'beta': beta, 'with_tsa': with_tsa, 'device': device, 'consistency_criterion': consistency_criterion, 'criterion': criterion } trainer = Engine( partial(train_update_function, model=model, optimizer=optimizer, cfg=cfg, train1_sup_loader_iter=train1_sup_loader_iter, train1_unsup_loader_iter=train1_unsup_loader_iter, train2_unsup_loader_iter=train2_unsup_loader_iter, output_transform_model=output_transform_model, use_fp_16=use_fp_16)) # Register events for e in CustomEvents: State.event_to_attr[e] = 'iteration' trainer.register_events(*CustomEvents) if with_tsa: trainer.add_event_handler(Events.ITERATION_COMPLETED, log_tsa, tsa) if lr_scheduler is not None: if not hasattr(lr_scheduler, "step"): trainer.add_event_handler(Events.ITERATION_STARTED, lr_scheduler) else: trainer.add_event_handler(Events.ITERATION_STARTED, lambda engine: lr_scheduler.step()) trainer.add_event_handler(Events.ITERATION_COMPLETED, log_learning_rate, optimizer) metric_names = [ 'supervised batch loss', 'consistency batch loss', 'final batch loss' ] def output_transform(x, name): return x[name] for n in metric_names: RunningAverage( output_transform=partial(output_transform, name=n)).attach( trainer, n) ProgressBar(persist=True, bar_format="").attach(trainer, event_name=Events.EPOCH_STARTED, closing_event_name=Events.COMPLETED) # Handlers for Tensorboard logging tb_logger = TensorboardLogger(log_dir=log_dir) tb_logger.attach(trainer, log_handler=tbOutputHandler(tag="train", metric_names=metric_names), event_name=CustomEvents.ITERATION_K_COMPLETED) tb_logger.attach(trainer, log_handler=tbOptimizerParamsHandler(optimizer, param_name="lr"), event_name=CustomEvents.ITERATION_K_STARTED) # Handlers for Polyaxon logging if plx_logger is not None: plx_logger.attach(trainer, log_handler=plxOutputHandler( tag="train", metric_names=metric_names), event_name=CustomEvents.ITERATION_K_COMPLETED) metrics = { 'loss': Loss(criterion, output_transform=lambda x: (x['y_pred'], x['y'])), 'mAcc': cmAccuracy(cm_metric).mean(), 'mPr': cmPrecision(cm_metric).mean(), 'mRe': cmRecall(cm_metric).mean(), 'mIoU': mIoU(cm_metric), 'mF1': cmFbeta(cm_metric, 1).mean() } iou = IoU(cm_metric) for i in range(num_classes): key_name = 'IoU_{}'.format(str(i)) metrics[key_name] = iou[i] inference_update_fn = partial( inference_update_function, model=model, cfg=cfg, output_transform_model=output_transform_model, inference_fn=inference_fn) evaluator = Engine(inference_update_fn) train_evaluator = Engine(inference_update_fn) for name, metric in metrics.items(): metric.attach(train_evaluator, name) metric.attach(evaluator, name) # Add checkpoint if save_model_dir: checkpoint = ModelCheckpoint(dirname=save_model_dir, filename_prefix='checkpoint', save_interval=save_interval, n_saved=n_saved, create_dir=True) trainer.add_event_handler(Events.EPOCH_COMPLETED, checkpoint, { 'mymodel': model, 'optimizer': optimizer }) def trigger_k_iteration_started(engine, k): if engine.state.iteration % k == 0: engine.fire_event(CustomEvents.ITERATION_K_STARTED) def trigger_k_iteration_completed(engine, k): if engine.state.iteration % k == 0: engine.fire_event(CustomEvents.ITERATION_K_COMPLETED) def run_validation(engine, validation_interval): if (trainer.state.epoch - 1) % validation_interval == 0: train_evaluator.run(train1_sup_loader) evaluator.run(test_loader) if save_prediction_dir: train_output = train_evaluator.state.output test_output = evaluator.state.output iteration = str(trainer.state.iteration) epoch = str(trainer.state.epoch) save_prediction('train_{}_{}'.format(iteration, epoch), save_prediction_dir, train_output['x'], torch.argmax( train_output['y_pred'][0, :, :, :], dim=0), y=train_output['y'][0, :, :]) save_prediction('test_{}_{}'.format(iteration, epoch), save_prediction_dir, test_output['x'], torch.argmax(test_output['y_pred'][0, :, :, :], dim=0), y=test_output['y'][0, :, :]) train_evaluator.state.output = None evaluator.state.output = None if reduce_on_plateau is not None: reduce_on_plateau.step(evaluator.state.metrics['mIoU']) trainer.add_event_handler(Events.ITERATION_STARTED, trigger_k_iteration_started, k=10) trainer.add_event_handler(Events.ITERATION_COMPLETED, trigger_k_iteration_completed, k=10) trainer.add_event_handler(Events.EPOCH_STARTED, run_validation, validation_interval=val_interval) trainer.add_event_handler(Events.COMPLETED, run_validation, validation_interval=1) def trainer_prediction_save(engine, prediction_interval): if (engine.state.iteration - 1) % prediction_interval == 0: if save_prediction_dir: trainer_output = trainer.state.output['unsup pred'] iteration = str(trainer.state.iteration) epoch = str(trainer.state.epoch) save_prediction('trainer_{}_{}'.format(iteration, epoch), save_prediction_dir, trainer_output['x'], trainer_output['y_pred']) logger.debug( 'Saved trainer prediction for iteration {}'.format( str(engine.state.iteration))) trainer.state.output = None trainer.add_event_handler(Events.ITERATION_COMPLETED, trainer_prediction_save, prediction_interval=pred_interval) tb_logger.attach(train_evaluator, log_handler=tbOutputHandler(tag="train", metric_names=list( metrics.keys())), event_name=Events.EPOCH_COMPLETED) tb_logger.attach(evaluator, log_handler=tbOutputHandler(tag="test", metric_names=list( metrics.keys())), event_name=Events.EPOCH_COMPLETED) # Handlers for Polyaxon logging if plx_logger is not None: plx_logger.attach(train_evaluator, log_handler=plxOutputHandler(tag="train", metric_names=list( metrics.keys())), event_name=Events.EPOCH_COMPLETED) plx_logger.attach(evaluator, log_handler=plxOutputHandler(tag="test", metric_names=list( metrics.keys())), event_name=Events.EPOCH_COMPLETED) trainer.add_event_handler(Events.ITERATION_COMPLETED, mlflow_batch_metrics_logging, "train", trainer) train_evaluator.add_event_handler(Events.COMPLETED, mlflow_val_metrics_logging, "train", trainer) evaluator.add_event_handler(Events.COMPLETED, mlflow_val_metrics_logging, "test", trainer) data_steps = list(range(len(train1_sup_loader))) logger.debug('Start training') trainer.run(data_steps, max_epochs=num_epochs) logger.debug('Finished training')
import time import json import numpy as np from collections import defaultdict from speaker import Speaker from utils import read_vocab, write_vocab, build_vocab, Tokenizer, padding_idx, timeSince, read_img_features, get_sync_dir import utils from env import R2RBatch from agent import Seq2SeqAgent from eval import Evaluation from collections import OrderedDict from polyaxon_client.tracking import get_outputs_refs_paths if args.train == 'validlistener' and args.upload: refs_paths = get_outputs_refs_paths()['experiments'][0] print(refs_paths) load_model = os.path.join(refs_paths, args.load) print(load_model) import warnings warnings.filterwarnings("ignore") from tensorboardX import SummaryWriter from polyaxon_client.tracking import get_outputs_path if args.upload: train_vocab = get_sync_dir(os.path.join(args.upload_path, args.TRAIN_VOCAB)) trainval_vocab = get_sync_dir( os.path.join(args.upload_path, args.TRAINVAL_VOCAB))