Example #1
0
def train(train_env, tok, n_iters, log_every=100, val_envs={}, aug_env=None):
    writer = SummaryWriter(logdir=log_dir)
    listner = Seq2SeqAgent(train_env, "", tok, args.maxAction)

    speaker = None
    if args.self_train:
        speaker = Speaker(train_env, listner, tok)
        if args.speaker is not None:
            if args.upload:
                print("Load the speaker from %s." % args.speaker)
                speaker.load(
                    get_sync_dir(os.path.join(args.upload_path, args.speaker)))
            else:
                print("Load the speaker from %s." % args.speaker)
                speaker.load(os.path.join(args.R2R_Aux_path, args.speaker))

    start_iter = 0
    if args.load is not None:
        if args.upload:
            refs_paths = get_outputs_refs_paths()['experiments'][0]
            print(refs_paths)
            load_model = os.path.join(refs_paths, args.load)
            print(load_model)
            print("LOAD THE listener from %s" % load_model)
            start_iter = listner.load(load_model)
        else:
            print("LOAD THE listener from %s" % args.load)
            start_iter = listner.load(
                os.path.join(args.R2R_Aux_path, args.load))

    start = time.time()

    best_val = {
        'val_seen': {
            "accu": 0.,
            "state": "",
            'update': False
        },
        'val_unseen': {
            "accu": 0.,
            "state": "",
            'update': False
        }
    }
    if args.fast_train:
        log_every = 40
    for idx in range(start_iter, start_iter + n_iters, log_every):
        listner.logs = defaultdict(list)
        interval = min(log_every, start_iter + n_iters - idx)
        iter = idx + interval

        # Train for log_every interval
        if aug_env is None:  # The default training process
            listner.env = train_env
            listner.train(interval,
                          feedback=feedback_method)  # Train interval iters
        else:
            if args.accumulate_grad:
                for _ in range(interval // 2):
                    listner.zero_grad()
                    listner.env = train_env

                    # Train with GT data
                    args.ml_weight = 0.2
                    listner.accumulate_gradient(feedback_method)
                    listner.env = aug_env

                    # Train with Back Translation
                    args.ml_weight = 0.6  # Sem-Configuration
                    listner.accumulate_gradient(feedback_method,
                                                speaker=speaker)
                    listner.optim_step()
            else:
                for _ in range(interval // 2):
                    # Train with GT data
                    listner.env = train_env
                    args.ml_weight = 0.2
                    listner.train(1, feedback=feedback_method)

                    # Train with Back Translation
                    listner.env = aug_env
                    args.ml_weight = 0.6
                    listner.train(1, feedback=feedback_method, speaker=speaker)

        # Log the training stats to tensorboard
        total = max(sum(listner.logs['total']), 1)
        # import pdb; pdb.set_trace() # length_rl == length_ml ? entropy length
        assert (max(len(listner.logs['rl_loss']),
                    1) == max(len(listner.logs['ml_loss']), 1))
        max_rl_length = max(len(listner.logs['critic_loss']), 1)
        log_length = max(len(listner.logs['rl_loss']), 1)
        rl_loss = sum(listner.logs['rl_loss']) / log_length
        ml_loss = sum(listner.logs['ml_loss']) / log_length
        critic_loss = sum(listner.logs['critic_loss']
                          ) / log_length  #/ length / args.batchSize
        spe_loss = sum(listner.logs['spe_loss']) / log_length
        pro_loss = sum(listner.logs['pro_loss']) / log_length
        mat_loss = sum(listner.logs['mat_loss']) / log_length
        fea_loss = sum(listner.logs['fea_loss']) / log_length
        ang_loss = sum(listner.logs['ang_loss']) / log_length
        entropy = sum(
            listner.logs['entropy']) / log_length  #/ length / args.batchSize
        predict_loss = sum(listner.logs['us_loss']) / log_length
        writer.add_scalar("loss/rl_loss", rl_loss, idx)
        writer.add_scalar("loss/ml_loss", ml_loss, idx)
        writer.add_scalar("policy_entropy", entropy, idx)
        writer.add_scalar("loss/spe_loss", spe_loss, idx)
        writer.add_scalar("loss/pro_loss", pro_loss, idx)
        writer.add_scalar("loss/mat_loss", mat_loss, idx)
        writer.add_scalar("loss/fea_loss", fea_loss, idx)
        writer.add_scalar("loss/ang_loss", ang_loss, idx)
        writer.add_scalar("total_actions", total, idx)
        writer.add_scalar("max_rl_length", max_rl_length, idx)
        writer.add_scalar("loss/critic", critic_loss, idx)
        writer.add_scalar("loss/unsupervised", predict_loss, idx)
        print("total_actions", total)
        print("max_rl_length", max_rl_length)

        # Run validation
        loss_str = ""
        for env_name, (env, evaluator) in val_envs.items():
            listner.env = env

            # Get validation loss under the same conditions as training
            iters = None if args.fast_train or env_name != 'train' else 20  # 20 * 64 = 1280

            # Get validation distance from goal under test evaluation conditions
            listner.test(use_dropout=False, feedback='argmax', iters=iters)
            result = listner.get_results()
            score_summary, _ = evaluator.score(result)
            loss_str += "%s " % env_name
            for metric, val in score_summary.items():
                if metric in ['success_rate']:
                    loss_str += ', %s: %.4f' % (metric, val)
                    writer.add_scalar("%s/accuracy" % env_name, val, idx)
                    if env_name in best_val:
                        if val > best_val[env_name]['accu']:
                            best_val[env_name]['accu'] = val
                            best_val[env_name]['update'] = True
                if metric in ['spl']:
                    writer.add_scalar("%s/spl" % env_name, val, idx)
                    loss_str += ', %s: %.4f' % (metric, val)
            loss_str += '\n'
        loss_str += '\n'

        for env_name in best_val:
            if best_val[env_name]['update']:
                best_val[env_name]['state'] = 'Iter %d \n%s' % (iter, loss_str)
                best_val[env_name]['update'] = False
                file_dir = os.path.join(output_dir, "snap", args.name,
                                        "state_dict", "best_%s" % (env_name))
                listner.save(idx, file_dir)
        print(('%s (%d %d%%) \n%s' % (timeSince(start,
                                                float(iter) / n_iters), iter,
                                      float(iter) / n_iters * 100, loss_str)))

        if iter % 1000 == 0:
            print("BEST RESULT TILL NOW")
            for env_name in best_val:
                print(env_name, best_val[env_name]['state'])

        if iter % args.save_iter == 0:
            file_dir = os.path.join(output_dir, "snap", args.name,
                                    "state_dict", "Iter_%06d" % (iter))
            listner.save(idx, file_dir)
Example #2
0
def run(train_config, logger, **kwargs):

    logger = logging.getLogger('UDA')
    if getattr(train_config, 'debug', False):
        setup_logger(logger, logging.DEBUG)

    # Set Polyaxon environment if needed
    plx_logger = None
    save_dir = None
    output_experiment_path = None
    try:
        plx_logger = PolyaxonLogger()
        experiment = plx_logger.experiment
        save_dir = get_outputs_path()
        output_experiment_path = get_outputs_refs_paths()
        output_experiment_path = output_experiment_path['experiments'][
            0] if output_experiment_path else None
        logger.debug("Experiment info: {}".format(
            experiment.get_experiment_info()))
    except PolyaxonClientException as e:
        logger.warning('Logger Polyaxon : ' + str(e))

    # Path configuration
    saves_dict = getattr(train_config, 'saves', {})

    save_dir = saves_dict.get('save_dir', '') if save_dir is None else save_dir
    log_dir = os.path.join(save_dir, saves_dict.get('log_dir', ''))
    save_model_dir = os.path.join(save_dir, saves_dict.get('model_dir', ''))
    save_prediction_dir = os.path.join(save_dir,
                                       saves_dict.get('prediction_dir', ''))
    save_config_dir = os.path.join(save_dir, saves_dict.get('config_dir', ''))
    load_model_file = saves_dict.get('load_model_file', '')
    load_optimizer_file = saves_dict.get('load_optimizer_file', '')

    # Create folders
    create_save_folders(save_dir, saves_dict)

    if output_experiment_path is not None:
        model_dir = saves_dict.get('model_dir', '')
        load_model_file = os.path.join(
            output_experiment_path, model_dir,
            load_model_file) if load_model_file else None
        load_optimizer_file = os.path.join(
            output_experiment_path, model_dir,
            load_optimizer_file) if load_optimizer_file else None

    num_epochs = getattr(train_config, 'num_epochs')
    num_classes = getattr(train_config, 'num_classes')
    device = getattr(train_config, 'device', 'cpu')

    # Set magical acceleration
    if torch.cuda.is_available():
        torch.backends.cudnn.benchmark = True
    else:
        assert device == 'cpu', 'CUDA device selected but none is available'

    # Set half precision if required
    use_fp_16 = getattr(train_config, 'use_fp_16', False)

    train1_sup_loader = getattr(train_config, 'train1_sup_loader')
    train1_unsup_loader = getattr(train_config, 'train1_unsup_loader')
    train2_unsup_loader = getattr(train_config, 'train2_unsup_loader')
    test_loader = getattr(train_config, 'test_loader')

    save_interval = saves_dict.get('save_interval', 0)
    n_saved = saves_dict.get('n_saved', 0)

    val_interval = getattr(train_config, 'val_interval', 1)
    pred_interval = getattr(train_config, 'pred_interval', 0)

    model = getattr(train_config, 'model').to(device)

    optimizer = getattr(train_config, 'optimizer')

    criterion = getattr(train_config, 'criterion').to(device)
    consistency_criterion = getattr(train_config,
                                    'consistency_criterion').to(device)

    cm_metric = getattr(
        train_config, 'cm_metric',
        ConfusionMatrix(num_classes=num_classes,
                        output_transform=lambda x: (x['y_pred'], x['y'])))

    # AMP initialization for half precision
    if use_fp_16:
        assert 'cuda' in device
        assert torch.backends.cudnn.enabled, "NVIDIA/Apex:Amp requires cudnn backend to be enabled."
        try:
            from apex import amp
        except:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to run this example."
            )
        # Initialize amp
        model, optimizer = amp.initialize(model, optimizer, opt_level="O2")

    # Load checkpoint
    load_params(model,
                optimizer=optimizer,
                model_file=load_model_file,
                optimizer_file=load_optimizer_file,
                device_name=device)

    # Add batch norm
    is_bn = getattr(train_config, 'is_bn', False)
    if is_bn:
        batch_norm = nn.BatchNorm2d(3).to(device)
        if use_fp_16:
            batch_norm = amp.initialize(batch_norm)
        batch_norm.reset_parameters()
        model = nn.Sequential(batch_norm, model)

    # Copy the config file
    shutil.copy2(os.path.abspath(train_config.__file__),
                 os.path.join(save_config_dir, 'checkpoint_module.py'))

    le = len(train1_sup_loader)
    num_train_steps = le * num_epochs
    mlflow.log_param("num train steps", num_train_steps)

    lr = getattr(train_config, 'learning_rate')
    num_warmup_steps = getattr(train_config, 'num_warmup_steps', 0)

    lr_scheduler = getattr(train_config, 'lr_scheduler', None)
    if lr_scheduler is not None:
        lr_scheduler = lr_scheduler(optimizer)

    if num_warmup_steps > 0:
        lr_scheduler = create_lr_scheduler_with_warmup(
            lr_scheduler,
            warmup_start_value=0.0,
            warmup_end_value=lr * (1.0 + 1.0 / num_warmup_steps),
            warmup_duration=num_warmup_steps)

    train1_sup_loader_iter = cycle(train1_sup_loader)
    train1_unsup_loader_iter = cycle(train1_unsup_loader)
    train2_unsup_loader_iter = cycle(train2_unsup_loader)

    # Reduce on plateau
    reduce_on_plateau = getattr(train_config, 'reduce_on_plateau', None)

    # Output transform model
    output_transform_model = getattr(train_config, 'output_transform_model',
                                     lambda x: x)

    inference_fn = getattr(train_config, 'inference_fn', inference_standard)

    lam = getattr(train_config, 'consistency_lambda')
    beta = getattr(train_config, 'consistency_beta', lam)

    tsa = TrainingSignalAnnealing(
        num_steps=num_train_steps,
        min_threshold=getattr(train_config, 'TSA_proba_min'),
        max_threshold=getattr(train_config, 'TSA_proba_max'))

    with_tsa = getattr(train_config, 'with_TSA', False)

    cfg = {
        'tsa': tsa,
        'lambda': lam,
        'beta': beta,
        'with_tsa': with_tsa,
        'device': device,
        'consistency_criterion': consistency_criterion,
        'criterion': criterion
    }

    trainer = Engine(
        partial(train_update_function,
                model=model,
                optimizer=optimizer,
                cfg=cfg,
                train1_sup_loader_iter=train1_sup_loader_iter,
                train1_unsup_loader_iter=train1_unsup_loader_iter,
                train2_unsup_loader_iter=train2_unsup_loader_iter,
                output_transform_model=output_transform_model,
                use_fp_16=use_fp_16))

    # Register events
    for e in CustomEvents:
        State.event_to_attr[e] = 'iteration'

    trainer.register_events(*CustomEvents)

    if with_tsa:
        trainer.add_event_handler(Events.ITERATION_COMPLETED, log_tsa, tsa)

    if lr_scheduler is not None:
        if not hasattr(lr_scheduler, "step"):
            trainer.add_event_handler(Events.ITERATION_STARTED, lr_scheduler)
        else:
            trainer.add_event_handler(Events.ITERATION_STARTED,
                                      lambda engine: lr_scheduler.step())

    trainer.add_event_handler(Events.ITERATION_COMPLETED, log_learning_rate,
                              optimizer)

    metric_names = [
        'supervised batch loss', 'consistency batch loss', 'final batch loss'
    ]

    def output_transform(x, name):
        return x[name]

    for n in metric_names:
        RunningAverage(
            output_transform=partial(output_transform, name=n)).attach(
                trainer, n)

    ProgressBar(persist=True,
                bar_format="").attach(trainer,
                                      event_name=Events.EPOCH_STARTED,
                                      closing_event_name=Events.COMPLETED)

    # Handlers for Tensorboard logging
    tb_logger = TensorboardLogger(log_dir=log_dir)
    tb_logger.attach(trainer,
                     log_handler=tbOutputHandler(tag="train",
                                                 metric_names=metric_names),
                     event_name=CustomEvents.ITERATION_K_COMPLETED)
    tb_logger.attach(trainer,
                     log_handler=tbOptimizerParamsHandler(optimizer,
                                                          param_name="lr"),
                     event_name=CustomEvents.ITERATION_K_STARTED)

    # Handlers for Polyaxon logging
    if plx_logger is not None:
        plx_logger.attach(trainer,
                          log_handler=plxOutputHandler(
                              tag="train", metric_names=metric_names),
                          event_name=CustomEvents.ITERATION_K_COMPLETED)

    metrics = {
        'loss': Loss(criterion,
                     output_transform=lambda x: (x['y_pred'], x['y'])),
        'mAcc': cmAccuracy(cm_metric).mean(),
        'mPr': cmPrecision(cm_metric).mean(),
        'mRe': cmRecall(cm_metric).mean(),
        'mIoU': mIoU(cm_metric),
        'mF1': cmFbeta(cm_metric, 1).mean()
    }
    iou = IoU(cm_metric)
    for i in range(num_classes):
        key_name = 'IoU_{}'.format(str(i))
        metrics[key_name] = iou[i]

    inference_update_fn = partial(
        inference_update_function,
        model=model,
        cfg=cfg,
        output_transform_model=output_transform_model,
        inference_fn=inference_fn)

    evaluator = Engine(inference_update_fn)
    train_evaluator = Engine(inference_update_fn)

    for name, metric in metrics.items():
        metric.attach(train_evaluator, name)
        metric.attach(evaluator, name)

    # Add checkpoint
    if save_model_dir:
        checkpoint = ModelCheckpoint(dirname=save_model_dir,
                                     filename_prefix='checkpoint',
                                     save_interval=save_interval,
                                     n_saved=n_saved,
                                     create_dir=True)
        trainer.add_event_handler(Events.EPOCH_COMPLETED, checkpoint, {
            'mymodel': model,
            'optimizer': optimizer
        })

    def trigger_k_iteration_started(engine, k):
        if engine.state.iteration % k == 0:
            engine.fire_event(CustomEvents.ITERATION_K_STARTED)

    def trigger_k_iteration_completed(engine, k):
        if engine.state.iteration % k == 0:
            engine.fire_event(CustomEvents.ITERATION_K_COMPLETED)

    def run_validation(engine, validation_interval):
        if (trainer.state.epoch - 1) % validation_interval == 0:
            train_evaluator.run(train1_sup_loader)
            evaluator.run(test_loader)

            if save_prediction_dir:
                train_output = train_evaluator.state.output
                test_output = evaluator.state.output

                iteration = str(trainer.state.iteration)
                epoch = str(trainer.state.epoch)

                save_prediction('train_{}_{}'.format(iteration, epoch),
                                save_prediction_dir,
                                train_output['x'],
                                torch.argmax(
                                    train_output['y_pred'][0, :, :, :], dim=0),
                                y=train_output['y'][0, :, :])

                save_prediction('test_{}_{}'.format(iteration, epoch),
                                save_prediction_dir,
                                test_output['x'],
                                torch.argmax(test_output['y_pred'][0, :, :, :],
                                             dim=0),
                                y=test_output['y'][0, :, :])

            train_evaluator.state.output = None
            evaluator.state.output = None

            if reduce_on_plateau is not None:
                reduce_on_plateau.step(evaluator.state.metrics['mIoU'])

    trainer.add_event_handler(Events.ITERATION_STARTED,
                              trigger_k_iteration_started,
                              k=10)
    trainer.add_event_handler(Events.ITERATION_COMPLETED,
                              trigger_k_iteration_completed,
                              k=10)

    trainer.add_event_handler(Events.EPOCH_STARTED,
                              run_validation,
                              validation_interval=val_interval)
    trainer.add_event_handler(Events.COMPLETED,
                              run_validation,
                              validation_interval=1)

    def trainer_prediction_save(engine, prediction_interval):
        if (engine.state.iteration - 1) % prediction_interval == 0:

            if save_prediction_dir:
                trainer_output = trainer.state.output['unsup pred']

                iteration = str(trainer.state.iteration)
                epoch = str(trainer.state.epoch)

                save_prediction('trainer_{}_{}'.format(iteration, epoch),
                                save_prediction_dir, trainer_output['x'],
                                trainer_output['y_pred'])

                logger.debug(
                    'Saved trainer prediction for iteration {}'.format(
                        str(engine.state.iteration)))

            trainer.state.output = None

    trainer.add_event_handler(Events.ITERATION_COMPLETED,
                              trainer_prediction_save,
                              prediction_interval=pred_interval)

    tb_logger.attach(train_evaluator,
                     log_handler=tbOutputHandler(tag="train",
                                                 metric_names=list(
                                                     metrics.keys())),
                     event_name=Events.EPOCH_COMPLETED)

    tb_logger.attach(evaluator,
                     log_handler=tbOutputHandler(tag="test",
                                                 metric_names=list(
                                                     metrics.keys())),
                     event_name=Events.EPOCH_COMPLETED)

    # Handlers for Polyaxon logging
    if plx_logger is not None:
        plx_logger.attach(train_evaluator,
                          log_handler=plxOutputHandler(tag="train",
                                                       metric_names=list(
                                                           metrics.keys())),
                          event_name=Events.EPOCH_COMPLETED)

        plx_logger.attach(evaluator,
                          log_handler=plxOutputHandler(tag="test",
                                                       metric_names=list(
                                                           metrics.keys())),
                          event_name=Events.EPOCH_COMPLETED)

    trainer.add_event_handler(Events.ITERATION_COMPLETED,
                              mlflow_batch_metrics_logging, "train", trainer)
    train_evaluator.add_event_handler(Events.COMPLETED,
                                      mlflow_val_metrics_logging, "train",
                                      trainer)
    evaluator.add_event_handler(Events.COMPLETED, mlflow_val_metrics_logging,
                                "test", trainer)

    data_steps = list(range(len(train1_sup_loader)))

    logger.debug('Start training')
    trainer.run(data_steps, max_epochs=num_epochs)
    logger.debug('Finished training')
Example #3
0
import time
import json
import numpy as np
from collections import defaultdict
from speaker import Speaker

from utils import read_vocab, write_vocab, build_vocab, Tokenizer, padding_idx, timeSince, read_img_features, get_sync_dir
import utils
from env import R2RBatch
from agent import Seq2SeqAgent
from eval import Evaluation
from collections import OrderedDict

from polyaxon_client.tracking import get_outputs_refs_paths
if args.train == 'validlistener' and args.upload:
    refs_paths = get_outputs_refs_paths()['experiments'][0]
    print(refs_paths)
    load_model = os.path.join(refs_paths, args.load)
    print(load_model)

import warnings
warnings.filterwarnings("ignore")

from tensorboardX import SummaryWriter
from polyaxon_client.tracking import get_outputs_path

if args.upload:
    train_vocab = get_sync_dir(os.path.join(args.upload_path,
                                            args.TRAIN_VOCAB))
    trainval_vocab = get_sync_dir(
        os.path.join(args.upload_path, args.TRAINVAL_VOCAB))