def run_experiment(**config):
    set_seed(config['seed'])
    original_saved_path = config['saved_path']
    if original_saved_path is not None:
        saved_model = joblib.load(config['saved_path'])
        if 'config' in saved_model:
            if not config['override_old_config']:
                config = saved_model['config']
    arguments = {
        "start_loc": 'all',
        "include_holdout_obj": False,
        "persist_goal": config['persist_goal'],
        "persist_objs": config['persist_objs'],
        "persist_agent": config['persist_agent'],
        "feedback_type": config["feedback_type"],
        "feedback_always": config["feedback_always"],
        "feedback_freq": config["feedback_freq"],
        "cartesian_steps": config["cartesian_steps"],
        "num_meta_tasks": config["rollouts_per_meta_task"],
        "intermediate_reward": config["intermediate_reward"],
    }
    advice_start_index = 160
    if original_saved_path is not None:
        set_seed(config['seed'])
        policy = saved_model['policy']
        optimizer = saved_model['optimizer']
        policy.device = torch.device("cuda" if torch.cuda.is_available() else
                                     "cpu")  # TODO: is this necessary?
        policy.hidden_state = None
        baseline = saved_model['baseline']
        curriculum_step = saved_model['curriculum_step']
        env = rl2env(normalize(
            Curriculum(config['advance_curriculum_func'],
                       start_index=curriculum_step,
                       **arguments)),
                     ceil_reward=config['ceil_reward'])
        start_itr = saved_model['itr']
        reward_predictor = saved_model['reward_predictor']
        reward_predictor.hidden_state = None
        if 'supervised_model' in saved_model:
            supervised_model = saved_model['supervised_model']
        else:
            supervised_model = None

        teacher_train_dict = {}
        for teacher_name in config['feedback_type']:
            teacher_train_dict[teacher_name] = True

    else:

        teacher_train_dict = {}
        for teacher_name in config['feedback_type']:
            teacher_train_dict[teacher_name] = True

        optimizer = None
        baseline = None
        env = rl2env(normalize(
            Curriculum(config['advance_curriculum_func'],
                       start_index=config['level'],
                       **arguments)),
                     ceil_reward=config['ceil_reward'])
        obs = env.reset()
        obs_dim = 100  # TODO: consider changing this with 'additional' and adding it!
        advice_size = sum(
            [np.prod(obs[adv_k].shape) for adv_k in teacher_train_dict.keys()])

        image_dim = 128
        memory_dim = config['memory_dim']
        instr_dim = config['instr_dim']
        use_instr = True
        instr_arch = 'bigru'
        use_mem = True
        arch = 'bow_endpool_res'
        advice_dim = 128  # TODO: move this to the config
        policy = ACModel(obs_space=obs_dim,
                         action_space=env.action_space,
                         env=env,
                         image_dim=image_dim,
                         memory_dim=memory_dim,
                         instr_dim=instr_dim,
                         lang_model=instr_arch,
                         use_instr=use_instr,
                         use_memory=use_mem,
                         arch=arch,
                         advice_dim=advice_dim,
                         advice_size=advice_size,
                         num_modules=config['num_modules'])

        reward_predictor = ACModel(
            obs_space=obs_dim -
            1,  # TODO: change into Discrete(3) and do 3-way classification
            action_space=spaces.Discrete(2),
            env=env,
            image_dim=image_dim,
            memory_dim=memory_dim,
            instr_dim=instr_dim,
            lang_model=instr_arch,
            use_instr=use_instr,
            use_memory=use_mem,
            arch=arch,
            advice_dim=advice_dim,
            advice_size=advice_size,
            num_modules=config['num_modules'])
        if config['self_distill'] and not config['distill_same_model']:
            obs_dim = env.reset()['obs'].shape[0]
            image_dim = 128
            memory_dim = config['memory_dim']
            instr_dim = config['instr_dim']
            use_instr = True
            instr_arch = 'bigru'
            use_mem = True
            arch = 'bow_endpool_res'
            supervised_model = ACModel(obs_space=obs_dim - 1,
                                       action_space=env.action_space,
                                       env=env,
                                       image_dim=image_dim,
                                       memory_dim=memory_dim,
                                       instr_dim=instr_dim,
                                       lang_model=instr_arch,
                                       use_instr=use_instr,
                                       use_memory=use_mem,
                                       arch=arch,
                                       advice_dim=advice_dim,
                                       advice_size=advice_size,
                                       num_modules=config['num_modules'])
        elif config['self_distill']:
            supervised_model = policy
        else:
            supervised_model = None
        start_itr = 0
        curriculum_step = env.index
    parser = ArgumentParser()
    args = parser.parse_args([])
    args.entropy_coef = config['entropy_bonus']
    args.model = 'default_il'
    args.lr = config['learning_rate']
    args.recurrence = config['backprop_steps']
    args.clip_eps = config['clip_eps']
    if supervised_model is not None:
        il_trainer = ImitationLearning(
            supervised_model,
            env,
            args,
            distill_with_teacher=config['distill_with_teacher'])
    else:
        il_trainer = None
    rp_trainer = ImitationLearning(reward_predictor,
                                   env,
                                   args,
                                   distill_with_teacher=True,
                                   reward_predictor=True)

    teacher_null_dict = env.teacher.null_feedback()
    obs_preprocessor = make_obs_preprocessor(teacher_null_dict)

    sampler = MetaSampler(
        env=env,
        policy=policy,
        rollouts_per_meta_task=config['rollouts_per_meta_task'],
        meta_batch_size=config['meta_batch_size'],
        max_path_length=config['max_path_length'],
        parallel=config['parallel'],
        envs_per_task=1,
        reward_predictor=reward_predictor,
        supervised_model=supervised_model,
        obs_preprocessor=obs_preprocessor,
    )

    sample_processor = RL2SampleProcessor(
        baseline=baseline,
        discount=config['discount'],
        gae_lambda=config['gae_lambda'],
        normalize_adv=config['normalize_adv'],
        positive_adv=config['positive_adv'],
    )

    envs = [
        copy.deepcopy(env),
        copy.deepcopy(env),
        copy.deepcopy(env),
        copy.deepcopy(env),
        copy.deepcopy(env),
        copy.deepcopy(env),
        copy.deepcopy(env),
        copy.deepcopy(env),
        copy.deepcopy(env),
        copy.deepcopy(env),
        copy.deepcopy(env),
        copy.deepcopy(env),
        copy.deepcopy(env),
        copy.deepcopy(env),
        copy.deepcopy(env),
        copy.deepcopy(env),
        copy.deepcopy(env),
        copy.deepcopy(env),
        copy.deepcopy(env),
        copy.deepcopy(env),
    ]
    algo = PPOAlgo(policy,
                   envs,
                   config['frames_per_proc'],
                   config['discount'],
                   args.lr,
                   args.beta1,
                   args.beta2,
                   config['gae_lambda'],
                   args.entropy_coef,
                   config['value_loss_coef'],
                   config['max_grad_norm'],
                   args.recurrence,
                   args.optim_eps,
                   config['clip_eps'],
                   config['epochs'],
                   config['meta_batch_size'],
                   parallel=config['parallel'],
                   rollouts_per_meta_task=config['rollouts_per_meta_task'],
                   obs_preprocessor=obs_preprocessor)

    if optimizer is not None:
        algo.optimizer.load_state_dict(optimizer)

    EXP_NAME = get_exp_name(config)
    exp_dir = os.getcwd() + '/data/' + EXP_NAME + "_" + str(config['seed'])
    if original_saved_path is None:
        if os.path.isdir(exp_dir):
            shutil.rmtree(exp_dir)
    log_formats = ['stdout', 'log', 'csv']
    is_debug = config['prefix'] == 'DEBUG'

    if not is_debug:
        log_formats.append('tensorboard')
        log_formats.append('wandb')
    logger.configure(dir=exp_dir,
                     format_strs=log_formats,
                     snapshot_mode=config['save_option'],
                     snapshot_gap=50,
                     step=start_itr,
                     name=config['prefix'] + str(config['seed']),
                     config=config)
    json.dump(config,
              open(exp_dir + '/params.json', 'w'),
              indent=2,
              sort_keys=True,
              cls=ClassEncoder)

    advice_end_index, advice_dim = 161, 1
    if config[
            'distill_with_teacher']:  # TODO: generalize this for multiple feedback types at once!
        teacher_info = []
    else:
        null_val = np.zeros(advice_end_index - advice_start_index)
        if len(null_val) > 0:
            null_val[-1] = 1
        teacher_info = [{
            "indices":
            np.arange(advice_start_index, advice_end_index),
            "null":
            null_val
        }]

    trainer = Trainer(
        algo=algo,
        policy=policy,
        env=deepcopy(env),
        sampler=sampler,
        sample_processor=sample_processor,
        n_itr=config['n_itr'],
        start_itr=start_itr,
        success_threshold=config['success_threshold'],
        accuracy_threshold=config['accuracy_threshold'],
        exp_name=exp_dir,
        curriculum_step=curriculum_step,
        config=config,
        advance_without_teacher=True,
        teacher_info=teacher_info,
        sparse_rewards=not config['intermediate_reward'],
        distill_only=config['distill_only'],
        il_trainer=il_trainer,
        source=config['source'],
        batch_size=config['meta_batch_size'],
        train_with_teacher=config['feedback_type'] is not None,
        distill_with_teacher=config['distill_with_teacher'],
        supervised_model=supervised_model,
        reward_predictor=reward_predictor,
        rp_trainer=rp_trainer,
        advance_levels=config['advance_levels'],
        is_debug=is_debug,
        teacher_train_dict=teacher_train_dict,
        obs_preprocessor=obs_preprocessor,
    )
    trainer.train()
Example #2
0
import time
import datetime
import numpy as np
import sys
import logging
import babyai.utils as utils
from babyai.arguments import ArgumentParser
from babyai.imitation import ImitationLearning
from babyai.evaluate import batch_evaluate, evaluate
from babyai.utils.agent import BotAgent
import torch
import blosc
from babyai.utils.agent import DemoAgent

# Parse arguments
parser = ArgumentParser()
parser.add_argument("--demos", default=None,
                    help="demos filename (REQUIRED or demos-origin required)")
parser.add_argument("--demos-origin", required=False,
                    help="origin of the demonstrations: human | agent (REQUIRED or demos required)")
parser.add_argument("--episodes", type=int, default=0,
                    help="number of episodes of demonstrations to use"
                         "(default: 0, meaning all demos)")
parser.add_argument("--start-demos", type=int, default=5000,
                    help="the starting number of demonstrations")
parser.add_argument("--demo-grow-factor", type=float, default=1.2,
                    help="number of demos to add to the training set")
parser.add_argument("--num-eval-demos", type=int, default=1000,
                    help="number of demos used for evaluation while growing the training set")
parser.add_argument("--finetune", action="store_true", default=False,
                    help="fine-tune the model at every phase instead of retraining")
Example #3
0
def main(exp, argv):
    os.environ["BABYAI_STORAGE"] = exp.results_directory()

    # Parse arguments
    parser = ArgumentParser()
    parser.add_argument("--algo",
                        default='ppo',
                        help="algorithm to use (default: ppo)")
    parser.add_argument("--discount",
                        type=float,
                        default=0.99,
                        help="discount factor (default: 0.99)")
    parser.add_argument("--reward-scale",
                        type=float,
                        default=20.,
                        help="Reward scale multiplier")
    parser.add_argument(
        "--gae-lambda",
        type=float,
        default=0.99,
        help="lambda coefficient in GAE formula (default: 0.99, 1 means no gae)"
    )
    parser.add_argument("--value-loss-coef",
                        type=float,
                        default=0.5,
                        help="value loss term coefficient (default: 0.5)")
    parser.add_argument("--max-grad-norm",
                        type=float,
                        default=0.5,
                        help="maximum norm of gradient (default: 0.5)")
    parser.add_argument("--clip-eps",
                        type=float,
                        default=0.2,
                        help="clipping epsilon for PPO (default: 0.2)")
    parser.add_argument("--ppo-epochs",
                        type=int,
                        default=4,
                        help="number of epochs for PPO (default: 4)")
    parser.add_argument(
        "--save-interval",
        type=int,
        default=50,
        help=
        "number of updates between two saves (default: 50, 0 means no saving)")
    parser.add_argument("--workers",
                        type=int,
                        default=8,
                        help="number of workers for PyTorch (default: 8)")
    parser.add_argument("--max-count",
                        type=int,
                        default=1000,
                        help="maximum number of frames to run for")
    parser.add_argument("--sample_duration",
                        type=float,
                        default=0.5,
                        help="sampling duration")
    parser.add_argument("--cuda",
                        action="store_true",
                        default=False,
                        help="whether to use cuda")
    args = parser.parse_args(argv)

    utils.seed(args.seed)

    torch_settings = init_torch(
        seed=args.seed,
        cuda=args.cuda,
        workers=args.workers,
    )

    # Generate environments
    envs = []
    for i in range(args.procs):
        env = gym.make(args.env)
        env.seed(100 * args.seed + i)
        envs.append(env)

    # Define model name
    suffix = datetime.datetime.now().strftime("%y-%m-%d-%H-%M-%S")
    instr = args.instr_arch if args.instr_arch else "noinstr"
    mem = "mem" if not args.no_mem else "nomem"
    model_name_parts = {
        'env': args.env,
        'algo': args.algo,
        'arch': args.arch,
        'instr': instr,
        'mem': mem,
        'seed': args.seed,
        'info': '',
        'coef': '',
        'suffix': suffix
    }
    default_model_name = "{env}_{algo}_{arch}_{instr}_{mem}_seed{seed}{info}{coef}_{suffix}".format(
        **model_name_parts)
    if args.pretrained_model:
        default_model_name = args.pretrained_model + '_pretrained_' + default_model_name
    args.model = args.model.format(
        **model_name_parts) if args.model else default_model_name

    utils.configure_logging(args.model)
    logger = logging.getLogger(__name__)

    # Define obss preprocessor
    if 'emb' in args.arch:
        obss_preprocessor = utils.IntObssPreprocessor(
            args.model, envs[0].observation_space, args.pretrained_model)
    else:
        obss_preprocessor = utils.ObssPreprocessor(args.model,
                                                   envs[0].observation_space,
                                                   args.pretrained_model)

    # Define actor-critic model
    # acmodel = utils.load_model(args.model, raise_not_found=False)
    acmodel = None
    if acmodel is None:
        if args.pretrained_model:
            acmodel = utils.load_model(args.pretrained_model,
                                       raise_not_found=True)
        else:
            acmodel = ACModel(obss_preprocessor.obs_space,
                              envs[0].action_space, args.image_dim,
                              args.memory_dim, args.instr_dim,
                              not args.no_instr, args.instr_arch,
                              not args.no_mem, args.arch)

    obss_preprocessor.vocab.save()
    # utils.save_model(acmodel, args.model)

    if torch_settings.cuda:
        acmodel.cuda()

    # Define actor-critic algo

    reshape_reward = lambda _0, _1, reward, _2: args.reward_scale * reward
    if args.algo == "ppo":
        algo = babyai.rl.PPOAlgo(
            envs, acmodel, args.frames_per_proc, args.discount, args.lr,
            args.beta1, args.beta2, args.gae_lambda, args.entropy_coef,
            args.value_loss_coef, args.max_grad_norm, args.recurrence,
            args.optim_eps, args.clip_eps, args.ppo_epochs, args.batch_size,
            obss_preprocessor, reshape_reward)
    else:
        raise ValueError("Incorrect algorithm name: {}".format(args.algo))

    # When using extra binary information, more tensors (model params) are initialized compared to when we don't use that.
    # Thus, there starts to be a difference in the random state. If we want to avoid it, in order to make sure that
    # the results of supervised-loss-coef=0. and extra-binary-info=0 match, we need to reseed here.

    utils.seed(args.seed)

    # Restore training status

    status_path = os.path.join(utils.get_log_dir(args.model), 'status.json')
    if os.path.exists(status_path):
        with open(status_path, 'r') as src:
            status = json.load(src)
    else:
        status = {'i': 0, 'num_episodes': 0, 'num_frames': 0}

    # # Define logger and Tensorboard writer and CSV writer

    # header = (["update", "episodes", "frames", "FPS", "duration"]
    #         + ["return_" + stat for stat in ['mean', 'std', 'min', 'max']]
    #         + ["success_rate"]
    #         + ["num_frames_" + stat for stat in ['mean', 'std', 'min', 'max']]
    #         + ["entropy", "value", "policy_loss", "value_loss", "loss", "grad_norm"])
    # if args.tb:
    #     from tensorboardX import SummaryWriter

    #     writer = SummaryWriter(utils.get_log_dir(args.model))
    # csv_path = os.path.join(utils.get_log_dir(args.model), 'log.csv')
    # first_created = not os.path.exists(csv_path)
    # # we don't buffer data going in the csv log, cause we assume
    # # that one update will take much longer that one write to the log
    # csv_writer = csv.writer(open(csv_path, 'a', 1))
    # if first_created:
    #     csv_writer.writerow(header)

    # Log code state, command, availability of CUDA and model

    babyai_code = list(babyai.__path__)[0]
    try:
        last_commit = subprocess.check_output(
            'cd {}; git log -n1'.format(babyai_code),
            shell=True).decode('utf-8')
        logger.info('LAST COMMIT INFO:')
        logger.info(last_commit)
    except subprocess.CalledProcessError:
        logger.info('Could not figure out the last commit')
    try:
        diff = subprocess.check_output('cd {}; git diff'.format(babyai_code),
                                       shell=True).decode('utf-8')
        if diff:
            logger.info('GIT DIFF:')
            logger.info(diff)
    except subprocess.CalledProcessError:
        logger.info('Could not figure out the last commit')
    logger.info('COMMAND LINE ARGS:')
    logger.info(args)
    logger.info("CUDA available: {}".format(torch.cuda.is_available()))
    logger.info(acmodel)

    # Train model

    total_start_time = time.time()
    best_success_rate = 0
    best_mean_return = 0
    test_env_name = args.env

    wrapper = iteration_wrapper(
        exp,
        sync=torch_settings.sync,
        max_count=args.max_count,
        sample_duration=args.sample_duration,
    )

    # while status['num_frames'] < args.frames:
    while True:
        with wrapper() as it:
            # Update parameters
            if wrapper.done():
                break

            update_start_time = time.time()
            logs = algo.update_parameters()
            update_end_time = time.time()

            it.set_count(logs["num_frames"])
            it.log(loss=logs["loss"], )