import time import datetime import numpy as np import sys import logging import babyai.utils as utils from babyai.arguments import ArgumentParser from babyai.imitation import ImitationLearning from babyai.evaluate import batch_evaluate, evaluate from babyai.utils.agent import BotAgent import torch import blosc from babyai.utils.agent import DemoAgent # Parse arguments parser = ArgumentParser() parser.add_argument("--demos", default=None, help="demos filename (REQUIRED or demos-origin required)") parser.add_argument("--demos-origin", required=False, help="origin of the demonstrations: human | agent (REQUIRED or demos required)") parser.add_argument("--episodes", type=int, default=0, help="number of episodes of demonstrations to use" "(default: 0, meaning all demos)") parser.add_argument("--start-demos", type=int, default=5000, help="the starting number of demonstrations") parser.add_argument("--demo-grow-factor", type=float, default=1.2, help="number of demos to add to the training set") parser.add_argument("--num-eval-demos", type=int, default=1000, help="number of demos used for evaluation while growing the training set") parser.add_argument("--finetune", action="store_true", default=False, help="fine-tune the model at every phase instead of retraining")
import time import datetime import torch import numpy as np import subprocess import babyai import babyai.utils as utils import babyai.rl from babyai.arguments import ArgumentParser from babyai.model import ACModel, ACModelImgInstr from babyai.evaluate import batch_evaluate from babyai.utils.agent import ModelAgent # Parse arguments parser = ArgumentParser() parser.add_argument("--algo", default='ppo', help="algorithm to use (default: ppo)") parser.add_argument("--discount", type=float, default=0.99, help="discount factor (default: 0.99)") parser.add_argument("--reward-scale", type=float, default=20., help="Reward scale multiplier") parser.add_argument("--gae-lambda", type=float, default=0.99, help="lambda coefficient in GAE formula (default: 0.99, 1 means no gae)") parser.add_argument("--value-loss-coef", type=float, default=0.5, help="value loss term coefficient (default: 0.5)") parser.add_argument("--max-grad-norm", type=float, default=0.5, help="maximum norm of gradient (default: 0.5)") parser.add_argument("--clip-eps", type=float, default=0.2, help="clipping epsilon for PPO (default: 0.2)") parser.add_argument("--ppo-epochs", type=int, default=4,
import gym import time import datetime import numpy as np import logging import torch sys.path.insert(0, os.environ['BABYAI_ROOT']) sys.path.insert(0, os.path.join(os.environ['BABYAI_ROOT'], 'babyai')) from babyai.arguments import ArgumentParser import babyai.utils as utils from babyai.imitation import ImitationLearning import gridworld.envs # Parse arguments parser = ArgumentParser() parser.add_argument( "--demos", default=None, help="demos filename (REQUIRED or demos-origin or multi-demos required)") parser.add_argument("--demos-size", default='10k', help="size of demos") parser.add_argument( "--demos-origin", required=False, help= "origin of the demonstrations: human | agent (REQUIRED or demos or multi-demos required)" ) parser.add_argument("--episodes", type=int, default=0, help="number of episodes of demonstrations to use"
import datetime import numpy as np import sys import logging import babyai.utils as utils from babyai.arguments import ArgumentParser from babyai.imitation import ImitationLearning from babyai.evaluate import batch_evaluate, evaluate from babyai.utils.agent import BotAgent import babyai.utils as utils import torch import blosc from babyai.utils.agent import DemoAgent # Parse arguments parser = ArgumentParser() parser.add_argument("--demos", default=None, help="demos filename (REQUIRED or demos-origin required)") parser.add_argument( "--demos-origin", required=False, help= "origin of the demonstrations: human | agent (REQUIRED or demos required)") parser.add_argument("--episodes", type=int, default=0, help="number of episodes of demonstrations to use" "(default: 0, meaning all demos)") parser.add_argument("--start-demos", type=int,
import os import csv import copy import gym import time import datetime import numpy as np import sys import logging import torch from babyai.arguments import ArgumentParser import babyai.utils as utils from babyai.imitation import ImitationLearning # Parse arguments parser = ArgumentParser() parser.add_argument( "--demos", default=None, help="demos filename (REQUIRED or demos-origin or multi-demos required)") parser.add_argument( "--demos-origin", required=False, help= "origin of the demonstrations: human | agent (REQUIRED or demos or multi-demos required)" ) parser.add_argument("--episodes", type=int, default=0, help="number of episodes of demonstrations to use" "(default: 0, meaning all demos)")
def run_experiment(**config): set_seed(config['seed']) original_saved_path = config['saved_path'] if original_saved_path is not None: saved_model = joblib.load(config['saved_path']) if 'config' in saved_model: if not config['override_old_config']: config = saved_model['config'] arguments = { "start_loc": 'all', "include_holdout_obj": False, "persist_goal": config['persist_goal'], "persist_objs": config['persist_objs'], "persist_agent": config['persist_agent'], "feedback_type": config["feedback_type"], "feedback_always": config["feedback_always"], "feedback_freq": config["feedback_freq"], "cartesian_steps": config["cartesian_steps"], "num_meta_tasks": config["rollouts_per_meta_task"], "intermediate_reward": config["intermediate_reward"], } advice_start_index = 160 if original_saved_path is not None: set_seed(config['seed']) policy = saved_model['policy'] optimizer = saved_model['optimizer'] policy.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # TODO: is this necessary? policy.hidden_state = None baseline = saved_model['baseline'] curriculum_step = saved_model['curriculum_step'] env = rl2env(normalize( Curriculum(config['advance_curriculum_func'], start_index=curriculum_step, **arguments)), ceil_reward=config['ceil_reward']) start_itr = saved_model['itr'] reward_predictor = saved_model['reward_predictor'] reward_predictor.hidden_state = None if 'supervised_model' in saved_model: supervised_model = saved_model['supervised_model'] else: supervised_model = None teacher_train_dict = {} for teacher_name in config['feedback_type']: teacher_train_dict[teacher_name] = True else: teacher_train_dict = {} for teacher_name in config['feedback_type']: teacher_train_dict[teacher_name] = True optimizer = None baseline = None env = rl2env(normalize( Curriculum(config['advance_curriculum_func'], start_index=config['level'], **arguments)), ceil_reward=config['ceil_reward']) obs = env.reset() obs_dim = 100 # TODO: consider changing this with 'additional' and adding it! advice_size = sum( [np.prod(obs[adv_k].shape) for adv_k in teacher_train_dict.keys()]) image_dim = 128 memory_dim = config['memory_dim'] instr_dim = config['instr_dim'] use_instr = True instr_arch = 'bigru' use_mem = True arch = 'bow_endpool_res' advice_dim = 128 # TODO: move this to the config policy = ACModel(obs_space=obs_dim, action_space=env.action_space, env=env, image_dim=image_dim, memory_dim=memory_dim, instr_dim=instr_dim, lang_model=instr_arch, use_instr=use_instr, use_memory=use_mem, arch=arch, advice_dim=advice_dim, advice_size=advice_size, num_modules=config['num_modules']) reward_predictor = ACModel( obs_space=obs_dim - 1, # TODO: change into Discrete(3) and do 3-way classification action_space=spaces.Discrete(2), env=env, image_dim=image_dim, memory_dim=memory_dim, instr_dim=instr_dim, lang_model=instr_arch, use_instr=use_instr, use_memory=use_mem, arch=arch, advice_dim=advice_dim, advice_size=advice_size, num_modules=config['num_modules']) if config['self_distill'] and not config['distill_same_model']: obs_dim = env.reset()['obs'].shape[0] image_dim = 128 memory_dim = config['memory_dim'] instr_dim = config['instr_dim'] use_instr = True instr_arch = 'bigru' use_mem = True arch = 'bow_endpool_res' supervised_model = ACModel(obs_space=obs_dim - 1, action_space=env.action_space, env=env, image_dim=image_dim, memory_dim=memory_dim, instr_dim=instr_dim, lang_model=instr_arch, use_instr=use_instr, use_memory=use_mem, arch=arch, advice_dim=advice_dim, advice_size=advice_size, num_modules=config['num_modules']) elif config['self_distill']: supervised_model = policy else: supervised_model = None start_itr = 0 curriculum_step = env.index parser = ArgumentParser() args = parser.parse_args([]) args.entropy_coef = config['entropy_bonus'] args.model = 'default_il' args.lr = config['learning_rate'] args.recurrence = config['backprop_steps'] args.clip_eps = config['clip_eps'] if supervised_model is not None: il_trainer = ImitationLearning( supervised_model, env, args, distill_with_teacher=config['distill_with_teacher']) else: il_trainer = None rp_trainer = ImitationLearning(reward_predictor, env, args, distill_with_teacher=True, reward_predictor=True) teacher_null_dict = env.teacher.null_feedback() obs_preprocessor = make_obs_preprocessor(teacher_null_dict) sampler = MetaSampler( env=env, policy=policy, rollouts_per_meta_task=config['rollouts_per_meta_task'], meta_batch_size=config['meta_batch_size'], max_path_length=config['max_path_length'], parallel=config['parallel'], envs_per_task=1, reward_predictor=reward_predictor, supervised_model=supervised_model, obs_preprocessor=obs_preprocessor, ) sample_processor = RL2SampleProcessor( baseline=baseline, discount=config['discount'], gae_lambda=config['gae_lambda'], normalize_adv=config['normalize_adv'], positive_adv=config['positive_adv'], ) envs = [ copy.deepcopy(env), copy.deepcopy(env), copy.deepcopy(env), copy.deepcopy(env), copy.deepcopy(env), copy.deepcopy(env), copy.deepcopy(env), copy.deepcopy(env), copy.deepcopy(env), copy.deepcopy(env), copy.deepcopy(env), copy.deepcopy(env), copy.deepcopy(env), copy.deepcopy(env), copy.deepcopy(env), copy.deepcopy(env), copy.deepcopy(env), copy.deepcopy(env), copy.deepcopy(env), copy.deepcopy(env), ] algo = PPOAlgo(policy, envs, config['frames_per_proc'], config['discount'], args.lr, args.beta1, args.beta2, config['gae_lambda'], args.entropy_coef, config['value_loss_coef'], config['max_grad_norm'], args.recurrence, args.optim_eps, config['clip_eps'], config['epochs'], config['meta_batch_size'], parallel=config['parallel'], rollouts_per_meta_task=config['rollouts_per_meta_task'], obs_preprocessor=obs_preprocessor) if optimizer is not None: algo.optimizer.load_state_dict(optimizer) EXP_NAME = get_exp_name(config) exp_dir = os.getcwd() + '/data/' + EXP_NAME + "_" + str(config['seed']) if original_saved_path is None: if os.path.isdir(exp_dir): shutil.rmtree(exp_dir) log_formats = ['stdout', 'log', 'csv'] is_debug = config['prefix'] == 'DEBUG' if not is_debug: log_formats.append('tensorboard') log_formats.append('wandb') logger.configure(dir=exp_dir, format_strs=log_formats, snapshot_mode=config['save_option'], snapshot_gap=50, step=start_itr, name=config['prefix'] + str(config['seed']), config=config) json.dump(config, open(exp_dir + '/params.json', 'w'), indent=2, sort_keys=True, cls=ClassEncoder) advice_end_index, advice_dim = 161, 1 if config[ 'distill_with_teacher']: # TODO: generalize this for multiple feedback types at once! teacher_info = [] else: null_val = np.zeros(advice_end_index - advice_start_index) if len(null_val) > 0: null_val[-1] = 1 teacher_info = [{ "indices": np.arange(advice_start_index, advice_end_index), "null": null_val }] trainer = Trainer( algo=algo, policy=policy, env=deepcopy(env), sampler=sampler, sample_processor=sample_processor, n_itr=config['n_itr'], start_itr=start_itr, success_threshold=config['success_threshold'], accuracy_threshold=config['accuracy_threshold'], exp_name=exp_dir, curriculum_step=curriculum_step, config=config, advance_without_teacher=True, teacher_info=teacher_info, sparse_rewards=not config['intermediate_reward'], distill_only=config['distill_only'], il_trainer=il_trainer, source=config['source'], batch_size=config['meta_batch_size'], train_with_teacher=config['feedback_type'] is not None, distill_with_teacher=config['distill_with_teacher'], supervised_model=supervised_model, reward_predictor=reward_predictor, rp_trainer=rp_trainer, advance_levels=config['advance_levels'], is_debug=is_debug, teacher_train_dict=teacher_train_dict, obs_preprocessor=obs_preprocessor, ) trainer.train()
def main(exp, argv): os.environ["BABYAI_STORAGE"] = exp.results_directory() # Parse arguments parser = ArgumentParser() parser.add_argument("--algo", default='ppo', help="algorithm to use (default: ppo)") parser.add_argument("--discount", type=float, default=0.99, help="discount factor (default: 0.99)") parser.add_argument("--reward-scale", type=float, default=20., help="Reward scale multiplier") parser.add_argument( "--gae-lambda", type=float, default=0.99, help="lambda coefficient in GAE formula (default: 0.99, 1 means no gae)" ) parser.add_argument("--value-loss-coef", type=float, default=0.5, help="value loss term coefficient (default: 0.5)") parser.add_argument("--max-grad-norm", type=float, default=0.5, help="maximum norm of gradient (default: 0.5)") parser.add_argument("--clip-eps", type=float, default=0.2, help="clipping epsilon for PPO (default: 0.2)") parser.add_argument("--ppo-epochs", type=int, default=4, help="number of epochs for PPO (default: 4)") parser.add_argument( "--save-interval", type=int, default=50, help= "number of updates between two saves (default: 50, 0 means no saving)") parser.add_argument("--workers", type=int, default=8, help="number of workers for PyTorch (default: 8)") parser.add_argument("--max-count", type=int, default=1000, help="maximum number of frames to run for") parser.add_argument("--sample_duration", type=float, default=0.5, help="sampling duration") parser.add_argument("--cuda", action="store_true", default=False, help="whether to use cuda") args = parser.parse_args(argv) utils.seed(args.seed) torch_settings = init_torch( seed=args.seed, cuda=args.cuda, workers=args.workers, ) # Generate environments envs = [] for i in range(args.procs): env = gym.make(args.env) env.seed(100 * args.seed + i) envs.append(env) # Define model name suffix = datetime.datetime.now().strftime("%y-%m-%d-%H-%M-%S") instr = args.instr_arch if args.instr_arch else "noinstr" mem = "mem" if not args.no_mem else "nomem" model_name_parts = { 'env': args.env, 'algo': args.algo, 'arch': args.arch, 'instr': instr, 'mem': mem, 'seed': args.seed, 'info': '', 'coef': '', 'suffix': suffix } default_model_name = "{env}_{algo}_{arch}_{instr}_{mem}_seed{seed}{info}{coef}_{suffix}".format( **model_name_parts) if args.pretrained_model: default_model_name = args.pretrained_model + '_pretrained_' + default_model_name args.model = args.model.format( **model_name_parts) if args.model else default_model_name utils.configure_logging(args.model) logger = logging.getLogger(__name__) # Define obss preprocessor if 'emb' in args.arch: obss_preprocessor = utils.IntObssPreprocessor( args.model, envs[0].observation_space, args.pretrained_model) else: obss_preprocessor = utils.ObssPreprocessor(args.model, envs[0].observation_space, args.pretrained_model) # Define actor-critic model # acmodel = utils.load_model(args.model, raise_not_found=False) acmodel = None if acmodel is None: if args.pretrained_model: acmodel = utils.load_model(args.pretrained_model, raise_not_found=True) else: acmodel = ACModel(obss_preprocessor.obs_space, envs[0].action_space, args.image_dim, args.memory_dim, args.instr_dim, not args.no_instr, args.instr_arch, not args.no_mem, args.arch) obss_preprocessor.vocab.save() # utils.save_model(acmodel, args.model) if torch_settings.cuda: acmodel.cuda() # Define actor-critic algo reshape_reward = lambda _0, _1, reward, _2: args.reward_scale * reward if args.algo == "ppo": algo = babyai.rl.PPOAlgo( envs, acmodel, args.frames_per_proc, args.discount, args.lr, args.beta1, args.beta2, args.gae_lambda, args.entropy_coef, args.value_loss_coef, args.max_grad_norm, args.recurrence, args.optim_eps, args.clip_eps, args.ppo_epochs, args.batch_size, obss_preprocessor, reshape_reward) else: raise ValueError("Incorrect algorithm name: {}".format(args.algo)) # When using extra binary information, more tensors (model params) are initialized compared to when we don't use that. # Thus, there starts to be a difference in the random state. If we want to avoid it, in order to make sure that # the results of supervised-loss-coef=0. and extra-binary-info=0 match, we need to reseed here. utils.seed(args.seed) # Restore training status status_path = os.path.join(utils.get_log_dir(args.model), 'status.json') if os.path.exists(status_path): with open(status_path, 'r') as src: status = json.load(src) else: status = {'i': 0, 'num_episodes': 0, 'num_frames': 0} # # Define logger and Tensorboard writer and CSV writer # header = (["update", "episodes", "frames", "FPS", "duration"] # + ["return_" + stat for stat in ['mean', 'std', 'min', 'max']] # + ["success_rate"] # + ["num_frames_" + stat for stat in ['mean', 'std', 'min', 'max']] # + ["entropy", "value", "policy_loss", "value_loss", "loss", "grad_norm"]) # if args.tb: # from tensorboardX import SummaryWriter # writer = SummaryWriter(utils.get_log_dir(args.model)) # csv_path = os.path.join(utils.get_log_dir(args.model), 'log.csv') # first_created = not os.path.exists(csv_path) # # we don't buffer data going in the csv log, cause we assume # # that one update will take much longer that one write to the log # csv_writer = csv.writer(open(csv_path, 'a', 1)) # if first_created: # csv_writer.writerow(header) # Log code state, command, availability of CUDA and model babyai_code = list(babyai.__path__)[0] try: last_commit = subprocess.check_output( 'cd {}; git log -n1'.format(babyai_code), shell=True).decode('utf-8') logger.info('LAST COMMIT INFO:') logger.info(last_commit) except subprocess.CalledProcessError: logger.info('Could not figure out the last commit') try: diff = subprocess.check_output('cd {}; git diff'.format(babyai_code), shell=True).decode('utf-8') if diff: logger.info('GIT DIFF:') logger.info(diff) except subprocess.CalledProcessError: logger.info('Could not figure out the last commit') logger.info('COMMAND LINE ARGS:') logger.info(args) logger.info("CUDA available: {}".format(torch.cuda.is_available())) logger.info(acmodel) # Train model total_start_time = time.time() best_success_rate = 0 best_mean_return = 0 test_env_name = args.env wrapper = iteration_wrapper( exp, sync=torch_settings.sync, max_count=args.max_count, sample_duration=args.sample_duration, ) # while status['num_frames'] < args.frames: while True: with wrapper() as it: # Update parameters if wrapper.done(): break update_start_time = time.time() logs = algo.update_parameters() update_end_time = time.time() it.set_count(logs["num_frames"]) it.log(loss=logs["loss"], )
import time import datetime import torch import numpy as np import subprocess import babyai import babyai.utils as utils import babyai.rl from babyai.arguments import ArgumentParser from babyai.model import ACModel from babyai.evaluate import batch_evaluate from babyai.utils.agent import ModelAgent # Parse arguments parser = ArgumentParser() parser.add_argument("--algo", default='rcppo', help="algorithm to use (default: ppo)") parser.add_argument("--discount", type=float, default=0.99, help="discount factor (default: 0.99)") parser.add_argument("--reward-scale", type=float, default=20., help="Reward scale multiplier") parser.add_argument( "--gae-lambda", type=float, default=0.99,
import gym import time import datetime import numpy as np import sys import logging import babyai.utils as utils from babyai.arguments import ArgumentParser from babyai.imitation import ImitationLearning from babyai.evaluate import batch_evaluate from babyai.utils.agent import BotAgent import torch import blosc # Parse arguments parser = ArgumentParser() parser.add_argument("--demos", default=None, help="demos filename (REQUIRED or demos-origin required)") parser.add_argument( "--demos-origin", required=False, help= "origin of the demonstrations: human | agent (REQUIRED or demos required)") parser.add_argument("--episodes", type=int, default=0, help="number of episodes of demonstrations to use" "(default: 0, meaning all demos)") parser.add_argument("--start-demos", type=int,
import datetime import torch import numpy as np import subprocess import babyai import babyai.utils as utils import babyai.rl from babyai.arguments import ArgumentParser from babyai.model import ACModel from babyai.evaluate import batch_evaluate from babyai.utils.agent import ModelAgent # Parse arguments parser = ArgumentParser() parser.add_argument("--algo", default='ppo', help="algorithm to use (default: ppo)") parser.add_argument("--discount", type=float, default=0.99, help="discount factor (default: 0.99)") parser.add_argument("--reward-scale", type=float, default=20., help="Reward scale multiplier") parser.add_argument("--gae-lambda", type=float, default=0.99, help="lambda coefficient in GAE formula (default: 0.99, 1 means no gae)") parser.add_argument("--value-loss-coef", type=float, default=0.5, help="value loss term coefficient (default: 0.5)") parser.add_argument("--max-grad-norm", type=float, default=0.5, help="maximum norm of gradient (default: 0.5)") parser.add_argument("--clip-eps", type=float, default=0.2, help="clipping epsilon for PPO (default: 0.2)") parser.add_argument("--ppo-epochs", type=int, default=4,