def build_and_train(game="pong", run_ID=0, cuda_idx=None, mid_batch_reset=False, n_parallel=2): affinity = dict(cuda_idx=cuda_idx, workers_cpus=list(range(n_parallel))) Collector = GpuResetCollector if mid_batch_reset else GpuWaitResetCollector print(f"To satisfy mid_batch_reset=={mid_batch_reset}, using {Collector}.") sampler = GpuSampler( EnvCls=AtariEnv, env_kwargs=dict(game=game, num_img_obs=1), # Learn on individual frames. CollectorCls=Collector, batch_T=20, # Longer sampling/optimization horizon for recurrence. batch_B=16, # 16 parallel environments. max_decorrelation_steps=400, ) algo = A2C() # Run with defaults. agent = AtariLstmAgent() runner = MinibatchRl( algo=algo, agent=agent, sampler=sampler, n_steps=50e6, log_interval_steps=1e5, affinity=affinity, ) config = dict(game=game) name = "a2c_" + game log_dir = "example_4" with logger_context(log_dir, run_ID, name, config): runner.train()
def build_and_train(slot_affinity_code, log_dir, run_ID, config_key): affinity = affinity_from_code(slot_affinity_code) config = configs[config_key] variant = load_variant(log_dir) config = update_config(config, variant) eval_env_config = config["env"].copy() eval_env_config["start_level"] = config["env"]["num_levels"] + 100 eval_env_config["num_levels"] = 100 sampler = GpuSampler(EnvCls=make, env_kwargs=config["env"], CollectorCls=GpuResetCollector, eval_env_kwargs=eval_env_config, **config["sampler"]) if config["checkpoint"]: model_state_dict = torch.load(config["checkpoint"]) print("Loaded.") else: model_state_dict = None algo = PPO_AUG_VAE(optim_kwargs=config["optim"], **config["algo"]) agent = RADPgVaeAgent(ModelCls=RadVaePolicy, model_kwargs=config["model"], initial_model_state_dict=model_state_dict, **config["agent"]) runner = MinibatchRlEval(algo=algo, agent=agent, sampler=sampler, affinity=affinity, **config["runner"]) name = config["env"]["id"] with logger_context(log_dir, run_ID, name, config, snapshot_mode='last'): runner.train()
def build_and_train(slot_affinity_code, log_dir, run_ID, config_key): affinity = affinity_from_code(slot_affinity_code) config = configs[config_key] variant = load_variant(log_dir) config = update_config(config, variant) eval_env_config = config["env"].copy() eval_env_config["start_level"] = config["env"]["num_levels"] + 100 eval_env_config["num_levels"] = 100 sampler = GpuSampler( EnvCls=make, env_kwargs=config["env"], CollectorCls=GpuResetCollector, eval_env_kwargs=eval_env_config, **config["sampler"] ) algo = PPO(optim_kwargs=config["optim"], **config["algo"]) agent = RADPgAgent(ModelCls=RADModel, model_kwargs=config["model"], **config["agent"]) runner = MinibatchRlEval( algo=algo, agent=agent, sampler=sampler, affinity=affinity, **config["runner"] ) name = config["env"]["id"] with logger_context(log_dir, run_ID, name, config, snapshot_mode='last'): runner.train()
def build_and_train(slot_affinity_code, log_dir, run_ID, config_key): affinity = make_affinity( run_slot=0, n_cpu_core=os.cpu_count(), # Use 16 cores across all experiments. n_gpu=1, # Use 8 gpus across all experiments. gpu_per_run=1, sample_gpu_per_run=1, async_sample=True, optim_sample_share_gpu=True) config = configs[config_key] variant = load_variant(log_dir) config = update_config(config, variant) config["eval_env"]["game"] = config["env"]["game"] sampler = GpuSampler(EnvCls=AtariEnv, env_kwargs=config["env"], CollectorCls=GpuWaitResetCollector, TrajInfoCls=AtariTrajInfo, eval_env_kwargs=config["eval_env"], **config["sampler"]) algo = CategoricalDQN(optim_kwargs=config["optim"], **config["algo"]) agent = AtariCatDqnAgent(model_kwargs=config["model"], **config["agent"]) runner = MinibatchRlEval(algo=algo, agent=agent, sampler=sampler, affinity=affinity, **config["runner"]) name = config["env"]["game"] with logger_context(log_dir, run_ID, name, config): runner.train()
def build_and_train(slot_affinity_code, log_dir, run_ID, config_key): affinity = affinity_from_code(slot_affinity_code) config = configs[config_key] variant = load_variant(log_dir) config = update_config(config, variant) config["eval_env"]["game"] = config["env"]["game"] sampler = GpuSampler( EnvCls=AtariEnv, env_kwargs=config["env"], CollectorCls=WaitResetCollector, TrajInfoCls=AtariTrajInfo, eval_env_kwargs=config["eval_env"], **config["sampler"] ) algo = DQN(optim_kwargs=config["optim"], **config["algo"]) agent = AtariDqnAgent(model_kwargs=config["model"], **config["agent"]) runner = MinibatchRlEval( algo=algo, agent=agent, sampler=sampler, affinity=affinity, **config["runner"] ) name = config["env"]["game"] with logger_context(log_dir, run_ID, name, config): runner.train()
def build_and_train(slot_affinity_code, log_dir, run_ID): # (Or load from a central store of configs.) config = dict( env=dict(game="pong"), algo=dict(learning_rate=7e-4), sampler=dict(batch_B=16), ) affinity = affinity_from_code(slot_affinity_code) variant = load_variant(log_dir) global config config = update_config(config, variant) sampler = GpuSampler( EnvCls=AtariEnv, env_kwargs=config["env"], CollectorCls=GpuWaitResetCollector, batch_T=5, # batch_B=16, # Get from config. max_decorrelation_steps=400, **config["sampler"]) algo = A2C(**config["algo"]) # Run with defaults. agent = AtariFfAgent() runner = MinibatchRl( algo=algo, agent=agent, sampler=sampler, n_steps=50e6, log_interval_steps=1e5, affinity=affinity, ) name = "a2c_" + config["env"]["game"] log_dir = "example_6" with logger_context(log_dir, run_ID, name, config): runner.train()
def build_and_train(game="pong", run_ID=0, cuda_idx=None, n_parallel=2): config = dict( env=dict(game=game), algo=dict(batch_size=128), sampler=dict(batch_T=2, batch_B=32), ) sampler = GpuSampler( EnvCls=AtariEnv, env_kwargs=dict(game=game), CollectorCls=GpuWaitResetCollector, eval_env_kwargs=dict(game=game), max_decorrelation_steps=0, eval_n_envs=10, eval_max_steps=int(10e3), eval_max_trajectories=5, # batch_T=4, # Get from config. # batch_B=1, **config[ "sampler"] # More parallel environments for batched forward-pass. ) algo = DQN(**config["algo"]) # Run with defaults. agent = AtariDqnAgent() runner = MinibatchRlEval( algo=algo, agent=agent, sampler=sampler, n_steps=50e6, log_interval_steps=1e3, affinity=dict(cuda_idx=cuda_idx, workers_cpus=list(range(n_parallel))), ) name = "dqn_" + game log_dir = "example_5" with logger_context(log_dir, run_ID, name, config): runner.train()
def build_and_train(slot_affinity_code, log_dir, run_ID, config_key): affinity = affinity_from_code(slot_affinity_code) assert isinstance(affinity, list) # One for each GPU. config = configs[config_key] variant = load_variant(log_dir) config = update_config(config, variant) sampler = GpuSampler( EnvCls=AtariEnv, env_kwargs=config["env"], CollectorCls=GpuWaitResetCollector, TrajInfoCls=AtariTrajInfo, **config["sampler"] ) algo = A2C(optim_kwargs=config["optim"], **config["algo"]) agent = AtariFfAgent(model_kwargs=config["model"], **config["agent"]) runner = SyncRl( algo=algo, agent=agent, sampler=sampler, affinity=affinity, **config["runner"] ) name = config["env"]["game"] with logger_context(log_dir, run_ID, name, config): runner.train()
def build_and_train(slot_affinity_code, log_dir, run_ID, config_key): affinity = affinity_from_code(slot_affinity_code) config = configs[config_key] variant = load_variant(log_dir) config = update_config(config, variant) sampler = GpuSampler( EnvCls=gym.make, env_kwargs=config["env"], eval_env_kwargs=config["eval_env"], **config["sampler"] ) algo = DiscreteSACAE(optim_kwargs=config["optim"], ae_optim_kwargs=config["ae_optim"], **config["algo"]) agent = DiscreteSacAEAgent(**config["agent"], encoder_kwargs=config["encoder"], model_kwargs=config["actor"], critic_kwargs=config["critic"]) runner = MinibatchRlEval( algo=algo, agent=agent, sampler=sampler, affinity=affinity, **config["runner"] ) name = config["env"]["id"] with logger_context(log_dir, run_ID, name, config, snapshot_mode='last'): runner.train()
def build_and_train(slot_affinity_code, log_dir, run_ID, config_key): affinity = affinity_from_code(slot_affinity_code) config = configs[config_key] variant = load_variant(log_dir) config = update_config(config, variant) config["algo_name"] = 'A2OC' t_env = pomdp_interface(**config["env"]) config["algo"]["discount"] = t_env.discount sampler = GpuSampler( EnvCls=pomdp_interface, env_kwargs=config["env"], **config["sampler"] ) algo = A2OC(optim_kwargs=config["optim"], **config["algo"]) agent = PomdpOcFfAgent(model_kwargs=config["model"], **config["agent"]) runner = MinibatchRl( algo=algo, agent=agent, sampler=sampler, affinity=affinity, **config["runner"] ) name = config["env"]["id"] with logger_context(log_dir, run_ID, name, config): runner.train()
def build_and_train(slot_affinity_code, log_dir, run_ID, config_key): affinity = affinity_from_code(slot_affinity_code) config = configs[config_key] variant = load_variant(log_dir) config = update_config(config, variant) sampler = GpuSampler(EnvCls=gym_make, env_kwargs=config["env"], CollectorCls=GpuResetCollector, **config["sampler"]) algo = PPO(optim_kwargs=config["optim"], **config["algo"]) agent = MujocoFfAgent(model_kwargs=config["model"], **config["agent"]) runner = MinibatchRl(algo=algo, agent=agent, sampler=sampler, affinity=affinity, **config["runner"]) name = config["env"]["id"] with logger_context(log_dir, run_ID, name, config): runner.train()
def build_and_train(game="pong", run_ID=0): # Seems like we should be able to skip the intermediate step of the code, # but so far have just always run that way. # Change these inputs to match local machine and desired parallelism. affinity = make_affinity( run_slot=0, n_cpu_core=16, # Use 16 cores across all experiments. n_gpu=8, # Use 8 gpus across all experiments. hyperthread_offset=24, # If machine has 24 cores. n_socket=2, # Presume CPU socket affinity to lower/upper half GPUs. gpu_per_run=2, # How many GPUs to parallelize one run across. # cpu_per_run=1, ) sampler = GpuSampler( EnvCls=AtariEnv, TrajInfoCls=AtariTrajInfo, env_kwargs=dict(game=game), CollectorCls=GpuWaitResetCollector, batch_T=5, batch_B=16, max_decorrelation_steps=400, ) algo = A2C() # Run with defaults. agent = AtariFfAgent() runner = SyncRl( algo=algo, agent=agent, sampler=sampler, n_steps=50e6, log_interval_steps=1e5, affinity=affinity, ) config = dict(game=game) name = "a2c_" + game log_dir = "example_7" with logger_context(log_dir, run_ID, name, config): runner.train()
def build_and_train(slot_affinity_code, log_dir, run_ID, config_key): affinity = affinity_from_code(slot_affinity_code) config = configs[config_key] variant = load_variant(log_dir) config = update_config(config, variant) sampler = GpuSampler( EnvCls=gym.make, env_kwargs=config["env"], CollectorCls=GpuResetCollector, eval_env_kwargs=config["eval_env"], **config["sampler"] ) if config["checkpoint"]: model_state_dict = torch.load(config["checkpoint"]) else: model_state_dict = None algo = PPO(optim_kwargs=config["optim"], **config["algo"]) agent = CategoricalPgAgent( ModelCls=BaselinePolicy, model_kwargs=config["model"], initial_model_state_dict=model_state_dict, **config["agent"] ) runner = MinibatchRlEval( algo=algo, agent=agent, sampler=sampler, affinity=affinity, **config["runner"] ) name = config["env"]["id"] with logger_context(log_dir, run_ID, name, config, snapshot_mode='last'): runner.train()
def start_experiment(args): args_json = json.dumps(vars(args), indent=4) if not os.path.isdir(args.log_dir): os.makedirs(args.log_dir) with open(args.log_dir + '/arguments.json', 'w') as jsonfile: jsonfile.write(args_json) with open(args.log_dir + '/git.txt', 'w') as git_file: branch = subprocess.check_output(['git', 'rev-parse', '--abbrev-ref', 'HEAD']).strip().decode('utf-8') commit = subprocess.check_output(['git', 'rev-parse', 'HEAD']).strip().decode('utf-8') git_file.write('{}/{}'.format(branch, commit)) config = dict(env_id=args.env) if args.sample_mode == 'gpu': # affinity = dict(num_gpus=args.num_gpus, workers_cpus=list(range(args.num_cpus))) if args.num_gpus > 0: # import ipdb; ipdb.set_trace() affinity = make_affinity( run_slot=0, n_cpu_core=args.num_cpus, # Use 16 cores across all experiments. n_gpu=args.num_gpus, # Use 8 gpus across all experiments. # contexts_per_gpu=2, # hyperthread_offset=72, # If machine has 24 cores. # n_socket=2, # Presume CPU socket affinity to lower/upper half GPUs. gpu_per_run=args.gpu_per_run, # How many GPUs to parallelize one run across. # cpu_per_run=1, ) print('Make multi-gpu affinity') else: affinity = dict(cuda_idx=0, workers_cpus=list(range(args.num_cpus))) os.environ['CUDA_VISIBLE_DEVICES'] = str(0) else: affinity = dict(workers_cpus=list(range(args.num_cpus))) # potentially reload models initial_optim_state_dict = None initial_model_state_dict = None if args.pretrain != 'None': os.system(f"find {args.log_dir} -name '*.json' -delete") # clean up json files for video recorder checkpoint = torch.load(os.path.join(_RESULTS_DIR, args.pretrain, 'params.pkl')) initial_optim_state_dict = checkpoint['optimizer_state_dict'] initial_model_state_dict = checkpoint['agent_state_dict'] # ----------------------------------------------------- POLICY ----------------------------------------------------- # model_args = dict(curiosity_kwargs=dict(curiosity_alg=args.curiosity_alg), curiosity_step_kwargs=dict()) if args.curiosity_alg =='icm': model_args['curiosity_kwargs']['feature_encoding'] = args.feature_encoding model_args['curiosity_kwargs']['batch_norm'] = args.batch_norm model_args['curiosity_kwargs']['prediction_beta'] = args.prediction_beta model_args['curiosity_kwargs']['forward_loss_wt'] = args.forward_loss_wt model_args['curiosity_kwargs']['forward_model'] = args.forward_model model_args['curiosity_kwargs']['feature_space'] = args.feature_space elif args.curiosity_alg == 'micm': model_args['curiosity_kwargs']['feature_encoding'] = args.feature_encoding model_args['curiosity_kwargs']['batch_norm'] = args.batch_norm model_args['curiosity_kwargs']['prediction_beta'] = args.prediction_beta model_args['curiosity_kwargs']['forward_loss_wt'] = args.forward_loss_wt model_args['curiosity_kwargs']['forward_model'] = args.forward_model model_args['curiosity_kwargs']['ensemble_mode'] = args.ensemble_mode model_args['curiosity_kwargs']['device'] = args.sample_mode elif args.curiosity_alg == 'disagreement': model_args['curiosity_kwargs']['feature_encoding'] = args.feature_encoding model_args['curiosity_kwargs']['ensemble_size'] = args.ensemble_size model_args['curiosity_kwargs']['batch_norm'] = args.batch_norm model_args['curiosity_kwargs']['prediction_beta'] = args.prediction_beta model_args['curiosity_kwargs']['forward_loss_wt'] = args.forward_loss_wt model_args['curiosity_kwargs']['device'] = args.sample_mode model_args['curiosity_kwargs']['forward_model'] = args.forward_model elif args.curiosity_alg == 'ndigo': model_args['curiosity_kwargs']['feature_encoding'] = args.feature_encoding model_args['curiosity_kwargs']['pred_horizon'] = args.pred_horizon model_args['curiosity_kwargs']['prediction_beta'] = args.prediction_beta model_args['curiosity_kwargs']['batch_norm'] = args.batch_norm model_args['curiosity_kwargs']['device'] = args.sample_mode elif args.curiosity_alg == 'rnd': model_args['curiosity_kwargs']['feature_encoding'] = args.feature_encoding model_args['curiosity_kwargs']['prediction_beta'] = args.prediction_beta model_args['curiosity_kwargs']['drop_probability'] = args.drop_probability model_args['curiosity_kwargs']['gamma'] = args.discount model_args['curiosity_kwargs']['device'] = args.sample_mode if args.curiosity_alg != 'none': model_args['curiosity_step_kwargs']['curiosity_step_minibatches'] = args.curiosity_step_minibatches if args.env in _MUJOCO_ENVS: if args.lstm: agent = MujocoLstmAgent(initial_model_state_dict=initial_model_state_dict) else: agent = MujocoFfAgent(initial_model_state_dict=initial_model_state_dict) else: if args.lstm: agent = AtariLstmAgent( initial_model_state_dict=initial_model_state_dict, model_kwargs=model_args, no_extrinsic=args.no_extrinsic, dual_model=args.dual_model, ) else: agent = AtariFfAgent(initial_model_state_dict=initial_model_state_dict, model_kwargs=model_args, no_extrinsic=args.no_extrinsic, dual_model=args.dual_model) # ----------------------------------------------------- LEARNING ALG ----------------------------------------------------- # if args.alg == 'ppo': algo = PPO( discount=args.discount, learning_rate=args.lr, value_loss_coeff=args.v_loss_coeff, entropy_loss_coeff=args.entropy_loss_coeff, OptimCls=torch.optim.Adam, optim_kwargs=None, clip_grad_norm=args.grad_norm_bound, initial_optim_state_dict=initial_optim_state_dict, # is None is not reloading a checkpoint gae_lambda=args.gae_lambda, minibatches=args.minibatches, # if recurrent: batch_B needs to be at least equal, if not recurrent: batch_B*batch_T needs to be at least equal to this epochs=args.epochs, ratio_clip=args.ratio_clip, linear_lr_schedule=args.linear_lr, normalize_advantage=args.normalize_advantage, normalize_reward=args.normalize_reward, curiosity_type=args.curiosity_alg, policy_loss_type=args.policy_loss_type ) elif args.alg == 'a2c': algo = A2C( discount=args.discount, learning_rate=args.lr, value_loss_coeff=args.v_loss_coeff, entropy_loss_coeff=args.entropy_loss_coeff, OptimCls=torch.optim.Adam, optim_kwargs=None, clip_grad_norm=args.grad_norm_bound, initial_optim_state_dict=initial_optim_state_dict, gae_lambda=args.gae_lambda, normalize_advantage=args.normalize_advantage ) # ----------------------------------------------------- SAMPLER ----------------------------------------------------- # # environment setup traj_info_cl = TrajInfo # environment specific - potentially overriden below if 'mario' in args.env.lower(): env_cl = mario_make env_args = dict( game=args.env, no_extrinsic=args.no_extrinsic, no_negative_reward=args.no_negative_reward, normalize_obs=args.normalize_obs, normalize_obs_steps=10000 ) elif args.env in _PYCOLAB_ENVS: env_cl = deepmind_make traj_info_cl = PycolabTrajInfo env_args = dict( game=args.env, no_extrinsic=args.no_extrinsic, no_negative_reward=args.no_negative_reward, normalize_obs=args.normalize_obs, normalize_obs_steps=10000, log_heatmaps=args.log_heatmaps, logdir=args.log_dir, obs_type=args.obs_type, grayscale=args.grayscale, max_steps_per_episode=args.max_episode_steps ) elif args.env in _MUJOCO_ENVS: env_cl = gym_make env_args = dict( id=args.env, no_extrinsic=args.no_extrinsic, no_negative_reward=args.no_negative_reward, normalize_obs=False, normalize_obs_steps=10000 ) elif args.env in _ATARI_ENVS: env_cl = AtariEnv traj_info_cl = AtariTrajInfo env_args = dict( game=args.env, no_extrinsic=args.no_extrinsic, no_negative_reward=args.no_negative_reward, normalize_obs=args.normalize_obs, normalize_obs_steps=10000, downsampling_scheme='classical', record_freq=args.record_freq, record_dir=args.log_dir, horizon=args.max_episode_steps, score_multiplier=args.score_multiplier, repeat_action_probability=args.repeat_action_probability, fire_on_reset=args.fire_on_reset ) if args.sample_mode == 'gpu': if args.lstm: collector_class = GpuWaitResetCollector else: collector_class = GpuResetCollector sampler = GpuSampler( EnvCls=env_cl, env_kwargs=env_args, eval_env_kwargs=env_args, batch_T=args.timestep_limit, batch_B=args.num_envs, max_decorrelation_steps=0, TrajInfoCls=traj_info_cl, eval_n_envs=args.eval_envs, eval_max_steps=args.eval_max_steps, eval_max_trajectories=args.eval_max_traj, record_freq=args.record_freq, log_dir=args.log_dir, CollectorCls=collector_class ) else: if args.lstm: collector_class = CpuWaitResetCollector else: collector_class = CpuResetCollector sampler = CpuSampler( EnvCls=env_cl, env_kwargs=env_args, eval_env_kwargs=env_args, batch_T=args.timestep_limit, # timesteps in a trajectory episode batch_B=args.num_envs, # environments distributed across workers max_decorrelation_steps=0, TrajInfoCls=traj_info_cl, eval_n_envs=args.eval_envs, eval_max_steps=args.eval_max_steps, eval_max_trajectories=args.eval_max_traj, record_freq=args.record_freq, log_dir=args.log_dir, CollectorCls=collector_class ) # ----------------------------------------------------- RUNNER ----------------------------------------------------- # if args.eval_envs > 0: runner = (MinibatchRlEval if args.num_gpus <= 1 else SyncRlEval)( algo=algo, agent=agent, sampler=sampler, n_steps=args.iterations, affinity=affinity, log_interval_steps=args.log_interval, log_dir=args.log_dir, pretrain=args.pretrain ) else: runner = (MinibatchRl if args.num_gpus <= 1 else SyncRl)( algo=algo, agent=agent, sampler=sampler, n_steps=args.iterations, affinity=affinity, log_interval_steps=args.log_interval, log_dir=args.log_dir, pretrain=args.pretrain ) with logger_context(args.log_dir, config, snapshot_mode="last", use_summary_writer=True): runner.train()
def start_experiment(args): args_json = json.dumps(vars(args), indent=4) if not os.path.isdir(args.log_dir): os.makedirs(args.log_dir) with open(args.log_dir + '/arguments.json', 'w') as jsonfile: jsonfile.write(args_json) config = dict(env_id=args.env) if args.sample_mode == 'gpu': assert args.num_gpus > 0 affinity = dict(cuda_idx=0, workers_cpus=list(range(args.num_cpus))) os.environ['CUDA_VISIBLE_DEVICES'] = str(0) else: affinity = dict(workers_cpus=list(range(args.num_cpus))) # potentially reload models initial_optim_state_dict = None initial_model_state_dict = None if args.pretrain != 'None': os.system(f"find {args.log_dir} -name '*.json' -delete" ) # clean up json files for video recorder checkpoint = torch.load( os.path.join(_RESULTS_DIR, args.pretrain, 'params.pkl')) initial_optim_state_dict = checkpoint['optimizer_state_dict'] initial_model_state_dict = checkpoint['agent_state_dict'] # ----------------------------------------------------- POLICY ----------------------------------------------------- # model_args = dict(curiosity_kwargs=dict(curiosity_alg=args.curiosity_alg)) if args.curiosity_alg == 'icm': model_args['curiosity_kwargs'][ 'feature_encoding'] = args.feature_encoding model_args['curiosity_kwargs']['batch_norm'] = args.batch_norm model_args['curiosity_kwargs'][ 'prediction_beta'] = args.prediction_beta model_args['curiosity_kwargs'][ 'forward_loss_wt'] = args.forward_loss_wt elif args.curiosity_alg == 'disagreement': model_args['curiosity_kwargs'][ 'feature_encoding'] = args.feature_encoding model_args['curiosity_kwargs']['ensemble_size'] = args.ensemble_size model_args['curiosity_kwargs']['batch_norm'] = args.batch_norm model_args['curiosity_kwargs'][ 'prediction_beta'] = args.prediction_beta model_args['curiosity_kwargs'][ 'forward_loss_wt'] = args.forward_loss_wt model_args['curiosity_kwargs']['device'] = args.sample_mode elif args.curiosity_alg == 'ndigo': model_args['curiosity_kwargs'][ 'feature_encoding'] = args.feature_encoding model_args['curiosity_kwargs']['pred_horizon'] = args.pred_horizon model_args['curiosity_kwargs']['batch_norm'] = args.batch_norm model_args['curiosity_kwargs']['num_predictors'] = args.num_predictors model_args['curiosity_kwargs']['device'] = args.sample_mode elif args.curiosity_alg == 'rnd': model_args['curiosity_kwargs'][ 'feature_encoding'] = args.feature_encoding model_args['curiosity_kwargs'][ 'prediction_beta'] = args.prediction_beta model_args['curiosity_kwargs'][ 'drop_probability'] = args.drop_probability model_args['curiosity_kwargs']['gamma'] = args.discount model_args['curiosity_kwargs']['device'] = args.sample_mode if args.env in _MUJOCO_ENVS: if args.lstm: agent = MujocoLstmAgent( initial_model_state_dict=initial_model_state_dict) else: agent = MujocoFfAgent( initial_model_state_dict=initial_model_state_dict) else: if args.lstm: agent = AtariLstmAgent( initial_model_state_dict=initial_model_state_dict, model_kwargs=model_args, no_extrinsic=args.no_extrinsic) else: agent = AtariFfAgent( initial_model_state_dict=initial_model_state_dict) # ----------------------------------------------------- LEARNING ALG ----------------------------------------------------- # if args.alg == 'ppo': if args.kernel_mu == 0.: kernel_params = None else: kernel_params = (args.kernel_mu, args.kernel_sigma) algo = PPO( discount=args.discount, learning_rate=args.lr, value_loss_coeff=args.v_loss_coeff, entropy_loss_coeff=args.entropy_loss_coeff, OptimCls=torch.optim.Adam, optim_kwargs=None, clip_grad_norm=args.grad_norm_bound, initial_optim_state_dict= initial_optim_state_dict, # is None is not reloading a checkpoint gae_lambda=args.gae_lambda, minibatches=args. minibatches, # if recurrent: batch_B needs to be at least equal, if not recurrent: batch_B*batch_T needs to be at least equal to this epochs=args.epochs, ratio_clip=args.ratio_clip, linear_lr_schedule=args.linear_lr, normalize_advantage=args.normalize_advantage, normalize_reward=args.normalize_reward, kernel_params=kernel_params, curiosity_type=args.curiosity_alg) elif args.alg == 'a2c': algo = A2C(discount=args.discount, learning_rate=args.lr, value_loss_coeff=args.v_loss_coeff, entropy_loss_coeff=args.entropy_loss_coeff, OptimCls=torch.optim.Adam, optim_kwargs=None, clip_grad_norm=args.grad_norm_bound, initial_optim_state_dict=initial_optim_state_dict, gae_lambda=args.gae_lambda, normalize_advantage=args.normalize_advantage) # ----------------------------------------------------- SAMPLER ----------------------------------------------------- # # environment setup traj_info_cl = TrajInfo # environment specific - potentially overriden below if 'mario' in args.env.lower(): env_cl = mario_make env_args = dict(game=args.env, no_extrinsic=args.no_extrinsic, no_negative_reward=args.no_negative_reward, normalize_obs=args.normalize_obs, normalize_obs_steps=10000) elif 'deepmind' in args.env.lower(): # pycolab deepmind environments env_cl = deepmind_make traj_info_cl = PycolabTrajInfo env_args = dict(game=args.env, no_extrinsic=args.no_extrinsic, no_negative_reward=args.no_negative_reward, normalize_obs=args.normalize_obs, normalize_obs_steps=10000, log_heatmaps=args.log_heatmaps, logdir=args.log_dir, obs_type=args.obs_type, max_steps_per_episode=args.max_episode_steps) elif args.env in _MUJOCO_ENVS: env_cl = gym_make env_args = dict(id=args.env, no_extrinsic=args.no_extrinsic, no_negative_reward=args.no_negative_reward, normalize_obs=False, normalize_obs_steps=10000) elif args.env in _ATARI_ENVS: env_cl = AtariEnv traj_info_cl = AtariTrajInfo env_args = dict( game=args.env, no_extrinsic=args.no_extrinsic, no_negative_reward=args.no_negative_reward, normalize_obs=args.normalize_obs, normalize_obs_steps=10000, downsampling_scheme='classical', record_freq=args.record_freq, record_dir=args.log_dir, horizon=args.max_episode_steps, ) if args.sample_mode == 'gpu': if args.lstm: collector_class = GpuWaitResetCollector else: collector_class = GpuResetCollector sampler = GpuSampler(EnvCls=env_cl, env_kwargs=env_args, eval_env_kwargs=env_args, batch_T=args.timestep_limit, batch_B=args.num_envs, max_decorrelation_steps=0, TrajInfoCls=traj_info_cl, eval_n_envs=args.eval_envs, eval_max_steps=args.eval_max_steps, eval_max_trajectories=args.eval_max_traj, record_freq=args.record_freq, log_dir=args.log_dir, CollectorCls=collector_class) else: if args.lstm: collector_class = CpuWaitResetCollector else: collector_class = CpuResetCollector sampler = CpuSampler( EnvCls=env_cl, env_kwargs=env_args, eval_env_kwargs=env_args, batch_T=args.timestep_limit, # timesteps in a trajectory episode batch_B=args.num_envs, # environments distributed across workers max_decorrelation_steps=0, TrajInfoCls=traj_info_cl, eval_n_envs=args.eval_envs, eval_max_steps=args.eval_max_steps, eval_max_trajectories=args.eval_max_traj, record_freq=args.record_freq, log_dir=args.log_dir, CollectorCls=collector_class) # ----------------------------------------------------- RUNNER ----------------------------------------------------- # if args.eval_envs > 0: runner = MinibatchRlEval(algo=algo, agent=agent, sampler=sampler, n_steps=args.iterations, affinity=affinity, log_interval_steps=args.log_interval, log_dir=args.log_dir, pretrain=args.pretrain) else: runner = MinibatchRl(algo=algo, agent=agent, sampler=sampler, n_steps=args.iterations, affinity=affinity, log_interval_steps=args.log_interval, log_dir=args.log_dir, pretrain=args.pretrain) with logger_context(args.log_dir, config, snapshot_mode="last", use_summary_writer=True): runner.train()
def build_and_train(log_dir, game="pong", run_ID=0, cuda_idx=None, eval=False, save_model='last', load_model_path=None, n_parallel=2, CumSteps=0): device = 'cpu' if cuda_idx is None else 'cuda' params = torch.load( load_model_path, map_location=torch.device(device)) if load_model_path else {} agent_state_dict = params.get('agent_state_dict') optimizer_state_dict = params.get('optimizer_state_dict') ##--- wu ---## log_interval_steps = 5e4 prefill = 5e4 train_every = 16 batch_B = 16 n_steps = 1e4 if eval else 5e6 itr_start = max(0, CumSteps - prefill) // train_every ##--- wu ---## action_repeat = 4 # 2 env_kwargs = dict( name=game, action_repeat=action_repeat, size=(64, 64), grayscale=True, # False life_done=True, sticky_actions=True, ) factory_method = make_wapper( AtariEnv, [OneHotAction, TimeLimit], [dict(), dict(duration=1000000 / action_repeat)]) # 1000 sampler = GpuSampler( EnvCls=factory_method, TrajInfoCls=AtariTrajInfo, env_kwargs=env_kwargs, eval_env_kwargs=env_kwargs, batch_T=1, batch_B=batch_B, max_decorrelation_steps=0, eval_n_envs=10, eval_max_steps=int(10e5), eval_max_trajectories=5, ) algo = Dreamer( initial_optim_state_dict=optimizer_state_dict, horizon=10, use_pcont=True, replay_size=int(2e6), # int(5e6) kl_scale=0.1, batch_size=50, batch_length=50, C=1, # 100, train_every=train_every // batch_B, # 1000 pretrain=100, world_lr=2e-4, # 6e-4, value_lr=1e-4, # 8e-5, actor_lr=4e-5, # 8e-5, discount=0.999, # 0.99, expl_amount=0.0, # 0.3, prefill=prefill // batch_B, # 5000 discount_scale=5., # 10. video_every=int(2e4 // 16 * 16 // batch_B), # int(10) ) if eval: # for eval - all versions agent = AtariDreamerAgent(train_noise=0.0, eval_noise=0, expl_type="epsilon_greedy", itr_start=itr_start, the_expl_mode='eval', expl_min=0.0, expl_decay=11000, initial_model_state_dict=agent_state_dict, model_kwargs=dict(use_pcont=True)) else: # for train - all versions # agent = AtariDreamerAgent(train_noise=0.4, eval_noise=0, expl_type="epsilon_greedy", itr_start=itr_start, the_expl_mode='train', # expl_min=0.1, expl_decay=11000, initial_model_state_dict=agent_state_dict, # model_kwargs=dict(use_pcont=True)) # for train - dreamer_V2 agent = AtariDreamerAgent(train_noise=0.0, eval_noise=0, expl_type="epsilon_greedy", itr_start=itr_start, the_expl_mode='train', expl_min=0.0, expl_decay=11000, initial_model_state_dict=agent_state_dict, model_kwargs=dict(use_pcont=True)) my_seed = 0 # reproductivity set_seed(my_seed) runner_cls = MinibatchRlEval if eval else MinibatchRl runner = runner_cls( algo= algo, # Uses gathered samples to train the agent (e.g. defines a loss function and performs gradient descent). agent= agent, # Chooses control action to the environment in sampler; trained by the algorithm. Interface to model. sampler=sampler, n_steps=n_steps, log_interval_steps=log_interval_steps, affinity=dict(cuda_idx=cuda_idx, workers_cpus=list(range(n_parallel))), seed=my_seed, ) config = dict(game=game) name = "dreamer_" + game with logger_context(log_dir, run_ID, name, config, snapshot_mode=save_model, override_prefix=True, use_summary_writer=True): runner.train()
class RandomDiscreteModel(torch.nn.Module): def __init__(self, num_actions): super().__init__() self.num_actions = num_actions def forward(self, observation, prev_action, prev_reward): lead_dim, T, B, img_shape = infer_leading_dims(observation, 3) action = torch.randint(low=0, high=self.num_actions, size=(T * B, )) action = restore_leading_dims((action), lead_dim, T, B) return action # Setup the data collection pipeline sampler = GpuSampler(EnvCls=gym.make, env_kwargs=config["env"], CollectorCls=GpuResetCollector, eval_env_kwargs=config["env"], **config["sampler"]) agent = RandomAgent(ModelCls=RandomDiscreteModel, model_kwargs={"num_actions": 15}) seed = make_seed() set_seed(seed) sampler.initialize(agent=agent, affinity=affinity, seed=seed + 1, rank=0) steps = config["train_steps"] # Create the model model = BiGAN(**config["model"]) if config["load_path"]: model.load_state_dict(torch.load(config["load_path"])) # Setup the optimizers lr = config["optim"]["lr"]
def build_and_train(args, game="", run_ID=0, config=None): """ 1. Parse the args object into dictionaries understood by rlpyt """ config['env']['id'] = args.env_name config["eval_env"]["id"] = args.env_name config["eval_env"]["horizon"] = args.horizon config["env"]["horizon"] = args.horizon if 'procgen' in args.env_name: for k, v in vars(args).items(): if args.env_name.split('-')[1] in k: config['env'][k] = v config['model']['frame_stack'] = args.frame_stack config['model']['nce_loss'] = args.nce_loss config['model']['algo'] = args.algo config['model']['env_name'] = args.env_name config['model']['dueling'] = args.dueling == 1 config['algo']['double_dqn'] = args.double_dqn == 1 config['algo']['prioritized_replay'] = args.prioritized_replay == 1 config['algo']['n_step_return'] = args.n_step_return config['algo']['learning_rate'] = args.learning_rate config['runner']['log_interval_steps'] = args.log_interval_steps config['cmd_args'] = vars(args) """ 2. Create the CatDQN (C51) agent from custom implementation """ agent = AtariCatDqnAgent(ModelCls=AtariCatDqnModel_nce, model_kwargs=config["model"], **config["agent"]) algo = CategoricalDQN_nce(args=config['cmd_args'], ReplayBufferCls=None, optim_kwargs=config["optim"], **config["algo"]) if args.mode == 'parallel': affinity = make_affinity(n_cpu_core=args.n_cpus, n_gpu=args.n_gpus, n_socket=1 # hyperthread_offset=0 ) """ Some architecture require the following block to be uncommented. Try with and without. This is here to allow scheduling of non-sequential CPU IDs """ # import psutil # psutil.Process().cpu_affinity([]) # cpus = tuple(psutil.Process().cpu_affinity()) # affinity['all_cpus'] = affinity['master_cpus'] = cpus # affinity['workers_cpus'] = tuple([tuple([x]) for x in cpus+cpus]) # env_kwargs = config['env'] sampler = GpuSampler(EnvCls=make_env, env_kwargs=config["env"], CollectorCls=GpuWaitResetCollector, TrajInfoCls=AtariTrajInfo, eval_env_kwargs=config["eval_env"], **config["sampler"]) """ If you don't have a GPU, use the CpuSampler """ # sampler = CpuSampler( # EnvCls=AtariEnv if args.game is not None else make_env, # env_kwargs=config["env"], # CollectorCls=CpuWaitResetCollector, # TrajInfoCls=AtariTrajInfo, # eval_env_kwargs=config["eval_env"], # **config["sampler"] # ) elif args.mode == 'serial': affinity = make_affinity( n_cpu_core=1, # Use 16 cores across all experiments. n_gpu=args.n_gpus, # Use 8 gpus across all experiments. n_socket=1, ) """ Some architecture require the following block to be uncommented. Try with and without. """ # import psutil # psutil.Process().cpu_affinity([]) # cpus = tuple(psutil.Process().cpu_affinity()) # affinity['all_cpus'] = affinity['master_cpus'] = cpus # affinity['workers_cpus'] = tuple([tuple([x]) for x in cpus+cpus]) # env_kwargs = config['env'] sampler = SerialSampler( EnvCls=make_env, env_kwargs=config["env"], # CollectorCls=SerialEvalCollector, TrajInfoCls=AtariTrajInfo, eval_env_kwargs=config["eval_env"], **config["sampler"]) """ 3. Bookkeeping, setting up Comet.ml experiments, etc """ folders_name = [args.output_dir, args.env_name, 'run_' + args.run_ID] path = os.path.join(*folders_name) os.makedirs(path, exist_ok=True) experiment = Experiment(api_key='your_key', auto_output_logging=False, project_name='driml', workspace="your_workspace", disabled=True) experiment.add_tag('C51+DIM' if ( args.lambda_LL > 0 or args.lambda_LG > 0 or args.lambda_GL > 0 or args.lambda_GG > 0) else 'C51') experiment.set_name(args.experiment_name) experiment.log_parameters(config) MinibatchRlEval.TF_logger = Logger(path, use_TFX=True, params=config, comet_experiment=experiment, disable_local=True) MinibatchRlEval.log_diagnostics = log_diagnostics_custom MinibatchRlEval._log_infos = _log_infos MinibatchRlEval.evaluate_agent = evaluate_agent """ 4. Define the runner as minibatch """ runner = MinibatchRlEval(algo=algo, agent=agent, sampler=sampler, affinity=affinity, **config["runner"]) runner.algo.opt_info_fields = tuple( list(runner.algo.opt_info_fields) + ['lossNCE'] + ['action%d' % i for i in range(15)]) name = args.mode + "_value_based_nce_" + args.env_name log_dir = os.path.join(args.output_dir, args.env_name) logger.set_snapshot_gap(args.weight_save_interval // config['runner']['log_interval_steps']) """ 6. Run the experiment and optionally save network weights """ with experiment.train(): with logger_context( log_dir, run_ID, name, config, snapshot_mode=( 'last' if args.weight_save_interval == -1 else 'gap' )): # set 'all' to save every it, 'gap' for every X it runner.train()
def build_and_train(env_id="POMDP-hallway-episodic-v0", run_ID=0, cuda_idx=None, n_parallel=1, fomdp=False): EnvCls = pomdp_interface env_args = dict(fomdp=fomdp, id=env_id, time_limit=100) test_instance = EnvCls(**env_args) gamma = test_instance.discount affinity = dict(cuda_idx=cuda_idx, workers_cpus=list(range(n_parallel)), alternating=True) lr = 1e-3 po = np.array([1,0,0,1,0], dtype=bool) # Model kwargs # model_kwargs = dict() # model_kwargs = dict(hidden_sizes=[64, 64], shared_processor=False) model_kwargs = dict(hidden_sizes=[64, 64], rnn_type='gru', rnn_size=256, rnn_placement=1, shared_processor=False, layer_norm=True, prev_action=3, prev_reward=3) # model_kwargs = dict(hidden_sizes=[64, 64], option_size=4, shared_processor=False, use_interest=False, use_diversity=False, use_attention=False) # model_kwargs = dict(hidden_sizes=[64, 64], option_size=4, use_interest=True, use_diversity=False, # use_attention=False, rnn_type='gru', rnn_size=256, rnn_placement=1, shared_processor=False, layer_norm=True, prev_option=po) # Samplers sampler = GpuSampler( EnvCls=EnvCls, env_kwargs=env_args, eval_env_kwargs=env_args, batch_T=20, # One time-step per sampler iteration. batch_B=30, # One environment (i.e. sampler Batch dimension). max_decorrelation_steps=0, eval_n_envs=5, eval_max_steps=int(25e3), eval_max_trajectories=30 ) # sampler = AlternatingSampler( # EnvCls=EnvCls, # env_kwargs=env_args, # eval_env_kwargs=env_args, # batch_T=20, # One time-step per sampler iteration. # batch_B=30, # One environment (i.e. sampler Batch dimension). # max_decorrelation_steps=0, # eval_n_envs=5, # eval_max_steps=int(25e3), # eval_max_trajectories=30 # ) # # sampler = SerialSampler( # EnvCls=EnvCls, # env_kwargs=env_args, # eval_env_kwargs=env_args, # batch_T=20, # One time-step per sampler iteration. # batch_B=30, # One environment (i.e. sampler Batch dimension). # max_decorrelation_steps=0, # # eval_n_envs=2, # # eval_max_steps=int(51e2), # # eval_max_trajectories=5, # ) # Algos (swapping out discount) algo = A2C(discount=gamma, learning_rate=lr, clip_grad_norm=2.) # algo = A2OC(discount=gamma, learning_rate=lr, clip_grad_norm=2.) # algo = PPO(discount=gamma, learning_rate=lr, clip_grad_norm=2.) # algo = PPOC(discount=gamma, learning_rate=lr, clip_grad_norm=2.) # Agents # agent = PomdpFfAgent(model_kwargs=model_kwargs) agent = PomdpRnnAgent(model_kwargs=model_kwargs) # agent = PomdpOcFfAgent(model_kwargs=model_kwargs) # agent = PomdpOcRnnAgent(model_kwargs=model_kwargs) # agent = AlternatingPomdpRnnAgent(model_kwargs=model_kwargs) # agent = AlternatingPomdpOcRnnAgent(model_kwargs=model_kwargs) runner = MinibatchRl( algo=algo, agent=agent, sampler=sampler, n_steps=1e6, log_interval_steps=1e3, affinity=affinity, ) config = dict(env_id=env_id, fomdp=fomdp, algo_name=algo.__class__.__name__, learning_rate=lr, sampler=sampler.__class__.__name__, model=model_kwargs) name = algo.NAME + '_' + env_id log_dir = "pomdps" with logger_context(log_dir, run_ID, name, config): runner.train()
def run(self, run_ID=0): config = self.getConfig() sampler = GpuSampler(EnvCls=make_env, env_kwargs={ "num_levels": config["num_levels"], "env": config['env'] }, CollectorCls=GpuResetCollector, batch_T=256, batch_B=config["envs_per_worker"], max_decorrelation_steps=1000) optim_args = dict(weight_decay=config["l2_penalty"] ) if "l2_penalty" in config else None algo = PPO(value_loss_coeff=0.5, clip_grad_norm=0.5, discount=config["discount"], entropy_loss_coeff=config["entropy_bonus"], gae_lambda=config["lambda"], minibatches=config["minibatches_per_epoch"], epochs=config["epochs_per_rollout"], ratio_clip=config["ppo_clip"], learning_rate=config["learning_rate"], normalize_advantage=True, optim_kwargs=optim_args) if config["arch"] == 'impala': agent = ImpalaAgent( model_kwargs={ "in_channels": [3, 16, 32], "out_channels": [16, 32, 32], "hidden_size": 256 }) elif config["arch"] == 'lstm': agent = NatureRecurrentAgent(model_kwargs={ "hidden_sizes": [512], "lstm_size": 256 }) else: agent = OriginalNatureAgent( model_kwargs={ "batchNorm": config["batchNorm"], "dropout": config["dropout"], "augment_obs": config["augment_obs"], "use_maxpool": config["maxpool"], "hidden_sizes": config["hidden_sizes"], "arch": config["arch"] }) affinity = dict(cuda_idx=0, workers_cpus=list(range(8))) runner = MinibatchRl(algo=algo, agent=agent, sampler=sampler, n_steps=config["total_timesteps"], log_interval_steps=500, affinity=affinity) log_dir = "./logs" name = config["name"] with logger_context(log_dir, run_ID, name, config, use_summary_writer=True, override_prefix=False): runner.train() torch.save(agent.state_dict(), "./" + name + ".pt") wandb.save("./" + name + ".pt")