def learn(env_path, seed, max_steps, reward_range, base_port, unity_arguments, summary_writer): env = VecFrameStack(_make_a2c(env_path, num_env=8, seed=seed, reward_range=reward_range, base_port=base_port, unity_arguments=unity_arguments), nstack=4) model = learn_a2c( policy=CnnPolicy, env=env, seed=seed, ent_coef=0.01, nsteps=5, total_timesteps=max_steps, callback=_create_summary_callback(summary_writer=summary_writer)) try: env.close() except Exception as e: print("Failed to close environment: " + str(e)) return model
def train(env_id, num_timesteps, seed, policy, lrschedule, num_env, ckpt_path, hparams): if policy == 'cnn': policy_fn = CnnPolicy elif policy == 'lstm': policy_fn = LstmPolicy elif policy == 'lnlstm': policy_fn = LnLstmPolicy elif policy == 'cnn_attention': policy_fn = CnnAttentionPolicy video_log_dir = os.path.join(hparams['base_dir'], 'videos', hparams['experiment_name']) env = VecFrameStack( make_atari_env(env_id, num_env, seed, video_log_dir=video_log_dir, write_attention_video='attention' in policy, hparams=hparams), 4) learn(policy_fn, env, seed, total_timesteps=int(num_timesteps * 1.1), lrschedule=lrschedule, ckpt_path=ckpt_path, hparams=hparams) env.close()
def train(num_timesteps, env_name, seed, policy, lrschedule, num_env, entrophy, lr, save_name=None): if policy == 'cnn': policy_fn = CnnPolicy elif policy == 'lstm': policy_fn = LstmPolicy elif policy == 'lnlstm': policy_fn = LnLstmPolicy elif policy == 'i2a': policy_fn = I2ANetwork env = VecFrameStack(make_doom_env(num_env, 0, env_name), 4) if save_name is None: save_name = env_name learn(policy_fn, env, seed, save_name=save_name, total_timesteps=int(num_timesteps * 1.1), lrschedule=lrschedule, log_interval=500, save_interval=1000, cont=True, ent_coef=entrophy, lr=lr) env.close()
def train(env_id, num_timesteps, seed, policy, lr_schedule, num_env): """ Train A2C model for atari environment, for testing purposes :param env_id: (str) Environment ID :param num_timesteps: (int) The total number of samples :param seed: (int) The initial seed for training :param policy: (A2CPolicy) The policy model to use (MLP, CNN, LSTM, ...) :param lr_schedule: (str) The type of scheduler for the learning rate update ('linear', 'constant', 'double_linear_con', 'middle_drop' or 'double_middle_drop') :param num_env: (int) The number of environments """ policy_fn = None if policy == 'cnn': policy_fn = CnnPolicy elif policy == 'lstm': policy_fn = LstmPolicy elif policy == 'lnlstm': policy_fn = LnLstmPolicy if policy_fn is None: raise ValueError("Error: policy {} not implemented".format(policy)) env = VecFrameStack(make_atari_env(env_id, num_env, seed), 4) learn(policy_fn, env, seed, total_timesteps=int(num_timesteps * 1.1), lr_schedule=lr_schedule) env.close()
def train(env_id, num_timesteps, seed, num_cpu): env = VecFrameStack(make_atari_env(env_id, num_cpu, seed), 4) policy_fn = CnnPolicy learn(policy_fn, env, seed, total_timesteps=int(num_timesteps * 1.1), nprocs=num_cpu) env.close()
def main(args): U.make_session(num_cpu=1).__enter__() set_global_seeds(args.seed) #env = gym.make(args.env_id) def policy_fn(name, ob_space, ac_space, reuse=False): return mlp_policy.CNNPolicy(name=name, ob_space=ob_space, ac_space=ac_space, reuse=reuse, hid_size=args.policy_hidden_size, num_hid_layers=2) #env = bench.Monitor(env, logger.get_dir() and # osp.join(logger.get_dir(), "monitor.json")) env = make_vec_env(args.env_id, 'atari', 1, args.seed, wrapper_kwargs={ 'clip_rewards':False, 'episode_life':False, }) env = VecFrameStack(env, 4) #env.seed(args.seed) gym.logger.setLevel(logging.WARN) task_name = get_task_name(args) args.checkpoint_dir = osp.join(args.checkpoint_dir, task_name) args.log_dir = osp.join(args.log_dir, task_name) if args.task == 'train': dataset = LMDB_Dset(expert_path=args.expert_path, traj_limitation=args.traj_limitation) reward_giver = TransitionClassifier(env, args.adversary_hidden_size, entcoeff=args.adversary_entcoeff) train(env, args.seed, policy_fn, reward_giver, dataset, args.algo, args.g_step, args.d_step, args.policy_entcoeff, args.num_timesteps, args.save_per_iter, args.checkpoint_dir, args.log_dir, args.pretrained, args.BC_max_iter, task_name ) elif args.task == 'evaluate': runner(env, policy_fn, args.load_model_path, timesteps_per_batch=1024, number_trajs=10, stochastic_policy=args.stochastic_policy, save=args.save_sample ) else: raise NotImplementedError env.close()
def train(env_id, num_timesteps, seed, policy, lrschedule, num_env): if policy == u'cnn': policy_fn = CnnPolicy elif policy == u'lstm': policy_fn = LstmPolicy elif policy == u'lnlstm': policy_fn = LnLstmPolicy env = VecFrameStack(make_atari_env(env_id, num_env, seed), 4) learn(policy_fn, env, seed, total_timesteps=int(num_timesteps * 1.1), lrschedule=lrschedule) env.close()
def train(env_id, num_timesteps, seed, policy, lrschedule, num_env): if policy == 'cnn': policy_fn = CnnPolicy elif policy == 'lstm': policy_fn = LstmPolicy elif policy == 'lnlstm': policy_fn = LnLstmPolicy env = VecFrameStack(make_atari_env(env_id, num_env, seed), 4) learn(policy_fn, env, seed, total_timesteps=int(num_timesteps * 1.1), lrschedule=lrschedule) env.close()
def main(): numOfTests = 40 env_args = { 'episode_life': False, 'clip_rewards': False, 'crop': True, 'rotate': True } env = VecFrameStack( make_vec_env("gvgai-zelda-lvl0-v0", numOfTests, 43, wrapper_kwargs=env_args), 4) policy = build_policy(env, "cnn") model = Model(policy=policy, env=env, nsteps=5) model.load('logs/test_4*5_r1_right/checkpoints/260000') nh, nw, nc = env.observation_space.shape result = dict() for j in range(201, 601): # obs = np.zeros((numOfTests, nh, nw, nc), dtype=np.uint8) done = np.array([False] * numOfTests) env.venv.set_level( "GVGAI_GYM/gym_gvgai/envs/games/zelda_v0/zelda_lvl{}.txt".format( j)) obs = env.reset() infos = [False] * numOfTests # dones = [False] * numOfTests while not all(done): actions, values, state, _ = model.step(obs) obs, rewards, dones, info = env.step(actions) done[np.where(dones != False)] = True for i in np.where(dones != False)[0].tolist(): if not infos[i]: # print(info) del info[i]["grid"] del info[i]["ascii"] infos[i] = info[i] # print(np.where(dones!=False)[0]) # print(done) # print(infos) # print(dones) win = [1 if (i['winner'] == 'PLAYER_WINS') else 0 for i in infos] # score = [i['episode']['r'] for i in infos] # steps = [i['episode']['l'] for i in infos] # time = [i['episode']['t'] for i in infos] print("level {}".format(j), win) result[j] = infos env.close() with open("result_4*5_r1_right_200~600", "wb") as f: pickle.dump(result, f)
def train(env_id, num_timesteps, seed, policy, lrschedule, num_env): if policy == 'cnn': policy_fn = CnnPolicy elif policy == 'lstm': policy_fn = LstmPolicy elif policy == 'lnlstm': policy_fn = LnLstmPolicy dict = {} dict['clip_rewards']=False env = VecFrameStack(make_atari_env(env_id, num_env, seed, wrapper_kwargs=dict), 4) learn(policy_fn, env, seed, total_timesteps=int(num_timesteps * 1.1), lrschedule=lrschedule) env.close()
def train(env_id, num_timesteps, seed, num_cpu, num_env): env = VecFrameStack( # make_atari_env(env_id, num_cpu, seed), make_distributed_env(env_id, num_env, seed), # make_old_dist_env(env_id, num_env, seed), 4) policy_fn = partial(CnnPolicy, one_dim_bias=True) learn(policy_fn, env, seed, total_timesteps=int(num_timesteps * 1.1), nprocs=num_cpu) env.close()
def train(env_id, num_timesteps, seed, num_cpu): """ train an ACKTR model on atari :param env_id: (str) Environment ID :param num_timesteps: (int) The total number of samples :param seed: (int) The initial seed for training :param num_cpu: (int) The number of cpu to train on """ env = VecFrameStack(make_atari_env(env_id, num_cpu, seed), 4) policy_fn = partial(CnnPolicy, one_dim_bias=True) learn(policy_fn, env, seed, total_timesteps=int(num_timesteps * 1.1), nprocs=num_cpu) env.close()
def train(env_id, num_timesteps, seed, policy, lrschedule, num_env, save_path, load_path, wrapper_kwargs): env_args = {'episode_life': False, 'clip_rewards': False} env_args.update(wrapper_kwargs) env = VecFrameStack( make_vec_env(env_id, num_env, seed, wrapper_kwargs=env_args), 4) # env = make_vec_env(env_id, num_env, seed, wrapper_kwargs=env_args) model = learn(policy, env, seed, total_timesteps=int(num_timesteps * 1.1), lrschedule=lrschedule, load_path=load_path) model.save(save_path) env.close()
def main(): env = VecFrameStack(make_sf2_env(), 1) obs = env.reset() n_steps = 128, # 5 * FPS options = { 'network': 'mlp', # 'impala_cnn' 'env': venv, 'total_timesteps': 40000000, 'nsteps': n_steps, # 5 * FPS, # TODO: Do we still need to pass nsteps here? 'q_coef': 1.0, 'ent_coef': 0.001, 'max_grad_norm': 10, 'lr': 7e-4, 'lrschedule': 'linear', 'rprop_epsilon': 1e-5, 'rprop_alpha': 0.99, 'gamma': 0.99, 'log_interval': 1000, 'buffer_size': 50000, 'replay_ratio': 4, 'replay_start': 10000, 'c': 10.0, 'trust_region': True, 'delta': 1, 'alpha': 0.99, # 'load_path': MODEL_PATH, 'save_interval': 1000, # neuronal network parameters 'activation': tf.nn.relu, 'num_layers': 2, # 4, 2 'num_hidden': 48, # 64, 64 'layer_norm': False, } models = ( Acer(**options), Acer(**options) ) runner = Runner(env, models, n_steps) while True: runner.run() # obs, rew, done, info = env.step(( # env.action_space.sample(), # env.action_space.sample() # )) # env.render() # if done: # obs = env.reset() env.close()
def train(env_id, num_timesteps, seed, policy, lrschedule, num_env): if policy == 'cnn': policy_fn = CnnPolicy elif policy == 'lstm': policy_fn = LstmPolicy elif policy == 'lnlstm': policy_fn = LnLstmPolicy #env = VecFrameStack(make_atari_env(env_id, num_env, seed), 4) env = VecFrameStack(make_custom_env('gridworld-v0', num_env, seed), 1) act = learn(policy_fn, env, seed, total_timesteps=int(num_timesteps * 1.1), lrschedule=lrschedule) act.save('a2c_bopen.pkl') env.close()
def train(env_id, num_timesteps, seed, policy, lrschedule, num_env, replay_lambda=1, replay_loss=None, ss_rate=1, thetas=None): if policy == 'cnn': policy_fn = CnnPolicy elif policy == 'lstm': policy_fn = LstmPolicy elif policy == 'lnlstm': policy_fn = LnLstmPolicy env = VecFrameStack(make_atari_env(env_id, num_env, seed), 4) if replay_loss is not None: learn_staged(policy_fn, env, seed, total_timesteps=int(num_timesteps * 1.1), lrschedule=lrschedule, replay_lambda=replay_lambda, ss_rate=ss_rate, replay_loss=replay_loss, thetas=thetas) else: learn(policy_fn, env, seed, total_timesteps=int(num_timesteps * 1.1), lrschedule=lrschedule) env.close()
def train(env_id, num_timesteps, seed, policy, lrschedule, num_env, v_ex_coef, r_ex_coef, r_in_coef, lr_alpha, lr_beta, no_ex, no_in): if policy == 'cnn': policy_fn = CnnPolicy elif policy == 'lstm': policy_fn = LstmPolicy elif policy == 'lnlstm': policy_fn = LnLstmPolicy elif policy == 'cnn_int': policy_fn = CnnPolicyIntrinsicReward else: raise NotImplementedError env = VecFrameStack(make_atari_env(env_id, num_env, seed), 4) learn(policy_fn, env, seed, total_timesteps=int(num_timesteps * 1.01), lrschedule=lrschedule, v_ex_coef=v_ex_coef, r_ex_coef=r_ex_coef, r_in_coef=r_in_coef, lr_alpha=lr_alpha, lr_beta=lr_beta, no_ex=no_ex, no_in=no_in) env.close()
def train(env_id, num_timesteps, seed, policy, lrschedule, num_env,load_path, algo='use_svib_uniform', ib_alpha=1e-3): if policy == 'cnn_svib': policy_fn = CnnPolicySVIB else: policy_fn = CnnPolicySVIB if 'NoFrameskip' in env_id: env = VecFrameStack(make_atari_env(env_id, num_env, seed), 4) test_env = VecFrameStack(make_atari_env(env_id, num_env, seed+1), 4) else: env = VecFrameStack(make_atari_env_low_dim(env_id, num_env, seed), 4) test_env = VecFrameStack(make_atari_env_low_dim(env_id, num_env, seed+1), 4) # train_mine_env = VecFrameStack(make_atari_env_low_dim(env_id, num_env, seed), 4) reward_list, value_list = learn(policy_fn, env, test_env, seed, total_timesteps=int(num_timesteps), lrschedule=lrschedule, load_path=load_path, algo=algo, ib_alpha=ib_alpha) env.close() return reward_list, value_list
def train(env_id, num_timesteps, seed, policy, lrschedule, num_env): print('train() called') if policy == 'cnn': policy_fn = CnnPolicy elif policy == 'lstm': policy_fn = LstmPolicy elif policy == 'lnlstm': policy_fn = LnLstmPolicy env = VecFrameStack(make_atari_env(env_id, num_env, seed), 4) # Make "num_env" environments learn(policy_fn, env, seed, total_timesteps=int(num_timesteps * 1.1), lrschedule=lrschedule) # Learn env.close()
def train(env_id, num_timesteps, seed, policy, lrschedule, num_env, args): if policy == 'i2a': policy_fn = I2A elif policy == 'cnn': policy_fn = CnnPolicy elif policy == 'lstm': policy_fn = LstmPolicy elif policy == 'lnlstm': policy_fn = LnLstmPolicy env = VecFrameStack( make_atari_env('MsPacmanNoFrameskip-v0', num_env, seed), 4) learn(policy_fn, env, seed, total_timesteps=int(num_timesteps * 1.1), lrschedule=lrschedule, args=args) env.close()
def train(env_id, num_timesteps, seed, policy, lrschedule, num_env, sil_update, sil_beta): if policy == 'cnn': policy_fn = CnnPolicy elif policy == 'lstm': policy_fn = LstmPolicy elif policy == 'lnlstm': policy_fn = LnLstmPolicy env_args = {'episode_life': False, 'clip_rewards': False} env = VecFrameStack( make_atari_env(env_id, num_env, seed, wrapper_kwargs=env_args), 4) learn(policy_fn, env, seed, total_timesteps=int(num_timesteps * 1.1), lrschedule=lrschedule, sil_update=sil_update, sil_beta=sil_beta) env.close()
def train(env_id, num_timesteps, seed, policy, lrschedule, num_env): nstates = None if policy == 'cnn': policy_fn = CnnPolicy elif policy == 'lstm': policy_fn = LstmPolicy elif policy == 'lnlstm': policy_fn = LnLstmPolicy nstates = 512 elif policy == 'caps': policy_fn = CapsulePolicy # TODO # DEBUG: # Changed ent_coef to zero # To undo, simply omit ent_coef from arguments (use default) env = VecFrameStack(make_atari_env(env_id, num_env, seed), 4) learn(policy_fn, env, seed, nsteps=5, nstates=nstates, total_timesteps=int(num_timesteps * 1.1), sc_coef=None, lrschedule=lrschedule) env.close()
def train(env_id, num_timesteps, seed, policy, lrschedule, num_env): if policy == 'cnn': policy_fn = CnnPolicy elif policy == 'lstm': policy_fn = LstmPolicy elif policy == 'lnlstm': policy_fn = LnLstmPolicy # VecFrameStack # make_atari_env() : launches 'num_env' subprocess each with 'env_id' and for i in num_env: seed+=seed+i env = VecFrameStack(make_atari_env(env_id, num_env, seed), 4) print("~~~~~~~~~~~~~ run_atari: len(env): " + str(env.nstack)) print("~~~~~~~~~~~~~ run_atari: str(env): " + str(env)) # above prints : run_atari: str(env): <baselines.common.vec_env.vec_frame_stack.VecFrameStack object at 0x1c22ee06d8> print("_____________________________________________ policy: " + str(policy)) learn(policy_fn, env, seed, total_timesteps=int(num_timesteps * 1.1), lrschedule=lrschedule) env.close()
def train(env_id, N_itr, seed, policy, lr, lrschedule, num_env, log_path, save_interval, alg): if policy == 'cnn': policy_fn = CnnPolicy elif policy == 'lstm': policy_fn = LstmPolicy elif policy == 'lnlstm': policy_fn = LnLstmPolicy env = VecFrameStack(make_atari_env(env_id, num_env, seed), 4) if alg == 'a2c': learn_a2c(policy=policy_fn, env=env, seed=seed, N_itr=int(N_itr), lr=lr, lrschedule=lrschedule, nsteps=128, save_interval=save_interval, save_path=log_path) #,load_path="./Data/a2cTest/a2c_1.pkl") elif alg == 'ppo2': learn_ppo2( policy=policy_fn, env=env, seed=seed, nsteps=128, nminibatches=4, lam=0.95, gamma=0.99, noptepochs=4, log_interval=1, ent_coef=.01, lr=lambda f: f * lr, cliprange=lambda f: f * 0.1, N_itr=int(N_itr), save_interval=save_interval, save_path=log_path ) #there are defalut values from original openai/baselines github env.close()
def train(env_id, num_timesteps, seed, policy, lrschedule, num_env, param): if policy == 'cnn': policy_fn = CnnPolicy elif policy == 'lstm': policy_fn = LstmPolicy elif policy == 'lnlstm': policy_fn = LnLstmPolicy ncpu = multiprocessing.cpu_count() if sys.platform == 'darwin': ncpu //= 2 config = tf.ConfigProto(allow_soft_placement=True, intra_op_parallelism_threads=ncpu, inter_op_parallelism_threads=ncpu) config.gpu_options.allow_growth = True # pylint: disable=E1101 tf.Session(config=config).__enter__() # change parameter of env to start multi envs env = VecFrameStack(make_atari_env(env_id, num_env, seed), 4) learn(policy_fn, env, seed, total_timesteps=int(num_timesteps * 1.1), lrschedule=lrschedule, param=param, nsteps=16) env.close()
def train(params): policy_fn = CnnPolicy dataflow_config = { 'future_rewards': True, # Should return future discounted rewards? 'exclude_zero_actions': False, # Should exclude zero actions 'remap_actions': False, # Should remap to smaller action set? 'clip_rewards': True, # Clip rewards to [-1, 1] 'monte-specific-blackout': True, # Cover up score and lives indicators 'pong-specific-blackout': False, # Cover up scores in pong 'gamma': params.gamma, # reward discount factor 'frame_history': 4, # What is minimum number of expert frames since beginning of episode? 'frameskip': 4, # frameskip 'preload_images': True, # Preload images from hard drive or keep reloading ? 'gdrive_data_id': cnst.MONTE_DATA_GDRIVE_ID, 'data_dir': cnst.DATA_DIR, 'img_dir': cnst.MIKE_IMG_DIR, 'traj_dir': cnst.MIKE_TRAJECTORIES_DIR, 'stat_dir': cnst.MIKE_STATES_DIR, 'batch_size': params.expert_nbatch, 'max_score_cutoff': params.exp_max_score, # What is maximum expert score we can show? Used to cut expert data 'min_score_cutoff': 20000, # What is minimum score to count trajectory as expert 'process_lost_lifes': True, # Should loss of life zero future discounted reward? 'use_n_trajectories': params.use_n_trajectories if 'use_n_trajectories' in params else None } the_seed = np.random.randint(10000) print(80 * "SEED") print("Today's lucky seed is {}".format(the_seed)) print(80 * "SEED") env = VecFrameStack( make_atari_env( env_id=params.env, num_env=params.num_env, seed=the_seed, limit_len=params.limit_len, limit_penalty=params.limit_penalty, death_penalty=params.death_penalty, step_penalty=params.step_penalty, random_state_reset=params.random_state_reset, dataflow_config=dataflow_config ), params.frame_stack ) learn( policy=policy_fn, env=env, seed=the_seed, params=params, dataflow_config=dataflow_config, expert_nbatch=params.expert_nbatch, exp_adv_est=params.exp_adv_est, load_model=params.load_model, gamma=params.gamma, nprocs=params.num_env, nsteps=params.nsteps, ent_coef=params.ent_coef, expert_coeff=params.exp_coeff, lr=params.lr, lrschedule=params.lrschedule, ) env.close()
from baselines.common.cmd_util import make_atari_env from baselines.common.vec_env.vec_frame_stack import VecFrameStack from arguments import achieve_arguments from a2c_agent import a2c_agent from baselines import logger if __name__ == '__main__': args = achieve_arguments() logger.configure(dir=args.log_dir) # create environments env_args = {'episode_life': False, 'clip_rewards': False} envs = VecFrameStack( make_atari_env(args.env_name, args.num_processes, args.seed, wrapper_kwargs=env_args), 4) trainer = a2c_agent(envs, args) trainer.learn() envs.close()
def main(env_name, mode, episodes, random_sample, save_path, concrete=False, expert_first=False, save_model=True, dropout=0.05, lr=0.001, ls=5e-7, train_epochs=10, density=0.0, hetero_loss=False, budget=1): """ env_name - gym environment [LunarLander-v2, CartPole-v1] mode - learning type [pool, stream, classic] save_path - where the model and tf loggin data should be saved to """ seed = random.randint(0, 1e6) isSpace = env_name[:5] == 'Space' if isSpace: wrapper_kwargs = {'episode_life': False} env = VecFrameStack( make_atari_env(env_name, 1, 0, wrapper_kwargs=wrapper_kwargs), 4) else: env = gym.make(env_name) env.seed(seed) isFetch = env_name[:5] == 'Fetch' if isFetch: # That's so fetch from active_imitation.agents.mujoco_robot import DEFAULT_PARAMS action_size = env.action_space.shape[0] observation_size = env.observation_space.spaces['observation'].shape goal_size = env.observation_space.spaces['desired_goal'].shape[0] env_dims = { 'observation': observation_size, 'goal': goal_size, 'action': action_size } elif isSpace: from active_imitation.agents.classic_gym import DEFAULT_PARAMS action_size = 1 action_space = env.action_space.n observation_size = env.observation_space.shape env_dims = { 'observation': observation_size, 'action': action_size, 'action_space': action_space } else: from active_imitation.agents.classic_gym import DEFAULT_PARAMS # Need the spaces dimensions to initialize the NN agent action_size = 1 # Single, discrete actions action_space = env.action_space.n observation_size = env.observation_space.shape env_dims = { 'observation': observation_size, 'action': action_size, 'action_space': action_space } # Change the dimensions of the nn layers params = DEFAULT_PARAMS # params['layers'] = [64, 64, 64] params['dropout_rate'] = dropout #[0.05, 0.1, 0.15, 0.2] params['filepath'] = save_path params['lr'] = lr params['hetero_loss'] = hetero_loss if isFetch or isSpace: params['layers'] = [256, 256, 256] #[512, 512, 512] # params['concrete'] = concrete params['ls'] = ls else: params['layers'] = [16, 16, 16] params['concrete'] = concrete if expert_first: mixing = 0.0 mixing_decay = 1.0 else: mixing = 1.0 mixing_decay = 1.0 param_mods = { 'random_sample': random_sample, 'mixing': mixing, 'density_weight': density, 'budget': budget } if isFetch: agent = GymRobotAgent(env_dims, **params) expert = RoboticEnv_Expert(policy_files[env_name]) continuous = True elif isSpace: expert = SpaceInvadersExpert({ 'observation': env.observation_space, 'action': env.action_space }) agent = AtariGymAgent(env_dims, **params) continuous = False param_mods['isSpace'] = True else: agent = GymAgent(env_dims, **params) expert = experts[env_name](env.unwrapped) continuous = False learning_mode = configure.configure_robot(env, env_dims, agent, expert, mode, continuous=continuous, concrete=concrete, param_mods=param_mods) ## Save the training parameters # learning rate, dropout, isconcrete, iscontinuout, env_name, mode, parameter_savefile = os.path.join(save_path, 'parameters.txt') with open(parameter_savefile, 'w') as f: f.write('Environment Name: {} \n'.format(env_name)) f.write('Learning Mode: {} \n'.format(mode)) f.write('# of Episodes: {} \n'.format(episodes)) f.write('Learning Rate:{} \n'.format(lr)) f.write('Concrete Length Scale: {} \n'.format(ls)) f.write('Training Epochs: {}\n'.format(train_epochs)) f.write('Continuous: {}\n'.format(continuous)) f.write('Concrete: {}\n'.format(concrete)) f.write('Random Sample: {}\n'.format(random_sample)) f.write('Mixing: {}\n'.format(mixing)) f.write('Mixing Decay: {}\n'.format(mixing_decay)) f.write('Density Weighting: {}\n'.format(density)) f.write('Budget: {}\n'.format(budget)) for label, value in params.items(): f.write('{}: {}\n'.format(label, value)) f.write('Random Seed: {}\n'.format(seed)) if isSpace: save_rate = 5000 valid_runs = 1 elif isFetch: save_rate = 100 valid_runs = 5 else: save_rate = 100 valid_runs = 5 rewards, stats = learning_mode.train( episodes=episodes, mixing_decay=mixing_decay, train_epochs=train_epochs, save_images=False, image_filepath=save_path + 'images/', save_rate=save_rate, valid_runs=valid_runs, ) if save_model: agent.save_model() if isSpace: expert.close() agent.sess.close() env.close() tf.reset_default_graph() return rewards, stats
def train(env_id, model, num_envs, num_timesteps, lrschedule, save_interval, seed): ncpu = multiprocessing.cpu_count() if sys.platform == 'darwin': ncpu //= 2 config = tf.ConfigProto(allow_soft_placement=True, intra_op_parallelism_threads=ncpu, inter_op_parallelism_threads=ncpu) config.gpu_options.allow_growth = True #pylint: disable=E1101 sess = tf.Session(config=config).__enter__() print("Starting experiment") # Level selector #level_path = './results/' + experiment_name + '/levels/' + experiment_id + '/' #level_selector = LevelSelector.get_selector(args.selector, args.game, level_path) # Make gym environment #env = make_gvgai_env(env_id=env_id, # num_env=args.num_envs, # seed=args.seed, # level_selector=level_selector) env = VecFrameStack(make_gvgai_env(env_id, num_envs, seed), 4) # Select model policy = { 'cnn': CnnPolicy, 'lstm': LstmPolicy, 'lnlstm': LnLstmPolicy, 'mlp': MlpPolicy }[model] #Philip: how to resume?, lrschedule is not used yet ppo2.learn(policy=policy, env=env, nsteps=128, nminibatches=4, lam=0.95, gamma=0.99, noptepochs=4, log_interval=1, save_interval=save_interval, ent_coef=.01, lr=lambda f: f * 2.5e-4, cliprange=lambda f: f * 0.1, total_timesteps=int(num_timesteps * 1.1)) #Verify there are no features here that I still want #learn(policy=policy, # env=env, # experiment_name=experiment_name, # experiment_id=experiment_id, # seed=args.seed, # total_timesteps=args.num_timesteps, # lrschedule=args.lrschedule, # frame_skip=False, # save_interval=args.save_interval, # level_selector=level_selector, # render=args.render) env.close() print("Experiment DONE")
from baselines.common.vec_env.vec_frame_stack import VecFrameStack from models import CNN_Net import torch import os # get the tensors def get_tensors(obs): return torch.tensor(np.transpose(obs, (0, 3, 1, 2)), dtype=torch.float32) if __name__ == '__main__': args = get_args() # create the environment env = VecFrameStack(make_atari_env(args.env_name, 1, args.seed), 4) # start to create the model model_path = args.save_dir + args.env_name + '/model.pt' network = CNN_Net(env.action_space.n) network.load_state_dict( torch.load(model_path, map_location=lambda storage, loc: storage)) # start to do the test obs = env.reset() for _ in range(10000): env.render() obs_tensor = get_tensors(obs) with torch.no_grad(): _, pi = network(obs_tensor) actions = torch.argmax(pi, dim=1).item() obs, reward, done, _ = env.step([actions]) env.close()