def main(): args = parse_args() for key in vars(args).keys(): print('[*] {} = {}'.format(key, vars(args)[key])) save_dir = make_saving_dir(args) print(save_dir) result = np.zeros((args.n_sample, args.n_trial, args.max_ep, 2)) for sample in range(args.n_sample): env = ENV(mapFile=args.map_name, random=args.random) model = {'MBIE': mbie.MBIE(env, args.beta), 'MBIE_NS': mbie.MBIE_NS(env, args.beta),\ 'DH': hindsight.DH(env, bool(args.ent_known),args.beta, args.lambd),\ 'DO': outcome.DO(env, bool(args.ent_known), args.beta, args.lambd)} print('sample {} out of {}'.format(sample, args.n_sample)) env._render() np.save(save_dir + "map_sample_{}.npy".format(sample), env.map) for trial in range(args.n_trial): print('trail = {}'.format(trial)) mrl = model[args.method] mrl.reset() for episode in range(args.max_ep): terminal = False step = 0 R = [] s = env.reset() while not terminal and step < args.max_step: action = np.random.choice(np.flatnonzero(mrl.Q[s, :] == mrl.Q[s,:].max())) ns, r, terminal = env.step(action) R.append(r) mrl.observe(s,action,ns,r, terminal) step += 1 s = ns result[sample, trial, episode, 0] = step result[sample, trial, episode, 1] = disc_return(R, mrl.gamma) mrl.Qupdate() print(episode, step, disc_return(R, mrl.gamma), np.max(mrl.Q)) #print(np.max(mrl.Q, axis=1).reshape(13,13)) try: np.save(save_dir + "entopy_trail_{}_sample_{}.npy".format(trial, sample), mrl.entropy) except: print("No entropy is saving") np.save(save_dir + "count_trail_{}_sample_{}.npy".format(trial, sample), mrl.count) np.save(save_dir + 'results.npy', result)
def test_step_with_kinematic(): env_config = configparser.RawConfigParser() env_config.read('configs/test_env.config') env_config.set('agent', 'kinematic', 'true') test_env = ENV(env_config, phase='test') test_env.reset() # test state computation states, rewards, done_signals = test_env.step((Action(1, 0), Action(1, 0))) assert np.allclose( states[0], JointState(-1, 0, 1, 0, 0.3, 2, 0, 1.0, 0, 1, 0, -1, 0, 0.3)) assert np.allclose( states[1], JointState(1, 0, -1, 0, 0.3, -2, 0, 1.0, np.pi, -1, 0, 1, 0, 0.3)) assert rewards == [0, 0] assert done_signals == [False, False] # test one-step lookahead reward, end_time = test_env.compute_reward(0, [Action(1.5, 0), None]) assert reward == -0.25 assert end_time == 1 reward, end_time = test_env.compute_reward( 0, [Action(1.5, 0), Action(1.5, 0)]) assert reward == -0.25 assert end_time == 0.5 # test collision detection states, rewards, done_signals = test_env.step((Action(1, 0), Action(1, 0))) assert np.allclose( states[0], JointState(0, 0, 1, 0, 0.3, 2, 0, 1.0, 0, 0, 0, -1, 0, 0.3)) assert np.allclose( states[1], JointState(0, 0, -1, 0, 0.3, -2, 0, 1.0, np.pi, 0, 0, 1, 0, 0.3)) assert rewards == [-0.25, -0.25] assert done_signals == [2, 2] # test reaching goal test_env = ENV(env_config, phase='test') test_env.reset() test_env.step((Action(1, np.pi / 2), Action(2, np.pi / 2))) test_env.step((Action(4, -np.pi / 2), Action(4, -np.pi / 2))) states, rewards, done_signals = test_env.step( (Action(1, -np.pi / 2), Action(2, -np.pi / 2))) assert rewards == [1, 1] assert done_signals == [1, 1]
def main(): original_size = (782, 600) env = ENV(actions, (original_size[0]/6, original_size[1]/6)) gamma = 0.9 epsilon = .95 model_ph = 'models' if not os.path.exists(model_ph): os.mkdir(model_ph) trials = 500 trial_len = 1000 rewards = [] q_values = [] dqn_agent = DQN(env=env) success_num = 0 rewards = [] q_values = [] Q = [] for trial in range(1, trials): t_reward = [] t_qvalue = [] cur_state = env.reset() for step in range(trial_len): action = dqn_agent.act(cur_state) new_state, reward, done, success = env.step(action) t_reward.append(reward) # reward = reward if not done else -20 dqn_agent.remember(cur_state, action, reward, new_state, done) q_value = dqn_agent.replay() # internally iterates default (prediction) model if q_value: t_qvalue.append(q_value) Q.append(q_value) else: t_qvalue.append(0.0) Q.append(0.0) dqn_agent.target_train() # iterates target model cur_state = new_state dqn_agent.log_result() save_q(Q) if success: success_num += 1 dqn_agent.step = 100 print("Completed in {} trials".format(trial)) dqn_agent.save_model(os.path.join(model_ph, "success-model.h5")) break if done: print("Failed to complete in trial {}, step {}".format(trial, step)) dqn_agent.save_model(os.path.join(model_ph, "trial-{}-model.h5").format(trial)) break rewards.append(np.sum(t_reward) if t_reward else 0.0) q_values.append(np.mean(t_qvalue) if t_qvalue else 0.0) with open('reward_and_Q/reward.txt', 'wb') as f: pickle.dump(rewards, f) with open('reward_and_Q/qvalue.txt', 'wb') as f: pickle.dump(q_values, f) print('trial: {}, success acc: {}'.format(trial, success_num / float(trial)))