class ModelInterface(): def __init__(self): self.agent = Agent(state_size=STATE_SIZE, action_size=ACTION_SIZE, random_seed=10) self.agent.actor_local.load_state_dict( torch.load('model/checkpoint_actor.pth', map_location='cpu')) self.agent.critic_local.load_state_dict( torch.load('model/checkpoint_critic.pth', map_location='cpu')) self.agent.actor_local.eval() self.agent.critic_local.eval() def get_action_q(self, state, action): s = np.zeros((128, 6)) s[0, :] = state a = np.zeros((128, 2)) a[0, :] = action state = torch.Tensor(s) action = torch.Tensor(a) return self.agent.critic_local(state, action).detach().numpy()[0, 0] def get_action(self, state): return self.agent.act(state)
def submit_agent(args, model_params): ########################################################## actor_fn, params_actor, params_crit = build_model_test(**model_params) weights = [p.get_value() for p in params_actor] actor = Agent(actor_fn, params_actor, params_crit) actor.set_actor_weights(weights) if args.weights is not None: actor.load(args.weights) env = RunEnv2(model=args.modeldim, prosthetic=args.prosthetic, difficulty=args.difficulty, skip_frame=3) # Settings remote_base = "http://grader.crowdai.org:1729" token = args.token client = Client(remote_base) # Create environment di = client.env_create(token, env_id="ProstheticsEnv") stat = [] ep = 1 ii = 0 reward_sum = 0 print('\n\n#################################################\n\n') while True: ii += 1 proj = env.dict_to_vec(di) action = actor.act(proj) action += np.random.rand(len(action)) / 10. [di, reward, done, info] = client.env_step(action.tolist(), True) reward_sum += reward print('ep: ' + str(ep) + ' >> step: ' + str(int(ii)) + ' >> reward: ' + format(reward, '.2f') + ' \t' + str(int(reward_sum)) + '\t >> pelvis X Y Z: \t' + format(di['body_pos']['pelvis'][0], '.2f') + '\t' + format(di['body_pos']['pelvis'][1], '.2f') + '\t' + format(di['body_pos']['pelvis'][2], '.2f')) if done: print('\n\n#################################################\n\n') stat.append([ep, ii, reward_sum]) di = client.env_reset() ep += 1 ii = 0 reward_sum = 0 if not di: break for e in stat: print(e) print('\n\nclient.submit()\n\n') client.submit() ########################################################## print('\n\n#################################################\n\n') print('DONE\n\n')
def test_agent(args, testing, num_test_episodes, model_params, weights, best_reward, updates, global_step, save_dir): env = RunEnv2(model=args.modeldim, prosthetic=args.prosthetic, difficulty=args.difficulty, skip_frame=3) test_rewards_all = [] test_pelvis_X_all = [] train_fn, actor_fn, target_update_fn, params_actor, params_crit, actor_lr, critic_lr = build_model( **model_params) actor = Agent(actor_fn, params_actor, params_crit) actor.set_actor_weights(weights) # if args.weights is not None: # actor.load(args.weights) for ep in range(num_test_episodes): seed = random.randrange(2**32 - 2) state = env.reset(seed=seed, difficulty=0) test_reward = 0 while True: state = np.asarray(state, dtype='float32') action = actor.act(state) state, reward, terminal, info = env._step(action) test_reward += reward if terminal: break test_rewards_all.append(test_reward) test_pelvis_X_all.append(info['pelvis_X']) test_reward_mean = np.mean(test_rewards_all) mean_pelvis_X = np.mean(test_pelvis_X_all) std_reward = np.std(test_rewards_all) test_str ='global step {}; test_reward_mean: {:.2f}, test_rewards_all: {}; mean_pelvis_Xmean: {:.2f}, test_pelvis_X_all: {} '.\ format(global_step.value, float(test_reward_mean), test_rewards_all, float(mean_pelvis_X), test_pelvis_X_all) print(test_str) try: with open(os.path.join(save_dir, 'test_report.log'), 'a') as f: f.write(test_str + '\n') except: print('#############################################') print('except » f.write(test_str )') print('#############################################') if test_reward_mean > best_reward.value or test_reward_mean > 30 * env.reward_mult: if test_reward_mean > best_reward.value: best_reward.value = test_reward_mean fname = os.path.join( save_dir, 'weights_updates_{}_reward_{:.1f}_pelvis_X_{:.1f}.pkl'.format( updates.value, test_reward_mean, mean_pelvis_X)) actor.save(fname) testing.value = 0
def test_agent(args, testing, state_transform, num_test_episodes, model_params, weights, best_reward, updates, global_step, save_dir): env = RunEnv2(state_transform, visualize=args.test, integrator_accuracy=args.accuracy, model=args.modeldim, prosthetic=args.prosthetic, difficulty=args.difficulty, skip_frame=1) test_rewards = [] train_fn, actor_fn, target_update_fn, params_actor, params_crit, actor_lr, critic_lr = \ build_model(**model_params) actor = Agent(actor_fn, params_actor, params_crit) actor.set_actor_weights(weights) if args.weights is not None: actor.load(args.weights) for ep in range(num_test_episodes): seed = random.randrange(2**32 - 2) state = env.reset(seed=seed, difficulty=2) test_reward = 0 while True: state = np.asarray(state, dtype='float32') action = actor.act(state) state, reward, terminal, _ = env._step(action) test_reward += reward if terminal: break test_rewards.append(test_reward) mean_reward = np.mean(test_rewards) std_reward = np.std(test_rewards) test_str ='global step {}; test reward mean: {:.2f}, std: {:.2f}, all: {} '.\ format(global_step.value, float(mean_reward), float(std_reward), test_rewards) print(test_str) with open(os.path.join(save_dir, 'test_report.log'), 'a') as f: f.write(test_str + '\n') if mean_reward > best_reward.value or mean_reward > 30 * env.reward_mult: if mean_reward > best_reward.value: best_reward.value = mean_reward fname = os.path.join( save_dir, 'weights_updates_{}_reward_{:.2f}.pkl'.format( updates.value, mean_reward)) actor.save(fname) testing.value = 0
def test_agent(args, num_test_episodes, model_params): env = RunEnv2(visualize=True, model=args.modeldim, prosthetic=args.prosthetic, difficulty=args.difficulty, skip_frame=3) test_rewards = [] # train_fn, actor_fn, target_update_fn, params_actor, params_crit, actor_lr, critic_lr = build_model(**model_params) # actor_fn, params_actor, params_crit, actor_lr, critic_lr = build_model(**model_params) actor_fn, params_actor, params_crit = build_model_test(**model_params) weights = [p.get_value() for p in params_actor] actor = Agent(actor_fn, params_actor, params_crit) actor.set_actor_weights(weights) if args.weights is not None: actor.load(args.weights) for ep in range(num_test_episodes): seed = random.randrange(2**32 - 2) state = env.reset(seed=seed, difficulty=0) test_reward = 0 while True: state = np.asarray(state, dtype='float32') # state = np.concatenate((state,state,state))[:390] # ndrw tmp action = actor.act(state) # ndrw tmp # if args.prosthetic: # action = np.zeros(19) # ndrw tmp # else: # action = np.zeros(22) # ndrw tmp state, reward, terminal, _ = env._step(action) test_reward += reward if terminal: break test_rewards.append(test_reward) mean_reward = np.mean(test_rewards) std_reward = np.std(test_rewards) global_step = 0 test_str ='global step {}; test reward mean: {:.2f}, std: {:.2f}, all: {} '.\ format(global_step.value, float(mean_reward), float(std_reward), test_rewards) print(test_str) with open(os.path.join('test_report.log'), 'a') as f: f.write(test_str + '\n')
def run_agent(model_params, weights, state_transform, data_queue, weights_queue, process, global_step, updates, best_reward, param_noise_prob, save_dir, max_steps=10000000): train_fn, actor_fn, target_update_fn, params_actor, params_crit, actor_lr, critic_lr = \ build_model(**model_params) actor = Agent(actor_fn, params_actor, params_crit) actor.set_actor_weights(weights) env = RunEnv2(state_transform, max_obstacles=config.num_obstacles, skip_frame=config.skip_frames) random_process = OrnsteinUhlenbeckProcess(theta=.1, mu=0., sigma=.2, size=env.noutput, sigma_min=0.05, n_steps_annealing=1e6) # prepare buffers for data states = [] actions = [] rewards = [] terminals = [] total_episodes = 0 start = time() action_noise = True while global_step.value < max_steps: seed = random.randrange(2**32-2) state = env.reset(seed=seed, difficulty=2) random_process.reset_states() total_reward = 0. total_reward_original = 0. terminal = False steps = 0 while not terminal: state = np.asarray(state, dtype='float32') action = actor.act(state) if action_noise: action += random_process.sample() next_state, reward, next_terminal, info = env.step(action) total_reward += reward total_reward_original += info['original_reward'] steps += 1 global_step.value += 1 # add data to buffers states.append(state) actions.append(action) rewards.append(reward) terminals.append(terminal) state = next_state terminal = next_terminal if terminal: break total_episodes += 1 # add data to buffers after episode end states.append(state) actions.append(np.zeros(env.noutput)) rewards.append(0) terminals.append(terminal) states_np = np.asarray(states).astype(np.float32) data = (states_np, np.asarray(actions).astype(np.float32), np.asarray(rewards).astype(np.float32), np.asarray(terminals), ) weight_send = None if total_reward > best_reward.value: weight_send = actor.get_actor_weights() # send data for training data_queue.put((process, data, weight_send, total_reward)) # receive weights and set params to weights weights = weights_queue.get() report_str = 'Global step: {}, steps/sec: {:.2f}, updates: {}, episode len {}, ' \ 'reward: {:.2f}, original_reward {:.4f}; best reward: {:.2f} noise {}'. \ format(global_step.value, 1. * global_step.value / (time() - start), updates.value, steps, total_reward, total_reward_original, best_reward.value, 'actions' if action_noise else 'params') print(report_str) with open(os.path.join(save_dir, 'train_report.log'), 'a') as f: f.write(report_str + '\n') actor.set_actor_weights(weights) action_noise = np.random.rand() < 1 - param_noise_prob if not action_noise: set_params_noise(actor, states_np, random_process.current_sigma) # clear buffers del states[:] del actions[:] del rewards[:] del terminals[:] if total_episodes % 100 == 0: env = RunEnv2(state_transform, max_obstacles=config.num_obstacles, skip_frame=config.skip_frames)
n_episode += 1 # observation: [row, column, RGB] observation = env.reset() agent.reset(observation) done = False timestep = 0 print('Episode start: %s' % (episode)) # Play game while (done is False): env.render() action = agent.act() observation, reward, done, info = env.step(action) agent.train(observation, reward, done) timestep += 1 print('Episode finished after timestep: %s' % (timestep)) env.close() print('Training complete after episode: %s' % (n_episode)) # plt.matshow(observation[:100, 20:80 ,0]) # plt.matshow(observation[:,:,2]) # plt.matshow(observation[:,:,1]) ob = env.reset() observation.shape
def run_agent(args, model_params, weights, data_queue, weights_queue, process, global_step, updates, best_reward, param_noise_prob, save_dir, max_steps=10000000): train_fn, actor_fn, target_update_fn, params_actor, params_crit, actor_lr, critic_lr = build_model( **model_params) actor = Agent(actor_fn, params_actor, params_crit) actor.set_actor_weights(weights) env = RunEnv2(model=args.modeldim, prosthetic=args.prosthetic, difficulty=args.difficulty, skip_frame=config.skip_frames) env.spec.timestep_limit = 3000 # ndrw # random_process = OrnsteinUhlenbeckProcess(theta=.1, mu=0., sigma=.3, size=env.noutput, sigma_min=0.05, n_steps_annealing=1e6) sigma_rand = random.uniform(0.05, 0.5) dt_rand = random.uniform(0.002, 0.02) param_noise_prob = random.uniform(param_noise_prob * 0.25, min(param_noise_prob * 1.5, 1.)) random_process = OrnsteinUhlenbeckProcess(theta=.1, mu=0., sigma=sigma_rand, dt=dt_rand, size=env.noutput, sigma_min=0.05, n_steps_annealing=1e6) print('OUProcess_sigma = ' + str(sigma_rand) + ' OUProcess_dt = ' + str(dt_rand) + ' param_noise_prob = ' + str(param_noise_prob)) # prepare buffers for data states = [] actions = [] rewards = [] terminals = [] total_episodes = 0 start = time() action_noise = True while global_step.value < max_steps: seed = random.randrange(2**32 - 2) state = env.reset(seed=seed, difficulty=args.difficulty) random_process.reset_states() total_reward = 0. total_reward_original = 0. terminal = False steps = 0 while not terminal: state = np.asarray(state, dtype='float32') action = actor.act(state) if action_noise: action += random_process.sample() next_state, reward, next_terminal, info = env._step(action) total_reward += reward total_reward_original += info['original_reward'] steps += 1 global_step.value += 1 # add data to buffers states.append(state) actions.append(action) rewards.append(reward) terminals.append(terminal) state = next_state terminal = next_terminal if terminal: break total_episodes += 1 # add data to buffers after episode end states.append(state) actions.append(np.zeros(env.noutput)) rewards.append(0) terminals.append(terminal) states_np = np.asarray(states).astype(np.float32) data = ( states_np, np.asarray(actions).astype(np.float32), np.asarray(rewards).astype(np.float32), np.asarray(terminals), ) weight_send = None if total_reward > best_reward.value: weight_send = actor.get_actor_weights() # send data for training data_queue.put((process, data, weight_send, total_reward)) # receive weights and set params to weights weights = weights_queue.get() # report_str = 'Global step: {}, steps/sec: {:.2f}, updates: {}, episode len: {}, pelvis_X: {:.2f}, reward: {:.2f}, original_reward {:.4f}, best reward: {:.2f}, noise: {}'. \ # format(global_step.value, 1. * global_step.value / (time() - start), updates.value, steps, info['pelvis_X'], total_reward, total_reward_original, best_reward.value, 'actions' if action_noise else 'params') # report_str = 'Global step: {}, steps/sec: {:.2f}, updates: {}, episode len: {}, pelvis_X: {:.2f}, reward: {:.2f}, best reward: {:.2f}, noise: {}'. \ # format(global_step.value, 1. * global_step.value / (time() - start), updates.value, steps, info['pelvis_X'], total_reward, best_reward.value, 'actions' if action_noise else 'params') report_str = 'Global step: {}, steps/sec: {:.2f}, updates: {}, episode len: {}, pelvis_X: {:.2f}, pelvis_Z: {:.2f}, reward: {:.2f}, best reward: {:.2f}, noise: {}'. \ format(global_step.value, 1. * global_step.value / (time() - start), updates.value, steps, info['pelvis'][0], info['pelvis'][2], total_reward, best_reward.value, 'actions' if action_noise else 'params') print(report_str) try: with open(os.path.join(save_dir, 'train_report.log'), 'a') as f: f.write(report_str + '\n') except: print('#############################################') print( 'except » with open(os.path.join(save_dir, train_report.log), a) as f:' ) print('#############################################') actor.set_actor_weights(weights) action_noise = np.random.rand() < 1 - param_noise_prob if not action_noise: set_params_noise(actor, states_np, random_process.current_sigma) # clear buffers del states[:] del actions[:] del rewards[:] del terminals[:] if total_episodes % 100 == 0: env = RunEnv2(model=args.modeldim, prosthetic=args.prosthetic, difficulty=args.difficulty, skip_frame=config.skip_frames)