def main(): args = get_args() args.critic_layers = literal_eval(args.critic_layers) args.actor_layers = literal_eval(args.actor_layers) if args.prosthetic: num_actions = 19 else: num_actions = 22 env = RunEnv2(model=args.modeldim, prosthetic=args.prosthetic, difficulty=args.difficulty, skip_frame=3) env.change_model(args.modeldim, args.prosthetic, args.difficulty) state = env.reset(seed=42, difficulty=0) # obs = env.get_observation() d = env.get_state_desc() state_size = len(env.dict_to_vec(d)) del env model_params = { 'state_size': state_size, 'num_act': num_actions, 'gamma': 0, 'actor_layers': args.actor_layers, 'critic_layers': args.critic_layers, 'actor_lr': 0, 'critic_lr': 0, 'layer_norm': args.layer_norm } test_agent(args, args.episodes, model_params)
def submit_agent(args, model_params): ########################################################## actor_fn, params_actor, params_crit = build_model_test(**model_params) weights = [p.get_value() for p in params_actor] actor = Agent(actor_fn, params_actor, params_crit) actor.set_actor_weights(weights) if args.weights is not None: actor.load(args.weights) env = RunEnv2(model=args.modeldim, prosthetic=args.prosthetic, difficulty=args.difficulty, skip_frame=3) # Settings remote_base = "http://grader.crowdai.org:1729" token = args.token client = Client(remote_base) # Create environment di = client.env_create(token, env_id="ProstheticsEnv") stat = [] ep = 1 ii = 0 reward_sum = 0 print('\n\n#################################################\n\n') while True: ii += 1 proj = env.dict_to_vec(di) action = actor.act(proj) action += np.random.rand(len(action)) / 10. [di, reward, done, info] = client.env_step(action.tolist(), True) reward_sum += reward print('ep: ' + str(ep) + ' >> step: ' + str(int(ii)) + ' >> reward: ' + format(reward, '.2f') + ' \t' + str(int(reward_sum)) + '\t >> pelvis X Y Z: \t' + format(di['body_pos']['pelvis'][0], '.2f') + '\t' + format(di['body_pos']['pelvis'][1], '.2f') + '\t' + format(di['body_pos']['pelvis'][2], '.2f')) if done: print('\n\n#################################################\n\n') stat.append([ep, ii, reward_sum]) di = client.env_reset() ep += 1 ii = 0 reward_sum = 0 if not di: break for e in stat: print(e) print('\n\nclient.submit()\n\n') client.submit() ########################################################## print('\n\n#################################################\n\n') print('DONE\n\n')
def test_agent(args, testing, num_test_episodes, model_params, weights, best_reward, updates, global_step, save_dir): env = RunEnv2(model=args.modeldim, prosthetic=args.prosthetic, difficulty=args.difficulty, skip_frame=3) test_rewards_all = [] test_pelvis_X_all = [] train_fn, actor_fn, target_update_fn, params_actor, params_crit, actor_lr, critic_lr = build_model( **model_params) actor = Agent(actor_fn, params_actor, params_crit) actor.set_actor_weights(weights) # if args.weights is not None: # actor.load(args.weights) for ep in range(num_test_episodes): seed = random.randrange(2**32 - 2) state = env.reset(seed=seed, difficulty=0) test_reward = 0 while True: state = np.asarray(state, dtype='float32') action = actor.act(state) state, reward, terminal, info = env._step(action) test_reward += reward if terminal: break test_rewards_all.append(test_reward) test_pelvis_X_all.append(info['pelvis_X']) test_reward_mean = np.mean(test_rewards_all) mean_pelvis_X = np.mean(test_pelvis_X_all) std_reward = np.std(test_rewards_all) test_str ='global step {}; test_reward_mean: {:.2f}, test_rewards_all: {}; mean_pelvis_Xmean: {:.2f}, test_pelvis_X_all: {} '.\ format(global_step.value, float(test_reward_mean), test_rewards_all, float(mean_pelvis_X), test_pelvis_X_all) print(test_str) try: with open(os.path.join(save_dir, 'test_report.log'), 'a') as f: f.write(test_str + '\n') except: print('#############################################') print('except » f.write(test_str )') print('#############################################') if test_reward_mean > best_reward.value or test_reward_mean > 30 * env.reward_mult: if test_reward_mean > best_reward.value: best_reward.value = test_reward_mean fname = os.path.join( save_dir, 'weights_updates_{}_reward_{:.1f}_pelvis_X_{:.1f}.pkl'.format( updates.value, test_reward_mean, mean_pelvis_X)) actor.save(fname) testing.value = 0
def test_agent(args, testing, state_transform, num_test_episodes, model_params, weights, best_reward, updates, global_step, save_dir): env = RunEnv2(state_transform, visualize=args.test, integrator_accuracy=args.accuracy, model=args.modeldim, prosthetic=args.prosthetic, difficulty=args.difficulty, skip_frame=1) test_rewards = [] train_fn, actor_fn, target_update_fn, params_actor, params_crit, actor_lr, critic_lr = \ build_model(**model_params) actor = Agent(actor_fn, params_actor, params_crit) actor.set_actor_weights(weights) if args.weights is not None: actor.load(args.weights) for ep in range(num_test_episodes): seed = random.randrange(2**32 - 2) state = env.reset(seed=seed, difficulty=2) test_reward = 0 while True: state = np.asarray(state, dtype='float32') action = actor.act(state) state, reward, terminal, _ = env._step(action) test_reward += reward if terminal: break test_rewards.append(test_reward) mean_reward = np.mean(test_rewards) std_reward = np.std(test_rewards) test_str ='global step {}; test reward mean: {:.2f}, std: {:.2f}, all: {} '.\ format(global_step.value, float(mean_reward), float(std_reward), test_rewards) print(test_str) with open(os.path.join(save_dir, 'test_report.log'), 'a') as f: f.write(test_str + '\n') if mean_reward > best_reward.value or mean_reward > 30 * env.reward_mult: if mean_reward > best_reward.value: best_reward.value = mean_reward fname = os.path.join( save_dir, 'weights_updates_{}_reward_{:.2f}.pkl'.format( updates.value, mean_reward)) actor.save(fname) testing.value = 0
def main(): args = get_args() args.critic_layers = literal_eval(args.critic_layers) args.actor_layers = literal_eval(args.actor_layers) save_dir = os.path.join('tests') if not os.path.exists(save_dir): os.makedirs(save_dir) state_transform = NormState(args.prosthetic) # state_transform = StateVelCentr(obstacles_mode='standard', # exclude_centr=True, # vel_states=[]) env = RunEnv2(state_transform, integrator_accuracy=args.accuracy, model=args.modeldim, prosthetic=args.prosthetic, difficulty=args.difficulty, skip_frame=1) env.change_model(args.modeldim, args.prosthetic, args.difficulty) num_actions = env.get_action_space_size() del env model_params = { 'state_size': state_transform.state_size, 'num_act': num_actions, 'gamma': 0, 'actor_layers': args.actor_layers, 'critic_layers': args.critic_layers, 'actor_lr': 0, 'critic_lr': 0, 'layer_norm': args.layer_norm } actor_fn, params_actor, params_crit, actor_lr, critic_lr = \ build_model_test(**model_params) actor = Agent(actor_fn, params_actor, params_crit) actor.load(args.weights) weights = [p.get_value() for p in params_actor] global_step = 0 test_agent(args, state_transform, args.episodes, actor, weights, global_step, save_dir)
def test_agent(args, num_test_episodes, model_params): env = RunEnv2(visualize=True, model=args.modeldim, prosthetic=args.prosthetic, difficulty=args.difficulty, skip_frame=3) test_rewards = [] # train_fn, actor_fn, target_update_fn, params_actor, params_crit, actor_lr, critic_lr = build_model(**model_params) # actor_fn, params_actor, params_crit, actor_lr, critic_lr = build_model(**model_params) actor_fn, params_actor, params_crit = build_model_test(**model_params) weights = [p.get_value() for p in params_actor] actor = Agent(actor_fn, params_actor, params_crit) actor.set_actor_weights(weights) if args.weights is not None: actor.load(args.weights) for ep in range(num_test_episodes): seed = random.randrange(2**32 - 2) state = env.reset(seed=seed, difficulty=0) test_reward = 0 while True: state = np.asarray(state, dtype='float32') # state = np.concatenate((state,state,state))[:390] # ndrw tmp action = actor.act(state) # ndrw tmp # if args.prosthetic: # action = np.zeros(19) # ndrw tmp # else: # action = np.zeros(22) # ndrw tmp state, reward, terminal, _ = env._step(action) test_reward += reward if terminal: break test_rewards.append(test_reward) mean_reward = np.mean(test_rewards) std_reward = np.std(test_rewards) global_step = 0 test_str ='global step {}; test reward mean: {:.2f}, std: {:.2f}, all: {} '.\ format(global_step.value, float(mean_reward), float(std_reward), test_rewards) print(test_str) with open(os.path.join('test_report.log'), 'a') as f: f.write(test_str + '\n')
def test_agent(args, state_transform, num_test_episodes, actor, weights, global_step, save_dir): env = RunEnv2(state_transform, visualize=True, integrator_accuracy=args.accuracy, model=args.modeldim, prosthetic=args.prosthetic, difficulty=args.difficulty, skip_frame=config.skip_frames) test_rewards = [] actor.set_actor_weights(weights) if args.weights is not None: actor.load(args.weights) for ep in range(num_test_episodes): seed = random.randrange(2**32 - 2) state = env.reset(seed=seed, difficulty=2) test_reward = 0 while True: state = np.asarray(state, dtype='float32') action = actor.act(state) state, reward, terminal, _ = env._step(action) test_reward += reward if terminal: break test_rewards.append(test_reward) mean_reward = np.mean(test_rewards) std_reward = np.std(test_rewards) test_str ='global step {}; test reward mean: {:.2f}, std: {:.2f}, all: {} '.\ format(global_step.value, float(mean_reward), float(std_reward), test_rewards) print(test_str) with open(os.path.join('test_report.log'), 'a') as f: f.write(test_str + '\n')
def run_agent(model_params, weights, state_transform, data_queue, weights_queue, process, global_step, updates, best_reward, param_noise_prob, save_dir, max_steps=10000000): train_fn, actor_fn, target_update_fn, params_actor, params_crit, actor_lr, critic_lr = \ build_model(**model_params) actor = Agent(actor_fn, params_actor, params_crit) actor.set_actor_weights(weights) env = RunEnv2(state_transform, max_obstacles=config.num_obstacles, skip_frame=config.skip_frames) random_process = OrnsteinUhlenbeckProcess(theta=.1, mu=0., sigma=.2, size=env.noutput, sigma_min=0.05, n_steps_annealing=1e6) # prepare buffers for data states = [] actions = [] rewards = [] terminals = [] total_episodes = 0 start = time() action_noise = True while global_step.value < max_steps: seed = random.randrange(2**32-2) state = env.reset(seed=seed, difficulty=2) random_process.reset_states() total_reward = 0. total_reward_original = 0. terminal = False steps = 0 while not terminal: state = np.asarray(state, dtype='float32') action = actor.act(state) if action_noise: action += random_process.sample() next_state, reward, next_terminal, info = env.step(action) total_reward += reward total_reward_original += info['original_reward'] steps += 1 global_step.value += 1 # add data to buffers states.append(state) actions.append(action) rewards.append(reward) terminals.append(terminal) state = next_state terminal = next_terminal if terminal: break total_episodes += 1 # add data to buffers after episode end states.append(state) actions.append(np.zeros(env.noutput)) rewards.append(0) terminals.append(terminal) states_np = np.asarray(states).astype(np.float32) data = (states_np, np.asarray(actions).astype(np.float32), np.asarray(rewards).astype(np.float32), np.asarray(terminals), ) weight_send = None if total_reward > best_reward.value: weight_send = actor.get_actor_weights() # send data for training data_queue.put((process, data, weight_send, total_reward)) # receive weights and set params to weights weights = weights_queue.get() report_str = 'Global step: {}, steps/sec: {:.2f}, updates: {}, episode len {}, ' \ 'reward: {:.2f}, original_reward {:.4f}; best reward: {:.2f} noise {}'. \ format(global_step.value, 1. * global_step.value / (time() - start), updates.value, steps, total_reward, total_reward_original, best_reward.value, 'actions' if action_noise else 'params') print(report_str) with open(os.path.join(save_dir, 'train_report.log'), 'a') as f: f.write(report_str + '\n') actor.set_actor_weights(weights) action_noise = np.random.rand() < 1 - param_noise_prob if not action_noise: set_params_noise(actor, states_np, random_process.current_sigma) # clear buffers del states[:] del actions[:] del rewards[:] del terminals[:] if total_episodes % 100 == 0: env = RunEnv2(state_transform, max_obstacles=config.num_obstacles, skip_frame=config.skip_frames)
def main(): args = get_args() args.critic_layers = literal_eval(args.critic_layers) args.actor_layers = literal_eval(args.actor_layers) # create save directory save_dir = os.path.join('weights', args.exp_name) if not os.path.exists(save_dir): os.makedirs(save_dir) else: shutil.move(save_dir, save_dir + '.backup') os.makedirs(save_dir) # state_transform = StateVelCentr(obstacles_mode='standard', exclude_centr=True, vel_states=[]) # num_actions = 18 # state_transform = NormState(args.prosthetic) if args.prosthetic: num_actions = 19 else: num_actions = 22 env = RunEnv2(model=args.modeldim, prosthetic=args.prosthetic, difficulty=args.difficulty, skip_frame=3) env.change_model(args.modeldim, args.prosthetic, args.difficulty) state = env.reset(seed=42, difficulty=0) # obs = env.get_observation() d = env.get_state_desc() state_size = len(env.dict_to_vec(d)) del env # build model model_params = { 'state_size': state_size, 'num_act': num_actions, 'gamma': args.gamma, 'actor_layers': args.actor_layers, 'critic_layers': args.critic_layers, 'actor_lr': args.actor_lr, 'critic_lr': args.critic_lr, 'layer_norm': args.layer_norm } print('building model') train_fn, actor_fn, target_update_fn, params_actor, params_crit, actor_lr, critic_lr = build_model( **model_params) actor = Agent(actor_fn, params_actor, params_crit) if args.weights is not None: actor.load(args.weights) # set_actor_weights & set_crit_weights actor_lr_step = (args.actor_lr - args.actor_lr_end) / args.max_steps critic_lr_step = (args.critic_lr - args.critic_lr_end) / args.max_steps # build actor weights = [p.get_value() for p in params_actor] # build replay memory memory = ReplayMemory(state_size, num_actions, 5000000) # init shared variables global_step = Value('i', 0) updates = Value('i', 0) best_reward = Value('f', -1e8) testing = Value('i', 0) # init agents data_queue = Queue() workers = [] weights_queues = [] num_agents = args.n_threads - 2 print('starting {} agents'.format(num_agents)) for i in range(num_agents): w_queue = Queue() worker = Process(target=run_agent, args=(args, model_params, weights, data_queue, w_queue, i, global_step, updates, best_reward, args.param_noise_prob, save_dir, args.max_steps)) worker.daemon = True worker.start() sleep(args.sleep) workers.append(worker) weights_queues.append(w_queue) prev_steps = 0 start_save = time() start_test = time() weights_rew_to_check = [] while global_step.value < args.max_steps: # get all data try: i, batch, weights_check, reward = data_queue.get_nowait() if weights_check is not None: weights_rew_to_check.append((weights_check, reward)) weights_queues[i].put(weights) # add data to memory memory.add_samples(*batch) except queue.Empty: pass # training step # TODO: consider not training during testing model if len(memory) > args.start_train_steps: batch = memory.random_batch(args.batch_size) # if np.random.rand() < args.flip_prob: # states, actions, rewards, terminals, next_states = batch # # states_flip = state_transform.flip_states(states) # next_states_flip = state_transform.flip_states(next_states) # actions_flip = np.zeros_like(actions) # actions_flip[:, :num_actions//2] = actions[:, num_actions//2:] # actions_flip[:, num_actions//2:] = actions[:, :num_actions//2] # # states_all = np.concatenate((states, states_flip)) # actions_all = np.concatenate((actions, actions_flip)) # rewards_all = np.tile(rewards.ravel(), 2).reshape(-1, 1) # terminals_all = np.tile(terminals.ravel(), 2).reshape(-1, 1) # next_states_all = np.concatenate((next_states, next_states_flip)) # batch = (states_all, actions_all, rewards_all, terminals_all, next_states_all) actor_loss, critic_loss = train_fn(*batch) updates.value += 1 if np.isnan(actor_loss): raise Value('actor loss is nan') if np.isnan(critic_loss): raise Value('critic loss is nan') target_update_fn() weights = actor.get_actor_weights() delta_steps = global_step.value - prev_steps prev_steps += delta_steps actor_lr.set_value( lasagne.utils.floatX( max(actor_lr.get_value() - delta_steps * actor_lr_step, args.actor_lr_end))) critic_lr.set_value( lasagne.utils.floatX( max(critic_lr.get_value() - delta_steps * critic_lr_step, args.critic_lr_end))) # check if need to save and test if (time() - start_save) / 60. > args.save_period_min: fname = os.path.join( save_dir, 'weights_updates_{}.pkl'.format(updates.value)) actor.save(fname) start_save = time() # start new test process weights_rew_to_check = [(w, r) for w, r in weights_rew_to_check if r > best_reward.value and r > 0] weights_rew_to_check = sorted(weights_rew_to_check, key=lambda x: x[1]) if ((time() - start_test) / 60. > args.test_period_min or len(weights_rew_to_check) > 0) and testing.value == 0: testing.value = 1 print('start test') if len(weights_rew_to_check) > 0: _weights, _ = weights_rew_to_check.pop() else: _weights = weights worker = Process(target=test_agent, args=(args, testing, args.num_test_episodes, model_params, _weights, best_reward, updates, global_step, save_dir)) worker.daemon = True worker.start() start_test = time() # end all processes for w in workers: w.join()
def run_agent(args, model_params, weights, data_queue, weights_queue, process, global_step, updates, best_reward, param_noise_prob, save_dir, max_steps=10000000): train_fn, actor_fn, target_update_fn, params_actor, params_crit, actor_lr, critic_lr = build_model( **model_params) actor = Agent(actor_fn, params_actor, params_crit) actor.set_actor_weights(weights) env = RunEnv2(model=args.modeldim, prosthetic=args.prosthetic, difficulty=args.difficulty, skip_frame=config.skip_frames) env.spec.timestep_limit = 3000 # ndrw # random_process = OrnsteinUhlenbeckProcess(theta=.1, mu=0., sigma=.3, size=env.noutput, sigma_min=0.05, n_steps_annealing=1e6) sigma_rand = random.uniform(0.05, 0.5) dt_rand = random.uniform(0.002, 0.02) param_noise_prob = random.uniform(param_noise_prob * 0.25, min(param_noise_prob * 1.5, 1.)) random_process = OrnsteinUhlenbeckProcess(theta=.1, mu=0., sigma=sigma_rand, dt=dt_rand, size=env.noutput, sigma_min=0.05, n_steps_annealing=1e6) print('OUProcess_sigma = ' + str(sigma_rand) + ' OUProcess_dt = ' + str(dt_rand) + ' param_noise_prob = ' + str(param_noise_prob)) # prepare buffers for data states = [] actions = [] rewards = [] terminals = [] total_episodes = 0 start = time() action_noise = True while global_step.value < max_steps: seed = random.randrange(2**32 - 2) state = env.reset(seed=seed, difficulty=args.difficulty) random_process.reset_states() total_reward = 0. total_reward_original = 0. terminal = False steps = 0 while not terminal: state = np.asarray(state, dtype='float32') action = actor.act(state) if action_noise: action += random_process.sample() next_state, reward, next_terminal, info = env._step(action) total_reward += reward total_reward_original += info['original_reward'] steps += 1 global_step.value += 1 # add data to buffers states.append(state) actions.append(action) rewards.append(reward) terminals.append(terminal) state = next_state terminal = next_terminal if terminal: break total_episodes += 1 # add data to buffers after episode end states.append(state) actions.append(np.zeros(env.noutput)) rewards.append(0) terminals.append(terminal) states_np = np.asarray(states).astype(np.float32) data = ( states_np, np.asarray(actions).astype(np.float32), np.asarray(rewards).astype(np.float32), np.asarray(terminals), ) weight_send = None if total_reward > best_reward.value: weight_send = actor.get_actor_weights() # send data for training data_queue.put((process, data, weight_send, total_reward)) # receive weights and set params to weights weights = weights_queue.get() # report_str = 'Global step: {}, steps/sec: {:.2f}, updates: {}, episode len: {}, pelvis_X: {:.2f}, reward: {:.2f}, original_reward {:.4f}, best reward: {:.2f}, noise: {}'. \ # format(global_step.value, 1. * global_step.value / (time() - start), updates.value, steps, info['pelvis_X'], total_reward, total_reward_original, best_reward.value, 'actions' if action_noise else 'params') # report_str = 'Global step: {}, steps/sec: {:.2f}, updates: {}, episode len: {}, pelvis_X: {:.2f}, reward: {:.2f}, best reward: {:.2f}, noise: {}'. \ # format(global_step.value, 1. * global_step.value / (time() - start), updates.value, steps, info['pelvis_X'], total_reward, best_reward.value, 'actions' if action_noise else 'params') report_str = 'Global step: {}, steps/sec: {:.2f}, updates: {}, episode len: {}, pelvis_X: {:.2f}, pelvis_Z: {:.2f}, reward: {:.2f}, best reward: {:.2f}, noise: {}'. \ format(global_step.value, 1. * global_step.value / (time() - start), updates.value, steps, info['pelvis'][0], info['pelvis'][2], total_reward, best_reward.value, 'actions' if action_noise else 'params') print(report_str) try: with open(os.path.join(save_dir, 'train_report.log'), 'a') as f: f.write(report_str + '\n') except: print('#############################################') print( 'except » with open(os.path.join(save_dir, train_report.log), a) as f:' ) print('#############################################') actor.set_actor_weights(weights) action_noise = np.random.rand() < 1 - param_noise_prob if not action_noise: set_params_noise(actor, states_np, random_process.current_sigma) # clear buffers del states[:] del actions[:] del rewards[:] del terminals[:] if total_episodes % 100 == 0: env = RunEnv2(model=args.modeldim, prosthetic=args.prosthetic, difficulty=args.difficulty, skip_frame=config.skip_frames)