def main(args): env = gym.make('Walker2d-v1') reward_history = [] agent = DDPG(env) agent.construct_model(args.gpu) saver = tf.train.Saver() if args.model_path is not None: # reuse saved model saver.restore(agent.sess, args.model_path) else: # build a new model agent.sess.run(tf.global_variables_initializer()) for episode in range(args.ep): # env init state = env.reset() total_rewards = 0 for step in range(env.spec.timestep_limit): env.render() action = agent.sample_action(state[np.newaxis, :], explore=False) # act next_state, reward, done, _ = env.step(action[0]) total_rewards += reward agent.store_experience(state, action, reward, next_state, done) agent.update_model() # shift state = next_state if done: break reward_history.append(total_rewards) print('Ep%d reward:%d' % (episode+1, total_rewards)) print('Average rewards: ', np.mean(reward_history))
def main(args): env = gym.make('Walker2d-v1') env = wrappers.Monitor(env, './videos/', force=True) reward_history = [] agent = DDPG(env, args) agent.construct_model(args.gpu) saver = tf.train.Saver() if args.model_path is not None: # reuse saved model saver.restore(agent.sess, args.model_path) ep_base = int(args.model_path.split('_')[-1]) best_avg_rewards = float(args.model_path.split('/')[-1].split('_')[0]) else: raise ValueError('model_path required!') for ep in range(args.ep): # env init state = env.reset() ep_rewards = 0 for step in range(env.spec.timestep_limit): env.render() action = agent.sample_action(state[np.newaxis, :], noise=False) # act next_state, reward, done, _ = env.step(action[0]) ep_rewards += reward agent.store_experience(state, action, reward, next_state, done) # shift state = next_state if done: break reward_history.append(ep_rewards) print('Ep%d reward:%d' % (ep + 1, ep_rewards)) print('Average rewards: ', np.mean(reward_history))
def main(args): env = gym.make('Walker2d-v1') agent = DDPG(env) agent.construct_model(args.gpu) saver = tf.train.Saver(max_to_keep=1) if args.model_path is not None: # reuse saved model saver.restore(agent.sess, args.model_path) ep_base = int(args.model_path.split('_')[-1]) else: # build a new model agent.sess.run(tf.global_variables_initializer()) ep_base = 0 MAX_EPISODES = 100000 TEST = 10 for episode in range(MAX_EPISODES): # env init state = env.reset() total_rewards = 0 for step in range(env.spec.timestep_limit): action = agent.sample_action(state[np.newaxis, :], explore=True) # act next_state, reward, done, _ = env.step(action[0]) total_rewards += reward agent.store_experience(state, action, reward, next_state, done) agent.update_model() # shift state = next_state if done: print('Ep %d global_steps: %d Reward: %.2f' % (episode + 1, agent.global_steps, total_rewards)) # reset ou noise agent.ou.reset() break # Evaluation per 100 ep if episode % 100 == 0 and episode > 100: total_rewards = 0 for ep_eval in range(TEST): state = env.reset() for step_eval in range(env.spec.timestep_limit): action = agent.sample_action(state[np.newaxis, :], explore=False) next_state, reward, done, _ = env.step(action[0]) total_rewards += reward state = next_state if done: break mean_rewards = total_rewards / TEST # logging print('\n') print('Episode: %d' % (episode + 1)) print('Gloabal steps: %d' % agent.global_steps) print('Mean reward: %.2f' % mean_rewards) print('\n') if not os.path.isdir(args.save_path): os.makedirs(args.save_path) save_name = args.save_path \ + str(episode) + '_' + str(round(mean_rewards, 2)) saver.save(agent.sess, save_name)
def main(args): set_random_seed(args.seed) env = gym.make('Walker2d-v1') agent = DDPG(env, args) agent.construct_model(args.gpu) saver = tf.train.Saver(max_to_keep=1) if args.model_path is not None: # reuse saved model saver.restore(agent.sess, args.model_path) ep_base = int(args.model_path.split('_')[-1]) best_avg_rewards = float(args.model_path.split('/')[-1].split('_')[0]) else: # build a new model agent.sess.run(tf.global_variables_initializer()) ep_base = 0 best_avg_rewards = None reward_history, step_history = [], [] train_steps = 0 for ep in range(args.max_ep): # env init state = env.reset() ep_rewards = 0 for step in range(env.spec.timestep_limit): action = agent.sample_action(state[np.newaxis, :], noise=True) # act next_state, reward, done, _ = env.step(action[0]) train_steps += 1 ep_rewards += reward agent.store_experience(state, action, reward, next_state, done) agent.update_model() # shift state = next_state if done: print('Ep %d global_steps: %d Reward: %.2f' % (ep + 1, agent.global_steps, ep_rewards)) # reset ou noise agent.ou.reset() break step_history.append(train_steps) if not reward_history: reward_history.append(ep_rewards) else: reward_history.append(reward_history[-1] * 0.99 + ep_rewards + 0.01) # Evaluate during training if ep % args.log_every == 0 and ep > 0: ep_rewards = 0 for ep_eval in range(args.test_ep): state = env.reset() for step_eval in range(env.spec.timestep_limit): action = agent.sample_action( state[np.newaxis, :], noise=False) next_state, reward, done, _ = env.step(action[0]) ep_rewards += reward state = next_state if done: break curr_avg_rewards = ep_rewards / args.test_ep # logging print('\n') print('Episode: %d' % (ep + 1)) print('Gloabal steps: %d' % agent.global_steps) print('Mean reward: %.2f' % curr_avg_rewards) print('\n') if not best_avg_rewards or (curr_avg_rewards >= best_avg_rewards): best_avg_rewards = curr_avg_rewards if not os.path.isdir(args.save_path): os.makedirs(args.save_path) save_name = args.save_path + str(round(best_avg_rewards, 2)) \ + '_' + str(ep_base + ep + 1) saver.save(agent.sess, save_name) print('Model save %s' % save_name) plt.plot(step_history, reward_history) plt.xlabel('steps') plt.ylabel('running reward') plt.show()