p.add_argument('--do_not_save', action='store_true') p.add_argument('--learning_freq', type=int, default=50) p.add_argument('--log_every_t_iter', type=int, default=50) p.add_argument('--max_gradient', type=float, default=10.0) p.add_argument('--n_iter', type=int, default=10000) p.add_argument('--seed', type=int, default=0) p.add_argument('--wait_until_rbuffer', type=int, default=1000) args = p.parse_args() # Handle the log directory and save the arguments. logdir = 'out/' + args.envname + '/seed' + str(args.seed).zfill(2) if args.do_not_save: logdir = None logz.configure_output_dir(logdir) if logdir is not None: with open(logdir + '/args.pkl', 'wb') as f: pickle.dump(args, f) print("Saving in logdir: {}".format(logdir)) # Other stuff for seeding and getting things set up. tf.set_random_seed(args.seed) np.random.seed(args.seed) env = gym.make(args.envname) test_env = gym.make(args.envname) tf_config = tf.ConfigProto(inter_op_parallelism_threads=1, intra_op_parallelism_threads=1) sess = tf.Session(config=tf_config) ddpg = DDPGAgent(sess, env, test_env, args) ddpg.train()
episode_reward = 0 for step in range(500): if episode >= 45: env.render() #action = agent.get_action(state, ou_noise) action = agent.get_action(state, ou_noise) #print("action = ", action) new_state, reward, done, _ = env.step(action) #print("new state =", new_state) #new_state = new_state['observation'] #print("new state =", new_state) agent.memory.push(state, action, reward, new_state, done) if len(agent.memory) > batch_size: agent.train(batch_size) state = new_state episode_reward += reward if done: if episode == 0: sys.stdout.write( "episode: {}, reward: {}, average _reward: {} \n".format( episode, np.round(episode_reward, decimals=2), "nan")) else: sys.stdout.write( "episode: {}, reward: {}, average _reward: {} \n".format( episode, np.round(episode_reward, decimals=2), np.mean(rewards[-10:]))) break