Ejemplo n.º 1
0
    p.add_argument('--do_not_save', action='store_true')
    p.add_argument('--learning_freq', type=int, default=50)
    p.add_argument('--log_every_t_iter', type=int, default=50)
    p.add_argument('--max_gradient', type=float, default=10.0)
    p.add_argument('--n_iter', type=int, default=10000)
    p.add_argument('--seed', type=int, default=0)
    p.add_argument('--wait_until_rbuffer', type=int, default=1000)
    args = p.parse_args()

    # Handle the log directory and save the arguments.
    logdir = 'out/' + args.envname + '/seed' + str(args.seed).zfill(2)
    if args.do_not_save:
        logdir = None
    logz.configure_output_dir(logdir)
    if logdir is not None:
        with open(logdir + '/args.pkl', 'wb') as f:
            pickle.dump(args, f)
    print("Saving in logdir: {}".format(logdir))

    # Other stuff for seeding and getting things set up.
    tf.set_random_seed(args.seed)
    np.random.seed(args.seed)
    env = gym.make(args.envname)
    test_env = gym.make(args.envname)
    tf_config = tf.ConfigProto(inter_op_parallelism_threads=1,
                               intra_op_parallelism_threads=1)
    sess = tf.Session(config=tf_config)

    ddpg = DDPGAgent(sess, env, test_env, args)
    ddpg.train()
    episode_reward = 0

    for step in range(500):
        if episode >= 45:
            env.render()
        #action = agent.get_action(state, ou_noise)
        action = agent.get_action(state, ou_noise)
        #print("action = ", action)
        new_state, reward, done, _ = env.step(action)
        #print("new state =", new_state)
        #new_state = new_state['observation']
        #print("new state =", new_state)
        agent.memory.push(state, action, reward, new_state, done)

        if len(agent.memory) > batch_size:
            agent.train(batch_size)

        state = new_state
        episode_reward += reward

        if done:
            if episode == 0:
                sys.stdout.write(
                    "episode: {}, reward: {}, average _reward: {} \n".format(
                        episode, np.round(episode_reward, decimals=2), "nan"))
            else:
                sys.stdout.write(
                    "episode: {}, reward: {}, average _reward: {} \n".format(
                        episode, np.round(episode_reward, decimals=2),
                        np.mean(rewards[-10:])))
            break