예제 #1
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--env', type=str, default='Pendulum-v0')
    parser.add_argument('--outdir', type=str, default=None)
    parser.add_argument('--logdir', type=str, default=None)
    parser.add_argument('--load', type=str, default=None)
    parser.add_argument('--final-steps', type=int, default=10**7)
    parser.add_argument('--render', action='store_true')
    parser.add_argument('--batch', type=int, default=64)
    parser.add_argument('--epoch', type=int, default=10)
    args = parser.parse_args()

    if args.outdir is None:
        args.outdir = os.path.join(os.path.dirname(__file__), 'results')
        if not os.path.exists(args.outdir):
            os.makedirs(args.outdir)
    if args.logdir is None:
        args.logdir = os.path.join(os.path.dirname(__file__), 'logs')

    env = gym.make(args.env)
    dam = gym.make("MyAntdam-v1")
    heal = env
    obs_dim = env.observation_space.shape[0]
    n_actions = env.action_space.shape[0]

    network = make_network([64, 64])

    sess = tf.Session()
    sess.__enter__()

    agent = Agent(network, obs_dim, n_actions)

    initialize()
    agent.sync_old()

    saver = tf.train.Saver(max_to_keep=50)
    if args.load is not None:
        saver.restore(sess, args.load)

    reward_summary = tf.placeholder(tf.int32, (), name='reward_summary')
    tf.summary.scalar('reward_summary', reward_summary)
    merged = tf.summary.merge_all()
    train_writer = tf.summary.FileWriter(args.logdir, sess.graph)

    global_step = 0
    episode = 0
    prob = 0.9
    while True:
        local_step = 0
        if global_step > 100000:
            prob = 0.5
        elif global_step > 50000:
            prob = 0.7
        elif global_step > 30000:
            prob = 0.8
        while True:
            if (np.random.randint(0, 2, 1) > prob):
                env = dam
            else:
                env = heal

            training_data = []
            sum_of_reward = 0
            reward = 0
            obs = env.reset()
            last_obs = None
            last_action = None
            last_value = None
            done = False

            while not done:
                if args.render:
                    env.render()

                action, value = agent.act_and_train(last_obs, last_action,
                                                    last_value, reward, obs)
                if numpy.isnan(action).any():
                    print "NaN found"
                    path = os.path.join(args.outdir,
                                        '{}/model.ckpt'.format(global_step))
                    saver.save(sess, path)
                    local_step = 3000
                    global_step = args.final_steps
                    break
                last_obs = obs
                last_action = action
                last_value = value
                obs, reward, done, info = env.step(action)

                sum_of_reward += reward
                global_step += 1
                local_step += 1

                # save model
                if global_step % (5000) == 0:
                    path = os.path.join(args.outdir,
                                        '{}/model.ckpt'.format(global_step))
                    saver.save(sess, path)

                # the end of episode
                if done:
                    summary, _ = sess.run(
                        [merged, reward_summary],
                        feed_dict={reward_summary: sum_of_reward})
                    train_writer.add_summary(summary, global_step)
                    agent.stop_episode(last_obs, last_action, last_value,
                                       reward)
                    print('Episode: {}, Step: {}: Reward: {} Dam: {}'.format(
                        episode, global_step, sum_of_reward, last_obs[-1]))
                    episode += 1
                    break

            # append data for training
            training_data.append(agent.get_training_data())

            if local_step > 2048:
                break

        # train network
        obs = []
        actions = []
        returns = []
        deltas = []
        for o, a, r, d in training_data:
            obs.extend(o)
            actions.extend(a)
            returns.extend(r)
            deltas.extend(d)
        print "Now Training"
        for epoch in range(args.epoch):
            indices = random.sample(range(len(obs)), min(len(obs), args.batch))
            sampled_obs = np.array(obs)[indices]
            sampled_actions = np.array(actions)[indices]
            sampled_returns = np.array(returns)[indices]
            sampled_deltas = np.array(deltas)[indices]

            ratio = agent.train(sampled_obs, sampled_actions, sampled_returns,
                                sampled_deltas)

        if args.final_steps < global_step:
            break
예제 #2
0
파일: exp3.py 프로젝트: arowdy98/NIPS18
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--env', type=str, default='Pendulum-v0')
    parser.add_argument('--outdir',
                        type=str,
                        default="/home/aditya/NIPS18/output")
    parser.add_argument('--logdir',
                        type=str,
                        default="/home/aditya/NIPS18/output")
    parser.add_argument('--load', type=str, default=None)
    parser.add_argument('--final-steps', type=int, default=10**7)
    parser.add_argument('--render', action='store_true')
    parser.add_argument('--batch', type=int, default=64)
    parser.add_argument('--epoch', type=int, default=10)
    args = parser.parse_args()

    if args.outdir is None:
        args.outdir = os.path.join(os.path.dirname(__file__), 'results')
        if not os.path.exists(args.outdir):
            os.makedirs(args.outdir)
    if args.logdir is None:
        args.logdir = os.path.join(os.path.dirname(__file__), 'logs')

    #env = gym.make(args.env)
    env = ProstheticsEnv(visualize=False)
    obs_dim = 160
    n_actions = env.action_space.shape[0]

    network = make_network([128, 128, 128])

    sess = tf.Session()
    sess.__enter__()

    agent = Agent(network, obs_dim, n_actions)

    initialize()
    agent.sync_old()

    saver = tf.train.Saver()
    if args.load is not None:
        saver.restore(sess, args.load)

    reward_summary = tf.placeholder(tf.int32, (), name='reward_summary')
    tf.summary.scalar('reward_summary', reward_summary)
    merged = tf.summary.merge_all()
    train_writer = tf.summary.FileWriter(args.logdir, sess.graph)

    global_step = 0
    episode = 0
    while True:
        local_step = 0

        while True:
            training_data = []
            sum_of_reward = 0
            reward = 0
            obs = env.reset()
            last_obs = None
            last_action = None
            last_value = None
            done = False

            while not done:
                if args.render:
                    env.render()

                action, value = agent.act_and_train(last_obs, last_action,
                                                    last_value, reward, obs)

                last_obs = obs
                last_action = action
                last_value = value
                action = tf.clip_by_value(action, 1e-10, 1.0)
                print(action)
                obs, reward, done, info = env.step(action)

                sum_of_reward += reward
                global_step += 1
                local_step += 1

                # save model
                if global_step % 10**6 == 0:
                    path = os.path.join(args.outdir,
                                        '{}/model.ckpt'.format(global_step))
                    saver.save(sess, path)

                # the end of episode
                if done:
                    summary, _ = sess.run(
                        [merged, reward_summary],
                        feed_dict={reward_summary: sum_of_reward})
                    train_writer.add_summary(summary, global_step)
                    agent.stop_episode(last_obs, last_action, last_value,
                                       reward)
                    print('Episode: {}, Step: {}: Reward: {}'.format(
                        episode, global_step, sum_of_reward))
                    episode += 1
                    break

            # append data for training
            training_data.append(agent.get_training_data())

            if local_step > 2048:
                break

        # train network
        obs = []
        actions = []
        returns = []
        deltas = []
        for o, a, r, d in training_data:
            obs.extend(o)
            actions.extend(a)
            returns.extend(r)
            deltas.extend(d)
        for epoch in range(args.epoch):
            indices = random.sample(range(len(obs)), args.batch)
            sampled_obs = np.array(obs)[indices]
            sampled_actions = np.array(actions)[indices]
            sampled_returns = np.array(returns)[indices]
            sampled_deltas = np.array(deltas)[indices]
            ratio = agent.train(sampled_obs, sampled_actions, sampled_returns,
                                sampled_deltas)

        if args.final_steps < global_step:
            break
예제 #3
0
        while True:
            training_data = []
            sum_of_reward = 0
            reward = 0
            obs = env.reset()
            last_obs = None
            last_action = None
            last_value = None
            done = False

            while not done:
                if args.render:
                    env.render()

                action, value = agent.act_and_train(
                        last_obs, last_action, last_value, reward,  obs)
                if numpy.isnan(action).any():
                    print "NaN found"
                    path = os.path.join(args.outdir,
                            '{}/model.ckpt'.format(global_step))
                    saver.save(sess, path, max_to_keep=20)
                    local_step = 3000
                    global_step = args.final_steps
                    break
                last_obs = obs
                last_action = action
                last_value = value
                obs, reward, done, info = env.step(action)

                sum_of_reward += reward
                global_step += 1
예제 #4
0
class Worker:
    def __init__(self, name, model, icm_model, global_step, env_name, render=False, training=True):
        self.training = training
        self.actions = np.arange(14).tolist()
        self.env = mario.make(env_name)
        self.name = name
        self.render = render
        self.agent = Agent(model, icm_model, len(self.actions), name=name)
        self.global_step = global_step
        self.inc_global_step = global_step.assign_add(1)

    def run(self, sess, summary_writer, saver):
        with sess.as_default():
            local_step = 0

            while True:
                states = np.zeros((4, 42, 42), dtype=np.float32)
                reward = 0
                done = False
                clipped_reward = 0
                sum_of_rewards = 0
                step = 0
                state = self.env.reset()

                while True:
                    state = cv2.cvtColor(state, cv2.COLOR_RGB2GRAY)
                    state = cv2.resize(state, (42, 42))
                    states = np.roll(states, 1, axis=0)
                    states[0] = state

                    if done:
                        if self.training:
                            self.agent.stop_episode_and_train(
                                    np.transpose(states, [1, 2, 0]), clipped_reward, summary_writer, done=done)
                        else:
                            self.agent.stop_episode()
                        break

                    if self.training:
                        action = self.agent.act_and_train(np.transpose(states, [1, 2, 0]), clipped_reward, summary_writer)
                    else:
                        action = self.agent.act(np.transpose(states, [1, 2, 0]))
                    action = self.actions[action]

                    state, reward, done, info = self.env.step(action)
                    if self.render:
                        self.env.render()

                    if reward > 0:
                        clipped_reward = 1.0
                    elif reward < 0:
                        clipped_reward = -1.0
                    else:
                        clipped_reward = 0.0
                    sum_of_rewards += reward
                    step += 1
                    local_step += 1 
                    global_step = get_session().run(self.inc_global_step)
                    if self.training and global_step % 1000000 == 0:
                        saver.save(sess, 'models/model', global_step=global_step)

                print('worker: {}, global: {}, local: {}, reward: {} distance: {}'.format(
                        self.name, self.global_step.value().eval(), local_step, sum_of_rewards, info['distance']))