Esempio n. 1
0
def main():
    env = filter_env.makeFilteredEnv(gym.make(ENV_NAME))
    agent = DDPG(env)
    env = gym.wrappers.Monitor(env, 'experiments/' + ENV_NAME, force=True)

    for episode in xrange(EPISODES):
        state = env.reset()
        #print "episode:",episode
        # Train
        for step in xrange(env.spec.timestep_limit):
            action = agent.noise_action(state)
            next_state, reward, done, _ = env.step(action)
            agent.perceive(state, action, reward, next_state, done)
            state = next_state
            if done:
                break
        # Testing:
        if episode % 100 == 0 and episode > 100:
            total_reward = 0
            for i in xrange(TEST):
                state = env.reset()
                for j in xrange(env.spec.timestep_limit):
                    #env.render()
                    action = agent.action(state)  # direct action for test
                    state, reward, done, _ = env.step(action)
                    total_reward += reward
                    if done:
                        break
            ave_reward = total_reward / TEST
            print 'episode: ', episode, 'Evaluation Average Reward:', ave_reward
    env.monitor.close()
Esempio n. 2
0
def main():
    env = filter_env.makeFilteredEnv(gym.make(ENV_NAME))
    agent = DDPG(env)
    env.monitor.start('experiments/' + ENV_NAME,force=True)

    for episode in xrange(EPISODES):
        state = env.reset()
        #print "episode:",episode
        # Train
        for step in xrange(env.spec.timestep_limit):
            action = agent.noise_action(state)
            next_state,reward,done,_ = env.step(action)
            agent.perceive(state,action,reward,next_state,done)
            state = next_state
            if done:
                break
        # Testing:
        if episode % 100 == 0 and episode > 100:
			total_reward = 0
			for i in xrange(TEST):
				state = env.reset()
				for j in xrange(env.spec.timestep_limit):
					#env.render()
					action = agent.action(state) # direct action for test
					state,reward,done,_ = env.step(action)
					total_reward += reward
					if done:
						break
			ave_reward = total_reward/TEST
			print 'episode: ',episode,'Evaluation Average Reward:',ave_reward
    env.monitor.close()
def main():
    finishedTraining = EPISODES
    startTime = time.time()
    env = filter_env.makeFilteredEnv(gym.make(ENV_NAME))
    results_file = open("ResultsNew.csv", 'a')
    agent = DDPG(env, results_file)
    env = Monitor(env, directory='experiments/' + ENV_NAME, force=True)
    results_file.write("Episodes Spent Training; " + str(TEST) +
                       " Episode Eval Avg \n")
    for episode in range(EPISODES):
        state = env.reset()
        if (episode % 20 == 0):
            print("episode:", episode)
        # Train
        for step in range(env.spec.timestep_limit):
            action = agent.noise_action(state)
            next_state, reward, done, _ = env.step(action)
            agent.perceive(state, action, reward, next_state, done)
            state = next_state
            if done:
                break
        # Testing:
        if (episode + 1) % 100 == 0 and episode > 100:
            total_reward = 0
            for i in range(TEST):
                state = env.reset()
                for j in range(env.spec.timestep_limit):
                    env.render()
                    action = agent.action(state)  # direct action for test
                    state, reward, done, _ = env.step(action)
                    total_reward += reward
                    if done:
                        break
            ave_reward = total_reward / TEST
            print('episode: ', episode, 'Evaluation Average Reward:',
                  ave_reward)
            results_file.write(str(episode) + "; " + str(ave_reward) + "\n")
            if ave_reward > 800 and finishedTraining > episode + 300:
                finishedTraining = episode + 300
            elif (episode >= finishedTraining):
                break

    results_file.write("Time Training (" + str(EPISODES) + "episodes);" +
                       str(time.time() - startTime) + "\n")
    results_file.write("Evaluation Episode; Reward \n")
    for episode in range(100):
        total_reward = 0
        env.reset()
        state = env.env.env.set_test(episode)
        for j in range(env.spec.timestep_limit):
            action = agent.action(state)  # direct action for test
            state, reward, done, _ = env.step(action)
            total_reward += reward
            if done:
                break
        results_file.write(str(episode) + "; " + str(total_reward) + "\n")
    results_file.write("endExperiment\n\n")
    results_file.close()
def main():
    startTime = time.time()
    env = filter_env.makeFilteredEnv(gym.make(ENV_NAME))
    results_file = open("MoreExactReward12.csv", 'a')
    agent = DDPG(env, results_file)
    env = Monitor(env, directory='experiments/' + ENV_NAME, force=True)
    results_file.write("Episodes Spent Training; " + str(TEST) +
                       " Episode Eval Avg; Learned Reward Map \n")
    for episode in range(EPISODES):
        state = env.reset()
        if (episode % 20 == 0):
            print("episode:", episode)
        # Train
        for step in range(env.spec.timestep_limit):
            action = agent.noise_action(state)
            next_state, reward, done, _ = env.step(action)
            agent.perceive(state, action, reward, next_state, done)
            state = next_state
            if done:
                break
        # Testing:
        if episode % 100 == 0 and episode > 100:
            total_reward = 0
            for i in range(TEST):
                state = env.reset()
                for j in range(env.spec.timestep_limit):
                    #env.render()
                    action = agent.action(state)  # direct action for test
                    state, reward, done, _ = env.step(action)
                    total_reward += reward
                    if done:
                        break
            ave_reward = total_reward / TEST
            print('episode: ', episode, 'Evaluation Average Reward:',
                  ave_reward)
            results_file.write(str(episode) + "; " + str(ave_reward) + ";")
            results_file.write("%s \n" % (np.array_str(
                agent.actor_network.net[-1].eval())).replace("\n", " "))

    results_file.write("Time Training (" + str(EPISODES) + "episodes);" +
                       str(time.time() - startTime) + "\n")
    results_file.write(
        "Final Learned Reward Map; %s \n" %
        (np.array_str(agent.actor_network.net[-1].eval())).replace("\n", " "))

    results_file.write("Evaluation Episode; Reward \n")
    for episode in range(100):
        total_reward = 0
        state = env.reset()
        for j in range(env.spec.timestep_limit):
            action = agent.action(state)  # direct action for test
            state, reward, done, _ = env.step(action)
            total_reward += reward
            if done:
                break
        results_file.write(str(episode) + "; " + str(total_reward) + "\n")
    results_file.write("endExperiment\n\n")
    results_file.close()
def main():
    env = filter_env.makeFilteredEnv(gym.make(ENV_NAME))
    #env = gym.wrappers.Monitor(env, 'experiments/' + ENV_NAME,force=True)
    state = env.reset()

    ddpg_server = GymDDPGServer(env, state)
    ddpg_server.Open()

    rospy.spin()
Esempio n. 6
0
  def run(self):
    self.t_train = 0
    self.t_test = 0

    # create filtered environment
    self.env = filter_env.makeFilteredEnv(gym.make(FLAGS.env))
    # self.env = gym.make(FLAGS.env)
    
    self.env.monitor.start(FLAGS.outdir+'/monitor/',video_callable=lambda _: False)
    gym.logger.setLevel(gym.logging.WARNING)

    dimO = self.env.observation_space.shape
    dimA = self.env.action_space.shape
    print(dimO,dimA)

    import pprint
    pprint.pprint(self.env.spec.__dict__,width=1)

    self.agent = ddpg.Agent(dimO=dimO,dimA=dimA)

    returns = []

    # main loop
    while self.t_train < FLAGS.total:

      # test
      T = self.t_test
      R = []
      self.env.monitor.start(FLAGS.outdir+'/monitor/',video_callable=lambda _: False,resume=True)
      while self.t_test - T < FLAGS.test:
        R.append(self.run_episode(test=True,monitor=(len(R)==0)))
      avr = np.mean(R)
      print('Average test return\t{} after {} timesteps of training'.format(avr,self.t_train))
      # save return
      returns.append((self.t_train, avr))
      np.save(FLAGS.outdir+"/returns.npy",returns)

      # evaluate required number of episodes for gym
      if self.env.spec.reward_threshold is not None and avr > self.env.spec.reward_threshold:
        for i in range(self.env.spec.trials):
          self.run_episode(test=True)

      self.env.monitor.close()


      # train
      T = self.t_train
      R = []
      while self.t_train - T < FLAGS.train:
        R.append(self.run_episode(test=False))
      avr = np.mean(R)
      print('Average training return\t{} after {} timesteps of training'.format(avr,self.t_train))

    self.env.monitor.close()
    # upload results
    if FLAGS.upload:
      gym.upload(FLAGS.outdir+"/monitor",algorithm_id = GYM_ALGO_ID)
def main():
    env = filter_env.makeFilteredEnv(gym.make(ENV_NAME))
    agent = DDPG(env)
    #monitor=wrappers.Monitor(env,'experiments/'+ENV_NAME,force=True);
    for episode in xrange(EPISODES):
        program_order_idx = np.random.randint(1, 4)
        env.set_order(program_order_idx, order_list[program_order_idx])
        state = env.reset()
        # Train
        for step in xrange(env.spec.timestep_limit):
            action = agent.noise_action(state, order_list[program_order_idx])
            next_state, reward, done, _ = env.step(action)
            agent.perceive(state, order_list[program_order_idx], action,
                           reward, next_state, done)
            state = next_state
            if done:
                break
        # Testing:
        if episode % 100 == 0 and episode > 100:
            ts_reward = 0
            for i in xrange(TEST):
                program_order_idx = 0
                env.set_order(program_order_idx, order_list[program_order_idx])
                state = env.reset()
                for j in xrange(env.spec.timestep_limit):
                    #monitor.render()
                    action = agent.action(state, order_list[program_order_idx]
                                          )  # direct action for test
                    state, reward, done, _ = env.step(action)
                    ts_reward += reward
                    if done:
                        break
            ave_ts_reward = ts_reward / TEST / 200
            tr_reward = 0
            for i in xrange(TEST):
                program_order_idx = np.random.randint(1, 4)
                env.set_order(program_order_idx, order_list[program_order_idx])
                state = env.reset()
                for j in xrange(env.spec.timestep_limit):
                    #monitor.render()
                    action = agent.action(state, order_list[program_order_idx]
                                          )  # direct action for test
                    state, reward, done, _ = env.step(action)
                    tr_reward += reward
                    if done:
                        break
            ave_tr_reward = tr_reward / TEST / 200
            print 'episode: ', episode, 'Unseen Case Average Reward:', ave_ts_reward, 'Training Case Average Reward:', ave_tr_reward
            f = open("pa_logs", "a")
            f.writelines('episode: ' + str(episode) +
                         'Unseen Case Average Reward:' + str(ave_ts_reward) +
                         'Training Case Average Reward:' + str(ave_tr_reward) +
                         "\n")
            f.close()
Esempio n. 8
0
File: run.py Progetto: leopard1/ddpg
    def run(self):
        self.t_train = 0
        self.t_test = 0

        # create filtered environment
        self.env = filter_env.makeFilteredEnv(gym.make(FLAGS.env))
        # self.env = gym.make(FLAGS.env)

        self.env.monitor.start(FLAGS.outdir + '/monitor/', video_callable=lambda _: False)
        gym.logger.setLevel(gym.logging.WARNING)

        dimO = self.env.observation_space.shape
        dimA = self.env.action_space.shape
        print(dimO, dimA)

        import pprint
        pprint.pprint(self.env.spec.__dict__, width=1)

        self.agent = ddpg.Agent(dimO=dimO, dimA=dimA)

        simplelog = open(FLAGS.outdir + '/log.txt', 'w')

        # main loop
        while self.t_train < FLAGS.total:

            # test
            T = self.t_test
            R = []
            while self.t_test - T < FLAGS.test:
                R.append(self.run_episode(test=True, monitor=np.random.binomial(1, FLAGS.monitor)))
                self.t_test += 1
            avr = np.mean(R)
            print('Average test return\t{} after {} episodes of training'.format(avr, self.t_train))
            print >> simplelog, "%d\t%d" % (self.t_train, avr)

            # evaluate required number of episodes for gym and end training when above threshold
            if self.env.spec.reward_threshold is not None and avr > self.env.spec.reward_threshold:
                avr = np.mean([self.run_episode(test=True) for _ in range(self.env.spec.trials)])
                if avr > self.env.spec.reward_threshold:
                    break

            # train
            T = self.t_train
            R = []
            while self.t_train - T < FLAGS.train:
                R.append(self.run_episode(test=False))
                self.t_train += 1
            avr = np.mean(R)
            print('Average training return\t{} after {} episodes of training'.format(avr, self.t_train))

        self.env.monitor.close()
        # upload results
        if FLAGS.upload:
            gym.upload(FLAGS.outdir + "/monitor", algorithm_id=GYM_ALGO_ID)
Esempio n. 9
0
def run(args):
    # experiment = "InvertedPendulum-v1"
    env = filter_env.makeFilteredEnv(gym.make(args.game))
    print "reward_threshold:", env.spec.reward_threshold, ", timestep_limit:", env.spec.timestep_limit

    save_dir = './result/%s/monitor/' % args.game
    if not os.path.isdir(save_dir):
        os.makedirs(save_dir)
    # env.monitor.start(save_dir, video_callable=lambda _: False, force=True)
    state_dim = env.observation_space.shape[0]
    action_dim = env.action_space.shape[0]
    action_range = (env.action_space.low, env.action_space.high)
    print "action space range:", action_range
    train_dir = "./result/%s/tf/" % args.game
    agent = DDPG(state_dim,
                 action_dim,
                 train_dir=train_dir,
                 gpu_id=args.gpu,
                 dim=args.dim)
    t_train, t_test = 0, 0
    experiment = Experiment(env, agent, args.tmax)
    while True:
        # test
        T = t_test
        R = []
        # env.monitor.start(save_dir, video_callable=lambda _: False, resume=True)
        while t_test - T < args.test:
            r, t = experiment.run_episode(test=True, monitor=(len(R) == 0))
            R.append(r)
            t_test += t
        if len(R) > 0:
            avr = sum(R) / len(R)
            logger.info(
                'Average test return\t{} after {} timesteps of training target: ({})'
                .format(avr, t_train, env.spec.reward_threshold))
        # env.monitor.close()
        # train
        T = t_train
        R = []
        while t_train - T < args.train:
            r, t = experiment.run_episode(test=False)
            R.append(r)
            t_train += t
        if len(R) > 0:
            avr = sum(R) / len(R)
            logger.info(
                'Average train return\t{} after {} timesteps of training'.
                format(avr, t_train))
Esempio n. 10
0
def main(args):
    env = filter_env.makeFilteredEnv(gym.make(ENV_NAME))
    agent = DDPG(env)
    # env.monitor.start('experiments/' + ENV_NAME,force=True)
    saver = tf.train.Saver()
    if args.checkpoint:
        saver.restore(agent.sess, args.checkpoint)
        print("Resuming checkpoint from" + args.checkpoint)
    max_reward = -100000
    for episode in range(EPISODES):
        state = env.reset()
        print("episode:", episode)
        # Train
        for step in range(MAX_STEPS):
            action = agent.noise_action(state)
            next_state, reward, done, _ = env.step(action)
            agent.perceive(state, action, reward, next_state, done)
            state = next_state
            if done:
                break
        # Testing:
        if episode % 100 == 0 and episode > 100:
            total_reward = 0
            for i in range(TEST):
                state = env.reset()
                for j in range(MAX_STEPS):
                    #env.render()
                    action = agent.action(state)  # direct action for test
                    state, reward, done, _ = env.step(action)
                    total_reward += reward
                    if done:
                        break
            ave_reward = total_reward / TEST
            if ave_reward > max_reward:
                max_reward = ave_reward
                saver.save(
                    agent.sess,
                    "models/ddpg_ep" + str(episode) + "-" + str(ave_reward))
            print('episode: ', episode, 'Evaluation Average Reward:',
                  ave_reward)
            with open("models/ddpg_2.csv", "a") as savefile:
                wr = csv.writer(savefile, dialect="excel")
                wr.writerow([episode, ave_reward])

    env.monitor.close()
Esempio n. 11
0
def main(args):
    env = filter_env.makeFilteredEnv(gym.make(ENV_NAME))
    agent = DDPG(env)
    # env.monitor.start('experiments/' + ENV_NAME,force=True)
    saver = tf.train.Saver()
    saver.restore(agent.sess, args.checkpoint)
    print("Resuming checkpoint from" + args.checkpoint)
    max_reward = -100000
    rewards = []
    for i in range(TEST):
        state = env.reset()
        for j in range(MAX_STEPS):
            action = agent.action(state)  # direct action for test
            state, reward, done, _ = env.step(action)
            rewards.append(reward)
            if done:
                break
        print("Average Reward: {}".format(np.mean(rewards)))
Esempio n. 12
0
def run(args):
    # experiment = "InvertedPendulum-v1"
    env = filter_env.makeFilteredEnv(gym.make(args.game))
    print "reward_threshold:", env.spec.reward_threshold, ", timestep_limit:", env.spec.timestep_limit

    save_dir = './result/%s/monitor/' % args.game
    if not os.path.isdir(save_dir):
        os.makedirs(save_dir)
    # env.monitor.start(save_dir, video_callable=lambda _: False, force=True)
    state_dim = env.observation_space.shape[0]
    action_dim = env.action_space.shape[0]
    action_range = (env.action_space.low, env.action_space.high)
    print "action space range:", action_range
    train_dir = "./result/%s/tf/" % args.game
    agent = DDPG(state_dim, action_dim, train_dir=train_dir,
                 gpu_id=args.gpu, dim=args.dim)
    t_train, t_test = 0, 0
    experiment = Experiment(env, agent, args.tmax)
    while True:
        # test
        T = t_test
        R = []
        # env.monitor.start(save_dir, video_callable=lambda _: False, resume=True)
        while t_test - T < args.test:
            r, t = experiment.run_episode(test=True, monitor=(len(R) == 0))
            R.append(r)
            t_test += t
        if len(R) > 0:
            avr = sum(R) / len(R)
            logger.info('Average test return\t{} after {} timesteps of training target: ({})'.format(avr, t_train,
                                                                                                 env.spec.reward_threshold))
        # env.monitor.close()
        # train
        T = t_train
        R = []
        while t_train - T < args.train:
            r, t = experiment.run_episode(test=False)
            R.append(r)
            t_train += t
        if len(R) > 0:
            avr = sum(R) / len(R)
            logger.info('Average train return\t{} after {} timesteps of training'.format(avr, t_train))
Esempio n. 13
0
File: run.py Progetto: amoliu/ddpg
  def run(self):
    self.t_log = 103
    self.t_global = 0

    # create filtered environment
    self.env = filter_env.makeFilteredEnv(gym.make(FLAGS.env))

    self.env.monitor.start(FLAGS.outdir+'/monitor/',video_callable=lambda _: False)
    gym.logger.setLevel(gym.logging.WARNING)

    dimO = self.env.observation_space.shape
    dimA = self.env.action_space.shape

    print('dimO: '+str(dimO) +'  dimA: '+str(dimA))

    self.agent = ddpg.Agent(dimO=dimO,dimA=dimA)

    returns = []
    t_last_test = 0

    # main loop
    while self.t_global < t_train:

      # test
      t_last_test = self.t_global
      R = np.mean([self.run_episode(test=True,render=render) for _ in range(n_test)])
      returns.append((self.t_global, R))
      np.save(FLAGS.outdir+"/returns.npy",returns)
      print('Average return '+str(R)+ ' after '+str(self.t_global)+' timesteps of training')

      # train
      while self.t_global-t_last_test <  FLAGS.test:
        self.run_episode(test=False)

    self.env.monitor.close()
    # upload results
    if FLAGS.gymkey != '':
      gym.upload(FLAGS.outdir+"/monitor",FLAGS.gymkey)
Esempio n. 14
0
def main():
    env = filter_env.makeFilteredEnv(gym.make(ENV_NAME))
    agent = DDPG(env)
    env.monitor.start('experiments/' + ENV_NAME, force=True)

    for episode in xrange(EPISODES):
        state = env.reset()
        # Train
        for step in xrange(env.spec.timestep_limit):
            action = agent.noise_action(state)
            next_state, reward, done, _ = env.step(action)

            agent.perceive(state, action, reward, next_state, done)
            state = next_state
            if done:
                break
        # Testing:
        if episode % 100 == 0 and episode > 100:
            total_reward = 0
            min_reward = 1000
            max_reward = 0
            for i in xrange(TEST):
                reward_one = 0
                state = env.reset()
                for j in xrange(env.spec.timestep_limit):
                    #env.render()
                    action = agent.action(state)  # direct action for test
                    state, reward, done, _ = env.step(action)
                    total_reward += reward
                    reward_one += reward
                    if done:
                        break
                min_reward = min(min_reward, reward_one)
                max_reward = max(max_reward, reward_one)
            ave_reward = total_reward / TEST
            print 'episode: ', episode, 'Evaluation Reward: Average-', ave_reward, "  Min-", min_reward, "  Max-", max_reward
    env.monitor.close()
Esempio n. 15
0
def main():
    env = filter_env.makeFilteredEnv(gym.make(ENV_NAME))
    agent = DDPG(env)
    #env.monitor.start('experiments/' + ENV_NAME,force=True)
    env = gym.wrappers.Monitor(env, PATH, force=True)

    returns = []
    rewords = []

    for episode in xrange(EPISODES):
        state = env.reset()
    reward_episode = []
        print "episode:",episode
        # Train
        for step in xrange(env.spec.timestep_limit):
            env.render()
            action = agent.noise_action(state)
            next_state,reward,done,_ = env.step(action)
            #print('state={}, action={}, reward={}, next_state={}, done={}'.format(state, action, reward, next_state, done))
            agent.perceive(state,action,reward,next_state,done)
            state = next_state
            reward_episode.append(reward)
        if done:
                break
Esempio n. 16
0
def main():
    env = filter_env.makeFilteredEnv(gym.make(ENV_NAME))
    agent = DDPG(env)
    #env.monitor.start('experiments/' + ENV_NAME,force=True)
    env = gym.wrappers.Monitor(env, PATH, force=True)

    returns = []
    rewords = []

    for episode in xrange(EPISODES):
        state = env.reset()
        reward_episode = []
        print "episode:", episode
        # Train
        for step in xrange(env.spec.timestep_limit):
            env.render()
            action = agent.noise_action(state)
            next_state, reward, done, _ = env.step(action)
            #print('state={}, action={}, reward={}, next_state={}, done={}'.format(state, action, reward, next_state, done))
            agent.perceive(state, action, reward, next_state, done)
            state = next_state
            reward_episode.append(reward)
            if done:
                break

        plt.figure(3)
        plt.plot(reward_episode)
        plt.show()

        # Testing:
        #if episode % 1 == 0:
        if episode % 10 == 0 and episode > 50:
            agent.save_model(PATH, episode)

            total_return = 0
            ave_reward = 0
            for i in xrange(TEST):
                state = env.reset()
                reward_per_step = 0
                for j in xrange(env.spec.timestep_limit):
                    #env.render()
                    action = agent.action(state)  # direct action for test
                    state, reward, done, _ = env.step(action)
                    total_reward += reward
                    if done:
                        break
                    reward_per_step += (reward - reward_per_step) / (j + 1)
                ave_reward += reward_per_step

            ave_return = total_return / TEST
            ave_reward = ave_reward / TEST
            returns.append(ave_return)
            rewards.append(ave_reward)

            plt.figure(1)
            plt.plot(returns)
            plt.figure(2)
            plt.plot(rewards)

            plt.show()

            print 'episode: ', episode, 'Evaluation Average Return:', ave_return, '  Evaluation Average Reward: ', ave_reward
    env.monitor.close()
Esempio n. 17
0
import filter_env
import rospy
from rl_agent_environment_communication.srv import *
import cv2
from cv_bridge import CvBridge
import gym
import numpy as np

ENV_NAME = 'LunarLanderContinuous-v2'
#ENV_NAME = 'Pendulum-v0'

DEBUG_SERVICES_MODE = False

env = filter_env.makeFilteredEnv(gym.make(ENV_NAME))
#env = gym.wrappers.Monitor(env, 'experiments/' + ENV_NAME,force=True)
state = env.reset()


def handle_environment_reset(req):
    print 'Reseting Env...'
    state = env.reset()

    resp = ResetEnvSrvResponse()
    resp.state = state
    return resp


def handle_environment_render(req):
    print 'Rendering Env...'
    image_state = env.render(mode='rgb_array')
    print 'ENV RENDERED!'
Esempio n. 18
0
def main():
    # tensorflow session
    sess = tf.InteractiveSession()

    # set agents per each particle
    envs = np.zeros(n_particle, dtype=object)
    agents = np.zeros(n_particle, dtype=object)
    states = np.zeros(n_particle, dtype=object)
    dones = np.zeros(n_particle, dtype=bool)
    actor_nets = np.zeros(n_particle, dtype=object)
    actor_pg_list = np.zeros(n_particle, dtype=object)
    ave_reward = np.zeros(n_particle, dtype=float)
    for i in range(n_particle):
        envs[i] = filter_env.makeFilteredEnv(gym.make(ENV_NAME))
        agents[i] = DDPG(sess, envs[i], i)
        dones[i] = False
        actor_nets[i] = agents[i].actor_network.net
        actor_pg_list[i] = agents[i].actor_network.pg_list
    actor_nets = np.array(list(actor_nets))
    actor_pg_list = np.array(list(actor_pg_list))
    svpg = SVPG(sess, actor_nets, actor_pg_list,
                envs[0].observation_space.shape[0],
                envs[0].action_space.shape[0], independent_flag)

    # session initialization and target NN update
    sess.run(tf.global_variables_initializer())
    for par in range(n_particle):
        agents[par].update_target()

    for episode in xrange(EPISODES):
        for par in range(n_particle):
            states[par] = envs[par].reset()
        # Train
        for par in range(n_particle):
            dones[par] = False
        for step in xrange(envs[0].spec.timestep_limit):
            flag = 0
            for par in range(n_particle):
                if not dones[par]:
                    action = agents[par].noise_action(states[par])
                    next_state, reward, done, _ = envs[par].step(action)
                    agents[par].save_to_buffer(states[par], action, reward,
                                               next_state, done)
                    states[par] = next_state
                    if done:
                        dones[par] = True
                    if agents[par].can_train():
                        flag += 1
            if (flag == n_particle):
                for par in range(n_particle):
                    # train critic NN and get policy gradient
                    agents[par].train()
                # svpg
                svpg.run()
                for par in range(n_particle):
                    agents[par].update_target()

        # Testing:
        if episode % 100 == 0 and episode > 100:
            for par in range(n_particle):
                total_reward = 0
                for i in xrange(TEST):
                    state = envs[par].reset()
                    for j in xrange(envs[0].spec.timestep_limit):
                        action = agents[par].action(
                            state)  # direct action for test
                        state, reward, done, _ = envs[par].step(action)
                        total_reward += reward
                        if done:
                            break
                ave_reward[par] = total_reward / TEST
            print 'episode: ', episode, 'Evaluation Average Reward:', np.max(
                ave_reward)
            if np.max(ave_reward) > 950.0:
                print 'solved'
                exit(1)
Esempio n. 19
0
    def __init__(self,
                 environment = 'MountainCarContinuous-v0',
                 # environment = 'InvertedPendulum-v1',
                 ):

        self.gamma = 0.99
        lr = 1e-3 #learning rate

        self.sess = tf.InteractiveSession()
        self.l1 = 100  #neurons layer 1
        self.l2 = 100  #neurons layer 2

        self.step = 0  # number of SGD-steps alredy taken

        self.summaries_dir = './logging/ddpg'

        replay_memory_size = 5e5 #number of transitions to be stored in replay buffer
        self.warmup = 0#5e4

        self.train_lengths = []
        self.test_lengths = []

        self.replay_memory = deque(maxlen=replay_memory_size)

        # environment specific:
        self.env_f = filter_env.makeFilteredEnv(gym.make(environment))
        self.select_env = environment

        self.num_outputs = 1
        self.action_dim = self.env_f.action_space.shape[0]
        self.state_dim = self.env_f.observation_space.shape[0]

        print('state dim', self.state_dim)
        print('action dim', self.action_dim)

        self.batch_size = 32
        self.samples_count = 0
        self.ou_process = ornstein_uhlenbeck(ndim= 1, theta= 0.15, sigma= .2, delta_t= 1)


        ####### Initialize the Networks: ######

        self.state = tf.placeholder(tf.float32, [None, self.state_dim], name='x-states')
        self.action = tf.placeholder(tf.float32, [None, self.action_dim], name='x-action')

        neurons_layer1 = 200
        neurons_layer2 = 200
        theta_hidden = naf_net.theta_hidden(self.state_dim, neurons_layer1, neurons_layer2)
        self.hidden_out, _ = naf_net.hidden_layers(self.state, theta_hidden, name= 'hidden_net')

        theta_v = naf_net.theta_fc(neurons_layer2, 1)
        self.V, _ = naf_net.fc_layer(self.hidden_out, theta_v, tf.identity, 'v_layer')

        theta_l = naf_net.theta_fc(neurons_layer2, int(self.action_dim*(self.action_dim+1)/2))
        l, _ = naf_net.fc_layer(self.hidden_out, theta_l, tf.identity,'l_layer')

        theta_mu = naf_net.theta_fc(neurons_layer2, self.action_dim)
        # theta_mu = [tf.Variable(tf.random_uniform((neurons_layer2,self.action_dim),-1e-3, 1e-3), name='1w'),
        #             tf.Variable(tf.random_uniform([self.action_dim],-1e-3, 1e-3), name='1b')]
        self.mu, _ = naf_net.fc_layer(self.hidden_out, theta_mu, tf.tanh, 'mu_layer')

        #prime net:

        self.state_prime = tf.placeholder(tf.float32, [None, self.state_dim], name='x-states_prime')

        theta_hidden_prime, update_hidden = naf_net.exponential_moving_averages(theta_hidden, 0.001)

        self.hidden_out_prime, _ = naf_net.hidden_layers(self.state, theta_hidden_prime, name= 'hidden_net_prime')

        theta_v_prime, update_theta_v = naf_net.exponential_moving_averages(theta_v, 0.001)
        V_prime, _ = naf_net.fc_layer(self.hidden_out, theta_v_prime, tf.identity, 'v_prime_layer')


        #creating the P matrix:
        pivot = 0
        rows = []
        for idx in xrange(self.action_dim):
            count = self.action_dim - idx

            diag_elem = tf.exp(tf.slice(l, (0, pivot), (-1, 1)))
            non_diag_elems = tf.slice(l, (0, pivot + 1), (-1, count - 1))
            row = tf.pad(tf.concat(1, (diag_elem, non_diag_elems)), ((0, 0), (idx, 0)))
            rows.append(row)

            pivot += count

        L = tf.transpose(tf.pack(rows), (1, 2, 0))
        P = tf.batch_matmul(L, tf.transpose(L, (0, 2, 1)))

        tmp = tf.expand_dims(self.action - self.mu, -1)
        A = -tf.batch_matmul(tf.transpose(tmp, (0, 2, 1)), tf.batch_matmul(P, tmp)) / 2
        A = tf.reshape(A, [-1, 1])

        with tf.name_scope('Q'):
            self.Q = A + self.V

        with tf.name_scope('optimization'):
            self.rew = tf.placeholder(tf.float32, [None, self.action_dim], name='reward')

            V_prime_stopped = tf.stop_gradient(V_prime)
            q_target = self.rew + self.gamma*V_prime_stopped
            self.loss = tf.reduce_mean(tf.squared_difference(tf.squeeze(q_target), tf.squeeze(self.Q)), name='td_error_loss')

            optimizer = tf.train.AdamOptimizer(learning_rate=lr)
            grads_and_vars = optimizer.compute_gradients(self.loss, var_list=theta_hidden + theta_v + theta_mu + theta_l)

            with tf.control_dependencies([update_hidden, update_theta_v]):
                self.train_step = optimizer.apply_gradients(grads_and_vars)

        # logging
        log_obs = [] if self.state_dim > 20 else [tf.histogram_summary("obs/" + str(i), self.state[:, i]) for i in
                                                  range(self.state_dim)]
        log_act = [] if self.action_dim > 20 else [tf.histogram_summary("act/inf" + str(i), self.mu[:, i]) for i in
                                                   range(self.action_dim)]
        log_act2 = [] if self.action_dim > 20 else [tf.histogram_summary("act/train" + str(i), self.action[:, i]) for
                                                    i in range(self.action_dim)]

        log_grad = [plotting.grad_histograms(grads_and_vars)]

        # self.log_all = tf.merge_summary(log_obs + log_act + log_act2)
        self.log_all =  tf.scalar_summary('mean squared tderror', self.loss)

        plotting.hist_summaries(*list(theta_v_prime + theta_hidden_prime))



        # Merge all the summaries and write them out to /tmp/mnist_logs (by default)
        self.merged = tf.merge_all_summaries()
        self.train_writer = tf.train.SummaryWriter(self.summaries_dir, self.sess.graph)
        tf.initialize_all_variables().run()
Esempio n. 20
0
def train():
    # parameter server and worker information
    ps_hosts = np.zeros(FLAGS.ps_hosts_num, dtype=object)
    worker_hosts = np.zeros(FLAGS.worker_hosts_num, dtype=object)
    port_num = FLAGS.st_port_num
    for i in range(FLAGS.ps_hosts_num):
        ps_hosts[i] = str(FLAGS.hostname) + ":" + str(port_num)
        port_num += 1
    for i in range(FLAGS.worker_hosts_num):
        worker_hosts[i] = str(FLAGS.hostname) + ":" + str(port_num)
        port_num += 1
    ps_hosts = list(ps_hosts)
    worker_hosts = list(worker_hosts)
    # Create a cluster from the parameter server and worker hosts.
    cluster = tf.train.ClusterSpec({"ps": ps_hosts, "worker": worker_hosts})
    # Create and start a server for the local task.
    server = tf.train.Server(cluster,
                             job_name=FLAGS.job_name,
                             task_index=FLAGS.task_index)
    if FLAGS.job_name == "ps":
        server.join()
    elif FLAGS.job_name == "worker":
        device = tf.train.replica_device_setter(
            worker_device="/job:worker/task:%d" % FLAGS.task_index,
            cluster=cluster)

        #tf.set_random_seed(1);
        # env and model call
        env = filter_env.makeFilteredEnv(gym.make(ENV_NAME))
        agent = DDPG(env, device)

        # prepare session
        with tf.device(
                tf.train.replica_device_setter(
                    worker_device="/job:worker/task:%d" % FLAGS.task_index,
                    cluster=cluster)):
            global_step = tf.get_variable(
                'global_step', [],
                initializer=tf.constant_initializer(0),
                trainable=False)
            global_step_ph = tf.placeholder(global_step.dtype,
                                            shape=global_step.get_shape())
            global_step_ops = global_step.assign(global_step_ph)
            score = tf.get_variable('score', [],
                                    initializer=tf.constant_initializer(-21),
                                    trainable=False)
            score_ph = tf.placeholder(score.dtype, shape=score.get_shape())
            score_ops = score.assign(score_ph)
            init_op = tf.global_variables_initializer()
            # summary for tensorboard
            tf.summary.scalar("score", score)
            summary_op = tf.summary.merge_all()
            saver = tf.train.Saver()

        sv = tf.train.Supervisor(is_chief=(FLAGS.task_index == 0),
                                 global_step=global_step,
                                 logdir=FLAGS.log_dir,
                                 summary_op=summary_op,
                                 saver=saver,
                                 init_op=init_op)

        with sv.managed_session(server.target) as sess:
            agent.set_sess(sess)
            while True:
                if sess.run([global_step])[0] > EPISODES:
                    break
                score = 0
                for ls in range(local_step):
                    state = env.reset()
                    for step in xrange(env.spec.timestep_limit):
                        action = agent.noise_action(state)
                        next_state, reward, done, _ = env.step(action)
                        agent.perceive(state, action, reward, next_state, done)
                        state = next_state
                        if done:
                            break
                for i in xrange(TEST):
                    state = env.reset()
                    for j in xrange(env.spec.timestep_limit):
                        #env.render()
                        action = agent.action(state)  # direct action for test
                        state, reward, done, _ = env.step(action)
                        score += reward
                        if done:
                            break
                sess.run(
                    global_step_ops,
                    {global_step_ph: sess.run([global_step])[0] + local_step})
                sess.run(score_ops, {score_ph: score / TEST / 200})
                print(
                    str(FLAGS.task_index) + "," +
                    str(sess.run([global_step])[0]) + "," +
                    str(score / TEST / 200))
        sv.stop()
        print("Done")
Esempio n. 21
0
    def run(self, env):
        self.t_train = 0
        self.t_test = 0

        # create filtered environment
        # self.env = filter_env.makeFilteredEnv(gym.make(FLAGS.env))
        self.env = filter_env.makeFilteredEnv(gym.make(env))

        self.t_elapsed = []

        # self.env = gym.make(FLAGS.env)

        if tf.gfile.Exists(FLAGS.outdir):
            tf.gfile.DeleteRecursively(FLAGS.outdir)
        # self.env.monitor.start(FLAGS.outdir + '/monitor/', video_callable=lambda _: False)
        # gym.logger.setLevel(gym.logging.WARNING)

        dimO = self.env.observation_space.shape
        dimA = self.env.action_space.shape
        print 'observationspace action space',
        print(dimO, dimA)

        import pprint
        pprint.pprint(self.env.spec.__dict__, width=1)

        self.agent = ddpg.Agent(dimO=dimO, dimA=dimA)

        returns = []

        it = 0
        episodelengths = []
        testlengths = []

        if env == 'Reacher-v1':
            self.train_frequency = 1
            test_frequency = 3
            plot_frequency = 1

        if env == 'MountainCarContinuous-v0':
            test_frequency = 10
            plot_frequency = 1
            self.train_frequency = 16

        if env == 'InvertedPendulum-v1':
            test_frequency = 100
            plot_frequency = 300
            self.train_frequency = 1

        print 'using train frequency', self.train_frequency

        # main loop
        while self.t_train < FLAGS.total:

            it += 1

            episodelengths.append(self.run_episode(test=False))

            if it % test_frequency == 0:
                testlengths.append(self.run_episode(test=True))

            if it % plot_frequency == 0:
                print 'avg time for sim step:', np.mean(
                    np.array(self.t_elapsed))
                plotting.plot_episode_lengths(episodelengths)
                plotting.plot_episode_lengths(testlengths)
                # plotting.plot_replay_memory_2d_state_histogramm(self.agent.rm.observations)
                # plotting.plot_learned_mu(self.agent.act_test, self.env)

            # else:
            #     # test
            #     T = self.t_test
            #     R = []
            #
            #     while self.t_test - T < FLAGS.test:
            #         # print 'running test episode'
            #         R.append(self.run_episode(test=True, monitor=(self.t_test - T < FLAGS.monitor * FLAGS.test)))
            #     avr = np.mean(R)
            #     print('Average test return\t{} after {} timesteps of training'.format(avr, self.t_train))
            #     # save return
            #     returns.append((self.t_train, avr))
            #     np.save(FLAGS.outdir + "/returns.npy", returns)
            #
            #     # evaluate required number of episodes for gym and end training when above threshold
            #     if self.env.spec.reward_threshold is not None and avr > self.env.spec.reward_threshold:
            #         avr = np.mean([self.run_episode(test=True) for _ in range(self.env.spec.trials)])
            #         if avr > self.env.spec.reward_threshold:
            #             break
            #
            #     # train
            #     T = self.t_train
            #     R = []
            #     while self.t_train - T < FLAGS.train:
            #         # print 'running train episode'
            #         R.append(self.run_episode(test=False))
            #     avr = np.mean(R)
            #     print('Average training return\t{} after {} timesteps of training'.format(avr, self.t_train))

        # self.env.monitor.close()
        # upload results
        if FLAGS.upload:
            gym.upload(FLAGS.outdir + "/monitor", algorithm_id=GYM_ALGO_ID)
Esempio n. 22
0
def main(args):
    if VERBOSE:
        print '***The Replay Buffer currently always returns the most recent experiences (instead of random), so the batches are constant between the tf and torch nets.'

    state_dim = 3
    action_dim = 1

    net = ActorCriticNet(state_dim, action_dim)

    target_net = copy.deepcopy(net)
    memory = ReplayBuffer(REPLAY_BUFFER_SIZE)
    noise = OUNoise(action_dim)

    criterion = nn.MSELoss()
    optimizer = optim.Adam(net.parameters(), lr=LEARNING_RATE, weight_decay=L2)
    target_optim = optim.Optimizer(target_net.parameters(),
                                   {})  # to iterate over target params

    if VERBOSE: print '***Making gym env (only used to setup TF net).'

    # load tf net (restoring saved parameters)
    dtf = ddpg_tf.DDPG_TF(filter_env.makeFilteredEnv(gym.make('Pendulum-v0')),
                          loadfilename='tf_params-0',
                          printVars=False)

    if VERBOSE: print '***TF net restore complete.'

    # load control data (only using a every fourth data), and tf net results
    control_states = np.load('control_states.npy')[::4]
    control_rewards = np.load('control_rewards.npy')[::4]
    tf_record = np.load('tf_control_record.npy')

    # replace torch params with tf params, and run control data, collecting torch net results
    # first optimization step will occur at i == 50, upon which extra data is recorded to compare tf and torch
    # using: no bn, REPLAY_BUFFER_SIZE=200, REPLAY_START_SIZE=50, BATCH_SIZE=50, constant replay_buffer_batches (always the most recent experiences)
    replaceNetParams(dtf, net, target_net)

    if VERBOSE: print '***Torch net params initialized to TF net params.'

    original_net = copy.deepcopy(net)  # save original net
    original_target_net = copy.deepcopy(target_net)

    torch_record = []

    loss = -1
    first_step = True

    for i in xrange(len(control_rewards) - 1):
        state = torch.from_numpy(control_states[i].reshape(1,
                                                           state_dim)).float()
        action = net.getAction(Variable(state)).data
        target_action = target_net.getAction(Variable(state)).data

        reward = torch.FloatTensor([[control_rewards[i]]]).float()

        new_state = torch.from_numpy(control_states[i + 1].reshape(
            1, state_dim)).float()

        memory.add(state, action, reward, new_state, True)
        if memory.count() > REPLAY_START_SIZE:
            minibatch = memory.get_batch(BATCH_SIZE)
            state_batch = torch.cat([data[0] for data in minibatch], dim=0)
            action_batch = torch.cat([data[1] for data in minibatch], dim=0)
            reward_batch = torch.cat([data[2] for data in minibatch])
            next_state_batch = torch.cat([data[3] for data in minibatch],
                                         dim=0)
            done_batch = Tensor([data[4] for data in minibatch])

            # calculate y_batch from targets
            #next_action_batch = target_net.getAction(Variable(next_state_batch))
            value_batch = target_net.getValue(Variable(next_state_batch)).data
            y_batch = reward_batch + GAMMA * value_batch * done_batch

            if first_step:
                if VERBOSE: print '***First Optimization Step complete.'
                torch_ys = y_batch
                torch_batch = minibatch
                torch_outs = net.getValue(Variable(state_batch)).data

            # optimize net 1 step
            loss = criterion(net.getValue(Variable(state_batch)),
                             Variable(y_batch))
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            loss = loss.data[0]

            # update targets - using exponential moving averages
            for group, target_group in zip(optimizer.param_groups,
                                           target_optim.param_groups):
                for param, target_param in zip(group['params'],
                                               target_group['params']):
                    target_param.data.mul_(1 - TAU)
                    target_param.data.add_(TAU, param.data)

            if first_step:
                first_step_net = copy.deepcopy(net)
                first_step_target_net = copy.deepcopy(target_net)
                first_step = False

        torch_record.append(
            [action.numpy()[0][0],
             target_action.numpy()[0][0], loss])
        loss = -1

    torch_record = np.array(torch_record)
    torch_outs = torch_outs.numpy().T[0]
    torch_ys = torch_ys.numpy().T[0]

    if VERBOSE: print '***Control Data run complete.'

    # compare torch and tf results
    # results for each net have 3 columns: [net action prediction, target net action prediction, loss (-1 if there was no training)]
    sel = np.arange(45, 55)
    #print calc_error(tf_record[sel,:], torch_record[sel,:])
    print 'Result comparison:'
    print 'control_data_index | tf_net_action | tf_target_net_action | tf_loss | torch_net_action | torch_target_net_action | torch_loss'
    print np.hstack(
        [sel[:, np.newaxis], tf_record[sel, :], torch_record[sel, :]])
    print '\t(a loss of -1 means no training occured in that step)'

    # load all tf results from before taking first optimization step
    tf_ys = np.load('tf_first_step_y_batch.npy')
    tf_rs = np.load('tf_first_step_reward_batch.npy')
    tf_ds = np.load('tf_first_step_done_batch.npy')
    tf_vs = np.load('tf_first_step_value_batch.npy')
    tf_outs = np.load('tf_first_step_output_values.npy')
    torch_wd = 1.36607  # weight decay loss of tf net at first optimization step - recorded directly from terminal output of tf net

    if VERBOSE:
        print '***Comparing first step stats'

        # compare tf and torch data from before taking first optimization step
        # including calculation of manual loss
        print '\terror in ys (between tf and torch)', calc_error(
            torch_ys, tf_ys)
        print '\terror in predictions (between tf and torch)', calc_error(
            torch_outs, tf_outs)
        print '\ttorch loss (manually calculated)', np.mean(
            (torch_ys - torch_outs)**2)
        print '\ttf loss (manually calculated)', np.mean((tf_ys - tf_outs)**2)
        print '\ttorch loss', torch_record[50,
                                           2], '(not including weight decay)'
        print '\ttf loss', tf_record[
            50, 2] - torch_wd, '(not including weight decay)'

    return 0
Esempio n. 23
0
File: run.py Progetto: febert/DeepRL
    def run(self,env):
        self.t_train = 0
        self.t_test = 0

        # create filtered environment
        # self.env = filter_env.makeFilteredEnv(gym.make(FLAGS.env))
        self.env = filter_env.makeFilteredEnv(gym.make(env))

        self.t_elapsed = []

        # self.env = gym.make(FLAGS.env)

        if tf.gfile.Exists(FLAGS.outdir):
            tf.gfile.DeleteRecursively(FLAGS.outdir)
        # self.env.monitor.start(FLAGS.outdir + '/monitor/', video_callable=lambda _: False)
        # gym.logger.setLevel(gym.logging.WARNING)

        dimO = self.env.observation_space.shape
        dimA = self.env.action_space.shape
        print 'observationspace action space',
        print(dimO, dimA)

        import pprint
        pprint.pprint(self.env.spec.__dict__, width=1)

        self.agent = ddpg.Agent(dimO=dimO, dimA=dimA)

        returns = []

        it = 0
        episodelengths = []
        testlengths = []

        if env == 'Reacher-v1':
            self.train_frequency = 1
            test_frequency = 3
            plot_frequency = 1

        if env == 'MountainCarContinuous-v0':
            test_frequency = 10
            plot_frequency = 1
            self.train_frequency = 16

        if env == 'InvertedPendulum-v1':
            test_frequency = 100
            plot_frequency = 300
            self.train_frequency = 1

        print 'using train frequency', self.train_frequency

        # main loop
        while self.t_train < FLAGS.total:

            it +=1

            episodelengths.append(self.run_episode(test=False))


            if it % test_frequency== 0:
                testlengths.append(self.run_episode(test=True))

            if it % plot_frequency == 0:
                print 'avg time for sim step:', np.mean(np.array(self.t_elapsed))
                plotting.plot_episode_lengths(episodelengths)
                plotting.plot_episode_lengths(testlengths)
                # plotting.plot_replay_memory_2d_state_histogramm(self.agent.rm.observations)
                # plotting.plot_learned_mu(self.agent.act_test, self.env)

            # else:
            #     # test
            #     T = self.t_test
            #     R = []
            #
            #     while self.t_test - T < FLAGS.test:
            #         # print 'running test episode'
            #         R.append(self.run_episode(test=True, monitor=(self.t_test - T < FLAGS.monitor * FLAGS.test)))
            #     avr = np.mean(R)
            #     print('Average test return\t{} after {} timesteps of training'.format(avr, self.t_train))
            #     # save return
            #     returns.append((self.t_train, avr))
            #     np.save(FLAGS.outdir + "/returns.npy", returns)
            #
            #     # evaluate required number of episodes for gym and end training when above threshold
            #     if self.env.spec.reward_threshold is not None and avr > self.env.spec.reward_threshold:
            #         avr = np.mean([self.run_episode(test=True) for _ in range(self.env.spec.trials)])
            #         if avr > self.env.spec.reward_threshold:
            #             break
            #
            #     # train
            #     T = self.t_train
            #     R = []
            #     while self.t_train - T < FLAGS.train:
            #         # print 'running train episode'
            #         R.append(self.run_episode(test=False))
            #     avr = np.mean(R)
            #     print('Average training return\t{} after {} timesteps of training'.format(avr, self.t_train))

        # self.env.monitor.close()
        # upload results
        if FLAGS.upload:
            gym.upload(FLAGS.outdir + "/monitor", algorithm_id=GYM_ALGO_ID)
Esempio n. 24
0
        action = np.clip(np.round(action), self._env.action_space.low, self._env.action_space.high)
        # action as integer id
        action = int("".join([str(int(a)) for a in action]), 2)
        vector_length = len(self._env.action_space.high)
        if action > 0:
            timestamp = long(time.time() * 1000) - 874724710
            observed_items = gl.SFrame({'item_id': [action], 'timestamp': [timestamp], 'prev_item': [state]})

            nearest_neighbors = k if k is not None else FLAGS.knn
            k_interactions = self._model.recommend_from_interactions(observed_items, k=nearest_neighbors, diversity=1)

            items = ["{0:0{1}b}".format(item[0], vector_length) for item in k_interactions[["item_id"]].to_numpy()]
        else:
            items = ["00000000000"]

        item_vectors = []
        for item in items:
            item_vectors.append(np.array([float(item[i]) for i in range(vector_length)]))

        return item_vectors

if __name__ == '__main__':
    # env = filter_env.makeFilteredEnv(gym.make("InvertedDoublePendulum-v1"))
    env = filter_env.makeFilteredEnv(gym.make("CollaborativeFiltering-v3"))
    x = FMPolicy(env)

    obs = env.reset()
    cont_action = env.action_space.sample()
    print('==Action in continuous space: {}'.format(cont_action))
    result = x.g(cont_action)
    print(result)
Esempio n. 25
0
  def run(self):
    self.t_train = 0
    self.t_test = 0

    # create filtered environment
    self.env = filter_env.makeFilteredEnv(gym.make(FLAGS.env), skip_space_norm=FLAGS.skip_space_norm,
                                          wolpertinger=FLAGS.wolpertinger)
    # self.env = gym.make(FLAGS.env)
    
    self.env.monitor.start(FLAGS.outdir+'/monitor/',video_callable=lambda _: False)
    # self.env.monitor.start(FLAGS.outdir+'/monitor/',video_callable=lambda _: True)
    gym.logger.setLevel(gym.logging.WARNING)

    dimO = self.env.observation_space.shape
    dimA = self.env.action_space.shape
    print(dimO,dimA)

    import pprint
    pprint.pprint(self.env.spec.__dict__,width=1)

    wolp = None
    if FLAGS.wolpertinger:
        wolp = wp.Wolpertinger(self.env, i=FLAGS.wp_total_actions,
                               action_set=wp.load_action_set(FLAGS.wp_action_set_file,
                                                             i=FLAGS.wp_total_actions, action_shape=dimA[0])
                               ).g
    elif FLAGS.fmpolicy:
        wolp = fmp.FMPolicy(self.env).g

    self.agent = ddpg.Agent(dimO=dimO, dimA=dimA, custom_policy=FLAGS.wolpertinger or FLAGS.fmpolicy,
                            env_dtype=str(self.env.action_space.high.dtype))

    returns = []

    # main loop
    while self.t_train < FLAGS.total:

      # test
      T = self.t_test
      R = []
      if self.t_train - T > 0 or FLAGS.train == 0:
        while self.t_test - T < FLAGS.test:
          R.append(self.run_episode(test=True, monitor=(self.t_test - T < FLAGS.monitor * FLAGS.test), custom_policy=wolp))
          self.t_test += 1
        avr = np.mean(R)
        # print('Average test return\t{} after {} timesteps of training'.format(avr,self.t_train))
        with open(os.path.join(FLAGS.outdir, "output.log"), mode='a') as f:
          # f.write('Average test return\t{} after {} timesteps of training\n'.format(avr, self.t_train))
          f.write('Average test return\t{} after {} timesteps\n'.format(avr, self.t_train + FLAGS.test))
        # save return
        returns.append((self.t_train, avr))
        np.save(FLAGS.outdir+"/returns.npy",returns)

        s = self.agent.checkpoint_session()
        with open(os.path.join(FLAGS.outdir, "output.log"), mode='a') as f:
            f.write('Checkpoint saved at {} \n'.format(s))

        # evaluate required number of episodes for gym and end training when above threshold
        if self.env.spec.reward_threshold is not None and avr > self.env.spec.reward_threshold:
          # TODO: it is supposed that when testing the model does not have to use the full wolpertinger policy?
          # TODO: to avoid the item not found exception in environment, custom policy is being sent to the run_episode
          avr = np.mean([self.run_episode(test=True, custom_policy=wolp) for _ in range(self.env.spec.trials)]) # trials???
          # print('TRIALS => Average return{}\t Reward Threshold {}'.format(avr, self.env.spec.reward_threshold))
          with open(os.path.join(FLAGS.outdir, "output.log"), mode='a') as f:
            f.write('TRIALS => Average return{}\t Reward Threshold {}\n'.format(avr, self.env.spec.reward_threshold))
          if avr > self.env.spec.reward_threshold:
            s = self.agent.checkpoint_session()
            with open(os.path.join(FLAGS.outdir, "output.log"), mode='a') as f:
                f.write('Final Checkpoint saved at {} \n'.format(s))
            break

      # train
      T = self.t_train
      R = []
      start_time = time.time()
      while self.t_train - T < FLAGS.train:
        R.append(self.run_episode(test=False, custom_policy=wolp))
        self.t_train += 1
      end_time = time.time()
      avr = np.mean(R)
      # print('Average training return\t{} after {} timesteps of training'.format(avr,self.t_train))
      with open(os.path.join(FLAGS.outdir, "output.log"), mode='a') as f:
        f.write('Average training return\t{} after {} timesteps of training. Batch time: {} sec.\n'
                .format(avr, self.t_train, end_time - start_time))

    self.env.monitor.close()
    f.close()
    # upload results
    if FLAGS.upload:
      gym.upload(FLAGS.outdir+"/monitor",algorithm_id = GYM_ALGO_ID)