Esempio n. 1
0
    def run(self):
        self.train_timestep = 0
        self.test_timestep = 0

        # create normal
        self.env = normalized_env.make_normalized_env(gym.make(FLAGS.env))
        tf.set_random_seed(FLAGS.tfseed)
        np.random.seed(FLAGS.npseed)
        self.env.monitor.start(os.path.join(FLAGS.outdir, 'monitor'),
                               force=FLAGS.force)
        self.env.seed(FLAGS.gymseed)
        gym.logger.setLevel(gym.logging.WARNING)

        dimO = self.env.observation_space.shape
        dimA = self.env.action_space.shape
        pprint.pprint(self.env.spec.__dict__)

        self.agent = Agent(dimO, dimA=dimA)
        test_log = open(os.path.join(FLAGS.outdir, 'test.log'), 'w')
        train_log = open(os.path.join(FLAGS.outdir, 'train.log'), 'w')

        while self.train_timestep < FLAGS.total:
            # test
            reward_list = []
            for _ in range(FLAGS.test):
                reward, timestep = self.run_episode(
                    test=True, monitor=np.random.rand() < FLAGS.monitor)
                reward_list.append(reward)
                self.test_timestep += timestep
            avg_reward = np.mean(reward_list)
            print(
                'Average test return {} after {} timestep of training.'.format(
                    avg_reward, self.train_timestep))
            test_log.write("{}\t{}\n".format(self.train_timestep, avg_reward))
            test_log.flush()

            # train
            reward_list = []
            last_checkpoint = np.floor(self.train_timestep / FLAGS.train)
            while np.floor(self.train_timestep /
                           FLAGS.train) == last_checkpoint:
                print('=== Running episode')
                reward, timestep = self.run_episode(test=False, monitor=False)
                reward_list.append(reward)
                self.train_timestep += timestep
                train_log.write("{}\t{}\n".format(self.train_timestep, reward))
                train_log.flush()
            avg_reward = np.mean(reward_list)
            print('Average train return {} after {} timestep of training.'.
                  format(avg_reward, self.train_timestep))

            os.system('{} {}'.format(plotScr, FLAGS.outdir))

        self.env.monitor.close()
        os.makedirs(os.path.join(FLAGS.outdir, "tf"))
        ckpt = os.path.join(FLAGS.outdir, "tf/model.ckpt")
        self.agent.saver.save(self.agent.sess, ckpt)
Esempio n. 2
0
    def run(self):
        self.train_timestep = 0
        self.test_timestep = 0

        # create normal
        self.env = normalized_env.make_normalized_env(gym.make(FLAGS.env))
        tf.set_random_seed(FLAGS.tfseed)
        np.random.seed(FLAGS.npseed)
        self.env.monitor.start(os.path.join(FLAGS.outdir, 'monitor'), force=FLAGS.force)
        self.env.seed(FLAGS.gymseed)
        gym.logger.setLevel(gym.logging.WARNING)

        dimO = self.env.observation_space.shape
        dimA = self.env.action_space.shape
        print(dimO, dimA)
        pprint.pprint(self.env.spec.__dict__)

        self.agent = Agent(dimO, dimA=dimA)
        simple_log_file = open(os.path.join(FLAGS.outdir, 'log.txt'), 'w')

        while self.train_timestep < FLAGS.total:

            # test
            reward_list = []
            for _ in xrange(FLAGS.test):
                reward, timestep = self.run_episode(test=True, monitor=np.random.rand() < FLAGS.monitor)
                reward_list.append(reward)
                self.test_timestep += timestep
            avg_reward = np.mean(reward_list)
            print('Average test return {} after {} timestep of training.'.format(avg_reward, self.train_timestep))
            print >> simple_log_file, "{}\t{}".format(self.train_timestep, avg_reward)

            # train
            reward_list = []
            last_checkpoint = self.train_timestep / FLAGS.train
            while self.train_timestep / FLAGS.train == last_checkpoint:
                reward, timestep = self.run_episode(test=False, monitor=False)
                reward_list.append(reward)
                self.train_timestep += timestep
            avg_reward = np.mean(reward_list)
            print('Average train return {} after {} timestep of training.'.format(avg_reward, self.train_timestep))

        self.env.monitor.close()
Esempio n. 3
0
    def run(self):
        self.train_timestep = 0
        self.test_timestep = 0

        # create normal
        maze_def = {'type': FLAGS.maze}
        self.env = normalized_env.make_normalized_env(
            Minecraft(maze_def,
                      reset=FLAGS.reset,
                      grayscale=False,
                      vision_observation=FLAGS.vision,
                      video_dim=(FLAGS.height, FLAGS.width),
                      num_parallel=FLAGS.num_parallel)
        )  # normalized_env.make_normalized_env(gym.make(FLAGS.env))
        tf.set_random_seed(FLAGS.tfseed)
        np.random.seed(FLAGS.npseed)
        #self.env.monitor.start(os.path.join(FLAGS.outdir, 'monitor'), force=FLAGS.force)
        #self.env.seed(FLAGS.gymseed)
        gym.logger.setLevel(gym.logging.WARNING)

        dimO = self.env.observation_space.shape
        dimA = self.env.action_space.shape
        print(dimO, dimA)
        #pprint.pprint(self.env.spec.__dict__)

        self.agent = Agent(dimO, dimA=dimA)
        simple_log_file = open(os.path.join(FLAGS.outdir, 'log.txt'), 'a')
        # Save command line arg
        git_hash = subprocess.check_output(
            ['git', 'rev-parse', '--short', 'HEAD'])
        simple_log_file.write(" ".join(sys.argv[:] + [git_hash]))

        avg_rewards = []
        while self.train_timestep < FLAGS.total:

            # test
            reward_list = []
            for _ in xrange(FLAGS.test):
                reward, timestep = self.run_episode(
                    test=True, monitor=np.random.rand() < FLAGS.monitor)
                reward_list.append(reward)
                self.test_timestep += timestep
            avg_reward = np.mean(reward_list)
            avg_rewards.append(avg_reward)
            print(
                'Average test return {} after {} timestep of training.'.format(
                    avg_reward, self.train_timestep))
            #print >> simple_log_file, "{}\t{}\t{}\t{}\t{}".format(self.train_timestep, avg_reward, np.std(reward_list), np.min(reward_list), np.max(reward_list))
            simple_log_file.write("{}\t{}\t{}\t{}\t{}\n".format(
                self.train_timestep, avg_reward, np.std(reward_list),
                np.min(reward_list), np.max(reward_list)))
            simple_log_file.flush()
            # Stopping criterion
            if self.train_timestep > 5e5 and len(avg_rewards) > 10 and np.var(
                    avg_rewards) < 1:
                break

            # train
            reward_list = []
            last_checkpoint = self.train_timestep / FLAGS.train
            while self.train_timestep / FLAGS.train == last_checkpoint:
                reward, timestep = self.run_episode(test=False, monitor=False)
                reward_list.append(reward)
                self.train_timestep += timestep
            avg_reward = np.mean(reward_list)
            print('Average train return {} after {} timestep of training.'.
                  format(avg_reward, self.train_timestep))
Esempio n. 4
0
    def run(self):
        Agents = [Agent1, Agent2, Agent3, Agent4, Agent5]
        rd_seeds = [8, 15, 20, 35]
        for i in range(4):
            if FLAGS.i != 0:
                i = FLAGS.i
            self.train_timestep = 0
            self.test_timestep = 0

            # create normal
            self.env = normalized_env.make_normalized_env(gym.make(FLAGS.env))
            tf.set_random_seed(rd_seeds[i])
            np.random.seed(rd_seeds[i])
            #self.env.monitor.start(os.path.join(FLAGS.outdir, 'monitor'), force=FLAGS.force)
            # self.env = gym.wrappers.Monitor(self.env,os.path.join(FLAGS.outdir, 'monitor'),force=True)
            self.env.seed(rd_seeds[i])
            gym.logger.set_level(logging.WARNING)

            dimO = self.env.observation_space.shape
            dimA = self.env.action_space.shape
            pprint.pprint(self.env.spec.__dict__)
            if FLAGS.model == "ICNN" or FLAGS.model == "ICNN_ARCH":
                print("Yes,ICNN!")
                self.agent = Agents[i](dimO=dimO, dimA=dimA)
            else:
                self.agent = Agents[i](dimO=dimO,
                                       dimA=dimA,
                                       num_layer=FLAGS.num_layer,
                                       num_nodes=FLAGS.num_nodes)
            test_log = open(os.path.join(FLAGS.outdir, 'test.log'), 'w')
            train_log = open(os.path.join(FLAGS.outdir, 'train.log'), 'w')
            x_offline = []
            y_offline = []
            x_online = []
            y_online = []
            reward_best = -2000
            while self.train_timestep <= FLAGS.total:
                # test
                reward_list = []
                for _ in range(FLAGS.test):
                    reward, timestep = self.run_episode(
                        test=True, monitor=np.random.rand() < FLAGS.monitor)
                    reward_list.append(reward)
                    self.test_timestep += timestep
                    if reward > reward_best:
                        reward_best = reward
                avg_reward = np.mean(reward_list)
                print('Average test return {} after {} timestep of training.'.
                      format(avg_reward, self.train_timestep))
                x_offline.append(self.train_timestep)
                y_offline.append(avg_reward)
                #test_log.write("{}\t{}\n".format(self.train_timestep, avg_reward))
                #test_log.flush()
                # train
                reward_list = []
                last_checkpoint = np.floor(self.train_timestep / FLAGS.train)
                while np.floor(self.train_timestep /
                               FLAGS.train) == last_checkpoint:
                    #print('=== Running episode')
                    reward, timestep = self.run_episode(test=False,
                                                        monitor=False)
                    reward_list.append(reward)
                    self.train_timestep += timestep
                #train_log.write("{}\t{}\n".format(self.train_timestep, reward))
                #train_log.flush()
                avg_reward = np.mean(reward_list)
                print('Average train return {} after {} timestep of training.'.
                      format(avg_reward, self.train_timestep))
                x_online.append(self.train_timestep)
                y_online.append(avg_reward)
            #os.system('{} {}'.format(plotScr, FLAGS.outdir))

            self.env.close()
            if FLAGS.parseq == 0:
                subadr = "pl2norm/"
                parval = FLAGS.pl2norm
            elif FLAGS.parseq == 1:
                subadr = "rate/"
                parval = FLAGS.rate
            elif FLAGS.parseq == 2:
                subadr = "prate/"
                parval = FLAGS.prate
            elif FLAGS.parseq == 4:
                subadr = "mix/"
                if FLAGS.rate == 0.0005 and FLAGS.prate == 0.00005:
                    parval = 1
                elif FLAGS.rate == 0.0001 and FLAGS.prate == 0.00001:
                    parval = 2
                elif FLAGS.rate == 0.001 and FLAGS.prate == 0.0001:
                    parval = 3
                elif FLAGS.l2norm == 0.0001 and FLAGS.pl2norm == 0:
                    parval = 4
                elif FLAGS.l2norm == 0.00001 and FLAGS.pl2norm == 0.0005:
                    parval = 5
                elif FLAGS.l2norm == 0.00005 and FLAGS.pl2norm == 0.001:
                    parval = 6
            if FLAGS.env == "HalfCheetah-v2":
                env_addr = "HalfCheetah/" + subadr
            elif FLAGS.env == "Pendulum-v0":
                env_addr = "Pendulum/" + subadr
            elif FLAGS.env == "Reacher-v2":
                env_addr = "Reacher/"
            elif FLAGS.env == "MountainCarContinuous-v0":
                env_addr = "MCContinuous/"
            #ckpt=FLAGS.outdir+"/"+env_addr+"tf/"+FLAGS.model+"_"+FLAGS.arch+str(parval)+"_"+str(i)+".ckpt"
        # os.makedirs(os.path.join(ckpt_addr, "tf"))
        #self.agent.saver.save(self.agent.sess, ckpt)
            x_offline.append(reward_best)
            print('Saving ckpt at {} timesteps.'.format(self.train_timestep))
            print("Best Reward:{}".format(reward_best))
            np.save(
                FLAGS.outdir + '/plots/' + env_addr + FLAGS.model + '_xon' +
                FLAGS.arch + str(parval) + "_" + str(i), x_online)
            np.save(
                FLAGS.outdir + '/plots/' + env_addr + FLAGS.model + '_xoff' +
                FLAGS.arch + str(parval) + "_" + str(i), x_offline)
            np.save(
                FLAGS.outdir + '/plots/' + env_addr + FLAGS.model + '_yon' +
                FLAGS.arch + str(parval) + "_" + str(i), y_online)
            np.save(
                FLAGS.outdir + '/plots/' + env_addr + FLAGS.model + '_yoff' +
                FLAGS.arch + str(parval) + "_" + str(i), y_offline)
            tf.reset_default_graph()
            tf.Graph().as_default()