예제 #1
0
def run_PG():
    env = gym.make('CartPole-v0')
    env = env.unwrapped
    env.seed(1229)
    sess = tf.Session(config=set_config())
    n_features = env.observation_space.shape[0]
    n_actions = env.action_space.n

    PG = PolicyGradient(sess, n_features, n_actions, lr=0.02, gamma=0.99, num_units=10)

    result = [] #result for each episode
    episodes = 1000
    for e in range(episodes):
        print("%d/%d" %(e+1, episodes))
        obs = env.reset()
        episode_reward = 0
        ss, aa, rr = [], [], [] 
        t = 0
        while True:
            #if e % 100 == 0:
            #    env.render()
            
            a = PG.choose_action(obs)
            obs_, true_r, done, info = env.step(a)
            episode_reward += true_r

            x, x_dot, theta, theta_dot = obs_
            r1 = (env.x_threshold - abs(x) ) / env.x_threshold - 0.8
            r2 = (env.theta_threshold_radians - abs(theta)) / env.theta_threshold_radians - 0.5
            r = r1 + r2
            t += 1

            ss.append(obs)
            aa.append(a)
            rr.append(r)

            if done or t >= 40000:
                vt = PG.learn(ss, aa, rr)
                result.append(episode_reward)
                print("Reward:", episode_reward)
                break

            obs = obs_

        rr = np.mean(result[-30:])
        print("Last 30 Reward: %f" %(rr))

    env.close()
    np.save('../result/Cartpole_PG_result.npy',result)
예제 #2
0
 def __init__(self, log_dir, log_rate, kid_dir, draw_graph, command,
              sync_period, learning_rate, rd, selection, pop_size, bfactor,
              mutate_decay, mut_frac):
     self.log_dir = log_dir
     self.log_rate = log_rate
     self.kid_dir = kid_dir
     self.draw_graph = draw_graph
     self.command = command
     self.sync_period = sync_period
     self.RL_agent = PolicyGradient(log_dir=self.log_dir,
                                    kid_dir=self.kid_dir,
                                    learning_rate=learning_rate,
                                    rd=rd)
     self.EA_agent = EA(log_dir=self.log_dir,
                        kid_dir=self.kid_dir,
                        selection=selection,
                        pop_size=pop_size,
                        bfactor=bfactor,
                        mutate_decay=mutate_decay,
                        mut_frac=mut_frac)
예제 #3
0
import matplotlib.pyplot as plt

DISPLAY_REWARD_THRESHOLD = 400
RENDER = False
env = gym.make("CartPole-v0")
env.seed(1)
env = env.unwrapped

print(env.action_space)
print(env.observation_space)
print(env.observation_space.high)
print(env.observation_space.low)

RL = PolicyGradient(n_actions=env.action_space.n,
                    n_features=env.observation_space.shape[0],
                    learning_rate=.02,
                    reward_decay=.99,
                    output_graph=True)

for episode in range(3000):
    observation = env.reset()

    while True:
        if RENDER:
            env.render()

        action = RL.choose_action(observation)

        observation_, reward, done, info = env.step(action)

        RL.store_transition(observation, action, reward)
예제 #4
0
class ERL:
    def __init__(self, log_dir, log_rate, kid_dir, draw_graph, command,
                 sync_period, learning_rate, rd, selection, pop_size, bfactor,
                 mutate_decay, mut_frac):
        self.log_dir = log_dir
        self.log_rate = log_rate
        self.kid_dir = kid_dir
        self.draw_graph = draw_graph
        self.command = command
        self.sync_period = sync_period
        self.RL_agent = PolicyGradient(log_dir=self.log_dir,
                                       kid_dir=self.kid_dir,
                                       learning_rate=learning_rate,
                                       rd=rd)
        self.EA_agent = EA(log_dir=self.log_dir,
                           kid_dir=self.kid_dir,
                           selection=selection,
                           pop_size=pop_size,
                           bfactor=bfactor,
                           mutate_decay=mutate_decay,
                           mut_frac=mut_frac)

    def training(self, generations, learning_rate, samples_per_distribution,
                 initial_policy, load_per_sample):
        if self.log_dir and self.draw_graph:
            merged = tf.summary.merge_all()
            writer = tf.summary.FileWriter(self.log_dir,
                                           self.RL_agent.sess.graph)

        print('Experiment logging saving at ' + self.log_dir)

        # EA initial sample
        if initial_policy is None:
            self.EA_agent.initialize_pops(
                self.RL_agent.policy.samples(self.EA_agent.pop_size))
        else:
            for one_pol in initial_policy.split(' '):
                sample = Sample()
                sample.pick_up(one_pol)
                self.EA_agent.initialize_pops(sample)
        self.RL_agent.get_ic3_distribution(sample.access, sample.wait, sample.piece,sample.wait_info1,\
        sample.wait_info2,sample.wait_info3)
        # actual learning
        for i in range(generations):
            # EA evaluation
            # if i == 0:
            #     print('Iter {} - EA'.format(i))
            #     EA_access, EA_wait, EA_piece, \
            #     EA_wait_info1, EA_wait_info2, EA_wait_info3, \
            #     EA_reward_buffer, weakest_reward, best_seen_sample \
            #     = self.EA_agent.Evaluate(i, generations, self.command, load_per_sample)
            # self.RL_agent.get_ic3_distribution()
            # RL evaluation part
            print('Iter {} - RL'.format(i))
            self.RL_agent.Evaluate(self.command, i, samples_per_distribution,
                                   load_per_sample)

            self.RL_agent.learn(i, learning_rate, generations)

            # logging
            if self.log_dir and self.draw_graph and i % self.log_rate == 0:
                sc = tf.Summary()
                # RL part logging
                sc.value.add(tag='RL-best-seen',
                             simple_value=self.RL_agent.best_seen)
                sc.value.add(tag='RL-round-best',
                             simple_value=self.RL_agent.round_best)
                sc.value.add(tag='RL-round-mean',
                             simple_value=self.RL_agent.round_mean)
                sc.value.add(tag='RL-round-worst',
                             simple_value=self.RL_agent.round_worst)
                sc.value.add(tag='RL-round-std',
                             simple_value=self.RL_agent.round_std)
                # # EA part logging
                # sc.value.add(tag='EA-best-seen',
                #             simple_value=self.EA_agent.best_seen)
                # sc.value.add(tag='EA-round-best',
                #             simple_value=self.EA_agent.round_best)
                # sc.value.add(tag='EA-round-mean',
                #             simple_value=self.EA_agent.round_mean)
                # sc.value.add(tag='EA-round-worst',
                #             simple_value=self.EA_agent.round_worst)
                writer.add_summary(sc, i)
                writer.flush()

            self.RL_agent.clear_round_info()
            self.EA_agent.clear_round_info()