Example #1
0
def main():
    args = parser.parse_args()
    use_cuda = args.use_cuda
    use_cuda = True

    transform = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
    ])

    trainset = torchvision.datasets.CIFAR10(root='./data',
                                            train=True,
                                            download=True,
                                            transform=transform)
    trainloader = torch.utils.data.DataLoader(trainset,
                                              batch_size=4,
                                              shuffle=True,
                                              num_workers=2)

    testset = torchvision.datasets.CIFAR10(root='./data',
                                           train=False,
                                           download=True,
                                           transform=transform)
    testloader = torch.utils.data.DataLoader(testset,
                                             batch_size=4,
                                             shuffle=False,
                                             num_workers=2)

    policy_gradient = PolicyGradient(config=Params,
                                     train_set=trainloader,
                                     test_set=testloader,
                                     use_cuda=use_cuda)
    policy_gradient.solve_environment()
 def __init__(self, 
              n_x,
              n_y,
              learning_rate = 0.02,
              reward_decay=0.99,
              load_path=None, 
              save_path=None):
     self.PG = PolicyGradient(n_x, n_y,
             learning_rate=learning_rate,
             reward_decay=reward_decay,
             load_path=load_path,
             save_path=save_path
             )
class AgentPolicyGradient:
    def __init__(self, 
                 n_x,
                 n_y,
                 learning_rate = 0.02,
                 reward_decay=0.99,
                 load_path=None, 
                 save_path=None):
        self.PG = PolicyGradient(n_x, n_y,
                learning_rate=learning_rate,
                reward_decay=reward_decay,
                load_path=load_path,
                save_path=save_path
                )
        
        
    def choose_action(self, observation):
        return self.PG.choose_action(observation)
    
    def store_transition(self, s, a, r):
        return self.PG.store_transition(s,a,r)
    
    
    def learn(self):
        return self.PG.learn()
    
    
    def plot_cost(self):
        import matplotlib
        matplotlib.use('TkAgg')
        import matplotlib.pyplot as plt
        plt.plot(np.arange(len(self.PG.cost_history)), self.PG.cost_history)
        plt.ylabel('Cost Ex')
        plt.xlabel('Training Steps Ex')
        plt.show()    
        
    def crashed(self):
        episode_rewards_sum = sum(self.PG.episode_rewards)
        return episode_rewards_sum < -250
    
    def episode_reward(self):
        episode_rewards_sum = sum(self.PG.episode_rewards)
        return episode_rewards_sum
    

    def costs(self):
        return self.PG.costs()
Example #4
0
def winRate(load_path, episodes, player_num):
    tf.reset_default_graph()
    number_of_players = 2
    number_of_pieces = 4
    reward = -1000
    EPISODES = episodes
    ghost_players = list(reversed(range(0, 4)))[:-number_of_players]
    players = list(reversed(range(0, 4)))[-number_of_players:]
    winner = None
    act = util.Action(number_of_players, number_of_pieces, reward)
    winnerCount = defaultdict(int)
    print(load_path, "---")
    PG = PolicyGradient(
        n_x=(number_of_players * number_of_pieces) + 5,  #input layer size
        n_y=5,  #ouput layer size
        learning_rate=0.02,
        reward_decay=0.99,
        load_path=load_path,
        save_path=None,
        player_num=player_num)
    preds = list()
    for episode in range(EPISODES):
        g = ludopy.Game(ghost_players=ghost_players,\
             number_of_pieces=number_of_pieces)

        there_is_a_winner = False
        winner = None
        totalMoves, wrongPred = 0, 0
        while True:
            for i in range(number_of_players):
                (dice, move_pieces, player_pieces, enemy_pieces, \
                         player_is_a_winner,there_is_a_winner),\
                                 player_i = g.get_observation()

                if player_i == 1:
                    action, random = act.getAction(PG, enemy_pieces,
                                                   player_pieces, move_pieces,
                                                   dice)
                    totalMoves += 1
                    if random:
                        wrongPred += 1
                else:
                    action = act.getAction(move_pieces=move_pieces)

                _, _, _, _, _, there_is_a_winner = g.answer_observation(action)

                if there_is_a_winner:
                    if episode % 1000 == 0 and 0:
                        print("saving the game--", episode)
                    winner = player_i
                    winnerCount[player_i] += 1
                    break
            if there_is_a_winner:
                preds.append([wrongPred, totalMoves])
                break
    return winnerCount, preds
Example #5
0
def main():
    exp_dir = 'search_{}_{}'.format(args.algorithm,
                                    time.strftime("%Y%m%d-%H%M%S"))
    if not os.path.exists(exp_dir):
        os.mkdir(exp_dir)
    log_format = '%(asctime)s %(message)s'
    logging.basicConfig(stream=sys.stdout,
                        level=logging.INFO,
                        format=log_format,
                        datefmt='%m/%d %I:%M:%S %p')
    fh = logging.FileHandler(os.path.join(exp_dir, 'log.txt'))
    fh.setFormatter(logging.Formatter(log_format))
    logging.getLogger().addHandler(fh)

    logging.info('args = %s', args)

    if args.algorithm == 'PPO' or args.algorithm == 'PG':
        torch.manual_seed(args.seed)
        np.random.seed(args.seed)
        if torch.cuda.is_available():
            device = torch.device('cuda:{}'.format(str(args.gpu)))
            cudnn.benchmark = True
            cudnn.enable = True
            logging.info('using gpu : {}'.format(args.gpu))
            torch.cuda.manual_seed(args.seed)
        else:
            device = torch.device('cpu')
            logging.info('using cpu')

        if args.algorithm == 'PPO':
            ppo = PPO(args, device)
            ppo.multi_solve_environment()
        elif args.algorithm == 'PG':
            pg = PolicyGradient(args, device)
            pg.multi_solve_environment()

    else:
        rs = RandomSearch(args)
        rs.multi_solve_environment()
Example #6
0
def main():
    env = gym.make('CartPole-v0')
    model = CartpoleModel(name_scope='noIdeaWhyNeedThis', act_dim=ACT_DIM)
    alg = PolicyGradient(model, LEARNING_RATE)
    agent = CartpoleAgent(alg, OBS_DIM, ACT_DIM)

    with fluid.dygraph.guard():
        for i in range(1000):  # 100 episodes
            obs_list, action_list, reward_list = run_episode(env, agent)
            if i % 10 == 0:
                logger.info("Episode {}, Reward Sum {}.".format(
                    i, sum(reward_list)))

            batch_obs = np.array(obs_list)
            batch_action = np.array(action_list)
            batch_reward = calc_reward_to_go(reward_list)

            agent.learn(batch_obs, batch_action, batch_reward)
            if (i + 1) % 100 == 0:
                _, _, reward_list = run_episode(env,
                                                agent,
                                                train_or_test='test')
                total_reward = np.sum(reward_list)
                logger.info('Test reward: {}'.format(total_reward))
Example #7
0
# env1 = gym.make('sateDCA_ENV-v0')
# env1 = env1.unwrapped
nOfenb = 2
nOfchannel = 12
nOfue = 2

if __name__ == "__main__":

    # Load checkpoint
    load_path = None
    save_path = None

    PG = PolicyGradient(n_x=4 + nOfenb * nOfchannel,
                        n_y=nOfenb * nOfchannel,
                        learning_rate=0.005,
                        reward_decay=1,
                        load_path=load_path,
                        save_path=save_path,
                        ep=0.99)

env = ns3env.Ns3Env(port=port,
                    startSim=startSim,
                    simSeed=seed,
                    simArgs=simArgs,
                    debug=debug)

env.reset()

ob_space = env.observation_space
ac_space = env.action_space
Example #8
0
tf.flags.DEFINE_float('display_threshold', 10,
                      'the reward threshold to display render')
tf.flags.DEFINE_boolean('render', False, 'render waste time')
tf.flags.DEFINE_boolean('output_graph', False, 'whether to save graph')
tf.flags.DEFINE_string('env_name', 'CartPole-v0', 'env name')
tf.flags.DEFINE_integer('episode', 1000, 'train episode')

RENDER = FLAGS.render

env = gym.make(FLAGS.env_name)
env.seed(1)
env = env.unwrapped

PG = PolicyGradient(n_actions=env.action_space.n,
                    n_features=env.observation_space.shape[0],
                    lr=0.02,
                    gamma=0.99,
                    output_graph=FLAGS.output_graph)

for i in range(FLAGS.episode):
    s = env.reset()

    while True:
        if RENDER:
            env.render()
        action = PG.choose_action(s)
        s_, r, done, info = env.step(action)
        PG.store_transition(s_, action, r)
        if done:
            episode_rs_sum = sum(PG.ep_rs)
            if 'running_reward' not in globals():
def test_cartpole():
    env = gym.make('CartPole-v0')
    agent_pg = PolicyGradient(env)
    agent_pg.learning(episodes=5000)
def simulation():
    users_num = 1
    action_rewards = [10, 9, 1, 1, 1, 1, 1, 1, 1, 1]
    actions = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
    observations = [[random.randint(0, i * 10) for i in range(1, 4)]
                    for j in range(1, 101)]
    # nums of items to recommend
    K = 2
    load_version = 1
    save_version = load_version + 1

    load_path = "output/weights/topk{}.ckpt".format(load_version)
    save_path = "output/weights/topk{}.ckpt".format(save_version)

    EPISODES = 5000
    RENDER_ENV = True
    rewards = []

    PG = PolicyGradient(n_x=len(observations[0]),
                        n_y=len(actions),
                        s0=observations[random.randint(0,
                                                       len(observations) - 1)],
                        learning_rate=0.005,
                        reward_decay=1,
                        load_path=None,
                        save_path=save_path,
                        weight_capping_c=2**3,
                        k=K,
                        b_distribution='uniform')

    for episode in range(EPISODES):

        episode_reward = 0

        tic = time.clock()
        done = False

        while True:
            '''
			TODO:initialize the env
			'''
            if RENDER_ENV:
                observation = observations[random.randint(
                    0,
                    len(observations) - 1)]

            # 1. Choose an action based on observation
            # action = PG.uniform_choose_action(observation)
            action = PG.choose_action(observation)

            # 2. Take action in the environment
            observation_, reward = observations[random.randint(
                0,
                len(observations) - 1)], action_rewards[action]

            # 4. Store transition for training
            PG.store_transition(observation, action, reward)

            toc = time.clock()
            elapsed_sec = toc - tic
            if elapsed_sec > 120:
                done = True
            if len(PG.episode_observations) > 100:
                done = True

            if done:
                episode_rewards_sum = sum(PG.episode_rewards)
                rewards.append(episode_rewards_sum)
                max_reward_so_far = np.amax(rewards)
                PG.cost_history.append(episode_rewards_sum)
                print("==========================================")
                print("Episode: ", episode)
                print("Seconds: ", elapsed_sec)
                print("Reward: ", episode_rewards_sum)
                print("Max reward so far: ", max_reward_so_far)
                #print(PG.outputs_softmax)
                print("distribution at {} is :{}".format(
                    PG.s0, PG.get_distribution(PG.s0)))
                # 5. Train neural network
                discounted_episode_rewards_norm = PG.learn()
                break

            # Save new observation
            observation = observation_

    PG.plot_cost()
    plt.bar(actions, PG.get_distribution(PG.s0))
    plt.xlabel("action")
    # 显示纵轴标签
    plt.ylabel("probability")
    # 显示图标题
    plt.title("top-k correction policy")
    plt.show()
Example #11
0
# Load checkpoint
load_path = "outputs/weights/2048-v0.ckpt"
save_path = "outputs/weights/2048-v0.ckpt"


# In[ ]:


if __name__ == "__main__":

    PG = PolicyGradient(
        n_x = env.observation_space.shape[0],
        n_y = env.action_space.n,
        learning_rate=0.025,
        reward_decay=0.5,
        epochs=2,
        load_path=load_path,
        save_path=save_path
    )
    
    PG.quiet = QUIET

    for episode in range(EPISODES):

        observation = env.reset()
        episode_reward = 0
        max_tile_value_so_far = 0

        while True:
            if RENDER_ENV: env.render()
Example #12
0
import gym
from policy_gradient import PolicyGradient
from config import get_config
import random

import pdb

parser = argparse.ArgumentParser()
parser.add_argument('--env-name',
                    required=True,
                    type=str,
                    choices=['cartpole', 'pendulum', 'cheetah'])
parser.add_argument('--baseline', dest='use_baseline', action='store_true')
parser.add_argument('--no-baseline', dest='use_baseline', action='store_false')
parser.add_argument('--seed', type=int, default=1)

parser.set_defaults(use_baseline=True)

if __name__ == '__main__':
    args = parser.parse_args()

    torch.random.manual_seed(args.seed)
    np.random.seed(args.seed)
    random.seed(args.seed)

    config = get_config(args.env_name, args.use_baseline, args.seed)
    env = gym.make(config.env_name)
    # train model
    model = PolicyGradient(env, config, args.seed)
    model.run()
Example #13
0
		state_dict['min_each_ingredient_per_slice'],
		state_dict['max_ingredients_per_slice']],
	))
	return state.astype(np.float).ravel()

if __name__ == "__main__":


    # Load checkpoint
    load_path = "./output/weights/pizza-temp.ckpt"
    save_path = "output/weights/pizza-temp.ckpt"

    PG = PolicyGradient(
            n_x = X_DIM,
            n_y = 5,
            learning_rate=0.01,
            reward_decay=0.95,
            load_path=load_path,
            save_path=save_path
            )

    for batch in range(BATCHES):
        for p_game in range(P_GAMES):
            env = game.Game({'max_steps': 100})
            episode_reward = 0
            h = 5
            l = 1
            pizza_lines = ["TMMMTTT","MMMMTMM", "TTMTTMT", "TMMTMMM", "TTTTTTM", "TTTTTTM"]
            pizza_config = { 'pizza_lines': pizza_lines, 'r': R, 'c': C, 'l': l, 'h': h }
            state = env.init(pizza_config)[0]
            print("\nPIZZA CONFIG: ", pizza_config)
            print("\nSTATE: ", state)
Example #14
0
    def __init__(self):
        rospy.init_node('runPG', anonymous=True)

        if self.mode == 5:
            self.n_inputs = 4
        if self.mode == 8:
            self.n_inputs = 8

        self.RL = PolicyGradient(
            n_actions=self.n_outputs,
            n_features=self.n_inputs,
            learning_rate=0.02,
            reward_decay=0.99,
            load_saved_net=False,
            # output_graph=True,
        )

        rospy.Subscriber('/RL/gripper_status', String,
                         self.callbackGripperStatus)
        rospy.Service('/RL/net', net_eval, self.EvalNet)
        rospy.Service('/RL/start_learning', Empty, self.start_learning)
        obs_srv = rospy.ServiceProxy('/RL/observation', observation)
        drop_srv = rospy.ServiceProxy('/RL/IsObjDropped', IsDropped)
        move_srv = rospy.ServiceProxy('/RL/MoveGripper', TargetAngles)
        open_srv = rospy.ServiceProxy('/RL/OpenGripper', Empty)
        close_srv = rospy.ServiceProxy('/RL/CloseGripper', Empty)

        rospy.sleep(3)
        o = open_srv()

        episode_count = 0
        rate = rospy.Rate(15)  # 15hz
        while not rospy.is_shutdown():

            if self.stLearning:
                ## Start episode ##
                episode_count += 1

                # Close gripper
                raw_input(
                    "Place object between fingers and press Enter to close gripper..."
                )
                close_srv()
                while not self.gripper_closed:
                    rate.sleep()

                raw_input("Remove table and press Enter to start episode...")

                # Get observation
                obs = np.array(obs_srv().state)
                self.VT = []
                while True:
                    # Choose action
                    action = self.RL.choose_action(obs)

                    # Act
                    suc = move_srv(self.A[action]).success
                    rospy.sleep(0.05)
                    rate.sleep()

                    if suc:
                        # Get observation
                        obs_ = np.array(obs_srv().state)
                        fail = drop_srv(
                        ).dropped  # Check if dropped - end of episode
                    else:
                        # End episode if overload or angle limits reached
                        rospy.logerr(
                            '[RL] Failed to move gripper. Episode declared failed.'
                        )
                        fail = True

                    reward, done = self.transition_reward(obs_, fail)

                    self.RL.store_transition(obs, action, reward)

                    obs = obs_

                    if done:
                        ep_rs_sum = sum(self.RL.ep_rs)

                        if 'running_reward' not in globals():
                            running_reward = ep_rs_sum
                        else:
                            running_reward = running_reward * 0.99 + ep_rs_sum * 0.01
                        print("*** episode: " + str(episode_count) +
                              ", episode reward: " + str(ep_rs_sum) +
                              ", running reward: " + str(int(running_reward)) +
                              " ***")

                        vt = self.RL.learn()
                        self.R.append(running_reward)
                        self.possible_plot = True

                        break

                    rate.sleep()
            elif self.possible_plot:
                self.plot_sav()
                self.possible_plot = False

            # Open gripper
            if self.gripper_closed:
                o = open_srv()
                rospy.sleep(0.2)
            # self.stLearning = False

            # print(obs_srv().state)

            # rospy.spin()
            rate.sleep()
Example #15
0
DISPLAY_REWARD_THRESHOLD = 400  # renders environment if total episode reward is greater then this threshold
RENDER = True  # rendering wastes time

env = gym.make('CartPole-v0')
env.unwrapped
env.seed(1)

print(env.action_space)
print(env.observation_space)
print(env.observation_space.low)
print(env.observation_space.high)

RL = PolicyGradient(
    n_actions=env.action_space.n,
    n_features=env.observation_space.shape[0],
    learning_rate=0.02,
    reward_decay=0.99,
    # output_graph=True,
)

for i_episode in range(1500):

    observation = env.reset()

    while True:
        if RENDER and i_episode>1000: env.render()

        action = RL.choose_action(observation)

        observation_, reward, done, info = env.step(action)
Example #16
0
class runPG():
    n_inputs = 4
    n_outputs = 4  # right and left for each finger
    # n_outputs = 8 # right, left and stop for each finger

    net = 0
    X = 0
    A = np.array([[-1, -1], [-1, 1], [1, -1], [1, 1], [0, -1], [0, 1], [-1, 0],
                  [1, 0]])

    mode = 5
    reward_mode = 2

    R = []

    gripper_closed = False
    stLearning = True
    possible_plot = False

    def __init__(self):
        rospy.init_node('runPG', anonymous=True)

        if self.mode == 5:
            self.n_inputs = 4
        if self.mode == 8:
            self.n_inputs = 8

        self.RL = PolicyGradient(
            n_actions=self.n_outputs,
            n_features=self.n_inputs,
            learning_rate=0.02,
            reward_decay=0.99,
            load_saved_net=False,
            # output_graph=True,
        )

        rospy.Subscriber('/RL/gripper_status', String,
                         self.callbackGripperStatus)
        rospy.Service('/RL/net', net_eval, self.EvalNet)
        rospy.Service('/RL/start_learning', Empty, self.start_learning)
        obs_srv = rospy.ServiceProxy('/RL/observation', observation)
        drop_srv = rospy.ServiceProxy('/RL/IsObjDropped', IsDropped)
        move_srv = rospy.ServiceProxy('/RL/MoveGripper', TargetAngles)
        open_srv = rospy.ServiceProxy('/RL/OpenGripper', Empty)
        close_srv = rospy.ServiceProxy('/RL/CloseGripper', Empty)

        rospy.sleep(3)
        o = open_srv()

        episode_count = 0
        rate = rospy.Rate(15)  # 15hz
        while not rospy.is_shutdown():

            if self.stLearning:
                ## Start episode ##
                episode_count += 1

                # Close gripper
                raw_input(
                    "Place object between fingers and press Enter to close gripper..."
                )
                close_srv()
                while not self.gripper_closed:
                    rate.sleep()

                raw_input("Remove table and press Enter to start episode...")

                # Get observation
                obs = np.array(obs_srv().state)
                self.VT = []
                while True:
                    # Choose action
                    action = self.RL.choose_action(obs)

                    # Act
                    suc = move_srv(self.A[action]).success
                    rospy.sleep(0.05)
                    rate.sleep()

                    if suc:
                        # Get observation
                        obs_ = np.array(obs_srv().state)
                        fail = drop_srv(
                        ).dropped  # Check if dropped - end of episode
                    else:
                        # End episode if overload or angle limits reached
                        rospy.logerr(
                            '[RL] Failed to move gripper. Episode declared failed.'
                        )
                        fail = True

                    reward, done = self.transition_reward(obs_, fail)

                    self.RL.store_transition(obs, action, reward)

                    obs = obs_

                    if done:
                        ep_rs_sum = sum(self.RL.ep_rs)

                        if 'running_reward' not in globals():
                            running_reward = ep_rs_sum
                        else:
                            running_reward = running_reward * 0.99 + ep_rs_sum * 0.01
                        print("*** episode: " + str(episode_count) +
                              ", episode reward: " + str(ep_rs_sum) +
                              ", running reward: " + str(int(running_reward)) +
                              " ***")

                        vt = self.RL.learn()
                        self.R.append(running_reward)
                        self.possible_plot = True

                        break

                    rate.sleep()
            elif self.possible_plot:
                self.plot_sav()
                self.possible_plot = False

            # Open gripper
            if self.gripper_closed:
                o = open_srv()
                rospy.sleep(0.2)
            # self.stLearning = False

            # print(obs_srv().state)

            # rospy.spin()
            rate.sleep()

    def plot_sav(self):
        plt.plot(range(len(self.R)), self.R)  # plot the episode vt
        plt.xlabel('episode steps')
        plt.ylabel('normalized state-action value')
        plt.show()

    def EvalNet(self, msg):
        a = 0
        return {'action': a}

    def callbackGripperStatus(self, msg):
        self.gripper_closed = msg.data == "closed"

    def start_learning(self, msg):
        self.stLearning = not self.stLearning

        return EmptyResponse()

    def transition_reward(self, obs, fail):

        # Keep moving as much as possible
        if self.reward_mode == 1:
            if fail:
                reward = 0.
            else:
                reward = 1.
            done = fail

        # Get to a certain coodrinate
        if self.reward_mode == 2:
            if fail:
                reward = -3.
            else:
                reward = -1.
            done = fail

            if obs[0] > 135.:
                raw_input('Reached goal, x = %f.' % obs[0])
                reward = 5.
                done = True

        return reward, done
Example #17
0
from policy_gradient import PolicyGradient
import matplotlib.pyplot as plt
import time

DISPLAY_REWARD_THRESHOLD = 100
RENDER = False

env = gym.make('CartPole-v0')
env.seed(1)
env = env.unwrapped

n_actions = env.action_space.n
n_features = env.observation_space.shape[0]

RL = PolicyGradient(n_actions=n_actions,
                    n_features=n_features,
                    learning_rate=0.02,
                    reward_decay=0.99)

for i_episode in range(3000):
    observation = env.reset()  # 车的位置,杆子的角度,车速,角度变化率

    while True:
        if RENDER:
            env.render()
        action = RL.choose_action(observation)

        observation_, reward, done, info = env.step(action)

        RL.store_transition(observation, action, reward)

        if done:
Example #18
0
def train(episode, rewardType=None):
    tf.reset_default_graph()
    number_of_players = 2
    number_of_pieces = 4
    # Load checkpoint
    load_version = 11
    save_version = load_version + 1
    #load_path = "output/weights/ludo/{}/ludo-v2.ckpt".format(load_version)
    load_path = None
    save_path = "/content/drive/My Drive/cse8673_project/output/weights/ludo/{}/ludo-v2.ckpt".format(
        rewardType)
    PG_dict = {}
    reward = -1000
    act = util.Action(number_of_players, number_of_pieces, reward)
    PG = PolicyGradient(
        n_x=(number_of_players * number_of_pieces) + 5,  #input layer size
        n_y=5,  #ouput layer size
        learning_rate=0.02,
        reward_decay=0.99,
        load_path=load_path,
        save_path=save_path,
        player_num=0,
        rewardType=rewardType)
    EPISODES = episode
    ghost_players = list(reversed(range(0, 4)))[:-number_of_players]
    players = list(reversed(range(0, 4)))[-number_of_players:]
    winner = None
    winnerCount = defaultdict(int)
    for episode in range(EPISODES):
        if episode % 500 == 0:
            print("episode : ", episode)
        g = ludopy.Game(ghost_players=ghost_players,\
             number_of_pieces=number_of_pieces)

        episode_reward = 0

        there_is_a_winner = False
        winner = None
        count = 0
        while True:
            count += 1
            for i in range(number_of_players):
                if i == 0:
                    (dice, move_pieces, player_pieces, enemy_pieces,
                     player_is_a_winner,
                     there_is_a_winner), player_i = g.get_observation()

                    action, random = act.getAction(PG, enemy_pieces,
                                                   player_pieces, move_pieces,
                                                   dice)

                    _, _, _, _, _, there_is_a_winner = g.answer_observation(
                        action)
                else:
                    action = act.getAction(move_pieces=move_pieces)

                if there_is_a_winner:
                    winner = player_i
                    winnerCount[player_i] += 1
                    break

            #this is where the agents are leanring
            if there_is_a_winner:
                if winner == 0:
                    PG.episode_rewards = [
                        i + 2000 if i == -1000 else i
                        for i in PG.episode_rewards
                    ]

                discounted_episode_rewards_norm = PG.learn(episode, 0, winner)
    return winnerCount, save_path
Example #19
0
RENDER_ENV = False
EPISODES = 50000
rewards = []
RENDER_REWARD_MIN = 50
MAX_FRAMES = 1800
N_avg = 100

if __name__ == "__main__":

    # Load checkpoint
    load_path = None  #"output/weights/CartPole-v0.ckpt"
    save_path = None  #"output/weights/CartPole-v0-temp.ckpt"

    PG = PolicyGradient(n_x=env.observation_space.shape[0],
                        n_y=9,
                        learning_rate=0.01,
                        reward_decay=0.995,
                        load_path=load_path,
                        save_path=save_path)

    past_n_rews = []

    for episode in range(EPISODES + 1):

        observation = env.reset()
        episode_reward = 0
        frame_counter = 0
        while True:
            if RENDER_ENV:
                print("rendering while training")
                PG.run_simulation(MAX_FRAMES, env, True)
            # 1. Choose an action based on observation
Example #20
0
env = gym.make('CartPole-v0')
env = env.unwrapped  # 取消限制
env.seed(1)  # 普通的 Policy Gradient 方法, 回合的方差比较大, 所以选一个好点的随机种子

print(env.action_space)  # 查看这个环境中可用的 action 有多少个
print(env.observation_space)  # 查看这个环境中 state/observation 有多少个特征值
print(env.observation_space.high)  # 查看 observation 最高取值
print(env.observation_space.low)  # 查看 observation 最低取值

update_frequency = 5  # 更新频率,多少回合更新一次
total_episodes = 3000  # 总回合数

# 创建 PolicyGradient 对象
agent = PolicyGradient(
    lr=0.01,
    a_size=env.action_space.n,  # 对 CartPole-v0 是 2, 两个 action,向左/向右
    s_size=env.observation_space.shape[0],  # 对 CartPole-v0 是 4
    h_size=8)

with tf.Session() as sess:
    # 初始化所有全局变量
    sess.run(tf.global_variables_initializer())

    # 总的奖励
    total_reward = []

    gradient_buffer = sess.run(tf.trainable_variables())
    for index, grad in enumerate(gradient_buffer):
        gradient_buffer[index] = grad * 0

    i = 0  # 第几回合
env = env.unwrapped

print(env.action_space)
print(env.observation_space)
print(env.observation_space.high)
print(env.observation_space.low)
				 # s_dim,
				 # a_dim,
				 # learning_rate = 0.01,
				 # reward_decay = 0.95,
				 # output_graph = False

RL = PolicyGradient(
		s_dim = env.observation_space.shape[0],
		a_dim = env.action_space.n,
		learning_rate = 0.02,
		reward_decay = 0.99,
		#output_graph = True
	)

for i_epsiode in range(3000):

	s = env.reset()
	while True:
		if RENDER: env.render()

		a = RL.choose_action(s)
		s_,r,done,info = env.step(a)

		RL.store_transition(s,a,r)
def simulation():
    users_num = 1
    '''
	action_rewards = {'11':4,'12':1,'13':1,'14':1,'21':1,'22':2,'23':3,'24':16,'31':1,'32':2,'33':3,'34':4}
	observation_action_transfer = {'11':[2],'12':[2],'13':[2],'14':[2],'21':[3],'22':[3],'23':[3],'24':[3],\
			'31':[1],'32':[1],'33':[3],'34':[3]}
	actions = [1,2,3,4]
	observations = [[1],[2],[3]]
	'''

    action_rewards = {'11': 5,'12': 0,'13': 0,'14':0,'15':0,'16':13, \
          '21': 10,'22': 0, '23': 0,'24':0,'25':0,'26':8}
    observation_action_transfer = {'11': [1,1], '12': [1,1], '13': [1,1],'14':[1,1],'15':[1,1],'16':[1,1], \
              '21': [1,1], '22': [1,1], '23': [1,1],'24':[1,1],'25':[1,1],'26':[0,1]}

    actions = [1, 2, 3, 4, 5, 6]
    observations = [[0, 1], [1, 1]]

    # nums of items to recommend
    K = 2
    load_version = 4
    save_version = load_version + 1

    load_path = "output/weights/topk{}.ckpt".format(load_version)
    save_path = "output/weights/topk{}.ckpt".format(save_version)

    EPISODES = 3000
    RENDER_ENV = True
    rewards = []

    PG = PolicyGradient(n_x=len(observations[0]),
                        n_y=len(actions),
                        s0=observations[-1],
                        learning_rate=0.001,
                        reward_decay=1,
                        load_path=None,
                        save_path=save_path,
                        weight_capping_c=2**3,
                        k=K,
                        b_distribution='uniform')

    for episode in range(EPISODES):

        episode_reward = 0

        tic = time.clock()
        done = False

        while True:
            '''
			TODO:initialize the env
			'''
            if RENDER_ENV:
                observation = PG.episode_observations[-1]
                #print(observation)

            # 1. Choose an action based on observation
            #action = PG.uniform_choose_action(observation)
            action = PG.choose_action(observation)

            # 2. Take action in the environment
            observation_, reward = observation_action_transfer[str(sum(observation))+str(actions[action])], \
                    action_rewards[str(sum(observation))+str(actions[action])]

            # 4. Store transition for training
            PG.store_transition(observation_, action, reward)
            #print(PG.episode_observations)
            #print(PG.episode_actions)
            #print(PG.episode_rewards)
            toc = time.clock()
            elapsed_sec = toc - tic
            if elapsed_sec > 120:
                done = True
            if len(PG.episode_observations) > 100:
                done = True

            if done:
                episode_rewards_sum = sum(PG.episode_rewards)
                rewards.append(episode_rewards_sum)
                max_reward_so_far = np.amax(rewards)
                PG.cost_history.append(episode_rewards_sum)
                print("==========================================")
                print("Episode: ", episode)
                print("Seconds: ", elapsed_sec)
                print("Reward: ", episode_rewards_sum)
                print("Max reward so far: ", max_reward_so_far)

                #print(PG.outputs_softmax)
                #print(PG.episode_rewards)
                # 5. Train neural network
                print("distribution at {} is :{}".format(
                    observations[0], PG.get_distribution(observations[0])))
                print("distribution at {} is :{}".format(
                    observations[1], PG.get_distribution(observations[1])))
                discounted_episode_rewards_norm = PG.learn()

                break

            # Save new observation
            observation = observation_
    PG.plot_cost()
    plt.bar(actions, PG.get_distribution(observations[0]))
    plt.xlabel("action at state[0,1]")
    # 显示纵轴标签
    plt.ylabel("probability")
    # 显示图标题
    plt.title("policy distribution at state[0,1]")
    plt.show()
    plt.bar(actions, PG.get_distribution(observations[1]))
    plt.xlabel("action at state[1,1]")
    # 显示纵轴标签
    plt.ylabel("probability")
    # 显示图标题
    plt.title("policy distribution at state[1,1]")
    plt.show()
Example #23
0
 def play(self,
          policyPlayers,
          randomPlayers,
          load_path,
          save_path,
          episodes,
          episodeStart,
          training,
          ghost_players,
          model2keep,
          n_x=125,
          n_y=5,
          learning_rate=0.02,
          reward_decay=0.99,
          player_num=0,
          number_of_players=2,
          number_of_pieces=4,
          reward=-1000,
          rewardType="monte",
          inputBoardType="fullBoard"):
     totalPlayers = len(policyPlayers) + len(randomPlayers)
     playerPool = policyPlayers + randomPlayers
     data = dict()
     for i in policyPlayers:
         data[i] = StoreTrainingData(n_y)
     act = Action(reward)
     PG = PolicyGradient(
         n_x=n_x,  #input layer size
         n_y=n_y,  #ouput layer size
         learning_rate=learning_rate,
         reward_decay=reward_decay,
         load_path=load_path,
         save_path=save_path,
         player_num=player_num,
         rewardType=rewardType,
         toKeep=model2keep)
     timeInterval = 50
     winCount = defaultdict(int)
     preds = list()
     startTime = time.time()
     for episode in range(episodeStart + 1, episodeStart + episodes):
         g = ludopy.Game(ghost_players=ghost_players,\
          number_of_pieces=number_of_pieces)
         while True:
             obs, currPlayer = g.get_observation()
             state = State(obs, currPlayer)
             action = None
             if currPlayer in policyPlayers and len(state.actions()) > 0:
                 action = act.action(self, state, n_y, playerPool,
                                     currPlayer, data[currPlayer], PG,
                                     training)
             elif currPlayer in randomPlayers:
                 action = act.action(self, state, n_y)
             _, _, _, _, _, there_is_a_winner = g.answer_observation(action)
             if int(time.time() - startTime) > timeInterval:
                 print("episode: {} running for {}".format(
                     episode,
                     time.time() - startTime))
                 timeInterval += 50
             if there_is_a_winner:
                 winCount[currPlayer] += 1
                 if episode % 1000 == 0:
                     print("wincount: {}".format(winCount))
                     print("time take for this epoch is {}".format(
                         time.time() - startTime))
                     startTime = time.time()
                     timeInterval = 50
                     winCount = defaultdict(int)
                     g.save_hist_video(
                         "videos/gameabc{}.avi".format(episode))
                 if training:
                     try:
                         self.__train(PG, data, episode, currPlayer)
                     except:
                         g.save_hist_video("error.avi".format(episode))
                         print(
                             "-----------------error------------------------"
                         )
                         pass
                 break
     return winCount
Example #24
0
    def __init__(self):
        rospy.init_node('runPG', anonymous=True)

        if self.mode == 5:
            self.n_inputs = 4
        if self.mode == 8:
            self.n_inputs = 8
        
        self.RL = PolicyGradient(
            n_actions = self.n_outputs,
            n_features = self.n_inputs,
            learning_rate=0.001,
            reward_decay=0.98,
            load_saved_net=True,
            # output_graph=True,
        )

        rospy.Subscriber('/RL/gripper_status', String, self.callbackGripperStatus)
        rospy.Service('/RL/net', net_eval, self.EvalNet)
        rospy.Service('/RL/start_learning', Empty, self.start_learning)
        obs_srv = rospy.ServiceProxy('/RL/observation', observation)
        drop_srv = rospy.ServiceProxy('/RL/IsObjDropped', IsDropped)
        move_srv = rospy.ServiceProxy('/RL/MoveGripper', TargetAngles)
        reset_srv = rospy.ServiceProxy('/RL/ResetGripper', Empty)
        pub_goal = rospy.Publisher('/RL/Goal', Float32MultiArray, queue_size=10)

        gg = Float32MultiArray()
        gg.data = self.g

        episode_count = 0
        rate = rospy.Rate(100) # 100hz
        while not rospy.is_shutdown():
            
            if self.stLearning:
                ## Start episode ##
                episode_count += 1

                self.prev_dis2goal = 1e9

                # Set gripper
                reset_srv()
                while not self.gripper_closed:
                    rate.sleep()

                # Get observation
                obs = np.array(obs_srv().state)
                self.VT = []
                step = 0
                while True:
                    step += 1
                    print('[RL] Step %d in episode %d, distance to goal: %f.' % (step, episode_count, self.prev_dis2goal))
                    pub_goal.publish(gg)

                    # Choose action
                    action = self.RL.choose_action(obs)

                    # Act
                    suc = move_srv(self.A[action]).success
                    rospy.sleep(0.05)
                    rate.sleep()

                    if suc:
                        # Get observation
                        obs_ = np.array(obs_srv().state)
                        fail = drop_srv().dropped # Check if dropped - end of episode
                    else:
                        # End episode if overload or angle limits reached
                        rospy.logerr('[RL] Failed to move gripper. Episode declared failed.')
                        fail = True 

                    reward, done = self.transition_reward(obs_, fail)

                    self.RL.store_transition(obs, action, reward)

                    obs = obs_

                    if step > self.max_steps:
                        done = True

                    if done:
                        ep_rs_sum = sum(self.RL.ep_rs)

                        if 'running_reward' not in globals():
                            running_reward = ep_rs_sum
                        else:
                            running_reward = running_reward * 0.99 + ep_rs_sum * 0.01
                        print("*** episode: " + str(episode_count) + ", episode reward: " + str(ep_rs_sum) + ", running reward: " + str(int(running_reward)) + " ***")

                        vt = self.RL.learn()
                        self.R.append(running_reward)
                        self.possible_plot = True

                        break

                    rate.sleep()
            elif self.possible_plot:
                self.plot_sav()
                self.possible_plot = False

            if self.max_episodes < episode_count:
                self.plot_sav()
                break

            rate.sleep()
Example #25
0
                                                   train_start, train_end)
    print("history file name : %s" % history_filename)

    codeMap = {}
    f = codecs.open(codeListFilename, "r", "utf-8")

    for line in f:
        if line.strip() != "":
            tokens = line.strip().split(
                ",") if not "\t" in line else line.strip().split("\t")
            codeMap[tokens[0]] = tokens[1]

    f.close()

    env = MarketEnv(dir_path="./data/",
                    codes=list(codeMap.keys()),
                    target_date_start=train_start,
                    target_date_end=train_end,
                    sudden_death=-1.0)
    env_test = MarketEnv(dir_path="./data/",
                         codes=list(codeMap.keys()),
                         target_date_start=test_start,
                         target_date_end=test_end,
                         sudden_death=-1.0)
    pg = PolicyGradient(env,
                        env_test,
                        discount=0.9,
                        model_filename=model_filename,
                        history_filename=history_filename)
    pg.train(verbose=0, max_episode=max_episode)
Example #26
0
    def train(self, max_episode=10, max_path_length=200, verbose=0):
        env = self.env
        avg_reward_sum = 0.

        #f_eps = open("episode.csv","w")
        #write_eps = csv.write(f_eps)

        for e in range(max_episode):
            env._reset()
            observation = env._reset()
            game_over = False
            reward_sum = 0

            inputs = []
            outputs = []
            predicteds = []
            rewards = []

            #f_iter = open("episode_{0}.csv".format(e),"w")
            #write_iter = csv.writer(f_iter)
            f_episode = "episode_{0}.csv".format(e)
            os.system("rm -rf {0}".format(f_episode))

            print(observation[0].shape, observation[1].shape)

            RL = PolicyGradient(
                n_actions=self.env.action_space.n,
                #				n_features=observation.shape[0],
                learning_rate=0.02,
                reward_decay=0.995,
                # output_graph=True,
            )

            while not game_over:

                action, aprob = RL.choose_action(observation)

                inputs.append(observation)
                predicteds.append(aprob)

                y = np.zeros([self.env.action_space.n])
                y[action] = 1.
                outputs.append(y)

                observation, reward, actual_reward, game_over, info = self.env._step(
                    action)
                reward_sum += float(actual_reward)

                #rewards.append(float(reward))
                rewards.append(float(reward_sum))

                RL.store_transition(observation, action, rewards)

                # check memory for RNN model
                if len(inputs) > self.max_memory:
                    del inputs[0]
                    del outputs[0]
                    del predicteds[0]
                    del rewards[0]

                if verbose > 0:
                    if env.actions[action] == "LONG" or env.actions[
                            action] == "SHORT":
                        #if env.actions[action] == "LONG" or env.actions[action] == "SHORT" or env.actions[action] == "HOLD":
                        color = bcolors.FAIL if env.actions[
                            action] == "LONG" else bcolors.OKBLUE
                        print("%s:\t%s\t%.2f\t%.2f\t" %
                              (info["dt"], color + env.actions[action] +
                               bcolors.ENDC, reward_sum, info["cum"]) +
                              ("\t".join([
                                  "%s:%.2f" % (l, i)
                                  for l, i in zip(env.actions, aprob.tolist())
                              ])))
                    #write_iter.writerow("%s:\t%s\t%.2f\t%.2f\t" % (info["dt"], env.actions[action], reward_sum, info["cum"]) + ("\t".join(["%s:%.2f" % (l, i) for l, i in zip(env.actions, aprob.tolist())])))
                    os.system("echo %s >> %s" %
                              ("%s:\t%s\t%.2f\t%.2f\t" %
                               (info["dt"], env.actions[action], reward_sum,
                                info["cum"]) +
                               ("\t".join([
                                   "%s:%.2f" % (l, i)
                                   for l, i in zip(env.actions, aprob.tolist())
                               ])), f_episode))

                avg_reward_sum = avg_reward_sum * 0.99 + reward_sum * 0.01
                toPrint = "%d\t%s\t%s\t%.2f\t%.2f" % (
                    e, info["code"],
                    (bcolors.FAIL if reward_sum >= 0 else bcolors.OKBLUE) +
                    ("%.2f" % reward_sum) + bcolors.ENDC, info["cum"],
                    avg_reward_sum)
                print(toPrint)
                if self.history_filename != None:
                    os.system("echo %s >> %s" %
                              (toPrint, self.history_filename))

                discounted_rewards_ = RL.learn()  # train

                dim = len(inputs[0])
                inputs_ = [[] for i in range(dim)]
                for obs in inputs:
                    for i, block in enumerate(obs):
                        inputs_[i].append(block[0])
                inputs_ = [np.array(inputs_[i]) for i in range(dim)]

                outputs_ = np.vstack(outputs)
                predicteds_ = np.vstack(predicteds)
                rewards_ = np.vstack(rewards)

                print("shape: ", np.shape(rewards),
                      np.shape(discounted_rewards_))
                #outputs_ *= discounted_rewards_
                for i, r in enumerate(zip(rewards, discounted_rewards_)):
                    reward, discounted_reward = r

                    if verbose > 1:
                        #						print (outputs_[i],)
                        print(outputs_[i], )

                    if verbose > 0:
                        print(predicteds_[i], outputs_[i], reward,
                              discounted_reward)

                print("fit model input.shape %s, output.shape %s" %
                      ([inputs_[i].shape
                        for i in range(len(inputs_))], outputs_.shape))

                np.set_printoptions(linewidth=200, suppress=True)
                print("currentTargetIndex:", env.currentTargetIndex)
Example #27
0
print("env.observation_space.high", env.observation_space.high)
print("env.observation_space.low", env.observation_space.low)

RENDER_ENV = False  # okno z grą
EPISODES = 500  # maksymalna ilość iteracji
rewards = []  # nagroda
RENDER_REWARD_MIN = 50  # minimalna nagroda do wyrenderowania gry

if __name__ == "__main__":

    load_path = None  #"output/weights/CartPole-v0.ckpt"
    save_path = None  #"output/weights/CartPole-v0-temp.ckpt"

    PG = PolicyGradient(n_x=env.observation_space.shape[0],
                        n_y=env.action_space.n,
                        learning_rate=0.01,
                        reward_decay=0.95,
                        load_path=load_path,
                        save_path=save_path)

    for episode in range(EPISODES):  # start nauki

        observation = env.reset()
        episode_reward = 0

        while True:
            if RENDER_ENV: env.render()

            # 1. Choose an action based on observation
            action = PG.choose_action(observation)

            # 2. Take action in the environment
Example #28
0
with open('loss.txt', "r+") as f:  #清楚上次运行保存的数据
    f.seek(0)
    f.truncate()  #清空文件

if __name__ == "__main__":

    # Load checkpoint
    load_path = None
    save_path = None

    PG = PolicyGradient(n_x=sizeperq * nOfenb * nOfchannel +
                        nOfenb * nOfchannel,
                        n_y=nOfchannel * nOfenb,
                        learning_rate=0.005,
                        reward_decay=0.9,
                        load_path=load_path,
                        save_path=save_path,
                        ep=0.99,
                        nOfChannel=nOfchannel)

env = ns3env.Ns3Env(port=port,
                    startSim=startSim,
                    simSeed=seed,
                    simArgs=simArgs,
                    debug=debug)

env.reset()

ob_space = env.observation_space
ac_space = env.action_space
Example #29
0
class runPG():
    n_inputs = 4
    # n_outputs = 4 # right and left for each finger
    n_outputs = 8 # right, left and stop for each finger
    max_episodes = 1200
    max_steps = 2500

    net = 0
    X = 0
    A = np.array([[-1, -1], [1, -1], [-1, 1], [1, 1], [0, -1], [0, 1], [-1, 0], [1, 0]])

    mode = 5
    reward_mode = 3

    R = []
    g = np.array([-35.0, 104.0], dtype='f') # Goal

    gripper_closed = False
    stLearning = True # Enable learning
    possible_plot = False

    # For reward mode 3
    prev_dis2goal = 1e9


    def __init__(self):
        rospy.init_node('runPG', anonymous=True)

        if self.mode == 5:
            self.n_inputs = 4
        if self.mode == 8:
            self.n_inputs = 8
        
        self.RL = PolicyGradient(
            n_actions = self.n_outputs,
            n_features = self.n_inputs,
            learning_rate=0.001,
            reward_decay=0.98,
            load_saved_net=True,
            # output_graph=True,
        )

        rospy.Subscriber('/RL/gripper_status', String, self.callbackGripperStatus)
        rospy.Service('/RL/net', net_eval, self.EvalNet)
        rospy.Service('/RL/start_learning', Empty, self.start_learning)
        obs_srv = rospy.ServiceProxy('/RL/observation', observation)
        drop_srv = rospy.ServiceProxy('/RL/IsObjDropped', IsDropped)
        move_srv = rospy.ServiceProxy('/RL/MoveGripper', TargetAngles)
        reset_srv = rospy.ServiceProxy('/RL/ResetGripper', Empty)
        pub_goal = rospy.Publisher('/RL/Goal', Float32MultiArray, queue_size=10)

        gg = Float32MultiArray()
        gg.data = self.g

        episode_count = 0
        rate = rospy.Rate(100) # 100hz
        while not rospy.is_shutdown():
            
            if self.stLearning:
                ## Start episode ##
                episode_count += 1

                self.prev_dis2goal = 1e9

                # Set gripper
                reset_srv()
                while not self.gripper_closed:
                    rate.sleep()

                # Get observation
                obs = np.array(obs_srv().state)
                self.VT = []
                step = 0
                while True:
                    step += 1
                    print('[RL] Step %d in episode %d, distance to goal: %f.' % (step, episode_count, self.prev_dis2goal))
                    pub_goal.publish(gg)

                    # Choose action
                    action = self.RL.choose_action(obs)

                    # Act
                    suc = move_srv(self.A[action]).success
                    rospy.sleep(0.05)
                    rate.sleep()

                    if suc:
                        # Get observation
                        obs_ = np.array(obs_srv().state)
                        fail = drop_srv().dropped # Check if dropped - end of episode
                    else:
                        # End episode if overload or angle limits reached
                        rospy.logerr('[RL] Failed to move gripper. Episode declared failed.')
                        fail = True 

                    reward, done = self.transition_reward(obs_, fail)

                    self.RL.store_transition(obs, action, reward)

                    obs = obs_

                    if step > self.max_steps:
                        done = True

                    if done:
                        ep_rs_sum = sum(self.RL.ep_rs)

                        if 'running_reward' not in globals():
                            running_reward = ep_rs_sum
                        else:
                            running_reward = running_reward * 0.99 + ep_rs_sum * 0.01
                        print("*** episode: " + str(episode_count) + ", episode reward: " + str(ep_rs_sum) + ", running reward: " + str(int(running_reward)) + " ***")

                        vt = self.RL.learn()
                        self.R.append(running_reward)
                        self.possible_plot = True

                        break

                    rate.sleep()
            elif self.possible_plot:
                self.plot_sav()
                self.possible_plot = False

            if self.max_episodes < episode_count:
                self.plot_sav()
                break

            rate.sleep()

    def plot_sav(self):
        plt.plot(range(len(self.R)),self.R)    # plot the episode vt
        plt.xlabel('episode steps')
        plt.ylabel('normalized state-action value')
        plt.show()

    def EvalNet(self, msg):
        a = 0
        return {'action': a}

    def callbackGripperStatus(self, msg):
        self.gripper_closed = msg.data == "closed"

    def start_learning(self, msg):
        self.stLearning = not self.stLearning

        return EmptyResponse()

    def transition_reward(self, obs, fail):

        # Keep moving as much as possible
        if self.reward_mode == 1:
            if fail:
                reward = 0.
            else:
                reward = 1.
            done = fail

        # Cross a line
        if self.reward_mode == 2:
            if fail:
                reward = -3.
            else:
                reward = -1.
            done = fail
            
            if obs[0] > 40.:
                print('Reached goal, x = %f.' % obs[0])
                reward = 5.
                done = True

        # Get to a certain coordinate
        if self.reward_mode == 3:
            
            d = np.linalg.norm(self.g-obs[:2])

            if fail or d > self.prev_dis2goal:
                reward = 0.
            else:
                reward = 1.
            done = fail
            
            if d < 5:
                print('Reached goal, (x,y) = (%f,%f).' % (obs[0],obs[1]))
                reward = 50.
                done = True
            
            self.prev_dis2goal = d

        return reward, done
Example #30
0
env.reset()
env = env.unwrapped
# Policy gradient has high variance, seed for reproducibility
env.seed(1)
print("env.action_space", env.action_space.n)
print("env.observation_space", env.observation_space.shape[0])
print("env.observation_space.high", env.observation_space.high)
print("env.observation_space.low", env.observation_space.low)

RENDER_FLAG = False
EPISODES = 500  # 收集500条序列
MAX_STEP = 1500  # 每条序列最多1500步
rewards = []  # 记录每条序列回报的list

if __name__ == "__main__":
    PG = PolicyGradient(n_input=env.observation_space.shape[0],
                        n_output=env.action_space.n)
    for episode in range(EPISODES):
        s = env.reset()
        for i in range(MAX_STEP):
            if RENDER_FLAG:
                env.render()
            # 与环境交互
            action = PG.choose_action(s)
            s_, reward, done, _ = env.step(action)
            PG.store_transition(s, action, reward)
            # 如果杆倒了或超出屏幕
            if done:
                ep_rewards_sum = np.sum(PG.ep_rewards)
                if ep_rewards_sum > 1000:
                    RENDER_FLAG = True
                else: