class AgentPolicyGradient:
    def __init__(self, 
                 n_x,
                 n_y,
                 learning_rate = 0.02,
                 reward_decay=0.99,
                 load_path=None, 
                 save_path=None):
        self.PG = PolicyGradient(n_x, n_y,
                learning_rate=learning_rate,
                reward_decay=reward_decay,
                load_path=load_path,
                save_path=save_path
                )
        
        
    def choose_action(self, observation):
        return self.PG.choose_action(observation)
    
    def store_transition(self, s, a, r):
        return self.PG.store_transition(s,a,r)
    
    
    def learn(self):
        return self.PG.learn()
    
    
    def plot_cost(self):
        import matplotlib
        matplotlib.use('TkAgg')
        import matplotlib.pyplot as plt
        plt.plot(np.arange(len(self.PG.cost_history)), self.PG.cost_history)
        plt.ylabel('Cost Ex')
        plt.xlabel('Training Steps Ex')
        plt.show()    
        
    def crashed(self):
        episode_rewards_sum = sum(self.PG.episode_rewards)
        return episode_rewards_sum < -250
    
    def episode_reward(self):
        episode_rewards_sum = sum(self.PG.episode_rewards)
        return episode_rewards_sum
    

    def costs(self):
        return self.PG.costs()
Exemple #2
0
            # 1. Choose an action based on observation
            action = PG.choose_action(observation)

            # 2. Take action in the environment
            observation_, reward, done, info = env.step(action)

            # 3. Store transition for training
            PG.store_transition(observation, action, reward)

            if done:
                episode_rewards_sum = sum(PG.episode_rewards)
                rewards.append(episode_rewards_sum)
                max_reward_so_far = np.amax(rewards)

                print("==========================================")
                print("Episode: ", episode)
                print("Reward: ", episode_rewards_sum)
                print("Max reward so far: ", max_reward_so_far)

                # 4. Train neural network
                discounted_episode_rewards_norm = PG.learn()

                # Renderuj gre dopiero gdy program uzyska minimalny wynik RENDER_REWARD_MIN
                if max_reward_so_far > RENDER_REWARD_MIN: RENDER_ENV = True

                break

            # Save new observation
            observation = observation_
def simulation():
    users_num = 1
    action_rewards = [10, 9, 1, 1, 1, 1, 1, 1, 1, 1]
    actions = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
    observations = [[random.randint(0, i * 10) for i in range(1, 4)]
                    for j in range(1, 101)]
    # nums of items to recommend
    K = 2
    load_version = 1
    save_version = load_version + 1

    load_path = "output/weights/topk{}.ckpt".format(load_version)
    save_path = "output/weights/topk{}.ckpt".format(save_version)

    EPISODES = 5000
    RENDER_ENV = True
    rewards = []

    PG = PolicyGradient(n_x=len(observations[0]),
                        n_y=len(actions),
                        s0=observations[random.randint(0,
                                                       len(observations) - 1)],
                        learning_rate=0.005,
                        reward_decay=1,
                        load_path=None,
                        save_path=save_path,
                        weight_capping_c=2**3,
                        k=K,
                        b_distribution='uniform')

    for episode in range(EPISODES):

        episode_reward = 0

        tic = time.clock()
        done = False

        while True:
            '''
			TODO:initialize the env
			'''
            if RENDER_ENV:
                observation = observations[random.randint(
                    0,
                    len(observations) - 1)]

            # 1. Choose an action based on observation
            # action = PG.uniform_choose_action(observation)
            action = PG.choose_action(observation)

            # 2. Take action in the environment
            observation_, reward = observations[random.randint(
                0,
                len(observations) - 1)], action_rewards[action]

            # 4. Store transition for training
            PG.store_transition(observation, action, reward)

            toc = time.clock()
            elapsed_sec = toc - tic
            if elapsed_sec > 120:
                done = True
            if len(PG.episode_observations) > 100:
                done = True

            if done:
                episode_rewards_sum = sum(PG.episode_rewards)
                rewards.append(episode_rewards_sum)
                max_reward_so_far = np.amax(rewards)
                PG.cost_history.append(episode_rewards_sum)
                print("==========================================")
                print("Episode: ", episode)
                print("Seconds: ", elapsed_sec)
                print("Reward: ", episode_rewards_sum)
                print("Max reward so far: ", max_reward_so_far)
                #print(PG.outputs_softmax)
                print("distribution at {} is :{}".format(
                    PG.s0, PG.get_distribution(PG.s0)))
                # 5. Train neural network
                discounted_episode_rewards_norm = PG.learn()
                break

            # Save new observation
            observation = observation_

    PG.plot_cost()
    plt.bar(actions, PG.get_distribution(PG.s0))
    plt.xlabel("action")
    # 显示纵轴标签
    plt.ylabel("probability")
    # 显示图标题
    plt.title("top-k correction policy")
    plt.show()
Exemple #4
0
for i in range(FLAGS.episode):
    s = env.reset()

    while True:
        if RENDER:
            env.render()
        action = PG.choose_action(s)
        s_, r, done, info = env.step(action)
        PG.store_transition(s_, action, r)
        if done:
            episode_rs_sum = sum(PG.ep_rs)
            if 'running_reward' not in globals():
                running_reward = episode_rs_sum
            else:
                running_reward = running_reward * 0.99 + episode_rs_sum * 0.01
            if running_reward > FLAGS.display_threshold:
                RENDER = True
            print('episode:', i, ' reward:', running_reward)

            norm_reward = PG.learn()

            if i == 30:
                plt.plot(norm_reward)
                plt.xlabel('episode steps')
                plt.ylabel('normalized reward')
                plt.show()
            break

        s = s_
Exemple #5
0
     action_ = spaces.Tuple(d)
     obs, reward_step, done, info = env.step(
         action_)  #获取这一eposide的奖励
     ay.append(reward_step + 0.5)
     plt.clf()  # 清除之前画的图
     plt.plot(ax, ay)  # 画出当前 ax 列表和 ay 列表中的值的图形
     plt.xlabel('step')
     plt.ylabel('吞吐率')
     plt.pause(0.1)  # 暂停一秒
     plt.ioff()  # 关闭画图的窗口
     reward = reward_step
     if stepIdx > 100:
         s, a, r = PG.store_transition(observation_step, action,
                                       reward)
     if stepIdx % 6 == 0 and stepIdx > 100:
         PG.learn()
 for k in range(len(observation)):
     ss = observation[k].copy()
     ss.extend(matrixOfChanAlloc.copy().reshape(
         1, nOfenb * nOfchannel).tolist()[0])
     # print(ss)
     observation_step = np.array(ss).reshape(
         nOfenb * nOfchannel + 4, 1).ravel()
     print("observation_step: ", observation_step)
     if observation_step[1] > 0:
         action = PG.choose_action1(observation_step,
                                    matrixOfChanAlloc, stepIdx)
         if action < 12:
             action_list.append(observation_step[0])
             action_list.append(observation_step[1])
             action_list.append(action)
def simulation():
    users_num = 1
    '''
	action_rewards = {'11':4,'12':1,'13':1,'14':1,'21':1,'22':2,'23':3,'24':16,'31':1,'32':2,'33':3,'34':4}
	observation_action_transfer = {'11':[2],'12':[2],'13':[2],'14':[2],'21':[3],'22':[3],'23':[3],'24':[3],\
			'31':[1],'32':[1],'33':[3],'34':[3]}
	actions = [1,2,3,4]
	observations = [[1],[2],[3]]
	'''

    action_rewards = {'11': 5,'12': 0,'13': 0,'14':0,'15':0,'16':13, \
          '21': 10,'22': 0, '23': 0,'24':0,'25':0,'26':8}
    observation_action_transfer = {'11': [1,1], '12': [1,1], '13': [1,1],'14':[1,1],'15':[1,1],'16':[1,1], \
              '21': [1,1], '22': [1,1], '23': [1,1],'24':[1,1],'25':[1,1],'26':[0,1]}

    actions = [1, 2, 3, 4, 5, 6]
    observations = [[0, 1], [1, 1]]

    # nums of items to recommend
    K = 2
    load_version = 4
    save_version = load_version + 1

    load_path = "output/weights/topk{}.ckpt".format(load_version)
    save_path = "output/weights/topk{}.ckpt".format(save_version)

    EPISODES = 3000
    RENDER_ENV = True
    rewards = []

    PG = PolicyGradient(n_x=len(observations[0]),
                        n_y=len(actions),
                        s0=observations[-1],
                        learning_rate=0.001,
                        reward_decay=1,
                        load_path=None,
                        save_path=save_path,
                        weight_capping_c=2**3,
                        k=K,
                        b_distribution='uniform')

    for episode in range(EPISODES):

        episode_reward = 0

        tic = time.clock()
        done = False

        while True:
            '''
			TODO:initialize the env
			'''
            if RENDER_ENV:
                observation = PG.episode_observations[-1]
                #print(observation)

            # 1. Choose an action based on observation
            #action = PG.uniform_choose_action(observation)
            action = PG.choose_action(observation)

            # 2. Take action in the environment
            observation_, reward = observation_action_transfer[str(sum(observation))+str(actions[action])], \
                    action_rewards[str(sum(observation))+str(actions[action])]

            # 4. Store transition for training
            PG.store_transition(observation_, action, reward)
            #print(PG.episode_observations)
            #print(PG.episode_actions)
            #print(PG.episode_rewards)
            toc = time.clock()
            elapsed_sec = toc - tic
            if elapsed_sec > 120:
                done = True
            if len(PG.episode_observations) > 100:
                done = True

            if done:
                episode_rewards_sum = sum(PG.episode_rewards)
                rewards.append(episode_rewards_sum)
                max_reward_so_far = np.amax(rewards)
                PG.cost_history.append(episode_rewards_sum)
                print("==========================================")
                print("Episode: ", episode)
                print("Seconds: ", elapsed_sec)
                print("Reward: ", episode_rewards_sum)
                print("Max reward so far: ", max_reward_so_far)

                #print(PG.outputs_softmax)
                #print(PG.episode_rewards)
                # 5. Train neural network
                print("distribution at {} is :{}".format(
                    observations[0], PG.get_distribution(observations[0])))
                print("distribution at {} is :{}".format(
                    observations[1], PG.get_distribution(observations[1])))
                discounted_episode_rewards_norm = PG.learn()

                break

            # Save new observation
            observation = observation_
    PG.plot_cost()
    plt.bar(actions, PG.get_distribution(observations[0]))
    plt.xlabel("action at state[0,1]")
    # 显示纵轴标签
    plt.ylabel("probability")
    # 显示图标题
    plt.title("policy distribution at state[0,1]")
    plt.show()
    plt.bar(actions, PG.get_distribution(observations[1]))
    plt.xlabel("action at state[1,1]")
    # 显示纵轴标签
    plt.ylabel("probability")
    # 显示图标题
    plt.title("policy distribution at state[1,1]")
    plt.show()
Exemple #7
0
    def train(self, max_episode=10, max_path_length=200, verbose=0):
        env = self.env
        avg_reward_sum = 0.

        #f_eps = open("episode.csv","w")
        #write_eps = csv.write(f_eps)

        for e in range(max_episode):
            env._reset()
            observation = env._reset()
            game_over = False
            reward_sum = 0

            inputs = []
            outputs = []
            predicteds = []
            rewards = []

            #f_iter = open("episode_{0}.csv".format(e),"w")
            #write_iter = csv.writer(f_iter)
            f_episode = "episode_{0}.csv".format(e)
            os.system("rm -rf {0}".format(f_episode))

            print(observation[0].shape, observation[1].shape)

            RL = PolicyGradient(
                n_actions=self.env.action_space.n,
                #				n_features=observation.shape[0],
                learning_rate=0.02,
                reward_decay=0.995,
                # output_graph=True,
            )

            while not game_over:

                action, aprob = RL.choose_action(observation)

                inputs.append(observation)
                predicteds.append(aprob)

                y = np.zeros([self.env.action_space.n])
                y[action] = 1.
                outputs.append(y)

                observation, reward, actual_reward, game_over, info = self.env._step(
                    action)
                reward_sum += float(actual_reward)

                #rewards.append(float(reward))
                rewards.append(float(reward_sum))

                RL.store_transition(observation, action, rewards)

                # check memory for RNN model
                if len(inputs) > self.max_memory:
                    del inputs[0]
                    del outputs[0]
                    del predicteds[0]
                    del rewards[0]

                if verbose > 0:
                    if env.actions[action] == "LONG" or env.actions[
                            action] == "SHORT":
                        #if env.actions[action] == "LONG" or env.actions[action] == "SHORT" or env.actions[action] == "HOLD":
                        color = bcolors.FAIL if env.actions[
                            action] == "LONG" else bcolors.OKBLUE
                        print("%s:\t%s\t%.2f\t%.2f\t" %
                              (info["dt"], color + env.actions[action] +
                               bcolors.ENDC, reward_sum, info["cum"]) +
                              ("\t".join([
                                  "%s:%.2f" % (l, i)
                                  for l, i in zip(env.actions, aprob.tolist())
                              ])))
                    #write_iter.writerow("%s:\t%s\t%.2f\t%.2f\t" % (info["dt"], env.actions[action], reward_sum, info["cum"]) + ("\t".join(["%s:%.2f" % (l, i) for l, i in zip(env.actions, aprob.tolist())])))
                    os.system("echo %s >> %s" %
                              ("%s:\t%s\t%.2f\t%.2f\t" %
                               (info["dt"], env.actions[action], reward_sum,
                                info["cum"]) +
                               ("\t".join([
                                   "%s:%.2f" % (l, i)
                                   for l, i in zip(env.actions, aprob.tolist())
                               ])), f_episode))

                avg_reward_sum = avg_reward_sum * 0.99 + reward_sum * 0.01
                toPrint = "%d\t%s\t%s\t%.2f\t%.2f" % (
                    e, info["code"],
                    (bcolors.FAIL if reward_sum >= 0 else bcolors.OKBLUE) +
                    ("%.2f" % reward_sum) + bcolors.ENDC, info["cum"],
                    avg_reward_sum)
                print(toPrint)
                if self.history_filename != None:
                    os.system("echo %s >> %s" %
                              (toPrint, self.history_filename))

                discounted_rewards_ = RL.learn()  # train

                dim = len(inputs[0])
                inputs_ = [[] for i in range(dim)]
                for obs in inputs:
                    for i, block in enumerate(obs):
                        inputs_[i].append(block[0])
                inputs_ = [np.array(inputs_[i]) for i in range(dim)]

                outputs_ = np.vstack(outputs)
                predicteds_ = np.vstack(predicteds)
                rewards_ = np.vstack(rewards)

                print("shape: ", np.shape(rewards),
                      np.shape(discounted_rewards_))
                #outputs_ *= discounted_rewards_
                for i, r in enumerate(zip(rewards, discounted_rewards_)):
                    reward, discounted_reward = r

                    if verbose > 1:
                        #						print (outputs_[i],)
                        print(outputs_[i], )

                    if verbose > 0:
                        print(predicteds_[i], outputs_[i], reward,
                              discounted_reward)

                print("fit model input.shape %s, output.shape %s" %
                      ([inputs_[i].shape
                        for i in range(len(inputs_))], outputs_.shape))

                np.set_printoptions(linewidth=200, suppress=True)
                print("currentTargetIndex:", env.currentTargetIndex)
Exemple #8
0
        if RENDER:
            env.render()
        action = RL.choose_action(observation)

        observation_, reward, done, info = env.step(action)

        RL.store_transition(observation, action, reward)

        if done:
            ep_rs_sum = sum(RL.ep_rs)

            if 'running_reward' not in globals():
                running_reward = ep_rs_sum
            else:
                running_reward = running_reward * 0.99 + ep_rs_sum * 0.01
            if running_reward > DISPLAY_REWARD_THRESHOLD:
                RENDER = True  # rendering
                time.sleep(2)
            print("episode:", i_episode, "  reward:", int(running_reward))

            vt = RL.learn()

            # if i_episode == 0:
            #     plt.plot(vt)  # plot the episode vt
            #     plt.xlabel('episode steps')
            #     plt.ylabel('normalized state-action value')
            #     plt.show()
            break

        observation = observation_
Exemple #9
0
class runPG():
    n_inputs = 4
    n_outputs = 4  # right and left for each finger
    # n_outputs = 8 # right, left and stop for each finger

    net = 0
    X = 0
    A = np.array([[-1, -1], [-1, 1], [1, -1], [1, 1], [0, -1], [0, 1], [-1, 0],
                  [1, 0]])

    mode = 5
    reward_mode = 2

    R = []

    gripper_closed = False
    stLearning = True
    possible_plot = False

    def __init__(self):
        rospy.init_node('runPG', anonymous=True)

        if self.mode == 5:
            self.n_inputs = 4
        if self.mode == 8:
            self.n_inputs = 8

        self.RL = PolicyGradient(
            n_actions=self.n_outputs,
            n_features=self.n_inputs,
            learning_rate=0.02,
            reward_decay=0.99,
            load_saved_net=False,
            # output_graph=True,
        )

        rospy.Subscriber('/RL/gripper_status', String,
                         self.callbackGripperStatus)
        rospy.Service('/RL/net', net_eval, self.EvalNet)
        rospy.Service('/RL/start_learning', Empty, self.start_learning)
        obs_srv = rospy.ServiceProxy('/RL/observation', observation)
        drop_srv = rospy.ServiceProxy('/RL/IsObjDropped', IsDropped)
        move_srv = rospy.ServiceProxy('/RL/MoveGripper', TargetAngles)
        open_srv = rospy.ServiceProxy('/RL/OpenGripper', Empty)
        close_srv = rospy.ServiceProxy('/RL/CloseGripper', Empty)

        rospy.sleep(3)
        o = open_srv()

        episode_count = 0
        rate = rospy.Rate(15)  # 15hz
        while not rospy.is_shutdown():

            if self.stLearning:
                ## Start episode ##
                episode_count += 1

                # Close gripper
                raw_input(
                    "Place object between fingers and press Enter to close gripper..."
                )
                close_srv()
                while not self.gripper_closed:
                    rate.sleep()

                raw_input("Remove table and press Enter to start episode...")

                # Get observation
                obs = np.array(obs_srv().state)
                self.VT = []
                while True:
                    # Choose action
                    action = self.RL.choose_action(obs)

                    # Act
                    suc = move_srv(self.A[action]).success
                    rospy.sleep(0.05)
                    rate.sleep()

                    if suc:
                        # Get observation
                        obs_ = np.array(obs_srv().state)
                        fail = drop_srv(
                        ).dropped  # Check if dropped - end of episode
                    else:
                        # End episode if overload or angle limits reached
                        rospy.logerr(
                            '[RL] Failed to move gripper. Episode declared failed.'
                        )
                        fail = True

                    reward, done = self.transition_reward(obs_, fail)

                    self.RL.store_transition(obs, action, reward)

                    obs = obs_

                    if done:
                        ep_rs_sum = sum(self.RL.ep_rs)

                        if 'running_reward' not in globals():
                            running_reward = ep_rs_sum
                        else:
                            running_reward = running_reward * 0.99 + ep_rs_sum * 0.01
                        print("*** episode: " + str(episode_count) +
                              ", episode reward: " + str(ep_rs_sum) +
                              ", running reward: " + str(int(running_reward)) +
                              " ***")

                        vt = self.RL.learn()
                        self.R.append(running_reward)
                        self.possible_plot = True

                        break

                    rate.sleep()
            elif self.possible_plot:
                self.plot_sav()
                self.possible_plot = False

            # Open gripper
            if self.gripper_closed:
                o = open_srv()
                rospy.sleep(0.2)
            # self.stLearning = False

            # print(obs_srv().state)

            # rospy.spin()
            rate.sleep()

    def plot_sav(self):
        plt.plot(range(len(self.R)), self.R)  # plot the episode vt
        plt.xlabel('episode steps')
        plt.ylabel('normalized state-action value')
        plt.show()

    def EvalNet(self, msg):
        a = 0
        return {'action': a}

    def callbackGripperStatus(self, msg):
        self.gripper_closed = msg.data == "closed"

    def start_learning(self, msg):
        self.stLearning = not self.stLearning

        return EmptyResponse()

    def transition_reward(self, obs, fail):

        # Keep moving as much as possible
        if self.reward_mode == 1:
            if fail:
                reward = 0.
            else:
                reward = 1.
            done = fail

        # Get to a certain coodrinate
        if self.reward_mode == 2:
            if fail:
                reward = -3.
            else:
                reward = -1.
            done = fail

            if obs[0] > 135.:
                raw_input('Reached goal, x = %f.' % obs[0])
                reward = 5.
                done = True

        return reward, done
Exemple #10
0
def train(episode, rewardType=None):
    tf.reset_default_graph()
    number_of_players = 2
    number_of_pieces = 4
    # Load checkpoint
    load_version = 11
    save_version = load_version + 1
    #load_path = "output/weights/ludo/{}/ludo-v2.ckpt".format(load_version)
    load_path = None
    save_path = "/content/drive/My Drive/cse8673_project/output/weights/ludo/{}/ludo-v2.ckpt".format(
        rewardType)
    PG_dict = {}
    reward = -1000
    act = util.Action(number_of_players, number_of_pieces, reward)
    PG = PolicyGradient(
        n_x=(number_of_players * number_of_pieces) + 5,  #input layer size
        n_y=5,  #ouput layer size
        learning_rate=0.02,
        reward_decay=0.99,
        load_path=load_path,
        save_path=save_path,
        player_num=0,
        rewardType=rewardType)
    EPISODES = episode
    ghost_players = list(reversed(range(0, 4)))[:-number_of_players]
    players = list(reversed(range(0, 4)))[-number_of_players:]
    winner = None
    winnerCount = defaultdict(int)
    for episode in range(EPISODES):
        if episode % 500 == 0:
            print("episode : ", episode)
        g = ludopy.Game(ghost_players=ghost_players,\
             number_of_pieces=number_of_pieces)

        episode_reward = 0

        there_is_a_winner = False
        winner = None
        count = 0
        while True:
            count += 1
            for i in range(number_of_players):
                if i == 0:
                    (dice, move_pieces, player_pieces, enemy_pieces,
                     player_is_a_winner,
                     there_is_a_winner), player_i = g.get_observation()

                    action, random = act.getAction(PG, enemy_pieces,
                                                   player_pieces, move_pieces,
                                                   dice)

                    _, _, _, _, _, there_is_a_winner = g.answer_observation(
                        action)
                else:
                    action = act.getAction(move_pieces=move_pieces)

                if there_is_a_winner:
                    winner = player_i
                    winnerCount[player_i] += 1
                    break

            #this is where the agents are leanring
            if there_is_a_winner:
                if winner == 0:
                    PG.episode_rewards = [
                        i + 2000 if i == -1000 else i
                        for i in PG.episode_rewards
                    ]

                discounted_episode_rewards_norm = PG.learn(episode, 0, winner)
    return winnerCount, save_path
Exemple #11
0
    while True:
        if RENDER and i_episode>1000: env.render()

        action = RL.choose_action(observation)

        observation_, reward, done, info = env.step(action)

        RL.store_transition(observation, action, reward)

        if done:
            ep_rs_sum = sum(RL.ep_rs)

            if 'running_reward' not in globals():
                running_reward = ep_rs_sum
            else:
                running_reward = running_reward * 0.99 + ep_rs_sum * 0.01
            if running_reward > DISPLAY_REWARD_THRESHOLD: RENDER = True     # rendering
            print("episode:", i_episode, "  reward:", int(running_reward))

            vt = RL.learn().numpy()

            if i_episode == 0:
                plt.plot(vt)    # plot the episode vt
                plt.xlabel('episode steps')
                plt.ylabel('normalized state-action value')
                plt.show()
            break

        observation = observation_
Exemple #12
0
class runPG():
    n_inputs = 4
    # n_outputs = 4 # right and left for each finger
    n_outputs = 8 # right, left and stop for each finger
    max_episodes = 1200
    max_steps = 2500

    net = 0
    X = 0
    A = np.array([[-1, -1], [1, -1], [-1, 1], [1, 1], [0, -1], [0, 1], [-1, 0], [1, 0]])

    mode = 5
    reward_mode = 3

    R = []
    g = np.array([-35.0, 104.0], dtype='f') # Goal

    gripper_closed = False
    stLearning = True # Enable learning
    possible_plot = False

    # For reward mode 3
    prev_dis2goal = 1e9


    def __init__(self):
        rospy.init_node('runPG', anonymous=True)

        if self.mode == 5:
            self.n_inputs = 4
        if self.mode == 8:
            self.n_inputs = 8
        
        self.RL = PolicyGradient(
            n_actions = self.n_outputs,
            n_features = self.n_inputs,
            learning_rate=0.001,
            reward_decay=0.98,
            load_saved_net=True,
            # output_graph=True,
        )

        rospy.Subscriber('/RL/gripper_status', String, self.callbackGripperStatus)
        rospy.Service('/RL/net', net_eval, self.EvalNet)
        rospy.Service('/RL/start_learning', Empty, self.start_learning)
        obs_srv = rospy.ServiceProxy('/RL/observation', observation)
        drop_srv = rospy.ServiceProxy('/RL/IsObjDropped', IsDropped)
        move_srv = rospy.ServiceProxy('/RL/MoveGripper', TargetAngles)
        reset_srv = rospy.ServiceProxy('/RL/ResetGripper', Empty)
        pub_goal = rospy.Publisher('/RL/Goal', Float32MultiArray, queue_size=10)

        gg = Float32MultiArray()
        gg.data = self.g

        episode_count = 0
        rate = rospy.Rate(100) # 100hz
        while not rospy.is_shutdown():
            
            if self.stLearning:
                ## Start episode ##
                episode_count += 1

                self.prev_dis2goal = 1e9

                # Set gripper
                reset_srv()
                while not self.gripper_closed:
                    rate.sleep()

                # Get observation
                obs = np.array(obs_srv().state)
                self.VT = []
                step = 0
                while True:
                    step += 1
                    print('[RL] Step %d in episode %d, distance to goal: %f.' % (step, episode_count, self.prev_dis2goal))
                    pub_goal.publish(gg)

                    # Choose action
                    action = self.RL.choose_action(obs)

                    # Act
                    suc = move_srv(self.A[action]).success
                    rospy.sleep(0.05)
                    rate.sleep()

                    if suc:
                        # Get observation
                        obs_ = np.array(obs_srv().state)
                        fail = drop_srv().dropped # Check if dropped - end of episode
                    else:
                        # End episode if overload or angle limits reached
                        rospy.logerr('[RL] Failed to move gripper. Episode declared failed.')
                        fail = True 

                    reward, done = self.transition_reward(obs_, fail)

                    self.RL.store_transition(obs, action, reward)

                    obs = obs_

                    if step > self.max_steps:
                        done = True

                    if done:
                        ep_rs_sum = sum(self.RL.ep_rs)

                        if 'running_reward' not in globals():
                            running_reward = ep_rs_sum
                        else:
                            running_reward = running_reward * 0.99 + ep_rs_sum * 0.01
                        print("*** episode: " + str(episode_count) + ", episode reward: " + str(ep_rs_sum) + ", running reward: " + str(int(running_reward)) + " ***")

                        vt = self.RL.learn()
                        self.R.append(running_reward)
                        self.possible_plot = True

                        break

                    rate.sleep()
            elif self.possible_plot:
                self.plot_sav()
                self.possible_plot = False

            if self.max_episodes < episode_count:
                self.plot_sav()
                break

            rate.sleep()

    def plot_sav(self):
        plt.plot(range(len(self.R)),self.R)    # plot the episode vt
        plt.xlabel('episode steps')
        plt.ylabel('normalized state-action value')
        plt.show()

    def EvalNet(self, msg):
        a = 0
        return {'action': a}

    def callbackGripperStatus(self, msg):
        self.gripper_closed = msg.data == "closed"

    def start_learning(self, msg):
        self.stLearning = not self.stLearning

        return EmptyResponse()

    def transition_reward(self, obs, fail):

        # Keep moving as much as possible
        if self.reward_mode == 1:
            if fail:
                reward = 0.
            else:
                reward = 1.
            done = fail

        # Cross a line
        if self.reward_mode == 2:
            if fail:
                reward = -3.
            else:
                reward = -1.
            done = fail
            
            if obs[0] > 40.:
                print('Reached goal, x = %f.' % obs[0])
                reward = 5.
                done = True

        # Get to a certain coordinate
        if self.reward_mode == 3:
            
            d = np.linalg.norm(self.g-obs[:2])

            if fail or d > self.prev_dis2goal:
                reward = 0.
            else:
                reward = 1.
            done = fail
            
            if d < 5:
                print('Reached goal, (x,y) = (%f,%f).' % (obs[0],obs[1]))
                reward = 50.
                done = True
            
            self.prev_dis2goal = d

        return reward, done
Exemple #13
0
                  "\n")
            env.render()
            #print(acts)
            #print(PG.game_rewards)
            true_min_reward_so_far = min_reward_so_far
            print("==========================================")
        print("==========================================")
        print("lensd game_scores", len(game_scores))
        print("game_scores\n", game_scores)
        reward_mean = sum(game_scores) / P_GAMES
        game_scores = []
        print("==========================================")
        print("FINAL GAME OF BATCH: ", batch, " out of ", BATCHES)
        print("Training...")
        print("Game reward mean = ", reward_mean)
        print("Max Batch reward so far: ", true_max_reward_so_far)
        print("L = ", bd.L)
        print("H = ", bd.H)
        print("GAMES per BATCH:", P_GAMES)
        print("Learning rate: ", Learning_rate)
        print("Gamma: ", GAMMA)
        print("Neurons: ", PG.neurons_layer_1)

        env.render()
        # 4. Train neural network
        discounted_batch_rewards_norm = PG.learn()
        print("==========================================")
        print("VALIDATION BATCH: ", batch, "\n")
        bd.run_validation(PG, STEPS)
        print("==========================================")
Exemple #14
0
    while True:
        if RENDER: env.render()

        action = RL.choose_action(observation)

        observation_, reward, done, info = env.step(action)

        RL.store_transition(observation, action, reward)

        if done:
            ep_rs_sum = sum(RL.ep_rs)
            if 'running_reward' not in globals():
                running_reward = ep_rs_sum
            else:
                running_reward = running_reward * 0.99 + ep_rs_sum * 0.01
            if running_reward > DISPLAY_REWARD_THRESHOLD:
                RENDER = True  # rendering

            print("episode:", i_episode, "  reward:", int(running_reward))

            vt = RL.learn()  # train

            if i_episode == 30:
                plt.plot(vt)  # plot the episode vt
                plt.xlabel('episode steps')
                plt.ylabel('normalized state-action value')
                plt.show()

            break

        observation = observation_