Esempio n. 1
0
def parameterTest():

    train50eps1_avgs = []
    train50eps2_avgs = []
    train50eps3_avgs = []
    train70eps1_avgs = []
    train70eps2_avgs = []
    train70eps3_avgs = []
    train100eps1_avgs = []
    train100eps2_avgs = []
    train100eps3_avgs = []

    training_steps = 10000000

    for i in range(0, 3):
        gridWorldModel = GridWorld(m,
                                   n,
                                   k,
                                   debug=False,
                                   gamma=1,
                                   no_stochastisity=False)

        Q1 = np.zeros((gridWorldModel.spec.nS, gridWorldModel.spec.nA))
        Q2 = np.zeros((gridWorldModel.spec.nS, gridWorldModel.spec.nA))
        Q3 = np.zeros((gridWorldModel.spec.nS, gridWorldModel.spec.nA))
        Q4 = np.zeros((gridWorldModel.spec.nS, gridWorldModel.spec.nA))
        Q5 = np.zeros((gridWorldModel.spec.nS, gridWorldModel.spec.nA))
        Q6 = np.zeros((gridWorldModel.spec.nS, gridWorldModel.spec.nA))
        Q7 = np.zeros((gridWorldModel.spec.nS, gridWorldModel.spec.nA))
        Q8 = np.zeros((gridWorldModel.spec.nS, gridWorldModel.spec.nA))
        Q9 = np.zeros((gridWorldModel.spec.nS, gridWorldModel.spec.nA))

        learning_rate = 0.1

        q1, pi1, episode_steps1 = tabular_dyna_q(gridWorldModel,
                                                 Q1,
                                                 learning_rate,
                                                 training_steps,
                                                 50,
                                                 num_of_episodes=1000,
                                                 eps=0.1)
        q2, pi2, episode_steps2 = tabular_dyna_q(gridWorldModel,
                                                 Q2,
                                                 learning_rate,
                                                 training_steps,
                                                 50,
                                                 num_of_episodes=1000,
                                                 eps=0.2)
        q3, pi3, episode_steps3 = tabular_dyna_q(gridWorldModel,
                                                 Q3,
                                                 learning_rate,
                                                 training_steps,
                                                 50,
                                                 num_of_episodes=1000,
                                                 eps=0.3)

        #eps = range(len(episode_steps1))
        #plt.plot(eps, episode_steps1)
        #plt.plot(eps, episode_steps2)
        #plt.plot(eps, episode_steps3)
        #plt.xlabel('Episodes')
        #plt.ylabel('Steps')
        #plt.title('Steps per Episode')
        #plt.show()

        q4, pi4, episode_steps4 = tabular_dyna_q(gridWorldModel,
                                                 Q4,
                                                 learning_rate,
                                                 training_steps,
                                                 70,
                                                 num_of_episodes=1000,
                                                 eps=0.1)
        q5, pi5, episode_steps5 = tabular_dyna_q(gridWorldModel,
                                                 Q5,
                                                 learning_rate,
                                                 training_steps,
                                                 70,
                                                 num_of_episodes=1000,
                                                 eps=0.2)
        q6, pi6, episode_steps6 = tabular_dyna_q(gridWorldModel,
                                                 Q6,
                                                 learning_rate,
                                                 training_steps,
                                                 70,
                                                 num_of_episodes=1000,
                                                 eps=0.3)

        #eps = range(len(episode_steps4))
        #plt.plot(eps, episode_steps4)
        #plt.plot(eps, episode_steps5)
        #plt.plot(eps, episode_steps6)
        #plt.xlabel('Episodes')
        #plt.ylabel('Steps')
        #plt.title('Steps per Episode')
        #plt.show()

        q7, pi7, episode_steps7 = tabular_dyna_q(gridWorldModel,
                                                 Q7,
                                                 learning_rate,
                                                 training_steps,
                                                 100,
                                                 num_of_episodes=1000,
                                                 eps=0.1)
        q8, pi8, episode_steps8 = tabular_dyna_q(gridWorldModel,
                                                 Q8,
                                                 learning_rate,
                                                 training_steps,
                                                 100,
                                                 num_of_episodes=1000,
                                                 eps=0.2)
        q9, pi9, episode_steps9 = tabular_dyna_q(gridWorldModel,
                                                 Q9,
                                                 learning_rate,
                                                 training_steps,
                                                 100,
                                                 num_of_episodes=1000,
                                                 eps=0.3)

        #eps = range(len(episode_steps7))
        #plt.plot(eps, episode_steps7)
        #plt.plot(eps, episode_steps8)
        #plt.plot(eps, episode_steps9)
        #plt.xlabel('Episodes')
        #plt.ylabel('Steps')
        #plt.title('Steps per Episode')
        #plt.show()

        train50eps1_steps = 0
        train50eps2_steps = 0
        train50eps3_steps = 0
        train70eps1_steps = 0
        train70eps2_steps = 0
        train70eps3_steps = 0
        train100eps1_steps = 0
        train100eps2_steps = 0
        train100eps3_steps = 0

        for i in range(0, 10):
            #print("inst world model...")
            gridWorldModel.reset(start_cell=(m - 1))
            gw1 = copy.deepcopy(gridWorldModel)
            gw2 = copy.deepcopy(gridWorldModel)
            gw3 = copy.deepcopy(gridWorldModel)
            gw4 = copy.deepcopy(gridWorldModel)
            gw5 = copy.deepcopy(gridWorldModel)
            gw6 = copy.deepcopy(gridWorldModel)
            gw7 = copy.deepcopy(gridWorldModel)
            gw8 = copy.deepcopy(gridWorldModel)
            gw9 = copy.deepcopy(gridWorldModel)

            #visualizeGridValueFunc(gw)
            #print("exec sweep policy for episode...")
            train50eps1_steps += exec_policy_for_episode(gw1, pi1)
            train50eps2_steps += exec_policy_for_episode(gw2, pi2)
            train50eps3_steps += exec_policy_for_episode(gw3, pi3)
            train70eps1_steps += exec_policy_for_episode(gw4, pi4)
            train70eps2_steps += exec_policy_for_episode(gw5, pi5)
            train70eps3_steps += exec_policy_for_episode(gw6, pi6)
            train100eps1_steps += exec_policy_for_episode(gw7, pi7)
            train100eps2_steps += exec_policy_for_episode(gw8, pi8)
            train100eps3_steps += exec_policy_for_episode(gw9, pi9)
            #print("rl steps" + str(rl_steps))
            #print("sweep steps" + str(sweep_steps))
            # nn_tour_expected_steps += gw.graph.calc_path_cost(base_line_tour)

        train50eps1_avgs.append(train50eps1_steps / 10)
        train50eps2_avgs.append(train50eps2_steps / 10)
        train50eps3_avgs.append(train50eps3_steps / 10)
        train70eps1_avgs.append(train70eps1_steps / 10)
        train70eps2_avgs.append(train70eps2_steps / 10)
        train70eps3_avgs.append(train70eps3_steps / 10)
        train100eps1_avgs.append(train100eps1_steps / 10)
        train100eps2_avgs.append(train100eps2_steps / 10)
        train100eps3_avgs.append(train100eps3_steps / 10)

    experiment_nums = ('1', '2', '3')
    y_pos = np.arange(len(experiment_nums))

    bar_width = 0.075

    rects1 = plt.bar(y_pos,
                     train50eps1_avgs,
                     bar_width,
                     color='b',
                     label='train50eps.1')

    rects2 = plt.bar(y_pos + bar_width,
                     train50eps2_avgs,
                     bar_width,
                     color='g',
                     label='train50eps.2')

    rects3 = plt.bar(y_pos + 2 * bar_width,
                     train50eps3_avgs,
                     bar_width,
                     color='r',
                     label='train50eps.3')

    rects4 = plt.bar(y_pos + 3 * bar_width,
                     train70eps1_avgs,
                     bar_width,
                     color='b',
                     label='train70eps.1')

    rects5 = plt.bar(y_pos + 4 * bar_width,
                     train70eps2_avgs,
                     bar_width,
                     color='g',
                     label='train70eps.2')

    rects6 = plt.bar(y_pos + 5 * bar_width,
                     train70eps3_avgs,
                     bar_width,
                     color='r',
                     label='train70eps.3')

    rects7 = plt.bar(y_pos + 6 * bar_width,
                     train100eps1_avgs,
                     bar_width,
                     color='b',
                     label='train100eps.1')

    rects8 = plt.bar(y_pos + 7 * bar_width,
                     train100eps2_avgs,
                     bar_width,
                     color='g',
                     label='train100eps.2')

    rects9 = plt.bar(y_pos + 8 * bar_width,
                     train100eps3_avgs,
                     bar_width,
                     color='r',
                     label='train100eps.3')

    plt.xticks(y_pos + bar_width, experiment_nums)
    plt.ylabel('Average Number of Steps')
    plt.xlabel('Experiment Number')
    plt.title('Average Number of Steps per Combination with Reward 20')
    plt.legend()
    plt.show()
Esempio n. 2
0
    # Intitalize 4x4 gridworld with 2 items
    n = 8
    m = 8
    k = 2

    nn_avgs = []
    sweep_avgs = []
    dyna_avgs = []

    # Run for 10 different distributions. Train RL, and then compare on 100 episodes each.
    plot_learning_curve = True
    for i in range(0, 10):
        gridWorldModel = GridWorld(m,
                                   n,
                                   k,
                                   debug=False,
                                   gamma=1,
                                   no_stochastisity=False)
        #visualizeGridValueFunc(gridWorldModel)
        visualizeGridProbabilities(gridWorldModel, k, aggregate=True)

        # Testing
        # testRandomPolicy(gridWorldModel)
        eval_pi = testDynaQ(gridWorldModel, plot=plot_learning_curve)
        #parameterTest()
        (nn_avg, sweep_avg,
         dyna_avg) = compareToBaseLine(gridWorldModel, eval_pi, k)
        nn_avgs.append(nn_avg)
        sweep_avgs.append(sweep_avg)
        dyna_avgs.append(dyna_avg)
        plot_learning_curve = False
Esempio n. 3
0
params.MAX_MEM_SIZE=50 
params.NUM_BOX=4
params.BRANCH_COUNT = 1

  

params.MAX_EPISODES=200000
params.LR_ACTOR=.002
params.NORMALIZE_Q = False
params.ST= 10
params.IC_Lambda = 2.5

optimizer = tf.keras.optimizers.Adam(params.LR_ACTOR )
# optimizer = tfa.optimizers.SWA(optimizer,average_period=4)    

env = GridWorld(max_episode = params.MAX_STEPS, max_branch_num=params.BRANCH_COUNT)

env.set_rewards(0,1.,10.,-.1)
env.max_length = 3

params.LR_ACTOR= .001
 
params.DISCOUNT_GAMMA=.2


 

class Memory(object):
    def __init__(self):
        self.ep_obs, self.ep_act, self.ep_rwd = [], [], []
Esempio n. 4
0
        ep += 1
        d = 0
        if (np.sum(_locals['true_reward']) > 2):
            d = 1
        if (np.sum(_locals['true_reward']) > 0):
            p = 2

        print(ep, avgSuccess.add(d))
        logging.info("episode : %d     average success rate : ,%.2f" %
                     (ep, avgSuccess.get()))

    n_steps += 1
    return True


# Create log dir
log_dir = "tmp/"
os.makedirs(log_dir, exist_ok=True)

env = DummyVecEnv([lambda: GridWorld(50, max_branch_num=1) for _ in range(1)])
model = A2C(MlpPolicy,
            env,
            verbose=0,
            tensorboard_log=None,
            full_tensorboard_log=False,
            learning_rate=0.001,
            gamma=0.99)

time_steps = 1e8
model.learn(total_timesteps=int(time_steps), callback=callback)
Esempio n. 5
0
params = dotdict({})
params.ILP_VALUE = False
params.HARD_CHOICE = False
params.DBL_SOFTMAX = False
params.REMOVE_REP = False
params.RESET_RANDOM = False
params.MAX_EPISODES = 200000
params.MAX_STEPS = 50
params.EPS_TH = 0
update_freq = 10
params.MAX_MEM_SIZE = 50
params.NUM_BOX = 4
params.BRANCH_COUNT = 1

env = GridWorld(max_episode=params.MAX_STEPS,
                max_branch_num=params.BRANCH_COUNT)

env.set_rewards(0., 1, 10., -.01)
env.max_length = 3

params.LR_ACTOR = .0005
params.LR_VALUE = .01
params.DISCOUNT_GAMMA = .2

print(params)


# non cnn version
def img_to_act(x, dim_out):

    state_in_x = tf.layers.dense(x / 255, 30 * COLOR_COUNT, tf.nn.relu,
Esempio n. 6
0
k = 2
#gw = GridWorld(m, n, k, debug=False)
#visualizeGridProbabilities(gw, k)

#Q = np.zeros((gridWorldModel._env_spec.nS,gridWorldModel._env_spec.nA))
#dyna_model_training_steps = 50
#learning_rate = 0.1
#q, pi = tabular_dyna_q(gridWorldModel, Q, learning_rate, training_steps, model_training_steps)

sweep_pi = policy.HandMadeSweepPolicy(4, m, n)
episodes_num = 100
start_state = 0
sweep_steps = 0
nn_tour_expected_steps = 0
#for i in tqdm(range(episodes_num)):
gw = GridWorld(m, n, k, debug=True)
visualizeGridProbabilities(gw, k, aggregate=True)
base_line_tour, nn_tour_expected_steps = gw.graph.get_approximate_best_path(
    start_vertex=m - 1)
print("nearest_neighbor_tour:" + str(base_line_tour))

for i in range(0, episodes_num):
    print("inst world model...")
    gw.reset(start_cell=m - 1)
    visualizeGridValueFunc(gw)
    print("exec sweep policy for episode...")
    sweep_steps += exec_policy_for_episode(gw, sweep_pi)
    print("get nearest neighbor tour...")
    print("get nn tour cost...")
    #nn_tour_expected_steps += gw.graph.calc_path_cost(base_line_tour)