def greedy_policy():
    # Agent chooses by the greedy policy
    rewards = np.zeros((len(epsilons), num_sessions, num_trials))
    num_best = np.zeros((len(epsilons), num_sessions, num_trials))

    for i in range(len(epsilons)):
        policy = GreedyPolicy()
        bandit = GaussianBandit(n)
        agent = Agent(n, policy, num_trials)
        env = Environment(bandit, agent, num_trials, num_sessions)
        rewards[i, :, :], num_best[i, :, :] = env.run()

    ave_reward = rewards[i, :, :].mean(axis=0)
    plt.plot(ave_reward)
    plt.title("Average Reward")
    plt.xlabel('Trial')
    plt.ylabel('Reward')
    plt.show()

    ave_percent_best = num_best[i, :, :].mean(axis=0)
    plt.plot(ave_percent_best)
    plt.title("Average Percent Best Option")
    plt.xlabel('Trial')
    plt.ylabel('Percent Best Option')
    plt.show()
Esempio n. 2
0
 def play(self, env):
     """
     perform a greedy action, (play the best move according to the
     agent)
     """
     av = self.get_av_pairs(env)
     a = GreedyPolicy().choose_action(av)[0]
     r, s_n = env.perform_action(a)
     return a, r, s_n
Esempio n. 3
0
def play(model_fn, color, start_board=None, sim=False, depth=3):

    env=Environment(start_board)
    pol=GreedyPolicy()
    with tf.Session() as sess:
        if model_fn is not None: 
            saver=tf.train.import_meta_graph(model_fn+'.meta')
            saver.restore(sess,model_fn)

        approx=Approximator(sess)
        a=[None,None]
        if sim:
            a[int(color)]=tdstem.TDStemPlayAgent(approx,depth=depth)
            a[int(not color)]=opt.OptimalAgent()
            
        else:
            a[int(not color)]=tdstem.TDStemPlayAgent(approx,depth=depth)
            a[int(color)]=tdstem.TDStemPlayAgent(approx,depth=depth)

        oa=opt.OptimalAgent()

        flag=False
        
        name=str(raw_input("What's your name? "))
        print "Let's play a game, %s!" %(str(name))

        while not flag:
            time.sleep(2)

            env.draw() 
            print 'DTM: {}'.format(np.abs(oa.approx.tb.probe_dtm(chess.Board.from_epd(env.current_state)[0])))

            if env.is_game_over():
                print env.result()
                flag=True

            else:
                print 'Evaluation: {}'.format(a[int(color)].get_av_pairs(env))
                print 'Optimal moves: {}'.format(oa.get_best_moves(env))
                start=time.time()

                if env.get_turn()==color:
                    if sim:
                        a[int(color)].play(env)
                    else:
                        suc=False
                        while not suc:
                            m=str(raw_input('YOUR MOVE: '))
                            try:
                                env.perform_action(m)
                                suc=True
                            except:
                                raise ValueError

                else:
                    a[int(not color)].play(env)
Esempio n. 4
0
def test_warehouse_03():
    env = make_test_warehouse_env_01()
    expected_value = None
    policy = GreedyPolicy()
    we.execute(env, policy)
    print('**' * 30)

    print('[Result]')
    print('Finish time clock value=', env.finish_time_clock,
          ':uncompleted orders=', len(env.available_orders))
Esempio n. 5
0
def test_warehouse_greedy(order_count):
    start = time.time()
    env = make_test_warehouse_env(order_count)  #order 60개기준
    expected_value = None
    policy = GreedyPolicy()
    we.execute(env, policy)
    print('**' * 30)
    print('[Result]GreedyPolicy')
    print('Finish time clock value=', env.finish_time_clock,
          ':uncompleted orders=', len(env.available_orders))
    end = time.time()
    print('time', (end - start))
Esempio n. 6
0
 def __init__(self, policy=GreedyPolicy()):
     Agent.__init__(self, policy)
     self.approx = OptimalApproximator()
 def temp_replace_policy(self):
     if self.run_type is RunType.RAND_FILL:
         self.agent.currently_used_policy = RandomPolicy()
     if self.run_type is RunType.TEST:
         self.agent.currently_used_policy = GreedyPolicy()
import data_utils
from mdp import MDP
from rewards import reward_func_linear  # Call it with stats to initialize
from env import Env
from q_learning import QLearningAlgo
from policy import EpsilonGreedyPolicy, GreedyPolicy, RandomPolicy

data = data_utils.Data(n=15)
mdp = MDP(data=data)
reward_func = reward_func_linear(data.statistics, verbose=False)
env = Env(reward_func=reward_func, mode='human')
# policy = EpsilonGreedyPolicy(action_space = mdp.action_space)
policy = RandomPolicy(action_space=mdp.action_space)
test_policy = GreedyPolicy(action_space=mdp.action_space)
algo = QLearningAlgo(env=env, mdp=mdp, policy=policy, discount=0.2)

algo.set_mode('train')
algo.fit(mode='train', epochs=4, remember=True)

algo.set_mode('test')
algo.test(mode='test', policy=test_policy)

algo.replay(batch_size=16, epochs=8)

algo.set_mode('test')
algo.test(mode='test', policy=test_policy)

# algo.test(mode = 'human', policy = test_policy)

import numpy as np
import matplotlib.pyplot as plt
Esempio n. 9
0
def main(N_bandits=10, epsilon=0.1):
    # =========================
    # Settings
    # =========================
    alpha = 1
    tao = 0.1
    bandit_probs = np.ones(N_bandits) - 0.5
    bandit_probs[
        1:] = bandit_probs[1:] - epsilon  # bandit probabilities of success
    simulations = 100  # number of simulations to perform
    trials = 100000  # number of episodes per experiment
    save_fig = True  # if false -> plot, if true save as file in same directory
    save_format = ".pdf"  # ".pdf" or ".png"

    # =========================
    # Start multi-armed bandit simulation
    # ========================
    print(
        "Running multi-armed bandits with N_bandits = {} and agent epsilon = {}"
        .format(N_bandits, epsilon))
    reward_history_avg = np.zeros(
        (trials, 4))  # reward history simulation-averaged
    # action_history_sum = np.zeros((trials, N_bandits))  # sum action history
    # regret simulation-averaged, 4 is the number of agents
    regret_history_avg = np.zeros((trials, 4))

    for i in range(simulations):
        bandit = MultiArmedBandit(bandit_probs)  # initialize bandits
        agents = [
            Agent(bandit, GreedyPolicy(epsilon)),  # epsilon-Greedy
            Agent(bandit, SoftMaxPolicy(tao)),  # Softmax
            Agent(bandit, UCBPolicy(alpha)),  # UCB1
            BetaAgent(bandit, GreedyPolicy(0))  # Thompson Sampling
        ]

        for a, agent in enumerate(agents):
            (action_history, reward_history,
             regret_history) = Environment(agent, bandit, trials,
                                           epsilon)  # perform experiment

            if (i + 1) % (simulations / 20) == 0:
                print("Agent = {}".format(a + 1))
                print("[Experiment {}/{}]".format(i + 1, simulations))
                print("  bandit choice history = {}".format(action_history +
                                                            1))
                print("  average reward = {}".format(
                    np.sum(reward_history) / len(reward_history)))
                print("  cumulative regret = {}".format(
                    np.sum(regret_history) / len(regret_history)))
                print("")
            # Sum up experiment reward (later to be divided to represent an average)
            reward_history_avg[:, a] += reward_history
            regret_history_avg[:, a] += regret_history

        # # Sum up action history
        # for j, (a) in enumerate(action_history):
        #     action_history_sum[j][a] += 1

    reward_history_avg /= np.float(simulations)
    regret_history_avg /= np.float(simulations)

    # =========================
    # Plot regret history results
    # =========================
    plt.close()
    plt.plot(regret_history_avg)
    plt.legend(['Greedy', 'SoftMax', 'UCB1', 'TS'], loc='upper left')
    plt.xlabel("Episode number")
    plt.ylabel("Regret accumulated".format(simulations))
    plt.title(
        "Bandit regret history averaged, number of arms = {}, epsilon = {}".
        format(N_bandits, epsilon))
    ax = plt.gca()
    ax.set_xscale("log", nonposx='clip')
    plt.xlim([1, trials])
    if save_fig:
        output_file = "results/regrets_" + str(N_bandits) + "_" + str(
            epsilon) + save_format
        plt.savefig(output_file, bbox_inches="tight")
    else:
        plt.show()
Esempio n. 10
0
parser.add_argument('--num-steps', type=int, default=1000)
parser.add_argument('--num-exp', type=int, default=10000)


def simulate_multi(policies, args):
    sample_rewards = []
    for p in policies:
        sample_reward = []
        for i in range(args.num_exp):
            if i % 20 == 0:
                print(i)
            sample_reward.append(p.run(args.num_steps))
        sample_rewards.append(sample_reward)
    plt.plot(np.mean(sample_rewards[0], axis=0), label='Greedy')
    plt.plot(np.mean(sample_rewards[1], axis=0), label='Thompson')
    plt.plot(np.mean(sample_rewards[2], axis=0), label='UCB')
    plt.legend()
    plt.xlabel('Steps')
    plt.ylabel('Mean Reward')
    plt.show()


if __name__ == '__main__':
    probs = [0.1, 0.2, 0.4, 0.6, 0.8, 0.9]
    args = parser.parse_args()
    bandit = BetaBernBandit(probs, len(probs))
    greedy = GreedyPolicy(eps=0.05, bandit=bandit)
    thompson = ThompsonSampling(bandit=bandit)
    ucb = UCB(bandit=bandit)
    simulate_multi([greedy, thompson, ucb], args)
Esempio n. 11
0
from policy import GreedyPolicy
from policy import UniformRandomPolicy
from memhelpers import NNMemStore
IMAGE_SIZE = (84, 84)
HISTORY_LENGTH = 4

MEM_SIZE = 2000
INIT_MEM_RATIO = 0.5

env = gym.make('BreakoutDeterministic-v0')
observation = env.reset()
num_actions = env.action_space.n

atari_processor = AtariProcessor(IMAGE_SIZE)
history_store = HistoryStore(HISTORY_LENGTH, IMAGE_SIZE)
greedy_selector = GreedyPolicy()
random_selector = UniformRandomPolicy(num_actions)
episode_end_flag = False
mem_store = NNMemStore(MEM_SIZE, (84, 84, 4))
observation = env.reset()
state = atari_processor.state_for_mem(observation)
history_store.add_history(state)
i = 0
life = False
first_step = True
while episode_end_flag == False:
    nn_input = history_store.get_history()
    action = random_selector.select_action()
    observation, reward, done, info = env.step(action)
    episode_end_flag = done
    state = atari_processor.state_for_mem(observation)
Esempio n. 12
0
 def __init__(self, approx):
     Agent.__init__(self, GreedyPolicy())
     self.approx = approx