def greedy_policy(): # Agent chooses by the greedy policy rewards = np.zeros((len(epsilons), num_sessions, num_trials)) num_best = np.zeros((len(epsilons), num_sessions, num_trials)) for i in range(len(epsilons)): policy = GreedyPolicy() bandit = GaussianBandit(n) agent = Agent(n, policy, num_trials) env = Environment(bandit, agent, num_trials, num_sessions) rewards[i, :, :], num_best[i, :, :] = env.run() ave_reward = rewards[i, :, :].mean(axis=0) plt.plot(ave_reward) plt.title("Average Reward") plt.xlabel('Trial') plt.ylabel('Reward') plt.show() ave_percent_best = num_best[i, :, :].mean(axis=0) plt.plot(ave_percent_best) plt.title("Average Percent Best Option") plt.xlabel('Trial') plt.ylabel('Percent Best Option') plt.show()
def play(self, env): """ perform a greedy action, (play the best move according to the agent) """ av = self.get_av_pairs(env) a = GreedyPolicy().choose_action(av)[0] r, s_n = env.perform_action(a) return a, r, s_n
def play(model_fn, color, start_board=None, sim=False, depth=3): env=Environment(start_board) pol=GreedyPolicy() with tf.Session() as sess: if model_fn is not None: saver=tf.train.import_meta_graph(model_fn+'.meta') saver.restore(sess,model_fn) approx=Approximator(sess) a=[None,None] if sim: a[int(color)]=tdstem.TDStemPlayAgent(approx,depth=depth) a[int(not color)]=opt.OptimalAgent() else: a[int(not color)]=tdstem.TDStemPlayAgent(approx,depth=depth) a[int(color)]=tdstem.TDStemPlayAgent(approx,depth=depth) oa=opt.OptimalAgent() flag=False name=str(raw_input("What's your name? ")) print "Let's play a game, %s!" %(str(name)) while not flag: time.sleep(2) env.draw() print 'DTM: {}'.format(np.abs(oa.approx.tb.probe_dtm(chess.Board.from_epd(env.current_state)[0]))) if env.is_game_over(): print env.result() flag=True else: print 'Evaluation: {}'.format(a[int(color)].get_av_pairs(env)) print 'Optimal moves: {}'.format(oa.get_best_moves(env)) start=time.time() if env.get_turn()==color: if sim: a[int(color)].play(env) else: suc=False while not suc: m=str(raw_input('YOUR MOVE: ')) try: env.perform_action(m) suc=True except: raise ValueError else: a[int(not color)].play(env)
def test_warehouse_03(): env = make_test_warehouse_env_01() expected_value = None policy = GreedyPolicy() we.execute(env, policy) print('**' * 30) print('[Result]') print('Finish time clock value=', env.finish_time_clock, ':uncompleted orders=', len(env.available_orders))
def test_warehouse_greedy(order_count): start = time.time() env = make_test_warehouse_env(order_count) #order 60개기준 expected_value = None policy = GreedyPolicy() we.execute(env, policy) print('**' * 30) print('[Result]GreedyPolicy') print('Finish time clock value=', env.finish_time_clock, ':uncompleted orders=', len(env.available_orders)) end = time.time() print('time', (end - start))
def __init__(self, policy=GreedyPolicy()): Agent.__init__(self, policy) self.approx = OptimalApproximator()
def temp_replace_policy(self): if self.run_type is RunType.RAND_FILL: self.agent.currently_used_policy = RandomPolicy() if self.run_type is RunType.TEST: self.agent.currently_used_policy = GreedyPolicy()
import data_utils from mdp import MDP from rewards import reward_func_linear # Call it with stats to initialize from env import Env from q_learning import QLearningAlgo from policy import EpsilonGreedyPolicy, GreedyPolicy, RandomPolicy data = data_utils.Data(n=15) mdp = MDP(data=data) reward_func = reward_func_linear(data.statistics, verbose=False) env = Env(reward_func=reward_func, mode='human') # policy = EpsilonGreedyPolicy(action_space = mdp.action_space) policy = RandomPolicy(action_space=mdp.action_space) test_policy = GreedyPolicy(action_space=mdp.action_space) algo = QLearningAlgo(env=env, mdp=mdp, policy=policy, discount=0.2) algo.set_mode('train') algo.fit(mode='train', epochs=4, remember=True) algo.set_mode('test') algo.test(mode='test', policy=test_policy) algo.replay(batch_size=16, epochs=8) algo.set_mode('test') algo.test(mode='test', policy=test_policy) # algo.test(mode = 'human', policy = test_policy) import numpy as np import matplotlib.pyplot as plt
def main(N_bandits=10, epsilon=0.1): # ========================= # Settings # ========================= alpha = 1 tao = 0.1 bandit_probs = np.ones(N_bandits) - 0.5 bandit_probs[ 1:] = bandit_probs[1:] - epsilon # bandit probabilities of success simulations = 100 # number of simulations to perform trials = 100000 # number of episodes per experiment save_fig = True # if false -> plot, if true save as file in same directory save_format = ".pdf" # ".pdf" or ".png" # ========================= # Start multi-armed bandit simulation # ======================== print( "Running multi-armed bandits with N_bandits = {} and agent epsilon = {}" .format(N_bandits, epsilon)) reward_history_avg = np.zeros( (trials, 4)) # reward history simulation-averaged # action_history_sum = np.zeros((trials, N_bandits)) # sum action history # regret simulation-averaged, 4 is the number of agents regret_history_avg = np.zeros((trials, 4)) for i in range(simulations): bandit = MultiArmedBandit(bandit_probs) # initialize bandits agents = [ Agent(bandit, GreedyPolicy(epsilon)), # epsilon-Greedy Agent(bandit, SoftMaxPolicy(tao)), # Softmax Agent(bandit, UCBPolicy(alpha)), # UCB1 BetaAgent(bandit, GreedyPolicy(0)) # Thompson Sampling ] for a, agent in enumerate(agents): (action_history, reward_history, regret_history) = Environment(agent, bandit, trials, epsilon) # perform experiment if (i + 1) % (simulations / 20) == 0: print("Agent = {}".format(a + 1)) print("[Experiment {}/{}]".format(i + 1, simulations)) print(" bandit choice history = {}".format(action_history + 1)) print(" average reward = {}".format( np.sum(reward_history) / len(reward_history))) print(" cumulative regret = {}".format( np.sum(regret_history) / len(regret_history))) print("") # Sum up experiment reward (later to be divided to represent an average) reward_history_avg[:, a] += reward_history regret_history_avg[:, a] += regret_history # # Sum up action history # for j, (a) in enumerate(action_history): # action_history_sum[j][a] += 1 reward_history_avg /= np.float(simulations) regret_history_avg /= np.float(simulations) # ========================= # Plot regret history results # ========================= plt.close() plt.plot(regret_history_avg) plt.legend(['Greedy', 'SoftMax', 'UCB1', 'TS'], loc='upper left') plt.xlabel("Episode number") plt.ylabel("Regret accumulated".format(simulations)) plt.title( "Bandit regret history averaged, number of arms = {}, epsilon = {}". format(N_bandits, epsilon)) ax = plt.gca() ax.set_xscale("log", nonposx='clip') plt.xlim([1, trials]) if save_fig: output_file = "results/regrets_" + str(N_bandits) + "_" + str( epsilon) + save_format plt.savefig(output_file, bbox_inches="tight") else: plt.show()
parser.add_argument('--num-steps', type=int, default=1000) parser.add_argument('--num-exp', type=int, default=10000) def simulate_multi(policies, args): sample_rewards = [] for p in policies: sample_reward = [] for i in range(args.num_exp): if i % 20 == 0: print(i) sample_reward.append(p.run(args.num_steps)) sample_rewards.append(sample_reward) plt.plot(np.mean(sample_rewards[0], axis=0), label='Greedy') plt.plot(np.mean(sample_rewards[1], axis=0), label='Thompson') plt.plot(np.mean(sample_rewards[2], axis=0), label='UCB') plt.legend() plt.xlabel('Steps') plt.ylabel('Mean Reward') plt.show() if __name__ == '__main__': probs = [0.1, 0.2, 0.4, 0.6, 0.8, 0.9] args = parser.parse_args() bandit = BetaBernBandit(probs, len(probs)) greedy = GreedyPolicy(eps=0.05, bandit=bandit) thompson = ThompsonSampling(bandit=bandit) ucb = UCB(bandit=bandit) simulate_multi([greedy, thompson, ucb], args)
from policy import GreedyPolicy from policy import UniformRandomPolicy from memhelpers import NNMemStore IMAGE_SIZE = (84, 84) HISTORY_LENGTH = 4 MEM_SIZE = 2000 INIT_MEM_RATIO = 0.5 env = gym.make('BreakoutDeterministic-v0') observation = env.reset() num_actions = env.action_space.n atari_processor = AtariProcessor(IMAGE_SIZE) history_store = HistoryStore(HISTORY_LENGTH, IMAGE_SIZE) greedy_selector = GreedyPolicy() random_selector = UniformRandomPolicy(num_actions) episode_end_flag = False mem_store = NNMemStore(MEM_SIZE, (84, 84, 4)) observation = env.reset() state = atari_processor.state_for_mem(observation) history_store.add_history(state) i = 0 life = False first_step = True while episode_end_flag == False: nn_input = history_store.get_history() action = random_selector.select_action() observation, reward, done, info = env.step(action) episode_end_flag = done state = atari_processor.state_for_mem(observation)
def __init__(self, approx): Agent.__init__(self, GreedyPolicy()) self.approx = approx