def main(): env = gym.make('Taxi-v2') num_episodes = 20000 ## Sarsa agent = Agent(method='Sarsa') sarsa_avg_rewards, sarsa_best_avg_reward = interact( env, agent, num_episodes=num_episodes) plot_performance(num_episodes, sarsa_avg_rewards, "Sarsa", disp_plot=True) # ## Expected Sarsa agent = Agent(method='Expected Sarsa') exp_sarsa_avg_rewards, exp_sarsa_best_avg_reward = interact( env, agent, num_episodes=num_episodes) plot_performance(num_episodes, exp_sarsa_avg_rewards, "Expected Sarsa", disp_plot=True) ## Q-Learning agent = Agent(method='Q-Learning') sarsamax_avg_rewards, sarsamax_best_avg_reward = interact( env, agent, num_episodes=num_episodes) plot_performance(num_episodes, sarsamax_avg_rewards, "Sarsamax (Q-Learning)", disp_plot=True) ## All performances plot_all_performances( num_episodes, [sarsa_avg_rewards, exp_sarsa_avg_rewards, sarsamax_avg_rewards], title="Comparison of Temporal Difference control methods")
def objective(args): env = gym.make(f'Taxi-{c_args.taxi_version}') best_scores = [] for i in range(c_args.n_iters): agent = Agent(algorithm=c_args.algo, alpha=args[0], start_epsilon=args[1], epsilon_decay=args[2], epsilon_cut=None if args[3][0] is None else args[3][1], gamma=args[4]) avg_rewards, best_avg_reward = interact(env, agent, print_logs=False) best_scores.append(best_avg_reward) return -sum(best_scores) / len(best_scores)
import gym from ddpg_agent import Agent import numpy as np from unityagents import UnityEnvironment import matplotlib.pyplot as plt from monitor import interact env = UnityEnvironment(file_name="./Reacher_Linux_NoVis/Reacher.x86_64") # reset env and extract state_dim and action_dim brain_name = env.brain_names[0] brain = env.brains[brain_name] env_info = env.reset(train_mode=True)[brain_name] state_dim = env_info.vector_observations.shape[1] action_dim = brain.vector_action_space_size # number of agents num_agents = len(env_info.agents) agent = Agent(state_dim=state_dim, action_dim=action_dim, num_agents=num_agents, seed=np.random.randint(100)) scores = interact(env, brain_name, agent, num_agents) # # plot the scores # fig = plt.figure() # ax = fig.add_subplot(111) # plt.plot(np.arange(len(scores)), scores) # plt.ylabel('Score') # plt.xlabel('Episode #') # plt.show()
# -1: 每走一步,就扣一分 # -10: 把客人送到錯誤的位置 env = gym.make('Taxi-v2') ## 先觀察環境 action_size = env.action_space.n state_size = env.observation_space.n print('狀態數量: ', state_size) print('可執行的動作: ', action_size) agent = Agent() # 建立超參數 total_episodes = 20000 avg_rewards, best_avg_reward = interact(env, agent, total_episodes) #Q = agent.Q # ## 測試 ## 值定某個狀態 #env.reset() #state = 122 ## 可視化 #env.render() #print('==== 開始位置 ======') # #while True: # # 根據機率來隨機挑選動作 -- (A0) # action = np.argmax(Q[state]) # # 根據A0獲得 R1, S1
from itertools import product from agent import Agent from monitor import interact import gym import numpy as np env = gym.make('Taxi-v3') epss = (0.0, 0.1, 0.2) alphas = (0.05, 0.1, 0.2) rewards = dict() rewards_over_time = dict() products = list(product(epss, alphas)) for i, (eps, alpha) in enumerate(products): print(f'{i}/{len(products)}:, eps: {eps}, alpha: {alpha}') agent = Agent(eps=eps, alpha=alpha) avg_rewards, best_avg_reward = interact(env, agent, num_episodes=20_000) rewards[(eps, alpha)] = best_avg_reward rewards_over_time[(eps, alpha)] = avg_rewards
import gym from nav_agent import Agent import numpy as np from unityagents import UnityEnvironment import matplotlib.pyplot as plt from monitor import interact env = UnityEnvironment(file_name="./Banana_Linux_NoVis/Banana.x86_64") # reset env and extract state_dim and action_dim brain_name = env.brain_names[0] brain = env.brains[brain_name] env_info = env.reset(train_mode=True)[brain_name] state_dim = len(env_info.vector_observations[0]) action_dim = brain.vector_action_space_size agent = Agent(state_dim=state_dim, action_dim=action_dim, seed=0) scores = interact(env, brain_name, agent) # # plot the scores # fig = plt.figure() # ax = fig.add_subplot(111) # plt.plot(np.arange(len(scores)), scores) # plt.ylabel('Score') # plt.xlabel('Episode #') # plt.show()
def fitness_function(individual, parameter): agent = Agent(epsilon=individual['epsilon'], gamma=individual['gamma'], epsilonreducer=individual['epsilonreducer']) avg_rewards, best_avg_reward = interact(env, agent) return best_avg_reward
def objective(args): env = gym.make('Taxi-v2') agent = Agent(alpha=args['alpha']) avg_rewards, best_avg_reward = interact(env, agent) return -1*best_avg_reward
from agent import Agent from monitor import interact import gym import numpy as np env = gym.make('Taxi-v2') agent = Agent() avg_rewards, best_avg_reward = interact(env, agent)
from agent import Agent from monitor import interact from tqdm.auto import tqdm import gym import numpy as np env = gym.make('Taxi-v3') ## Q-Learning agent = Agent(alpha=0.1, gamma=0.99) avg_rewards, best_avg_reward = interact(env, agent, is_qlearning=True) ## Expected Sarsa # agent = Agent(epsilon=0.001, alpha=0.1, gamma=0.99) # avg_rewards, best_avg_reward = interact(env, agent, is_qlearning=False)
from agent import Agent from monitor import interact import gym import numpy as np env = gym.make('Taxi-v2') agent = Agent() avg_rewards, best_avg_reward, samp_rewards = interact(env, agent)
from agent import Agent from monitor import interact import gym import numpy as np env = gym.make('Taxi-v2') agent = Agent() avg_rewards, best_avg_reward = interact(env, agent, num_episodes=100000, window=100)
from agent import Agent from monitor import interact import gym import numpy as np import time import datetime env = gym.make('Taxi-v2') agent = Agent() # set timer tick = time.time() avg_rewards, best_avg_reward, scores = interact(env, agent, 100000) tock = time.time() elapsed = tock - tick print(str(datetime.timedelta(seconds=elapsed))) for score in scores: print(score)
def interact_wrapper(epsilon, alpha, gamma): agent = Agent(epsilon=epsilon, alpha=alpha, gamma=gamma) avg_rewards, best_avg_reward = interact(env, agent) return best_avg_reward
from agent import Agent from monitor import interact import gym import numpy as np env = gym.make('Taxi-v2') agent = Agent() avg_rewards, best_avg_reward = interact(env, agent, num_episodes=50000)
from agent import Agent from monitor import interact import gym import numpy as np alpha_values = [0.5, 0.1, 0.05, 0.01] num_episode_values = [20000] gamma_values = [1, 0.8, 0.5] epsilon_divisor_values = [4, 40, 400] env = gym.make('Taxi-v2') agent = Agent() for alpha in alpha_values: for gamma in gamma_values: for epsilon_divisor in epsilon_divisor_values: for num_episode in num_episode_values: avg_rewards, best_avg_reward = interact( env, agent, num_episode, 100, alpha, gamma, epsilon_divisor) print( 'alpha: {}, gamma: {}, epsilon_divisor: {}, num_episode: {}, reward: {}' .format(alpha, gamma, epsilon_divisor, num_episode, best_avg_reward))
only_KG = True data = pd.DataFrame([], columns=['Agent', 'episode', 'reward']) if only_KG: data_agent = pd.DataFrame( [], columns=['Agent', 'episode', 'greedy', 'mu', 'nu']) n_episodes = 100000 for i in range(len(agent_classes)): agent = agent_classes[i](env) agent_name = agent_names[i] avg_rewards, best_avg_reward = interact(env, agent, num_episodes=n_episodes, window=n_episodes) data_new = pd.DataFrame(list(avg_rewards), columns=['reward']) data_new.loc[:, 'episode'] = range(0, len(list(avg_rewards))) data_new.loc[:, 'Agent'] = agent_name data = data.append(data_new) if only_KG: data_agent_new = pd.DataFrame(agent.greedy_choice, columns=['greedy']) data_agent_new.loc[:, 'episode'] = range(0, len(agent.greedy_choice)) data_agent_new.loc[:, 'Agent'] = agent_name mu = [x[0] for x in agent.mu_vs_nu] nu = [x[1] for x in agent.mu_vs_nu] data_agent_new.loc[:, 'mu'] = mu data_agent_new.loc[:, 'nu'] = nu
def interact_function(epsilon, gamma, epsilonreducer): agent = Agent(epsilon=epsilon, gamma=gamma, epsilonreducer=epsilonreducer) avg_rewards, best_avg_reward = interact(env, agent) return -(best_avg_reward) ## Because i need to maximise this !!
def interact_wrapper(decay_rate, alpha, gamma): agent = Agent(decay_rate, alpha, gamma) avg_rewards, best_avg_reward = interact(env, agent, 15000) return best_avg_reward
import gym from ma_ddgp_agent import maddpgagent import numpy as np from unityagents import UnityEnvironment import matplotlib.pyplot as plt from monitor import interact env = UnityEnvironment(file_name="Tennis_Linux_NoVis/Tennis.x86_64", seed=0) # reset env and extract state_dim and action_dim brain_name = env.brain_names[0] brain = env.brains[brain_name] env_info = env.reset(train_mode=True)[brain_name] state_dim = env_info.vector_observations.shape[1] action_dim = brain.vector_action_space_size # number of agents num_agents = len(env_info.agents) agent = maddpgagent(state_dim=state_dim, action_dim=action_dim,num_agents = num_agents, seed=0) scores = interact(env, state_dim, brain_name, agent,num_agents)
def main(): env = gym.make('Taxi-v3') agent = Agent() avg_rewards, best_avg_reward = interact(env, agent)
# Set seeds based on local seed and run sequence number random.seed(i + local_seed) np.random.seed(100 * i + local_seed) env.seed(10000 * i + local_seed) env.action_space.seed(1000000 * i + local_seed) # Run the learning problem agent = Agent(alpha=alpha, gamma=gamma, get_epsilon=epfunc, c1=c1, c2=c2, beta=beta) avg_rewards, best_avg_reward = interact(env, agent, n_episodes, show_progress=10000, endline='\n') best_avg_rewards.append(best_avg_reward) # Monitor results after each run print("\rRun {}/{}, average so far={}".format( i, nruns, sum(best_avg_rewards) / len(best_avg_rewards))) print('\nLocal seed: ', local_seed) print('Average: ', sum(best_avg_rewards) / len(best_avg_rewards)) print('Median: ', sorted(best_avg_rewards)[medsub]) np.array(sorted(best_avg_rewards))
# -*- coding: utf-8 -*- """ Created on Sun May 3 13:23:33 2020 @author: Srimanth Tenneti """ from agent import Agent from monitor import interact import gym import numpy as np import matplotlib.pyplot as plt env = gym.make('Taxi-v3') # Loading the environment agent = Agent() # Creating an agent instance avg_rewards, best_avg_reward = interact(env, agent) # Training the agent
def run(params): alpha, epsilon, gamma = params env = gym.make('Taxi-v2') agent = Agent(epsilon=epsilon, alpha=alpha, gamma=gamma) avg_rewards, best_avg_reward = interact(env, agent, 10000) return best_avg_reward
with open('obj/' + name + '.pkl', 'wb') as f: pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL) def load_obj(name): with open('obj/' + name + '.pkl', 'rb') as f: return pickle.load(f) env = gym.make('Taxi-v2') num_episodes = 10000 alphas = np.arange(0.1, 1, 0.1) epsilons = np.arange(0.1, 1, 0.1) algo_types = [AlgorithmType.QLearning, AlgorithmType.ExpectedSarsa] # experiment_tag = 'alpha-.1-1-.1-eps-.1-1-.1' # avg_rewards = defaultdict(dict) # best_avg_rewards = defaultdict(dict) # for t in algo_types: # for i, a in enumerate(alphas): # for e in epsilons: # avg_rewards[a][e], best_avg_rewards[a][e] = interact(env, Agent(algorithm_type=t, epsilon=e, alpha=a), num_episodes=num_episodes) # if i == len(alphas)-1: # save_obj(avg_rewards, 'avg_rewards-'+ experiment_tag + '-' + t.name) # save_obj(best_avg_rewards, 'best_avg_rewards-' + experiment_tag + '-' + t.name) avg_rewards, best_avg_rewards = interact(env, Agent(), num_episodes=num_episodes)
from agent import Agent from monitor import interact import gym import numpy as np env = gym.make('Taxi-v3') agent = Agent() <<<<<<< HEAD avg_rewards, best_avg_reward = interact(env, agent ) #avg_rewards, best_avg_reward = interact(env, agent, 'SARSA_MAX') #avg_rewards, best_avg_reward = interact(env, agent, 'EXPECTED_SARSA') ======= avg_rewards, best_avg_reward = interact(env, agent) >>>>>>> 8f30861f8dd600672a58df9907d3b74f8999f1dd
parser.add_argument("--port", default=52162) # Pass args args = parser.parse_args() if __name__ == '__main__': # Create environment env = gym.make('LunarLander-v2') env.seed(0) # Instantiate agent agent = Agent( state_size=8, # Box(-inf, inf, (8,), float32) action_size=4, # Discrete(4) buffer_size=args.buffer_size, batch_size=args.batch_size, gamma=args.gamma, tau=args.tau, lr=args.lr, update_every=args.update_every) # Interact with environment scores = interact(env, agent, n_episodes=args.n_episodes, max_t=args.max_t, eps_start=args.eps_start, eps_end=args.eps_end, eps_decay=args.eps_decay)