def main(): parser = argparse.ArgumentParser(description=None) parser.add_argument('-b', '--base-dir', default='blackjack-1', help='Set base dir.') parser.add_argument('-v', '--verbose', action='count', dest='verbosity', default=0, help='Set verbosity.') args = parser.parse_args() if args.verbosity == 0: logger.setLevel(logging.INFO) elif args.verbosity >= 1: logger.setLevel(logging.DEBUG) num_episodes = 100000 epsilon_decay = 8000 policy, Q = learn(args.base_dir, num_episodes, epsilon_decay) final_average_return = score(policy) logger.info("final average returns: {}".format(final_average_return)) plot_policy( policy, "diag_{}_{}_{}.png".format(num_episodes, epsilon_decay, final_average_return)) return 0
def __init__(self, **options): self.options = self.defaults.copy() # self.options.update(options) self.world = World() self.players = {} self.live_players = 0 self.turn = 0 self.turn_order = [] self.remaining = {} self.drafting = False self.finished = False # todo if log .... # just for newer version from gym import __version__ as gym_v if gym_v == "0.7.4" or gym_v == "0.7.3": logger.setLevel(40) else: logger.set_level(40)
# the number of trials without falling over win_trials = 10 # the CartPole-v0 is considered solved if # for 100 consecutive trials, he cart pole has not # fallen over and it has achieved an average # reward of 195.0 # a reward of +1 is provided for every timestep # the pole remains upright win_reward = {'CartPole-v0': 195.0} # stores the reward per episode scores = deque(maxlen=win_trials) logger.setLevel(logger.ERROR) env = gym.make(args.env_id) outdir = "/tmp/dqn-%s" % args.env_id if args.ddqn: outdir = "/tmp/ddqn-%s" % args.env_id if args.no_render: env = wrappers.Monitor(env, directory=outdir, video_callable=False, force=True) else: env = wrappers.Monitor(env, directory=outdir, force=True) env.seed(0)
############################################ parser.add_argument('env_id', nargs='?', default='MountainCar-v0', help='Select the environment to run') args = parser.parse_args() logger = logging.getLogger() formatter = logging.Formatter('[%(asctime)s] %(message)s') handler = logging.StreamHandler(sys.stderr) handler.setFormatter(formatter) logger.addHandler(handler) # You can set the level to logging.DEBUG or logging.WARN if you # want to change the amount of output. logger.setLevel(logging.INFO) env = gym.make(args.env_id) outdir = '/tmp/' + 'qagent' + '-results' env = wrappers.Monitor(env, outdir, write_upon_reset=True, force=True) env.seed(0) ############################################ # CS482: This initial Q-table size should # change to fit the number of actions (columns) # and the number of observations (rows) ############################################ Q = np.zeros([300, env.action_space.n])
import sys import time # from evostra import EvolutionStrategy from pytorch_es import EvolutionModule from pytorch_es.utils.helpers import weights_init import gym from gym import logger as gym_logger import numpy as np from PIL import Image import torch from torch.autograd import Variable import torch.nn as nn import torchvision from torchvision import transforms gym_logger.setLevel(logging.CRITICAL) parser = argparse.ArgumentParser() parser.add_argument('-w', '--weights_path', type=str, required=True, help='Path to save final weights') parser.add_argument('-c', '--cuda', action='store_true', help='Whether or not to use CUDA') parser.set_defaults(cuda=False) args = parser.parse_args()
"--slippery", help=help_, action='store_true') help_ = "Exploration only. For baseline." parser.add_argument("-e", "--explore", help=help_, action='store_true') help_ = "Sec of time delay in UI. Useful for viz in demo mode." parser.add_argument("-t", "--delay", help=help_, type=int) args = parser.parse_args() logger.setLevel(logger.INFO) # instantiate a gym environment (FrozenLake-v0) env = gym.make(args.env_id) # debug dir outdir = "/tmp/q-learning-%s" % args.env_id env = wrappers.Monitor(env, directory=outdir, force=True) env.seed(0) if not args.slippery: env.is_slippery = False if args.delay is not None: delay = args.delay else: delay = 0
import gym import gym.wrappers import gym.envs import gym.spaces import traceback import logging try: from gym import logger as gym_logger gym_logger.setLevel(logging.WARNING) except Exception as e: traceback.print_exc() import os import os.path as osp from rllab.envs.base import Env, Step from rllab.core.serializable import Serializable from rllab.spaces.box import Box from rllab.spaces.discrete import Discrete from rllab.spaces.product import Product from rllab.misc import logger def convert_gym_space(space): if isinstance(space, gym.spaces.Box): return Box(low=space.low, high=space.high) elif isinstance(space, gym.spaces.Discrete): return Discrete(n=space.n) elif isinstance(space, gym.spaces.Tuple): return Product([convert_gym_space(x) for x in space.spaces])
def main(): args = options.parse_args() random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) gym_logger.setLevel(logging.CRITICAL) env_func = partial(get_env, args=args) env = get_env(args) reward_goal = get_goal(args) consecutive_goal_max = 10 max_iteration = args.epoch all_rewards = [] all_times = [] all_totals = [] for trial in range(args.n_trials): policy = policies.get_policy(args, env) if args.alg == 'ES': run_func = partial(envs.run_env_ES, policy=policy, env_func=env_func) alg = ESModule( policy, run_func, population_size=args.population_size, # HYPERPARAMETER sigma=args.sigma, # HYPERPARAMETER learning_rate=args.lr, # HYPERPARAMETER TODO:CHANGE threadcount=args.population_size) elif args.alg == 'PPO': run_func = partial(envs.run_env_PPO, env_func=env_func) # TODO: update alg = PPOModule( policy, run_func, n_updates=args.n_updates, # HYPERPARAMETER batch_size=args.batch_size, # HYPERPARAMETER max_steps=args.max_steps, gamma=args.gamma, clip=args.clip, ent_coeff=args.ent_coeff, learning_rate=args.lr) # TODO: CHANGE elif args.alg == 'ESPPO': run_func = partial(envs.run_env_PPO, env_func=env_func) alg = ESPPOModule( policy, run_func, population_size=args.population_size, # HYPERPARAMETER sigma=args.sigma, # HYPERPARAMETER n_updates=args.n_updates, # HYPERPARAMETER batch_size=args.batch_size, # HYPERPARAMETER max_steps=args.max_steps, gamma=args.gamma, clip=args.clip, ent_coeff=args.ent_coeff, n_seq=args.n_seq, ppo_learning_rate=args.ppo_lr, es_learning_rate=args.es_lr, threadcount=args.population_size) elif args.alg == 'MAXPPO': run_func = partial(envs.run_env_PPO, env_func=env_func) alg = MaxPPOModule( policy, run_func, population_size=args.population_size, # HYPERPARAMETER sigma=args.sigma, # HYPERPARAMETER n_updates=args.n_updates, # HYPERPARAMETER batch_size=args.batch_size, # HYPERPARAMETER max_steps=args.max_steps, gamma=args.gamma, clip=args.clip, ent_coeff=args.ent_coeff, n_seq=args.n_seq, ppo_learning_rate=args.ppo_lr, threadcount=args.population_size) elif args.alg == 'ALTPPO': run_func = partial(envs.run_env_PPO, env_func=env_func) alg = AltPPOModule( policy, run_func, population_size=args.population_size, # HYPERPARAMETER sigma=args.sigma, # HYPERPARAMETER n_updates=args.n_updates, # HYPERPARAMETER batch_size=args.batch_size, # HYPERPARAMETER max_steps=args.max_steps, gamma=args.gamma, clip=args.clip, ent_coeff=args.ent_coeff, n_alt=args.n_alt, es_learning_rate=args.es_lr, ppo_learning_rate=args.ppo_lr, threadcount=args.population_size) if args.render: with open(os.path.join(args.directory, 'weights.pkl'), 'rb') as fp: weights = pickle.load(fp) policy.load_state_dict(weights) if args.alg == 'ES': total_reward = run_func(weights, stochastic=False, render=True) else: total_reward = run_func(policy, stochastic=False, render=True, reward_only=True) print(f"Total rewards from episode: {total_rewards}") return exp_dir = os.path.join(args.directory, alg.model_name) if not os.path.exists(exp_dir): os.makedirs(exp_dir) start = time.time() consecutive_goal_count = 0 iteration = 0 rewards = [] while True: if iteration >= max_iteration: break weights = alg.step() if (iteration + 1) % 10 == 0: if args.alg == 'ES': test_reward = run_func(weights, stochastic=False, render=False) else: test_reward = run_func(policy, stochastic=False, render=False, reward_only=True) rewards.append(test_reward) print('iter %d. reward: %f' % (iteration + 1, test_reward)) if consecutive_goal_max and reward_goal: consecutive_goal_count = consecutive_goal_count + 1 if test_reward >= reward_goal else 0 if consecutive_goal_count >= consecutive_goal_max: break iteration += 1 end = time.time() - start if args.alg == 'ES': total_reward = run_func(weights, stochastic=False, render=False) else: total_reward = run_func(policy, stochastic=False, render=False, reward_only=True) all_rewards.append(rewards) all_times.append(end) all_totals.append(total_reward) print(f"Reward from final weights: {total_reward}") print(f"Time to completion: {end}") max_len = 0 for rewards in all_rewards: if len(rewards) > max_len: max_len = len(rewards) for rewards in all_rewards: while len(rewards) < max_len: rewards.append(reward_goal) rewards = np.array(rewards) all_rewards = np.array(all_rewards) rewards_mean = np.mean(all_rewards, axis=0) rewards_std = np.std(all_rewards, axis=0) total_mean = np.mean(all_totals) time_mean = np.mean(all_times) plt.errorbar(np.arange(max_len), rewards_mean, yerr=rewards_std, label='rewards') plt.legend(loc=4) plt.grid(True) plt.tight_layout() path = os.path.join(exp_dir, "rewards_plot.png") plt.savefig(path) plt.close() np.savetxt(os.path.join(exp_dir, 'rewards.txt'), rewards_mean) pickle.dump(weights, open(os.path.join(exp_dir, 'weights.pkl'), 'wb')) out_file = open(os.path.join(exp_dir, "results.txt"), 'w') print(f"Average rewards from final weights: {total_mean}") msg = f"Average rewards from final weights: {total_mean}" msg += "\n" print(f"Average time to completion: {time_mean}") msg += f"Average time to completion: {time_mean}" msg += "\n" print(f"Results saved at: {exp_dir}") out_file.write(msg) out_file.flush()
def main(): gym_logger.setLevel(logging.CRITICAL) fmtr = argparse.ArgumentDefaultsHelpFormatter parser = argparse.ArgumentParser(formatter_class=fmtr) parser.add_argument('--env', default='Pendulum-v0', help='Environment id') parser.add_argument('--pop', type=int, default=64, help='Population size') parser.add_argument('--iter', type=int, default=400, help='Iterations') parser.add_argument('--seed', type=int, required=False, help='Seed for random number generator') parser.add_argument('--checkpoint', action='store_true', help='Save at each new best') parser.add_argument('--sigma', type=float, default=0.1, help='Sigma') parser.add_argument('--lr', type=float, default=0.001, help='Learning rate') parser.add_argument('--target', type=float, default=None, help='Reward target') parser.add_argument('--csg', type=int, default=10, help='Consecutive goal stopping') args = parser.parse_args() np.random.seed(args.seed) if args.seed is not None: torch.manual_seed(args.seed) net = Actor(3, 1, 2, 64) target = -120 def get_reward(weights, net): cloned_net = copy.deepcopy(net) _copy_weights_to_net(weights, cloned_net) return eval_net(cloned_net, args.env) partial_func = partial(get_reward, net=net) mother_parameters = list(net.parameters()) es = EvolutionModule(mother_parameters, partial_func, population_size=args.pop, sigma=args.sigma, learning_rate=args.lr, reward_goal=target, consecutive_goal_stopping=args.csg, save_name=(args.env if args.checkpoint else None)) os.makedirs('models', exist_ok=True) final_weights, total_steps = es.run(args.iter, target) print('Total evaluations = %d' % total_steps) # Save final weights in a new network, along with environment name reward = partial_func(final_weights)[0] _copy_weights_to_net(final_weights, net) _save_net(net, args.env, reward)
def main(): gym_logger.setLevel(logging.CRITICAL) fmtr = argparse.ArgumentDefaultsHelpFormatter parser = argparse.ArgumentParser(formatter_class=fmtr) parser.add_argument('--env', default='CartPole-v0', help='Environment id') parser.add_argument('--cuda', action='store_true', help='Whether or not to use CUDA') parser.add_argument('--pop', type=int, default=64, help='Population size') parser.add_argument('--iter', type=int, default=400, help='Iterations') parser.add_argument('--seed', type=int, required=False, help='Seed for random number generator') parser.add_argument('--checkpoint', action='store_true', help='Save at each new best') parser.add_argument('--sigma', type=float, default=0.1, help='Sigma') parser.add_argument('--lr', type=float, default=0.001, help='Learning rate') parser.add_argument('--target', type=float, default=None, help='Reward target') parser.add_argument('--csg', type=int, default=10, help='Consecutive goal stopping') args = parser.parse_args() cuda = False if args.cuda: if torch.cuda.is_available(): cuda = True else: print('******* Sorry, CUDA not available *******') np.random.seed(args.seed) if args.seed is not None: torch.manual_seed(args.seed) # Run code in script named by environment code = open('./config/%s.py' % args.env).read() ldict = {} exec(code, globals(), ldict) net = ldict['net'] # If target was specified on command line, use it; otherwise, check for # target in config file; if found, use it. target = (args.target if args.target is not None else ldict['target'] if 'target' in ldict else None) # Convert net to CUDA format if specified and avaialble if cuda: net = net.cuda() def get_reward(weights, net): cloned_net = copy.deepcopy(net) _copy_weights_to_net(weights, cloned_net) return eval_net(cloned_net, args.env, seed=args.seed) partial_func = partial(get_reward, net=net) mother_parameters = list(net.parameters()) es = EvolutionModule(mother_parameters, partial_func, population_size=args.pop, sigma=args.sigma, learning_rate=args.lr, cuda=cuda, reward_goal=target, consecutive_goal_stopping=args.csg, save_name=(args.env if args.checkpoint else None)) os.makedirs('models', exist_ok=True) final_weights, total_steps = es.run(args.iter, target) print('Total evaluations = %d' % total_steps) # Save final weights in a new network, along with environment name reward = partial_func(final_weights)[0] _copy_weights_to_net(final_weights, net) _save_net(net, args.env, reward)
if __name__ == '__main__': parser = argparse.ArgumentParser(description=None) parser.add_argument('env_id', nargs='?', default='CartPole-v0', help='Select the environment to run') args = parser.parse_args() win_trials = 100 win_reward = {'CartPole-v0': 195.0} scores = deque(maxlen=win_trials) # You can set the level to logging.DEBUG or logging.WARN if you # want to change the amount of output. logger.setLevel(logging.ERROR) env = gym.make(args.env_id) outdir = "/tmp/dqn-%s" % args.env_id env = wrappers.Monitor(env, directory=outdir, force=True) env.seed(0) agent = DQNAgent(env.observation_space, env.action_space) episode_count = 3000 state_size = env.observation_space.shape[0] batch_size = 4 for i in range(episode_count): state = env.reset() state = np.reshape(state, [1, state_size])
import gym.envs import gym.spaces import traceback import logging from gym.spaces import Box from gym.spaces import Discrete from gym.spaces import Tuple as Product import gym.logger as monitor_logger import os from .env_base import Env, Step from util.serializable import Serializable import util.logger as logger monitor_logger.setLevel(logging.WARNING) def convert_gym_space(space): if isinstance(space, gym.spaces.Box): return Box(low=space.low, high=space.high, dtype='float32') # float32 added elif isinstance(space, gym.spaces.Discrete): return Discrete(n=space.n) elif isinstance(space, gym.spaces.Tuple): return Product([convert_gym_space(x) for x in space.spaces]) else: raise NotImplementedError class CappedCubicVideoSchedule(object):
def main(): parser = argparse.ArgumentParser(description=None) parser.add_argument('-b', '--base-dir', default='blackjack-1', help='Set base dir.') parser.add_argument('-v', '--verbose', action='count', dest='verbosity', default=0, help='Set verbosity.') args = parser.parse_args() if args.verbosity == 0: logger.setLevel(logging.INFO) elif args.verbosity >= 1: logger.setLevel(logging.DEBUG) num_episodes = 100000 epsilon = 1 policy, Q, Q_side, rewards, chips, side_bets, N = learn( args.base_dir, num_episodes, epsilon) rewards_, chips, num_episodes_score = score(policy, Q_side, num_episodes) win = 0 lose = 0 draw = 0 for reward in rewards_: if reward >= 1: win += 1 elif reward == 0: draw += 1 elif reward <= 0: lose += 1 print(win, num_episodes_score) win_rate = float(win) / num_episodes_score print(win, draw, lose, win_rate) print("win rate:", win_rate * 100, "draw rate:", float(draw) / num_episodes_score * 100, "lose rate:", float(lose) / num_episodes_score * 100) path = "C:/Users/drp3p/Desktop/honours_3/" dir_out = "{}_results".format( str(datetime.datetime.now()).replace(' ', '_').replace('.', ':').replace( ':', '-')) directory = './html/' filename = "file.html" file_path = os.path.join(directory, filename) os.mkdir(dir_out, 755) write = open("policy.txt", 'w') write.write(str(policy)) write.close() write = open("val_func.txt", 'w') write.write(str(Q)) write.close() write = open("N.txt", "w") write.write(str(N)) write.close() #plt.yscale('log') plt.plot(rewards_) print("chips: ", chips) #plt.plot(rewards_) #plt.plot(Q_side) #plt.show() #logger.info("final average returns: {}".format(rewards_)) plot_policy(policy, "diag_{}_{}_{}.png".format(num_episodes, epsilon, 0)) write = open("side_bets.txt", 'w') write.write(str(side_bets)) write.close() write = open("policy.txt", "w") return 0
""" Rllab implementation with a HACK. See comment in `GymEnv.__init__`. """ import gym import gym.wrappers import gym.envs import gym.spaces import traceback import logging try: from gym import logger as monitor_logger monitor_logger.setLevel(logging.WARNING) except Exception as e: traceback.print_exc() import os from gym import Env from garage.envs.base import Step from garage.core.serializable import Serializable from garage.spaces.box import Box from garage.spaces.discrete import Discrete from garage.spaces.tuple import Tuple from garage.misc import logger def convert_gym_space(space): if isinstance(space, gym.spaces.Box): return Box(low=space.low, high=space.high) elif isinstance(space, gym.spaces.Discrete): return Discrete(n=space.n) elif isinstance(space, gym.spaces.Tuple): return Tuple([convert_gym_space(x) for x in space.spaces])