Esempio n. 1
0
def main():
    parser = argparse.ArgumentParser(description=None)
    parser.add_argument('-b',
                        '--base-dir',
                        default='blackjack-1',
                        help='Set base dir.')
    parser.add_argument('-v',
                        '--verbose',
                        action='count',
                        dest='verbosity',
                        default=0,
                        help='Set verbosity.')
    args = parser.parse_args()

    if args.verbosity == 0:
        logger.setLevel(logging.INFO)
    elif args.verbosity >= 1:
        logger.setLevel(logging.DEBUG)

    num_episodes = 100000
    epsilon_decay = 8000

    policy, Q = learn(args.base_dir, num_episodes, epsilon_decay)

    final_average_return = score(policy)
    logger.info("final average returns: {}".format(final_average_return))

    plot_policy(
        policy, "diag_{}_{}_{}.png".format(num_episodes, epsilon_decay,
                                           final_average_return))

    return 0
Esempio n. 2
0
    def __init__(self, **options):
        self.options = self.defaults.copy()
        # self.options.update(options)

        self.world = World()

        self.players = {}
        self.live_players = 0

        self.turn = 0
        self.turn_order = []

        self.remaining = {}
        self.drafting = False
        self.finished = False
        # todo if log ....
        # just for newer  version
        from gym import __version__ as gym_v
        if gym_v == "0.7.4" or gym_v == "0.7.3":
            logger.setLevel(40)
        else:
            logger.set_level(40)
Esempio n. 3
0
    # the number of trials without falling over
    win_trials = 10

    # the CartPole-v0 is considered solved if
    # for 100 consecutive trials, he cart pole has not
    # fallen over and it has achieved an average
    # reward of 195.0
    # a reward of +1 is provided for every timestep
    # the pole remains upright
    win_reward = {'CartPole-v0': 195.0}

    # stores the reward per episode
    scores = deque(maxlen=win_trials)

    logger.setLevel(logger.ERROR)
    env = gym.make(args.env_id)

    outdir = "/tmp/dqn-%s" % args.env_id
    if args.ddqn:
        outdir = "/tmp/ddqn-%s" % args.env_id

    if args.no_render:
        env = wrappers.Monitor(env,
                               directory=outdir,
                               video_callable=False,
                               force=True)
    else:
        env = wrappers.Monitor(env, directory=outdir, force=True)
    env.seed(0)
Esempio n. 4
0
    ############################################
    parser.add_argument('env_id',
                        nargs='?',
                        default='MountainCar-v0',
                        help='Select the environment to run')
    args = parser.parse_args()

    logger = logging.getLogger()
    formatter = logging.Formatter('[%(asctime)s] %(message)s')
    handler = logging.StreamHandler(sys.stderr)
    handler.setFormatter(formatter)
    logger.addHandler(handler)

    # You can set the level to logging.DEBUG or logging.WARN if you
    # want to change the amount of output.
    logger.setLevel(logging.INFO)

    env = gym.make(args.env_id)
    outdir = '/tmp/' + 'qagent' + '-results'
    env = wrappers.Monitor(env, outdir, write_upon_reset=True, force=True)

    env.seed(0)

    ############################################
    # CS482: This initial Q-table size should
    # change to fit the number of actions (columns)
    # and the number of observations (rows)
    ############################################

    Q = np.zeros([300, env.action_space.n])
Esempio n. 5
0
import sys
import time

# from evostra import EvolutionStrategy
from pytorch_es import EvolutionModule
from pytorch_es.utils.helpers import weights_init
import gym
from gym import logger as gym_logger
import numpy as np
from PIL import Image
import torch
from torch.autograd import Variable
import torch.nn as nn
import torchvision
from torchvision import transforms
gym_logger.setLevel(logging.CRITICAL)

parser = argparse.ArgumentParser()
parser.add_argument('-w',
                    '--weights_path',
                    type=str,
                    required=True,
                    help='Path to save final weights')
parser.add_argument('-c',
                    '--cuda',
                    action='store_true',
                    help='Whether or not to use CUDA')
parser.set_defaults(cuda=False)

args = parser.parse_args()
                        "--slippery",
                        help=help_,
                        action='store_true')
    help_ = "Exploration only. For baseline."
    parser.add_argument("-e",
                        "--explore",
                        help=help_,
                        action='store_true')
    help_ = "Sec of time delay in UI. Useful for viz in demo mode."
    parser.add_argument("-t",
                        "--delay",
                        help=help_,
                        type=int)
    args = parser.parse_args()

    logger.setLevel(logger.INFO)

    # instantiate a gym environment (FrozenLake-v0)
    env = gym.make(args.env_id)

    # debug dir
    outdir = "/tmp/q-learning-%s" % args.env_id
    env = wrappers.Monitor(env, directory=outdir, force=True)
    env.seed(0)
    if not args.slippery:
        env.is_slippery = False

    if args.delay is not None:
        delay = args.delay 
    else: 
        delay = 0
Esempio n. 7
0
import gym
import gym.wrappers
import gym.envs
import gym.spaces
import traceback
import logging

try:
    from gym import logger as gym_logger

    gym_logger.setLevel(logging.WARNING)
except Exception as e:
    traceback.print_exc()

import os
import os.path as osp
from rllab.envs.base import Env, Step
from rllab.core.serializable import Serializable
from rllab.spaces.box import Box
from rllab.spaces.discrete import Discrete
from rllab.spaces.product import Product
from rllab.misc import logger


def convert_gym_space(space):
    if isinstance(space, gym.spaces.Box):
        return Box(low=space.low, high=space.high)
    elif isinstance(space, gym.spaces.Discrete):
        return Discrete(n=space.n)
    elif isinstance(space, gym.spaces.Tuple):
        return Product([convert_gym_space(x) for x in space.spaces])
Esempio n. 8
0
def main():
    args = options.parse_args()
    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)
    gym_logger.setLevel(logging.CRITICAL)
    env_func = partial(get_env, args=args)
    env = get_env(args)
    reward_goal = get_goal(args)
    consecutive_goal_max = 10
    max_iteration = args.epoch
    all_rewards = []
    all_times = []
    all_totals = []
    for trial in range(args.n_trials):
        policy = policies.get_policy(args, env)
        if args.alg == 'ES':
            run_func = partial(envs.run_env_ES,
                               policy=policy,
                               env_func=env_func)
            alg = ESModule(
                policy,
                run_func,
                population_size=args.population_size,  # HYPERPARAMETER
                sigma=args.sigma,  # HYPERPARAMETER
                learning_rate=args.lr,  # HYPERPARAMETER TODO:CHANGE
                threadcount=args.population_size)

        elif args.alg == 'PPO':
            run_func = partial(envs.run_env_PPO,
                               env_func=env_func)  # TODO: update
            alg = PPOModule(
                policy,
                run_func,
                n_updates=args.n_updates,  # HYPERPARAMETER
                batch_size=args.batch_size,  # HYPERPARAMETER
                max_steps=args.max_steps,
                gamma=args.gamma,
                clip=args.clip,
                ent_coeff=args.ent_coeff,
                learning_rate=args.lr)  # TODO: CHANGE

        elif args.alg == 'ESPPO':
            run_func = partial(envs.run_env_PPO, env_func=env_func)

            alg = ESPPOModule(
                policy,
                run_func,
                population_size=args.population_size,  # HYPERPARAMETER
                sigma=args.sigma,  # HYPERPARAMETER
                n_updates=args.n_updates,  # HYPERPARAMETER
                batch_size=args.batch_size,  # HYPERPARAMETER
                max_steps=args.max_steps,
                gamma=args.gamma,
                clip=args.clip,
                ent_coeff=args.ent_coeff,
                n_seq=args.n_seq,
                ppo_learning_rate=args.ppo_lr,
                es_learning_rate=args.es_lr,
                threadcount=args.population_size)

        elif args.alg == 'MAXPPO':
            run_func = partial(envs.run_env_PPO, env_func=env_func)

            alg = MaxPPOModule(
                policy,
                run_func,
                population_size=args.population_size,  # HYPERPARAMETER
                sigma=args.sigma,  # HYPERPARAMETER
                n_updates=args.n_updates,  # HYPERPARAMETER
                batch_size=args.batch_size,  # HYPERPARAMETER
                max_steps=args.max_steps,
                gamma=args.gamma,
                clip=args.clip,
                ent_coeff=args.ent_coeff,
                n_seq=args.n_seq,
                ppo_learning_rate=args.ppo_lr,
                threadcount=args.population_size)

        elif args.alg == 'ALTPPO':
            run_func = partial(envs.run_env_PPO, env_func=env_func)

            alg = AltPPOModule(
                policy,
                run_func,
                population_size=args.population_size,  # HYPERPARAMETER
                sigma=args.sigma,  # HYPERPARAMETER
                n_updates=args.n_updates,  # HYPERPARAMETER
                batch_size=args.batch_size,  # HYPERPARAMETER
                max_steps=args.max_steps,
                gamma=args.gamma,
                clip=args.clip,
                ent_coeff=args.ent_coeff,
                n_alt=args.n_alt,
                es_learning_rate=args.es_lr,
                ppo_learning_rate=args.ppo_lr,
                threadcount=args.population_size)

        if args.render:
            with open(os.path.join(args.directory, 'weights.pkl'), 'rb') as fp:
                weights = pickle.load(fp)
                policy.load_state_dict(weights)

            if args.alg == 'ES':
                total_reward = run_func(weights, stochastic=False, render=True)
            else:
                total_reward = run_func(policy,
                                        stochastic=False,
                                        render=True,
                                        reward_only=True)
            print(f"Total rewards from episode: {total_rewards}")
            return

        exp_dir = os.path.join(args.directory, alg.model_name)
        if not os.path.exists(exp_dir):
            os.makedirs(exp_dir)

        start = time.time()
        consecutive_goal_count = 0
        iteration = 0
        rewards = []
        while True:
            if iteration >= max_iteration:
                break
            weights = alg.step()
            if (iteration + 1) % 10 == 0:
                if args.alg == 'ES':
                    test_reward = run_func(weights,
                                           stochastic=False,
                                           render=False)
                else:
                    test_reward = run_func(policy,
                                           stochastic=False,
                                           render=False,
                                           reward_only=True)
                rewards.append(test_reward)
                print('iter %d. reward: %f' % (iteration + 1, test_reward))

                if consecutive_goal_max and reward_goal:
                    consecutive_goal_count = consecutive_goal_count + 1 if test_reward >= reward_goal else 0
                    if consecutive_goal_count >= consecutive_goal_max:
                        break
            iteration += 1
        end = time.time() - start
        if args.alg == 'ES':
            total_reward = run_func(weights, stochastic=False, render=False)
        else:
            total_reward = run_func(policy,
                                    stochastic=False,
                                    render=False,
                                    reward_only=True)
        all_rewards.append(rewards)
        all_times.append(end)
        all_totals.append(total_reward)
        print(f"Reward from final weights: {total_reward}")
        print(f"Time to completion: {end}")
    max_len = 0
    for rewards in all_rewards:
        if len(rewards) > max_len:
            max_len = len(rewards)
    for rewards in all_rewards:
        while len(rewards) < max_len:
            rewards.append(reward_goal)
        rewards = np.array(rewards)
    all_rewards = np.array(all_rewards)
    rewards_mean = np.mean(all_rewards, axis=0)
    rewards_std = np.std(all_rewards, axis=0)
    total_mean = np.mean(all_totals)
    time_mean = np.mean(all_times)
    plt.errorbar(np.arange(max_len),
                 rewards_mean,
                 yerr=rewards_std,
                 label='rewards')
    plt.legend(loc=4)
    plt.grid(True)
    plt.tight_layout()
    path = os.path.join(exp_dir, "rewards_plot.png")
    plt.savefig(path)
    plt.close()
    np.savetxt(os.path.join(exp_dir, 'rewards.txt'), rewards_mean)
    pickle.dump(weights, open(os.path.join(exp_dir, 'weights.pkl'), 'wb'))
    out_file = open(os.path.join(exp_dir, "results.txt"), 'w')
    print(f"Average rewards from final weights: {total_mean}")
    msg = f"Average rewards from final weights: {total_mean}"
    msg += "\n"
    print(f"Average time to completion: {time_mean}")
    msg += f"Average time to completion: {time_mean}"
    msg += "\n"
    print(f"Results saved at: {exp_dir}")
    out_file.write(msg)
    out_file.flush()
Esempio n. 9
0
def main():

    gym_logger.setLevel(logging.CRITICAL)

    fmtr = argparse.ArgumentDefaultsHelpFormatter
    parser = argparse.ArgumentParser(formatter_class=fmtr)
    parser.add_argument('--env', default='Pendulum-v0', help='Environment id')
    parser.add_argument('--pop', type=int, default=64, help='Population size')
    parser.add_argument('--iter', type=int, default=400, help='Iterations')
    parser.add_argument('--seed',
                        type=int,
                        required=False,
                        help='Seed for random number generator')
    parser.add_argument('--checkpoint',
                        action='store_true',
                        help='Save at each new best')
    parser.add_argument('--sigma', type=float, default=0.1, help='Sigma')
    parser.add_argument('--lr',
                        type=float,
                        default=0.001,
                        help='Learning rate')
    parser.add_argument('--target',
                        type=float,
                        default=None,
                        help='Reward target')
    parser.add_argument('--csg',
                        type=int,
                        default=10,
                        help='Consecutive goal stopping')
    args = parser.parse_args()

    np.random.seed(args.seed)
    if args.seed is not None:
        torch.manual_seed(args.seed)

    net = Actor(3, 1, 2, 64)

    target = -120

    def get_reward(weights, net):
        cloned_net = copy.deepcopy(net)
        _copy_weights_to_net(weights, cloned_net)
        return eval_net(cloned_net, args.env)

    partial_func = partial(get_reward, net=net)
    mother_parameters = list(net.parameters())

    es = EvolutionModule(mother_parameters,
                         partial_func,
                         population_size=args.pop,
                         sigma=args.sigma,
                         learning_rate=args.lr,
                         reward_goal=target,
                         consecutive_goal_stopping=args.csg,
                         save_name=(args.env if args.checkpoint else None))

    os.makedirs('models', exist_ok=True)

    final_weights, total_steps = es.run(args.iter, target)

    print('Total evaluations = %d' % total_steps)

    # Save final weights in a new network, along with environment name
    reward = partial_func(final_weights)[0]
    _copy_weights_to_net(final_weights, net)
    _save_net(net, args.env, reward)
Esempio n. 10
0
def main():

    gym_logger.setLevel(logging.CRITICAL)

    fmtr = argparse.ArgumentDefaultsHelpFormatter
    parser = argparse.ArgumentParser(formatter_class=fmtr)
    parser.add_argument('--env', default='CartPole-v0', help='Environment id')
    parser.add_argument('--cuda',
                        action='store_true',
                        help='Whether or not to use CUDA')
    parser.add_argument('--pop', type=int, default=64, help='Population size')
    parser.add_argument('--iter', type=int, default=400, help='Iterations')
    parser.add_argument('--seed',
                        type=int,
                        required=False,
                        help='Seed for random number generator')
    parser.add_argument('--checkpoint',
                        action='store_true',
                        help='Save at each new best')
    parser.add_argument('--sigma', type=float, default=0.1, help='Sigma')
    parser.add_argument('--lr',
                        type=float,
                        default=0.001,
                        help='Learning rate')
    parser.add_argument('--target',
                        type=float,
                        default=None,
                        help='Reward target')
    parser.add_argument('--csg',
                        type=int,
                        default=10,
                        help='Consecutive goal stopping')
    args = parser.parse_args()

    cuda = False

    if args.cuda:
        if torch.cuda.is_available():
            cuda = True
        else:
            print('******* Sorry, CUDA not available *******')

    np.random.seed(args.seed)
    if args.seed is not None:
        torch.manual_seed(args.seed)

    # Run code in script named by environment
    code = open('./config/%s.py' % args.env).read()
    ldict = {}
    exec(code, globals(), ldict)
    net = ldict['net']

    # If target was specified on command line, use it; otherwise, check for
    # target in config file; if found, use it.
    target = (args.target if args.target is not None else
              ldict['target'] if 'target' in ldict else None)

    # Convert net to CUDA format if specified and avaialble
    if cuda:
        net = net.cuda()

    def get_reward(weights, net):
        cloned_net = copy.deepcopy(net)
        _copy_weights_to_net(weights, cloned_net)
        return eval_net(cloned_net, args.env, seed=args.seed)

    partial_func = partial(get_reward, net=net)
    mother_parameters = list(net.parameters())

    es = EvolutionModule(mother_parameters,
                         partial_func,
                         population_size=args.pop,
                         sigma=args.sigma,
                         learning_rate=args.lr,
                         cuda=cuda,
                         reward_goal=target,
                         consecutive_goal_stopping=args.csg,
                         save_name=(args.env if args.checkpoint else None))

    os.makedirs('models', exist_ok=True)

    final_weights, total_steps = es.run(args.iter, target)

    print('Total evaluations = %d' % total_steps)

    # Save final weights in a new network, along with environment name
    reward = partial_func(final_weights)[0]
    _copy_weights_to_net(final_weights, net)
    _save_net(net, args.env, reward)
if __name__ == '__main__':
    parser = argparse.ArgumentParser(description=None)
    parser.add_argument('env_id',
                        nargs='?',
                        default='CartPole-v0',
                        help='Select the environment to run')
    args = parser.parse_args()

    win_trials = 100
    win_reward = {'CartPole-v0': 195.0}
    scores = deque(maxlen=win_trials)

    # You can set the level to logging.DEBUG or logging.WARN if you
    # want to change the amount of output.
    logger.setLevel(logging.ERROR)

    env = gym.make(args.env_id)

    outdir = "/tmp/dqn-%s" % args.env_id
    env = wrappers.Monitor(env, directory=outdir, force=True)
    env.seed(0)
    agent = DQNAgent(env.observation_space, env.action_space)

    episode_count = 3000
    state_size = env.observation_space.shape[0]
    batch_size = 4

    for i in range(episode_count):
        state = env.reset()
        state = np.reshape(state, [1, state_size])
Esempio n. 12
0
import gym.envs
import gym.spaces
import traceback
import logging
from gym.spaces import Box
from gym.spaces import Discrete
from gym.spaces import Tuple as Product
import gym.logger as monitor_logger
import os

from .env_base import Env, Step
from util.serializable import Serializable

import util.logger as logger

monitor_logger.setLevel(logging.WARNING)


def convert_gym_space(space):
    if isinstance(space, gym.spaces.Box):
        return Box(low=space.low, high=space.high,
                   dtype='float32')  # float32 added
    elif isinstance(space, gym.spaces.Discrete):
        return Discrete(n=space.n)
    elif isinstance(space, gym.spaces.Tuple):
        return Product([convert_gym_space(x) for x in space.spaces])
    else:
        raise NotImplementedError


class CappedCubicVideoSchedule(object):
Esempio n. 13
0
def main():
    parser = argparse.ArgumentParser(description=None)
    parser.add_argument('-b',
                        '--base-dir',
                        default='blackjack-1',
                        help='Set base dir.')
    parser.add_argument('-v',
                        '--verbose',
                        action='count',
                        dest='verbosity',
                        default=0,
                        help='Set verbosity.')
    args = parser.parse_args()

    if args.verbosity == 0:
        logger.setLevel(logging.INFO)
    elif args.verbosity >= 1:
        logger.setLevel(logging.DEBUG)

    num_episodes = 100000
    epsilon = 1

    policy, Q, Q_side, rewards, chips, side_bets, N = learn(
        args.base_dir, num_episodes, epsilon)

    rewards_, chips, num_episodes_score = score(policy, Q_side, num_episodes)

    win = 0
    lose = 0
    draw = 0
    for reward in rewards_:
        if reward >= 1:
            win += 1
        elif reward == 0:
            draw += 1
        elif reward <= 0:
            lose += 1
    print(win, num_episodes_score)
    win_rate = float(win) / num_episodes_score
    print(win, draw, lose, win_rate)
    print("win rate:", win_rate * 100, "draw rate:",
          float(draw) / num_episodes_score * 100, "lose rate:",
          float(lose) / num_episodes_score * 100)
    path = "C:/Users/drp3p/Desktop/honours_3/"
    dir_out = "{}_results".format(
        str(datetime.datetime.now()).replace(' ', '_').replace('.',
                                                               ':').replace(
                                                                   ':', '-'))

    directory = './html/'
    filename = "file.html"
    file_path = os.path.join(directory, filename)

    os.mkdir(dir_out, 755)
    write = open("policy.txt", 'w')
    write.write(str(policy))
    write.close()
    write = open("val_func.txt", 'w')
    write.write(str(Q))
    write.close()
    write = open("N.txt", "w")
    write.write(str(N))
    write.close()

    #plt.yscale('log')
    plt.plot(rewards_)
    print("chips: ", chips)
    #plt.plot(rewards_)

    #plt.plot(Q_side)

    #plt.show()
    #logger.info("final average returns: {}".format(rewards_))

    plot_policy(policy, "diag_{}_{}_{}.png".format(num_episodes, epsilon, 0))

    write = open("side_bets.txt", 'w')
    write.write(str(side_bets))
    write.close()

    write = open("policy.txt", "w")

    return 0
Esempio n. 14
0
""" Rllab implementation with a HACK. See comment in `GymEnv.__init__`. """
import gym
import gym.wrappers
import gym.envs
import gym.spaces
import traceback
import logging

try:
    from gym import logger as monitor_logger
    monitor_logger.setLevel(logging.WARNING)
except Exception as e:
    traceback.print_exc()

import os
from gym import Env
from garage.envs.base import Step
from garage.core.serializable import Serializable
from garage.spaces.box import Box
from garage.spaces.discrete import Discrete
from garage.spaces.tuple import Tuple
from garage.misc import logger


def convert_gym_space(space):
    if isinstance(space, gym.spaces.Box):
        return Box(low=space.low, high=space.high)
    elif isinstance(space, gym.spaces.Discrete):
        return Discrete(n=space.n)
    elif isinstance(space, gym.spaces.Tuple):
        return Tuple([convert_gym_space(x) for x in space.spaces])