Beispiel #1
0
def create_agent(agent_type, *args, **kwargs):
    agent_type = agent_type.lower()

    if agent_type == 'a2c':
        agent = A2C(*args, **kwargs)

    return agent
Beispiel #2
0
def dyian_test(path):

    path_plot = path + "plot_pt\\"
    if not os.path.exists(path_plot):
        os.makedirs(path_plot)

    path_save = path + "save_pt\\"
    if not os.path.exists(path_save):
        os.makedirs(path_save)

    diayn_mod = build_diayn(2)
    diayn_mod.load(path + "save_diayn\\")
    mountaincar(diayn_mod, path_plot + "pretrained_trajectoires_0")

    pretrained = A2C.from_diayn(diayn_mod, 0)

    model = pretrained
    for k in range(0, 1):
        iter_ = 200
        model.train(iter_)
        model.plot_rewards(path_plot + "pretrained_rewards_" +
                           str((k + 1) * iter_))
        mountaincar_baseline(
            model, path_plot + "pretrained_trajectoires_" + str(
                (k + 1) * iter_))
        model.save(path_save)
Beispiel #3
0
    def run(self):
        self.agent = A2C(self.id)

        def treatQueue():
            msg = self.conn.recv()
            if msg == "load":
                self.agent.load_model()
                print("Process " + str(self.id) +
                      " loaded the master (0) model.")

            if msg[0] == "train_with_batchs":
                print("Master process is training ...")
                t0 = time.time()
                self.agent.train_with_batchs(msg[1])
                self.agent.save_model()
                print("Master process finished training. Time : " +
                      str(time.time() - t0) + " \n")
                self.conn.send("saved")

        while True:
            if (self.id != 0):
                batch_values = []
                batch_states = []
                batch_actions = []
                print("Process " + str(self.id) + " starts playing " +
                      str(self.n_games) + " games.")
                scores = []
                env = SnakeEnv()
                overall_data = 0
                for i in range(self.n_games):
                    state = env.init()
                    t = 0
                    lastScoring = -1
                    while True:
                        action = self.agent([state])
                        newState, reward, done = env.step(action)
                        if (reward == 1):
                            for j in range(t - lastScoring):
                                batch_values.append(1)
                            lastScoring = t

                        batch_states.append([state])
                        batch_actions.append(action)
                        t += 1
                        if (done or (t - lastScoring >= 100)):
                            for j in range(t - lastScoring - 1):
                                batch_values.append(0)
                            break
                        state = newState
                    scores.append(env.score)
                    overall_data += t

                    if (overall_data >= 10000):
                        break
                print("Process " + str(self.id) + " finished playing.")
                batch = (batch_states, batch_actions, batch_values)
                self.conn.send((np.mean(scores), batch))
            treatQueue()
Beispiel #4
0
def plot_results(path):
    env = gym.make("MountainCar-v0")
    baseline = A2C(env, {"actor": [30, 30], "critic": [30, 30]}, gamma=0.99)
    baseline.load(path + "save_bl\\")

    diayn_mod = build_diayn(2)
    diayn_mod.load(path + "save_diayn\\")
    pretrained = A2C.from_diayn(diayn_mod, 0)
    pretrained.load(path + "save_pt\\")

    plt.figure()
    plt.plot(range(99, len(pretrained.rewards)),
             np.convolve(pretrained.rewards,
                         np.ones(100) / 100, "valid"),
             label="pretrained")
    plt.plot(range(99, len(baseline.rewards)),
             np.convolve(baseline.rewards,
                         np.ones(100) / 100, "valid"),
             label="baseline")
    plt.legend()
    plt.show()
    plt.savefig(path + "results")
    plt.pause(1)
Beispiel #5
0
 def __init__(self):
     #self.name = name
     action_mask = [1,1,1,1]
     self.controlTLIds = traci.trafficlight.getIDList()
     self.controlTLIds = self.controlTLIds[0]
     self.phaseDefs = ['GrrrGrrr', 'rGrrrGrr', 'rrGrrrGr', 'rrrGrrrG']
     self.yelloPhases = ['yrrryrrr', 'ryrrryrr', 'rryrrryr', 'rrryrrry']
     self.detectorIDs = traci.inductionloop.getIDList()
     self.controlLanes = get_laneID(self.detectorIDs)
     self.reset()
     state_size = len(self.state)
     self.learner = A2C(state_size, action_mask)
     self.buffer_reset()
     return
Beispiel #6
0
def baseline_test(path):
    path_plot = path + "plot_bl\\"
    if not os.path.exists(path_plot):
        os.makedirs(path_plot)

    path_save = path + "save_bl\\"
    if not os.path.exists(path_save):
        os.makedirs(path_save)

    env = gym.make("MountainCar-v0")
    baseline = A2C(env, {"actor": [30, 30], "critic": [30, 30]}, gamma=0.99)
    model = baseline
    for k in range(0, 1):
        iter_ = 200
        model.train(iter_)
        model.plot_rewards(path_plot + "baseline_rewards_" +
                           str((k + 1) * iter_))
        mountaincar_baseline(
            model, path_plot + "baseline_trajectoires_" + str((k + 1) * iter_))
        model.save(path_save)
def main(flags):
    '''
        Runs an agent in an environment.
        params:
            flags (dict): configuration
    '''
    env = gym.make('CartPole-v0')

    agent = A2C(env,
                gamma=flags.gamma,
                lambd=flags.lambd,
                learning_rate=flags.learning_rate,
                num_units=flags.num_units,
                num_layers=flags.num_layers,
                update_frequency=flags.update_frequency)

    trainer = ActorCriticTrainer(env, agent, flags)
    rewards, lengths = trainer.train(flags.num_episodes, flags.max_steps)

    plot_results(rewards, lengths)
Beispiel #8
0
def main():
    args = parser.parse_args()

    with open(args.config) as f:
        config = yaml.safe_load(f)

    set_seed(config['seed'])

    writer = None
    # Will ERROR if outdir already exists
    if not os.path.exists(config['outdir']):
        os.makedirs(config['outdir'])
        if config['use_tensorboard']:
            os.makedirs(os.path.join(config['outdir'], 'tensorboard'))
            writer = SummaryWriter(
                os.path.join(config['outdir'], 'tensorboard'))
        # save a copy of the config file
        shutil.copyfile(args.config,
                        os.path.join(config['outdir'], 'config.yaml'))
    else:
        print("ERROR: directory \'./{}\' already exists!".format(
            config['outdir']))
        raise EnvironmentError

    logger = get_logger(config)

    # create environment
    env = make_atari_env(config['task'],
                         num_env=config['parallel_envs'],
                         seed=config['seed'])
    env = VecFrameStack(env, n_stack=config['state_frames'])

    # default device for torch tensors
    device = torch.device('cuda') if config['use_gpu'] else torch.device('cpu')

    # start training
    a2c = A2C(config, env, device, logger, writer)
    a2c.train()
Beispiel #9
0
def main():
    a2c_config = A2CConfig()
    set_seed(a2c_config.seed)

    # initialize environment
    env = football_env.create_environment(
        env_name=a2c_config.env_name,
        representation="simple115",
        number_of_left_players_agent_controls=1,
        stacked=False,
        logdir="/tmp/football",
        write_goal_dumps=False,
        write_full_episode_dumps=False,
        render=False)

    # state and action space
    state_space_size = env.observation_space.shape[
        0]  # we are using simple115 representation
    if a2c_config.forbid_actions:
        action_list = [0, 1, 2, 3, 4, 5, 6, 7, 8, 12, 13,
                       15]  # forbid some actions
        action_space_size = len(action_list)
    else:
        action_list = list(range(env.action_space.n))  # default action space
        action_space_size = len(action_list)

    # initialize model
    model_config = FFNModelConfig(state_space_size=state_space_size,
                                  action_space_size=action_space_size)
    model = FFN(model_config)

    # TODO multiprocessing env
    a2c = A2C(env=env,
              model=model,
              a2c_config=a2c_config,
              action_list=action_list)
    a2c.learn()
def main():
    map_name=args.map
    envs_num=args.envs
    max_windows=args.max_windows
    total_updates=args.iters
    env_args = dict(
        map_name=map_name,
        battle_net_map=False,
        players=[sc2_env.Agent(sc2_env.Race.terran)],
        agent_interface_format=sc2_env.parse_agent_interface_format(
            feature_screen=32,
            feature_minimap=32,
            rgb_screen=None,
            rgb_minimap=None,
            action_space=None,
            use_feature_units=False,
            use_raw_units=False),
        step_mul=8,
        game_steps_per_episode=None,
        disable_fog=False,
        visualize=False
    )
    vis_env_args = env_args.copy()
    vis_env_args['visualize'] = True
    num_vis = min(envs_num, max_windows)
    env_fns = [partial(make_sc2env, **vis_env_args)] * num_vis
    num_no_vis = envs_num - num_vis
    if num_no_vis > 0:
      env_fns.extend([partial(make_sc2env, **env_args)] * num_no_vis)
    envs = SubprocVecEnv(env_fns)
    # 一个随机的实现方式 用来debug
    '''agents=[]
    for i in range(envs_num):
        agent=RandomAgent()
        agents.append(agent)'''

    '''observation_spec = envs.observation_spec()
    action_spec = envs.action_spec()
    processor = pro(observation_spec)
    for agent,obs_spec,act_spec in zip(agents,observation_spec,action_spec):
        agent.setup(obs_spec[0],act_spec[0])
    try:
        while True:
            num_frames=0
            timesteps= envs.reset()
            for a in agents:
                a.reset()
            while True:
                num_frames+=1
                last_timesteps=timesteps
                actions= [agent.step(timestep) for agent,timestep in zip(agents,timesteps)]
                timesteps=envs.step(actions)
                obs=processor.preprocess_obs(timesteps)
                a=1
    except KeyboardInterrupt:
        pass'''
    while True:
        if args.algorithm=='a2c':
            agent=A2C(envs,args)
        elif args.algorithm=='ppo':
            agent=PPO(envs,args)
        agent.reset()
        if os.path.exists(args.load_model):
            agent.net.load_state_dict(torch.load(args.load_model))
        #try:
        while True:
            agent.train()
            if agent.sum_episode>total_updates:
                print("over############################\n\n\n")
                break

    #except :
        #print(agent.last_obs['available_actions'])

    envs.close()
Beispiel #11
0
    torch.cuda.manual_seed_all(args.seed)
    torch.set_num_threads(1)
    device = torch.device('cuda' if args.cuda else 'cpu')
    # device = torch.device('cpu')
    env = gym.make(args.env_name)
    env.seed(args.seed)

    # reset env and preprocess to obtain the shape of the input (feeded into nn)
    obs = torch.from_numpy(utils.preprocess(env.reset())).float().unsqueeze(0).unsqueeze(0)

    shared_ac = Policy(obs.shape)
    shared_ac.to(device)
    shared_ac.share_memory()


    agent = A2C(shared_ac, args)

    if args.cuda:  # somehow need it to enable cuda.. but super slow
        torch.multiprocessing.set_start_method('spawn')

    processes = []
    counter = mp.Value('i', 0)
    lock = mp.Lock()

    for idx in range(0, args.num_processes):
        # p = mp.Process(target = train, args = (agent, shared_ac, args, 'cpu', idx, counter, lock))
        p = mp.Process(target = train, args = (agent, shared_ac, args, device, idx, counter, lock))
        p.start()
        processes.append(p)
    for p in processes:
        p.join()
Beispiel #12
0
def main(_):

    if FLAGS.debug:
        tf.config.experimental_run_functions_eagerly(True)

    with open(f"configs/{FLAGS.algo}.yaml") as file:
        kwargs = yaml.load(file, Loader=yaml.FullLoader)

    os.makedirs(FLAGS.logs_dir, exist_ok=True)

    tf.random.set_seed(FLAGS.seed)

    envs = make_vec_envs(FLAGS.env_name, FLAGS.seed, kwargs['num_processes'],
                         FLAGS.logs_dir)

    for gpu in tf.config.experimental.list_physical_devices('GPU'):
        tf.config.experimental.set_memory_growth(gpu, True)

    def get_obs():
        return envs.stackedobs

    def env_step(action):
        next_obs, reward, done, _ = envs.step(action)
        return next_obs, reward.astype(np.float32), done.astype(np.float32)

    batch_size = kwargs['num_steps'] * kwargs['num_processes']

    if FLAGS.algo == 'ppo':
        actor_critic = PPO((-1, *envs.observation_space.shape),
                           envs.action_space.n, FLAGS.entropy_coef,
                           FLAGS.value_loss_coef, FLAGS.gamma, **kwargs)
    else:
        del kwargs['num_processes']
        actor_critic = A2C((-1, *envs.observation_space.shape),
                           envs.action_space.n, FLAGS.entropy_coef,
                           FLAGS.value_loss_coef, FLAGS.gamma, **kwargs)

    num_updates = FLAGS.max_timesteps // batch_size

    val_loss, act_loss, ent_loss = 0, 0, 0

    hparam_str = utils.get_haram_str(env_name=FLAGS.env_name, seed=FLAGS.seed)
    writer = tf.summary.create_file_writer(
        os.path.join(FLAGS.save_dir, 'tb', hparam_str))
    writer.set_as_default()

    envs.reset()
    for i in tqdm(range(num_updates), unit_scale=batch_size, smoothing=0.1):

        actor_critic.set_learning_rate(kwargs['learning_rate'] *
                                       (1.0 - i / num_updates))

        value_loss, action_loss, entropy_loss = actor_critic.update(
            env_step, get_obs)

        val_loss += value_loss
        act_loss += action_loss
        ent_loss += entropy_loss

        if i % FLAGS.log_interval == 0 and i > 0:
            tf.summary.scalar("losses/value_loss",
                              val_loss / FLAGS.log_interval,
                              step=batch_size * i)
            tf.summary.scalar("losses/action_loss",
                              act_loss / FLAGS.log_interval,
                              step=batch_size * i)
            tf.summary.scalar("losses/entropy_loss",
                              ent_loss / FLAGS.log_interval,
                              step=batch_size * i)
            tf.summary.flush()

            val_loss = 0
            act_loss = 0
            ent_loss = 0
Beispiel #13
0
                        help="Random seed for the environment.")
    parser.add_argument('--num_episodes',
                        type=int,
                        default=1,
                        help="Number of test episodes.")
    parser.add_argument('--stochastic',
                        action='store_true',
                        help="Use stochastic policy in testing.")
    parser.add_argument('--record',
                        action='store_true',
                        help="Record videos of test episodes.")
    parser.add_argument('--video_dir',
                        help="Directory to store recorded videos.")
    args = parser.parse_args()
    env = gym.make('LunarLander-v2')
    env.seed(args.seed)
    if args.record:
        env = gym.wrappers.Monitor(env, args.video_dir, force=True)
    if args.agent_type == 'reinforce':
        agent = Reinforce(env, 0)
    elif args.agent_type == 'a2c':
        agent = A2C(env, 0, args.n)
    else:
        print('Unknown agent type %s' % args.agent_type)
        exit(1)
    agent.model.load_state_dict(
        torch.load(args.model_path, map_location=lambda storage, loc: storage))
    stochastic = True if args.stochastic else False
    r_avg, r_std = agent.eval(args.num_episodes, stochastic=stochastic)
    print('Reward average %.6f std %.6f' % (r_avg, r_std))
Beispiel #14
0
    ###############################
    # MAKE NET AND POLICY
    critic_net = FFNet(in_size=2, out_size=1)
    actor_net = FFNet(in_size=2, out_size=2)
    plc = None
    if train_config['policy'] == 'angular':
        plc = policy.AngularPolicy(actor_net, train_config['sigma'])
    elif train_config['policy'] == 'gauss':
        plc = policy.GaussianPolicy(actor_net, train_config['sigma'])
    else:
        raise RuntimeError('Not a valid policy: %s' % train_config['policy'])

    ###############################
    # CREATE ENVIRONMENT AND RUN
    algo = A2C(plc, critic_net, train_config['lr'], train_config['gamma'])

    sampler = sampler.BatchSampler(plc, **train_config)
    cumulative_rewards = np.array([]).reshape((0, 3))
    cur_update = 0
    finished_episodes = 0
    sampler.reset()

    while cur_update < train_config['num_updates']:
        batch, terminal = sampler.sample()
        algo.update(batch, terminal)
        cr = sampler.cumulative_reward

        # save cumulative rewards
        for i, t in enumerate(terminal):
            if t:
Beispiel #15
0
from hyperparams import HyperParams
from a2c import A2C
import torch.multiprocessing as mp

if __name__ == "__main__":
    mp.set_start_method('forkserver')
    a2c_trainer = A2C()
    hyps = dict()
    hyps['exp_name'] = "pongbptt"
    hyps['env_type'] = "Pong-v0"
    hyps['model_type'] = 'conv'
    hyps['use_bptt'] = True
    hyps['entr_coef'] = .01
    hyps['entr_coef_low'] = .001
    hyps['decay_entr'] = True
    hyps['val_coef'] = .5
    hyps['lr'] = 5e-4
    hyps['lr_low'] = 1e-6
    hyps['decay_lr'] = True
    hyps['gamma'] = .98
    hyps['lambda_'] = .95
    hyps['n_tsteps'] = 32
    hyps['n_rollouts'] = 36
    hyps['n_envs'] = 13
    hyps['max_tsteps'] = 40000000
    hyps['n_frame_stack'] = 3
    hyps['optim_type'] = 'rmsprop'
    hyper_params = HyperParams(hyps)
    a2c_trainer.train(hyper_params.hyps)

Beispiel #16
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--train_env',
                        type=str,
                        default="BanditTenArmedRandomRandom-v0",
                        help='env for meta-training')
    parser.add_argument('--train_eps',
                        type=int,
                        default=100,
                        help='training episodes per trial')
    parser.add_argument('--train_trial_n',
                        type=int,
                        default=1000,
                        help='number of trials during training')
    parser.add_argument('--seed', type=int, default=1, help='experiment seed')

    # Training Hyperparameters
    parser.add_argument('--hidden',
                        type=int,
                        default=48,
                        help='hidden layer dimensions')
    parser.add_argument('--gamma',
                        type=float,
                        default=0.8,
                        help='discount factor')
    args = parser.parse_args()

    x, y, e = [], [], []
    for trial in range(1, args.train_trial_n + 1):
        env = gym.make(args.train_env)
        env._seed(args.seed)
        env.reset()

        # initialize algorithm at first iteration
        if trial == 1:
            action_dim = env.action_space.n
            input_dim = 3
            algo = A2C(session=get_session(),
                       policy_cls=LSTMPolicy,
                       input_dim=input_dim,
                       hidden_dim=args.hidden,
                       action_dim=action_dim,
                       scope='a2c')
        algo.reset()
        """
		what does the env.unwrapped do exactly?

		https://discuss.pytorch.org/t/in-the-official-q-learning-example-what-does-the-env-unwrapped-do-exactly/28695

		there is a core super class called gym.Env and there are other sub classes of this to implement different environments (CartPoleEnv, MountainCarEnv etc). This unwrapped property is used to get the underlying gym.Env object from other environments.
		"""

        save_iter = args.train_trial_n // 20
        tot_returns = []
        prop_reward = []
        tot_regret = []
        tot_subopt = []

        ep_X, ep_R, ep_A, ep_V, ep_D = [], [], [], [], []
        track_R = 0
        track_regret = np.max(env.unwrapped.p_dist) * args.train_eps
        best_action = np.argmax(env.unwrapped.p_dist)
        num_suboptimal = 0
        action_hist = np.zeros(env.action_space.n)

        action = 0
        rew = 0
        # begin a trial
        for ep in range(args.train_eps):
            # run policy
            #print(action,rew, ep)
            algo_input = np.array([action, rew, ep])
            #print(algo_input)
            if len(algo_input.shape) <= 1:
                algo_input = algo_input[None]
            action, value = algo.get_actions(algo_input)
            new_obs, rew, done, info = env.step(action)
            track_R += rew
            num_suboptimal += int(action != best_action)
            action_hist[action] += 1
            if ep == 0:
                ep_X = algo_input
            else:
                ep_X = np.concatenate([ep_X, algo_input], axis=0)
            ep_A.append(action)
            ep_V.append(value)
            ep_R.append(rew)
            ep_D.append(done)

        # update policy
        ep_X = np.asarray(ep_X, dtype=np.float32)
        ep_R = np.asarray(ep_R, dtype=np.float32)
        ep_A = np.asarray(ep_A, dtype=np.int32)
        ep_V = np.squeeze(np.asarray(ep_V, dtype=np.float32))
        ep_D = np.asarray(ep_D, dtype=np.float32)
        last_value = value

        if ep_D[-1] == 0:
            disc_rew = discount_with_dones(
                ep_R.to_list() + [np.squeeze(last_value)],
                ep_D.to_list() + [0], args.gamma)[:-1]
        else:
            disc_rew = discount_with_dones(ep_R.tolist(), ep_D.tolist(),
                                           args.gamma)
        ep_adv = disc_rew - ep_V
        prop_reward.append(track_R / track_regret)
        track_regret -= track_R

        train_info = algo.train(ep_X=ep_X, ep_A=ep_A, ep_R=ep_R, ep_adv=ep_adv)
        tot_returns.append(track_R)
        tot_regret.append(track_regret)
        tot_subopt.append(num_suboptimal)

        if trial % save_iter == 0 and trial != 0:
            print("Episode: {}".format(trial))
            print("MeanReward: {}".format(np.mean(tot_returns[-save_iter:])))
            print("StdReward: {}".format(np.std(tot_returns[-save_iter:])))
            print("MeanRegret: {}".format(np.mean(tot_regret[-save_iter:])))
            print("StdRegret: {}".format(np.std(tot_regret[-save_iter:])))
            print("NumSuboptimal: {}".format(np.mean(tot_subopt[-save_iter:])))
            cur_y = np.mean(prop_reward[-save_iter:])
            cur_e = np.std(prop_reward[-save_iter:])
            x.append(trial)
            y.append(cur_y)
            e.append(cur_e)
            print("MeanPropReward: {}".format(cur_y))
            print("StdPropReward: {}".format(cur_e))

    x = np.asarray(x, dtype=np.int)
    y = np.asarray(y, dtype=np.float32)
    e = np.asarray(e, dtype=np.float32)
    # plt.errorbar(x, y, e)
    # plt.show()

    # database
    db = {}
    db['x'] = x
    db['y'] = y
    db['e'] = e

    file_name = args.train_env[:-3] + str(args.train_trial_n)
    pickle.dump(db, open(file_name + ".p", "wb"))
Beispiel #17
0
runner = Runner(env_name, n_envs, pool)

# Create model and optimizer
action_dim = 2  # Pong specific number of possible actions
net = model.Model(runner.obs_shape, action_dim, batch_norm=batch_norm)
if torch.cuda.is_available():
    net = net.cuda()
    torch.FloatTensor = torch.cuda.FloatTensor
    torch.LongTensor = torch.cuda.LongTensor
optimizer = optim.Adam(net.parameters(), lr=lr)

a2c = A2C(net,
          n_envs,
          pool,
          val_const=val_const,
          entropy_const=entropy_const,
          spatio_const=spatio_const,
          gamma=gamma,
          lambda_=lambda_,
          predict_spatio=predict_spatio)

if resume:
    net.load_state_dict(torch.load(net_save_file))
    optimizer.load_state_dict(torch.load(optim_save_file))
    logger = open(log_file, 'a+')
else:
    logger = open(log_file, 'w+')
    if batch_norm:
        logger.write("Batch Norm = True\n")
    else:
        logger.write("Batch Norm = False\n")
Beispiel #18
0
import os

writer = SummaryWriter(os.path.join('runs', name_dir(config_enhanced)))

# os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"   # see issue #152
# os.environ["CUDA_VISIBLE_DEVICES"] = config_enhanced["GPU_id"]

print("Current config_enhanced is:")
pprint(config_enhanced)
writer.add_text("config", str(config_enhanced))

env = CholeskyTaskGraph(**config_enhanced['env_settings'])
# env.reset()

# model = Net
# model = SimpleNet
model = ResNetG
# model = SimpleNetMax

agent = A2C(config_enhanced, env, model=model, writer=writer)

# rewards = Parallel(n_jobs=config_enhanced['num_cores'])(
#     delayed(wrap_non_picklable_objects(agent.training_batch))(config_enhanced['epochs'],
#                          config_enhanced['nbatch']) for i in range(config_enhanced['num_cores']))

agent.training_batch()

# TODO : evaluate test_mode and save if best than previous
# TODO: Transfer ?
# ToDo : load training batch during GPU training
Beispiel #19
0
def main(
    _run,
    _log,
    num_env_steps,
    env_name,
    seed,
    algorithm,
    dummy_vecenv,
    time_limit,
    wrappers,
    save_dir,
    eval_dir,
    loss_dir,
    log_interval,
    save_interval,
    eval_interval,
):

    if loss_dir:
        loss_dir = path.expanduser(loss_dir.format(id=str(_run._id)))
        utils.cleanup_log_dir(loss_dir)
        writer = SummaryWriter(loss_dir)
    else:
        writer = None

    eval_dir = path.expanduser(eval_dir.format(id=str(_run._id)))
    save_dir = path.expanduser(save_dir.format(id=str(_run._id)))

    utils.cleanup_log_dir(eval_dir)
    utils.cleanup_log_dir(save_dir)

    torch.set_num_threads(1)
    envs = make_vec_envs(
        env_name,
        seed,
        dummy_vecenv,
        algorithm["num_processes"],
        time_limit,
        wrappers,
        algorithm["device"],
    )

    agents = [
        A2C(i, osp, asp)
        for i, (osp, asp) in enumerate(zip(envs.observation_space, envs.action_space))
    ]
    obs = envs.reset()

    for i in range(len(obs)):
        agents[i].storage.obs[0].copy_(obs[i])
        agents[i].storage.to(algorithm["device"])

    start = time.time()
    num_updates = (
        int(num_env_steps) // algorithm["num_steps"] // algorithm["num_processes"]
    )

    all_infos = deque(maxlen=10)

    for j in range(1, num_updates + 1):

        for step in range(algorithm["num_steps"]):
            # Sample actions
            with torch.no_grad():
                n_value, n_action, n_action_log_prob, n_recurrent_hidden_states = zip(
                    *[
                        agent.model.act(
                            agent.storage.obs[step],
                            agent.storage.recurrent_hidden_states[step],
                            agent.storage.masks[step],
                        )
                        for agent in agents
                    ]
                )
            # Obser reward and next obs
            obs, reward, done, infos = envs.step(n_action)
            # envs.envs[0].render()

            # If done then clean the history of observations.
            masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done])

            bad_masks = torch.FloatTensor(
                [
                    [0.0] if info.get("TimeLimit.truncated", False) else [1.0]
                    for info in infos
                ]
            )
            for i in range(len(agents)):
                agents[i].storage.insert(
                    obs[i],
                    n_recurrent_hidden_states[i],
                    n_action[i],
                    n_action_log_prob[i],
                    n_value[i],
                    reward[:, i].unsqueeze(1),
                    masks,
                    bad_masks,
                )

            for info in infos:
                if info:
                    all_infos.append(info)

        # value_loss, action_loss, dist_entropy = agent.update(rollouts)
        for agent in agents:
            agent.compute_returns()

        for agent in agents:
            loss = agent.update([a.storage for a in agents])
            for k, v in loss.items():
                if writer:
                    writer.add_scalar(f"agent{agent.agent_id}/{k}", v, j)

        for agent in agents:
            agent.storage.after_update()

        if j % log_interval == 0 and len(all_infos) > 1:
            squashed = _squash_info(all_infos)

            total_num_steps = (
                (j + 1) * algorithm["num_processes"] * algorithm["num_steps"]
            )
            end = time.time()
            _log.info(
                f"Updates {j}, num timesteps {total_num_steps}, FPS {int(total_num_steps / (end - start))}"
            )
            _log.info(
                f"Last {len(all_infos)} training episodes mean reward {squashed['episode_reward'].sum():.3f}"
            )

            for k, v in squashed.items():
                _run.log_scalar(k, v, j)
            all_infos.clear()

        if save_interval is not None and (
            j > 0 and j % save_interval == 0 or j == num_updates
        ):
            cur_save_dir = path.join(save_dir, f"u{j}")
            for agent in agents:
                save_at = path.join(cur_save_dir, f"agent{agent.agent_id}")
                os.makedirs(save_at, exist_ok=True)
                agent.save(save_at)
            archive_name = shutil.make_archive(cur_save_dir, "xztar", save_dir, f"u{j}")
            shutil.rmtree(cur_save_dir)
            _run.add_artifact(archive_name)

        if eval_interval is not None and (
            j > 0 and j % eval_interval == 0 or j == num_updates
        ):
            evaluate(
                agents, os.path.join(eval_dir, f"u{j}"),
            )
            videos = glob.glob(os.path.join(eval_dir, f"u{j}") + "/*.mp4")
            for i, v in enumerate(videos):
                _run.add_artifact(v, f"u{j}.{i}.mp4")
    envs.close()
Beispiel #20
0
def train():
    """
    1. Process data.
    2. Train actor supervised in SL model
    3. Train critic supervised
    4. Train RL agent as a function of actor and critic weights.
    """
    # process data
    state_inputs, prev_order_inputs, prev_orders_game_labels, season_names, \
    supply_center_owners, board_dict_list = get_data("data/standard_no_press.jsonl", num_games=1)

    # train SL actor
    print("Training SL actor")
    actor_sl = SL_model(num_board_blocks=16, num_order_blocks=16)
    actor_sl.train(state_inputs, prev_order_inputs, prev_orders_game_labels, season_names, board_dict_list)

    # save actor weights
    print("Saving SL actor weights")
    weights_file = open("actor_weights.pickle", "wb+")
    pickle.dump(actor_sl.get_weights(), weights_file)
    weights_file.close()

    # train SL critic
    print("Training SL critic")
    critic_sl = CriticSL()
    critic_sl.train(state_inputs, supply_center_owners)

    # save critic weights
    print("Saving SL critic weights")
    weights_file = open("critic_weights.pickle","wb+")
    pickle.dump(critic_sl.get_weights(), weights_file)
    weights_file.close()

    # load actor, critic weights from SL
    print("Loading actor, critic weights ready for RL training")
    ### LOADING ACTOR DOESN'T WORK BECAUSE YOU NEED TO CALL IT ON SOMETHING FIRST ###
    ## see https://stackoverflow.com/questions/55719047/is-loading-in-eager-tensorflow-broken-right-now
    new_weights_file = open("sl_weights_50_chunks.pickle", "rb")
    new_weights_actor = pickle.load(new_weights_file)
    weights_file.close()

    actor_rl = ActorRL(num_board_blocks=16, num_order_blocks=16)
    # actor_rl.call(state_inputs[0], prev_order_inputs[0], season_names[0],board_dict_list[0],"AUSTRIA")

    ##########################################################################
    new_weights_file = open("critic_weights.pickle","rb")
    new_weights = pickle.load(new_weights_file)
    weights_file.close()

    critic_rl = CriticRL()
    train_data = critic_sl.process_data(state_inputs, supply_center_owners)[0][0] # needed so that critic_rl knows input shapes or something
    set_rl_weights(new_weights, critic_rl, train_data)

    # Train RL A2C
    print("Training A2C")
    a2c = A2C(actor_rl, critic_rl)
    a2c.train(num_episodes=1)
    actor_rl.set_weights(new_weights_actor)
    a2c.train(num_episodes=1)

    # save actor/critic RL weights
    print("Saving RL actor/critic weights")
    weights_file = open("critic_rl_weights.pickle", "wb+")
    pickle.dump(critic_rl.get_weights(), weights_file)
    weights_file.close()

    weights_file = open("actor_rl_weights.pickle", "wb+")
    pickle.dump(actor_rl.get_weights(), weights_file)
    weights_file.close()
    print("Done!")
Beispiel #21
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--train_env',
                        type=str,
                        default="MediumBandit-v0",
                        help='env for meta-training')
    parser.add_argument('--test_env',
                        type=str,
                        default="EasyBandit-v0",
                        help='env for meta-testing')
    parser.add_argument('--train_eps',
                        type=int,
                        default=int(2e4),
                        help='training episodes')
    parser.add_argument('--test_eps',
                        type=int,
                        default=300,
                        help='test episodes')
    parser.add_argument('--seed', type=int, default=1, help='experiment seed')

    # Training Hyperparameters
    parser.add_argument('--hidden',
                        type=int,
                        default=48,
                        help='hidden layer dimensions')
    parser.add_argument('--gamma',
                        type=float,
                        default=0.8,
                        help='discount factor')
    args = parser.parse_args()

    env = gym.make(args.train_env)
    env.seed(args.seed)

    eval_env = gym.make(args.test_env)
    eval_env.seed(args.seed)

    algo = A2C(env=env,
               session=get_session(),
               policy_cls=LSTMPolicy,
               hidden_dim=args.hidden,
               action_dim=env.action_space.n,
               scope='a2c')

    save_iter = args.train_eps // 20
    average_returns = []
    average_regret = []
    average_subopt = []

    for ep in range(args.train_eps):
        obs = env.reset()
        done = False
        ep_X, ep_R, ep_A, ep_V, ep_D = [], [], [], [], []
        track_R = 0
        track_regret = np.max(env.unwrapped.probs) * env.unwrapped.n
        best_action = np.argmax(env.unwrapped.probs)
        num_suboptimal = 0
        action_hist = np.zeros(env.action_space.n)
        algo.reset()

        while not done:
            action, value = algo.get_actions(obs[None])
            new_obs, rew, done, info = env.step(action)
            track_R += rew
            num_suboptimal += int(action != best_action)
            action_hist[action] += 1

            ep_X.append(obs)
            ep_A.append(action)
            ep_V.append(value)
            ep_R.append(rew)
            ep_D.append(done)

            obs = new_obs
        _, last_value = algo.get_actions(obs[None])
        ep_X = np.asarray(ep_X, dtype=np.float32)
        ep_R = np.asarray(ep_R, dtype=np.float32)
        ep_A = np.asarray(ep_A, dtype=np.int32)
        ep_V = np.squeeze(np.asarray(ep_V, dtype=np.float32))
        ep_D = np.asarray(ep_D, dtype=np.float32)

        if ep_D[-1] == 0:
            disc_rew = discount_with_dones(
                ep_R.to_list() + [np.squeeze(last_value)],
                ep_D.to_list() + [0], args.gamma)[:-1]
        else:
            disc_rew = discount_with_dones(ep_R.tolist(), ep_D.tolist(),
                                           args.gamma)
        ep_adv = disc_rew - ep_V
        track_regret -= track_R

        train_info = algo.train(ep_X=ep_X, ep_A=ep_A, ep_R=ep_R, ep_adv=ep_adv)
        average_returns.append(track_R)
        average_regret.append(track_regret)
        average_subopt.append(num_suboptimal)

        if ep % save_iter == 0 and ep != 0:
            print("Episode: {}".format(ep))
            print("ActionHist: {}".format(action_hist))
            print("Probs: {}".format(env.unwrapped.probs))
            print("MeanReward: {}".format(np.mean(average_returns[-50:])))
            print("MeanRegret: {}".format(np.mean(average_regret[-50:])))
            print("NumSuboptimal: {}".format(np.mean(average_subopt[-50:])))

    print()
    test_regrets = []
    test_rewards = []
    for test_ep in range(args.test_eps):
        obs = eval_env.reset()
        algo.reset()
        done = False
        track_regret = np.max(eval_env.unwrapped.probs) * eval_env.unwrapped.n
        track_R = 0

        while not done:
            action, value = algo.get_actions(obs[None])
            new_obs, rew, done, info = eval_env.step(action)
            obs = new_obs
            track_R += rew

        test_regrets.append(track_regret - track_R)
        test_rewards.append(track_R)
    print('Mean Test Cumulative Regret: {}'.format(np.mean(test_regrets)))
    print('Mean Test Reward: {}'.format(np.mean(test_rewards)))
Beispiel #22
0
def main():
    env = MyDoom()
    agent = A2C(unsup, envWrap, designHead, noReward)

    actions = [[True, False, False], [False, True, False], [False, False, True]]
    
    last_state = env.reset()
    last_features = agent.network.get_initial_features()  # reset lstm memory
    length = 0
    rewards = 0
    values = 0
    ep_bonus = 0
    life_bonus = 0
    timestep_limit = 524 # 2100/4
    episodes = 0
    total_steps = 0

    f_loss = open('./logs/loss.txt', 'a')
    f_pred_loss = open('./logs/pred_loss.txt', 'a')
    f_reward = open('./logs/reward.txt', 'a')

    with tf.Session() as sess, sess.as_default():
        init_op(sess)

        while(True):
            terminal_end = False
            rollout = PartialRollout(True)
            
            for _ in range(constants['ROLLOUT_MAXLEN']):
                # run policy
                fetched = agent.network.act(last_state, *last_features)
                action, value_, features = fetched[0], fetched[1], fetched[2:]

                # run environment: get action_index from sampled one-hot 'action'
                stepAct = action.argmax()
                
                # action repeat
                state, reward, terminal = env.skip_step(actions[stepAct])
                total_steps += 1
                if terminal: state = last_state

                if noReward:
                    reward = 0.

                bonus = agent.ap_network.pred_bonus(last_state, state, action)
                curr_tuple = [last_state, action, reward, value_, terminal, last_features, bonus, state]
                life_bonus += bonus
                ep_bonus += bonus

                # collect the experience
                rollout.add(*curr_tuple)
                rewards += reward
                length += 1
                values += value_[0]

                last_state = state
                last_features = features

                if terminal or length >= timestep_limit:
                    # prints summary of each life if envWrap==True else each game
                    print("Episode %d finished. Sum of shaped rewards: %.2f. Length: %d. Bonus: %.4f." % (episodes ,rewards, length, life_bonus))
                    f_reward.write(str(total_steps) + "," + str(rewards) + "\n")
                    f_loss.flush()
                    f_pred_loss.flush()
                    f_reward.flush()
                    if (episodes % 100 == 0): env.make_gif("./video/" + str(episodes) + ".gif")
                    life_bonus = 0
                    length = 0
                    rewards = 0
                    terminal_end = True
                    last_features = agent.network.get_initial_features()  # reset lstm memory
                    last_state = env.reset()
                    episodes += 1

                if terminal_end:
                    break

            if not terminal_end:
                rollout.r = agent.network.value(last_state, *last_features)

            loss, pred_loss = agent.process(sess, rollout)
            f_loss.write(str(total_steps) + "," + str(loss) + "\n")
            f_pred_loss.write(str(total_steps) + "," + str(pred_loss) + "\n")

    env.close()
    f_reward.close()
    f_loss.close()
    f_pred_loss.close()
Beispiel #23
0
import gym
from a2c import A2C

from utils.a2c_runner import vector_train
from utils.a2c_runner import evaluate

if __name__ == "__main__":
    env = gym.vector.make("CartPole-v1", num_envs=4, asynchronous=True)
    actor = A2C(env.single_observation_space, env.single_action_space)

    returns = vector_train(actor, env, 100000, 300)

    eval_env = gym.make("CartPole-v1")
    evaluate(actor, eval_env, 1, True)
Beispiel #24
0
from a2c import A2C
from wrappers import RecordEpisodeStatistics, TimeLimit

path = "pretrained/rware-small-4ag"
env_name = "rware-small-4ag-v1"
time_limit = 500  # 25 for LBF

RUN_STEPS = 1500

env = gym.make(env_name)
env = TimeLimit(env, time_limit)
env = RecordEpisodeStatistics(env)

agents = [
    A2C(i, osp, asp, 0.1, 0.1, False, 1, 1, "cpu")
    for i, (osp,
            asp) in enumerate(zip(env.observation_space, env.action_space))
]

for agent in agents:
    agent.restore(path + f"/agent{agent.agent_id}")

obs = env.reset()

for i in range(RUN_STEPS):
    obs = [torch.from_numpy(o) for o in obs]
    _, actions, _, _ = zip(
        *
        [agent.model.act(obs[agent.agent_id], None, None) for agent in agents])
    actions = [a.item() for a in actions]