Esempio n. 1
0
def play(env_name, seed, load_file, total_timesteps, n_cpu):
    np.set_printoptions(precision=5)
    def padding_obss(obss, dummy_obss):
        dummy_obss[ 0, :, :, : ] = obss
        return dummy_obss
    # trained LSTM model cannot change number of env.
    # so it needs to reshape observation by padding dummy data.
    dummy_obss = np.zeros((n_cpu, 64, 64, 4))
    env = SubprocVecEnv([make_env(env_name, 0, seed)])
    model = PPO2.load(load_file, verbose=1)
    obss = env.reset()
    obss = padding_obss(obss, dummy_obss)
    rewards_buf = []
    steps_buf = []
    # TODO: single
    for i in range(total_timesteps):
        actions, _states = model.predict(obss)
        actions = actions[0:1]
        obss, rewards, dones, infos = env.step(actions)
        obss = padding_obss(obss, dummy_obss)
        # env.render() # dummy
        if dones[0]:
            rewards_buf.append(infos[0]['episode']['r'])
            steps_buf.append(infos[0]['episode']['l'])
            line = np.array([np.mean(rewards_buf), np.std(rewards_buf), np.mean(steps_buf), np.std(steps_buf)])
            print(len(rewards_buf), line)
            obss = env.reset()
            obss = padding_obss(obss, dummy_obss)
    env.close()
Esempio n. 2
0
def main():
    #env = SubprocVecEnv([(lambda i=i: SwocGym(i+1, GameServicePath, i, fieldWidth=10, fieldHeight=10, saveEpisode=True)) for i in range(1)])
    env = SubprocVecEnv([
        (lambda i=i: MazeGym(mazeWidth=10, mazeHeight=10, nrWallsToRemove=10))
        for i in range(1)
    ])
    try:
        model = PPO2("MlpPolicy",
                     env,
                     verbose=1,
                     tensorboard_log='/home/ralph/swoc2019/log')
        if SaveFile.exists():
            print('loading...')
            model.load_parameters(SaveFile)
        else:
            print('Warning: No save file loaded')

        print('evaluating...', end='')
        obs = env.reset()
        totalRewards = None
        for i in range(100):
            action, _states = model.predict(obs)
            obs, rewards, dones, info = env.step(action)
            totalRewards = totalRewards + rewards if totalRewards is not None else rewards
            env.render()
            sleep(0.2)
        print(f'mean reward: {np.mean(totalRewards)}')

    except KeyboardInterrupt:
        print('closing...')
    finally:
        env.close()
    print('closed')
Esempio n. 3
0
def train():
    def callback(_locals, _globals):
        # Save model
        _locals['self'].save(MODEL_NAME)

    envs = [create_env_headless for _ in range(ENV_COUNT)]
    vec_envs = SubprocVecEnv(envs)
    model = PPO2('CnnPolicy',
                 vec_envs,
                 verbose=1,
                 ent_coef=0.0001,
                 n_steps=256)

    if not os.path.isfile(MODEL_NAME):
        model.save(MODEL_NAME)
        vec_envs.close()
        print("Run again to train")
    else:
        model.learn(total_timesteps=TIMESTEPS, callback=callback)
        model.save(MODEL_NAME)
        vec_envs.close()
        print("Training Done")

        # Evaluation
        print("Evaluation")
        vec_env = create_env_headless()
        vec_env = DummyVecEnv([lambda: vec_env])
        model = PPO2.load(MODEL_NAME)
        print(evaluate_policy(model, vec_env, n_eval_episodes=100))
        print(evaluate_policy(model, vec_env, n_eval_episodes=100))
        vec_env.close()
Esempio n. 4
0
def train_policy():
    ppo_config = {
        "gamma": 0.9988,
        "n_steps": 200,
        "ent_coef": 0,
        "learning_rate": 0.001,
        "vf_coef": 0.99,
        "max_grad_norm": 0.1,
        "lam": 0.95,
        "nminibatches": 5,
        "noptepochs": 100,
        "cliprange": 0.2,
        "tensorboard_log": log_relative_path
    }
    os.makedirs(log_relative_path)
    policy_kwargs = dict(act_fun=tf.nn.tanh, net_arch=[256, 128])
    env = SubprocVecEnv([_make_env(rank=i) for i in range(5)])
    model = PPO2(MlpPolicy,
                 env,
                 _init_setup_model=True,
                 policy_kwargs=policy_kwargs,
                 verbose=1,
                 **ppo_config)
    model.learn(total_timesteps=1000,
                tb_log_name="ppo2",
                reset_num_timesteps=False)
    model.save(os.path.join(log_relative_path, 'model'))
    env.env_method("save_world", log_relative_path)
    env.close()
    return
Esempio n. 5
0
def main():
    env = SubprocVecEnv([(lambda i=i: SwocGym(
        i + 1, GameServicePath, i, actionRepeat=4, oneTarget=True))
                         for i in range(4)])
    try:
        model = PPO2("MlpPolicy",
                     env,
                     verbose=1,
                     policy_kwargs={
                         'net_arch': [256, 256, 256, 128, 128, 128],
                         'act_fun': tf.nn.relu
                     },
                     n_steps=256,
                     ent_coef=0.0,
                     learning_rate=1e-5)
        if SaveFile.exists():
            print('loading...')
            model.load_parameters(SaveFile)
        else:
            print('Warning: No save file loaded')

        print('evaluating...', end='')
        totalRewards = evaluate(env, model)
        print(f'mean reward: {np.mean(totalRewards)}')

    except KeyboardInterrupt:
        print('closing...')
    finally:
        env.close()
    print('closed')
Esempio n. 6
0
def play(env_name, seed, load_file, total_timesteps, n_cpu):
    np.set_printoptions(precision=5)
    def padding_obss(obss, dummy_obss):
        dummy_obss[ 0, :, :, : ] = obss
        return dummy_obss
    # if it's GUI mode, number of env is changed to 1 to reduce GUI windows.
    # but trained LSTM model cannot change number of env.
    # so it needs to reshape observation by padding dummy data.
    isGUI = env_name.find('GUI') != -1
    dummy_obss = np.zeros((n_cpu, 64, 64, 4)) if isGUI else None
    env = SubprocVecEnv([make_env(env_name, i, seed) for i in range(1 if isGUI else n_cpu)])
    model = PPO2.load(load_file, verbose=1)
    obss = env.reset()
    obss = padding_obss(obss, dummy_obss) if isGUI else obss
    rewards_buf = []
    steps_buf = []
    # TODO: single
    for i in range(total_timesteps):
        actions, _states = model.predict(obss)
        actions = actions[0:1] if isGUI else actions
        obss, rewards, dones, infos = env.step(actions)
        obss = padding_obss(obss, dummy_obss) if isGUI else obss
        # env.render() # dummy
        if dones.any():
            rewards_buf.extend([ info['episode']['r'] for info in infos if 'episode' in info ])
            steps_buf.extend([ info['episode']['l'] for info in infos if 'episode' in info ])
            line = np.array([np.mean(rewards_buf), np.std(rewards_buf), np.mean(steps_buf), np.std(steps_buf)])
            print(len(rewards_buf), line)
    env.close()
def run_experiment(args):
    hyperparam_file = os.path.join(HYPERPARAM_DIR, args.agent + ".yml")
    hyperparams = yaml.safe_load(open(hyperparam_file))

    hyperparams = hyperparams[args.env]

    n_envs = hyperparams.pop("n_envs", 1)
    n_timesteps = int(hyperparams.pop("n_timesteps"))
    policy = hyperparams.pop("policy")
    normalize = hyperparams.pop("normalize", None)

    vecEnv = []
    for i in range(n_envs):
        # Bit of trickery here to avoid referencing
        # to the same "i"
        vecEnv.append((lambda idx: lambda: create_env(args, idx))(i))

    if args.subprocenv:
        vecEnv = SubprocVecEnv(vecEnv)
    else:
        vecEnv = DummyVecEnv(vecEnv)

    # Handle learning rates
    # Taken from rl-zoo/train.py
    for key in ['learning_rate', 'cliprange', 'cliprange_vf']:
        if key not in hyperparams or args.agent == "dqn":
            continue
        if key == 'learning_rate' and args.agent == "a2c":
            continue
        if isinstance(hyperparams[key], str):
            schedule, initial_value = hyperparams[key].split('_')
            initial_value = float(initial_value)
            hyperparams[key] = linear_schedule(initial_value)
        elif isinstance(hyperparams[key], (float, int)):
            # Negative value: ignore (ex: for clipping)
            if hyperparams[key] < 0:
                continue
            hyperparams[key] = constfn(float(hyperparams[key]))

    if args.forced_cliprange is not None:
        hyperparams["cliprange"] = args.forced_cliprange

    agent_class = AVAILABLE_ALGORITHMS[args.agent]
    agent = agent_class(policy, vecEnv, verbose=1, **hyperparams)

    # Prepare callback
    checkpoint_dir = os.path.join(args.output, CHECKPOINT_DIR)
    os.makedirs(checkpoint_dir)
    # Note that save_freq is counted in number of agent step-calls,
    # not env step-calls.
    save_freq = n_timesteps // (args.num_snapshots * n_envs)

    checkpoint_callback = CheckpointCallback(save_freq, checkpoint_dir)

    agent.learn(total_timesteps=n_timesteps, callback=checkpoint_callback)

    vecEnv.close()
def train():
    n_cpu = os.cpu_count()
    env = SubprocVecEnv([lambda: DemoEnv() for i in range(n_cpu)])
    model = PPO2(MlpPolicy,
                 env,
                 verbose=1,
                 policy_kwargs={'net_arch': [dict(vf=[4], pi=[4])]})
    model.learn(total_timesteps=int(1e6))
    model.save("ppo2_DemoEnv")
    env.close()
    del model
Esempio n. 9
0
def run_training(config: Dict):
    """Runs training based on config passed in"""
    print("Run configuration:")
    print(config)
    seed(config['seed'])

    # read config
    hyperparameters = read_hyperparameters(config)
    graphs = graphs_from_args(config['graphs'])
    policy, policy_kwargs = policy_from_args(config, graphs)
    demands = demands_from_args(config, graphs)
    env_kwargs = env_kwargs_from_args(config)
    env_name = config['env_name']
    timesteps = config['timesteps']
    parallelism = config['parallelism']
    log_name = config['log_name']
    model_name = config['model_name']
    tensorboard_log = config['tensorboard_log']

    oblivious_routings = None

    # make env
    env = lambda: gym.make(env_name,
                           dm_sequence=demands,
                           graphs=graphs,
                           oblivious_routings=oblivious_routings,
                           **env_kwargs)
    vec_env = SubprocVecEnv([env for _ in range(parallelism)],
                            start_method="spawn")

    # make model
    model = PPO2(policy,
                 vec_env,
                 cliprange_vf=-1,
                 verbose=1,
                 policy_kwargs=policy_kwargs,
                 tensorboard_log=tensorboard_log,
                 **hyperparameters)

    # learn
    if env_name == 'ddr-iterative-v0':
        model.learn(total_timesteps=timesteps, tb_log_name=log_name)
    else:
        model.learn(total_timesteps=timesteps, tb_log_name=log_name)

    # save it
    model.save(model_name)

    # make sure everything stopped correctly
    vec_env.close()
Esempio n. 10
0
    def get_rewards(self,
                    skills=[],
                    train_total_timesteps=5000000,
                    eval_times=100,
                    eval_max_steps=10000,
                    model_save_name=None,
                    add_info={}):
        # def get_rewards(self, skills=[], train_total_timesteps=10, eval_times=10, eval_max_steps=10, model_save_name=None, add_info={}):
        """
        
        :param skills: (list) the availiable action sequence for agent 
        e.g [[0,2,2],[0,1,1]]
        :param train_total_timesteps: (int)total_timesteps to train 
        :param eval_times: (int)the evaluation times
        e.g eval_times=100, evalulate the policy by averageing the reward of 100 episode
        :param eval_max_steps: (int)maximum timesteps per episode when evaluate
        :param model_save_name: (str)specify the name of saved model (should not repeat)
        :param add_info: (dict) other information to log in log.txt
        """

        # env = SkillWrapper(self.env, skills=skills)
        if self.num_cpu > 1:
            env = SubprocVecEnv([
                self.make_env(self.env_creator, i, skills)
                for i in range(self.num_cpu)
            ])
        else:
            env = DummyVecEnv([lambda: self.env_creator()])
        model = self.model(self.policy, env, verbose=self.verbose)

        self.strat_time = time.time()
        print("start to train agent...")
        model.learn(total_timesteps=train_total_timesteps,
                    reset_num_timesteps=self.reset_num_timesteps)
        print("Finish train agent")

        if self.save_path is not None:
            if self.preserve_model > 0:
                self.save_model(model, model_save_name, skills=skills)

        # evaluate
        info = self.evaluate(env, model, eval_times, eval_max_steps)
        env.close()

        #log result
        info.update(add_info)
        self.log(info)

        self._serial_num = self._serial_num + 1
        return info["ave_score"], info["ave_action_reward"]
Esempio n. 11
0
class SimulatorModel(object):
    def __init__(self, _make_env_func, parallel_agents):
        """
        This class instantiates a dynamics model based on the pybullet simulator
        (i.e: simulates exactly the result of the actions), it can be used
        for reward tuning and verifying tasks..etc

        :param _make_env_func: (func) a function if called it will return a gym
                                      environment.
        :param parallel_agents: (int) number of parallel agents to siumulate
                                      to evaluate the actions.
        """
        self.parallel_agents = parallel_agents
        self.envs = SubprocVecEnv(
            [_make_env_func() for i in range(self.parallel_agents)])
        return

    def evaluate_trajectories(self, action_sequences):
        """
        A function to be called to evaluate the action sequences and return
        the corresponding reward for each sequence.

        :param action_sequences: (nd.array) actions to be evaluated
                                            (number of sequences, horizon length)
        :return: (nd.array) sum of rewards for each action sequence.
        """
        horizon_length = action_sequences.shape[1]
        num_of_particles = action_sequences.shape[0]
        rewards = np.zeros([num_of_particles])
        assert ((float(num_of_particles) / self.parallel_agents).is_integer())
        for j in range(0, num_of_particles, self.parallel_agents):
            self.envs.reset()
            total_reward = np.zeros([self.parallel_agents])
            for k in range(horizon_length):
                actions = action_sequences[j:j + self.parallel_agents, k]
                task_observations, current_reward, done, info = \
                    self.envs.step(actions)
                total_reward += current_reward
            rewards[j:j + self.parallel_agents] = total_reward
        return rewards

    def end_sim(self):
        """
        Closes the environments that were used for simulation.
        :return:
        """
        self.envs.close()
        return
Esempio n. 12
0
def train():
    def callback(_locals, _globals):
        global n_steps
        if (n_steps + 1) % 100 == 0:
            _locals['self'].save(MODEL_NAME)
        n_steps += 1

    envs = [create_env_headless_monitor for _ in range(ENV_COUNT)]
    envs = SubprocVecEnv(envs)

    model = PPO2('CnnPolicy', envs, verbose=1, ent_coef=0.0001, n_steps=512)
    model.save(MODEL_NAME)
    model.learn(total_timesteps=TIMESTEPS, callback=callback)
    model.save(MODEL_NAME)
    print("Training Done")
    envs.close()
Esempio n. 13
0
def _eval_model(model, env_id, ob_shape, num_eps, plot=False):
  test_env = SubprocVecEnv([make_env(env_id)])
  sharpe_ratios = []
  for episode in range(num_eps):
    # Padding zeros to the test env to match the shape of the training env.
    zero_completed_obs = np.zeros((NUM_CPU,) + ob_shape)
    zero_completed_obs[0, :] = test_env.reset()
    state = None
    for _ in range(L):
      action, state = model.predict(zero_completed_obs, state=state, deterministic=True)
      zero_completed_obs[0, :], reward, done, _ = test_env.env_method('step', action[0], indices=0)[0]
    sharpe_ratios.append(test_env.env_method('get_sharpe_ratio', indices=0)[0])
    if plot: test_env.env_method('render', indices=0)
  test_env.close()
  
  # Return the average sharpe ratio
  return sum(sharpe_ratios) / len(sharpe_ratios)
Esempio n. 14
0
def test():
    # Parallel environments
    n_cpu = 4
    env = SubprocVecEnv([lambda: RSEnv() for i in range(n_cpu)])

    model = A2C(MlpPolicy, env, verbose=1)
    model.learn(total_timesteps=600000, log_interval=10)

    model.save("sba2c")

    env = TestRSEnv()
    obs = env.reset()
    done = False
    while not done:
        action, _ = model.predict(obs)
        obs, rewards, done, info = env.step(action)
        env.render()
    env.close()
Esempio n. 15
0
def main():
    #     env = SubprocVecEnv([(lambda i=i: SwocGym(i+1, GameServicePath, i, fieldWidth=10, fieldHeight=10)) for i in range(16)])
    env = SubprocVecEnv([
        (lambda i=i: MazeGym(mazeWidth=10, mazeHeight=10, nrWallsToRemove=60))
        for i in range(12)
    ])
    try:
        #model = PPO2("MlpPolicy", env, verbose=1, ent_coef=0.01, tensorboard_log='/home/ralph/swoc2019/log')
        #model = PPO2("MlpPolicy", env, verbose=1, policy_kwargs={'net_arch': [1024,1024,512,512,256,256,128,128,64,64], 'act_fun': tf.nn.relu},
        #               n_steps=64, ent_coef=0.01, learning_rate=1e-5, tensorboard_log='/home/ralph/swoc2019/log')
        model = PPO2("MlpPolicy",
                     env,
                     verbose=1,
                     policy_kwargs={
                         'net_arch':
                         [1024, 1024, 512, 512, 256, 256, 128, 128, 64, 64],
                         'act_fun':
                         tf.nn.relu
                     })
        if SaveFile.exists():
            print('loading...', end='')
            model.load_parameters(SaveFile)
            print('loaded!')
        else:
            # No weights loaded, so remove history
            with open(RewardsLog, 'w+') as file:
                file.write('')

        try:
            print('learning...')
            model.learn(total_timesteps=100000000, callback=callback)
        finally:
            print('saving...', end='')
            model.save(SaveFile)
            print('saved!')

    except KeyboardInterrupt:
        print('closing...', end='')
    finally:
        env.close()
    print('closed!')
Esempio n. 16
0
def learn(env_name, seed, load_file, save_file, tensorboard_log, total_timesteps, n_cpu):
    best_mean_reward = -np.inf
    best_mean_step = np.inf
    save_file = env_name if save_file is None else save_file
    best_save_file = save_file + BEST_SAVE_FILE_SUFFIX
    start_time = time.time()
    def callback(_locals, _globals):
        nonlocal best_mean_reward, best_mean_step
        t = (time.time() - start_time) / 3600.0
        print(f'hours: {t:.2f}')
        ep_info_buf = _locals['ep_info_buf']
        if len(ep_info_buf) < ep_info_buf.maxlen:
            return True
        mean_reward = np.mean([ ep_info['r'] for ep_info in ep_info_buf ])
        mean_step = np.mean([ ep_info['l'] for ep_info in ep_info_buf ])
        if mean_reward > best_mean_reward:
            best_mean_reward = mean_reward
            print('best_mean_reward:', best_mean_reward)
            print('saving new best model:', best_save_file)
            _locals['self'].save(best_save_file)
        if mean_step < best_mean_step:
            best_mean_step = mean_step
            print('best_mean_step:', best_mean_step)
        return True # False should finish learning
    # policy = CnnPolicy
    policy = CnnLstmPolicy
    # policy = CnnLnLstmPolicy
    print(env_name, policy)
    # Run this to enable SubprocVecEnv on Mac OS X.
    # export OBJC_DISABLE_INITIALIZE_FORK_SAFETY=YES
    # see https://github.com/rtomayko/shotgun/issues/69#issuecomment-338401331
    env = SubprocVecEnv([make_env(env_name, i, seed) for i in range(n_cpu)])
    if load_file is not None:
        model = PPO2.load(load_file, env, verbose=1, tensorboard_log=tensorboard_log)
    else:
        model = PPO2(policy, env, verbose=1, tensorboard_log=tensorboard_log)
    model.learn(total_timesteps=total_timesteps, log_interval=5, callback=callback)
    print('saving model:', save_file)
    model.save(save_file)
    env.close()
Esempio n. 17
0
def learn(env_name, seed, load_path, save_path, tensorboard_log, total_timesteps, n_cpu):
    save_path = env_name if save_path is None else save_path
    checkpoint_callback = CheckpointCallback(save_freq=2000, save_path=save_path)
    eval_env = make_env(env_name, n_cpu, seed)()
    eval_callback = EvalCallback(eval_env, best_model_save_path=save_path+'/best', log_path=tensorboard_log, eval_freq=1000)
    callback = CallbackList([checkpoint_callback, eval_callback])

    policy = CnnPolicy
    # policy = CnnLstmPolicy
    # policy = CnnLnLstmPolicy
    print(env_name, policy)
    # Run this to enable SubprocVecEnv on Mac OS X.
    # export OBJC_DISABLE_INITIALIZE_FORK_SAFETY=YES
    # see https://github.com/rtomayko/shotgun/issues/69#issuecomment-338401331
    env = SubprocVecEnv([make_env(env_name, i, seed) for i in range(n_cpu)])
    if load_path is not None:
        model = PPO2.load(load_path, env, verbose=1, tensorboard_log=tensorboard_log)
    else:
        model = PPO2(policy, env, verbose=1, tensorboard_log=tensorboard_log)
    model.learn(total_timesteps=total_timesteps, log_interval=5, callback=callback)
    print('saving model:', save_path+'/latest_model')
    model.save(save_path+'/latest_model')
    env.close()
Esempio n. 18
0
def main():
    env = SubprocVecEnv([(lambda i=i: SwocGym(
        i + 1, GameServicePath, i, actionRepeat=4, oneTarget=True))
                         for i in range(16)])
    try:
        model = PPO2("MlpPolicy",
                     env,
                     verbose=1,
                     policy_kwargs={
                         'net_arch': [256, 256, 256, 128, 128, 128],
                         'act_fun': tf.nn.relu
                     },
                     n_steps=32,
                     ent_coef=0.1,
                     learning_rate=1e-4,
                     tensorboard_log='/home/ralph/swoc2019/log')
        if SaveFile.exists():
            print('loading...')
            model.load_parameters(SaveFile)
        else:
            # No weights loaded, so remove history
            with open(RewardsLog, 'w+') as file:
                file.write('')

        try:
            print('learning...')
            model.learn(total_timesteps=100000000, callback=callback)
        finally:
            print('saving...')
            model.save(SaveFile)
            print('saved!')

    except KeyboardInterrupt:
        print('closing...')
    finally:
        env.close()
    print('closed')
Esempio n. 19
0
def train(exp_name, env_name, n_envs, **kwargs):
    # Train 10 runs
    for n in range(1, 11):  # PPO2_n
        # Configure logger
        log_folder = 'training_logs/' + exp_name + '_' + str(n) + '/'
        logger.configure(log_folder, ['csv'])

        print("[+] Starting training", n)
        env = SubprocVecEnv([
            make_env(log_folder, env_name, i, (n - 1) * 32)
            for i in range(n_envs)
        ])

        model = PPO2(
            policy=MlpPolicy,
            env=env,
            verbose=True,

            # Make it deterministic
            seed=32 * n,  # Fixed seed
            n_cpu_tf_sess=1,  # force deterministic results

            # Pass arguments
            **kwargs)
        model.learn(
            total_timesteps=int(250e3),
            log_interval=1,  # log each update
        )

        # Saving model
        os.makedirs("trained_models", exist_ok=True)
        model.save("trained_models/" + exp_name + "_" + str(n))

        env.close()
        del env
        del model
Esempio n. 20
0
        env = SubprocVecEnv([make_env(env_id, params) for _ in range(4)],
                            start_method='fork')
        policy_kwargs = dict()
        model = A2C('MlpLstmPolicy',
                    env,
                    learning_rate=1e-3,
                    verbose=1,
                    n_steps=64,
                    tensorboard_log="/tmp",
                    gamma=0.99,
                    policy_kwargs=policy_kwargs)
        model.learn(total_timesteps=int(params["steps"]))
        print("Done learning, saving model")
        model.save("agents/SBL_{}".format(params["ID"]))
        print("Saved model, closing env")
        env.close()
        print("Finished training with ID: {}".format(ID))
    else:
        #env_vec = SubprocVecEnv([make_env(env_id, params) for _ in range(4)], start_method='fork')
        env = env_id(params["env_list"],
                     max_n_envs=1,
                     specific_env_len=70,
                     s_len=150,
                     walls=True,
                     target_vel=params["target_vel"],
                     use_contacts=params["use_contacts"])

        print("Testing")
        policy_name = "QWZ"  # LX3: joints + contacts + yaw
        policy_path = 'agents/SBL_{}'.format(policy_name)
        model = A2C.load(policy_path)
def main():
    args = get_args()
    log_dir = create_log_dir(args)
    if not args.test:
        writer = SummaryWriter(log_dir)
    else:
        writer = None

    SEED = 721
    if args.ram_obs or args.env == "slimevolley_v0":
        obs_type = 'ram'
    else:
        obs_type = 'rgb_image'
    env = make_env(
        args.env, SEED, obs_type=obs_type
    )  # TODO used for providing spaces info, can also modify SubprocVecEnv wrapper
    # https://stable-baselines.readthedocs.io/en/master/guide/vec_envs.html?highlight=multiprocessing
    envs = SubprocVecEnv([
        lambda: make_env(args.env, obs_type=obs_type)
        for _ in range(args.num_envs)
    ],
                         start_method='spawn')

    # envs.seed(np.random.randint(1000, size=args.num_envs).tolist())  # random seeding
    envs.seed(SEED)  # fix seeding
    state_spaces = env.observation_spaces
    action_spaces = env.action_spaces
    print('state_spaces: ', state_spaces, ',  action_spaces: ', action_spaces)

    learner_args = {'device': args.device}
    env.reset()
    agents = env.agents
    print('agents: ', agents)

    if args.train_both:
        fixed_agents = []
    else:
        fixed_agents = [
            'first_0'
        ]  # SlimeVolley: opponent is the first, the second agent is the learnable one

    if obs_type == 'ram':
        model = ParallelMultiPPODiscrete(args.num_envs, agents, state_spaces,
                                         action_spaces, 'MLP', fixed_agents,
                                         learner_args,
                                         **hyperparams).to(args.device)
    else:
        model = ParallelMultiPPODiscrete(args.num_envs, agents, state_spaces,
                                         action_spaces, 'CNN', fixed_agents,
                                         learner_args,
                                         **hyperparams).to(args.device)

    load_model(model, args)

    path = f"model/{args.env}/"
    os.makedirs(path, exist_ok=True)

    if args.fictitious:
        path = path + 'fictitious_'

    parallel_rollout(envs, model, writer, max_eps=max_eps, max_timesteps=max_timesteps, selfplay_interval=selfplay_interval,\
        render=args.render, model_path=path, against_baseline=args.against_baseline, selfplay=args.selfplay, \
        fictitious=args.fictitious, test=args.test)

    envs.close()
Esempio n. 22
0
def main():

    args = get_configuration()
    args.state_dim = util.get_state_dim(args)

    if not os.path.exists(args.outdir):
        os.makedirs(args.outdir, exist_ok=True)

    if args.graph_embedding:

        class MyPolicy(EmbeddingPolicy):
            def __init__(self,
                         sess,
                         ob_space,
                         ac_space,
                         n_env,
                         n_steps,
                         n_batch,
                         reuse=True,
                         **_kwargs):
                super().__init__(sess,
                                 ob_space,
                                 ac_space,
                                 n_env,
                                 n_steps,
                                 n_batch,
                                 args,
                                 reuse=reuse,
                                 **_kwargs)
    else:

        class MyPolicy(EnigmaPolicy):
            def __init__(self,
                         sess,
                         ob_space,
                         ac_space,
                         n_env,
                         n_steps,
                         n_batch,
                         reuse=True,
                         **_kwargs):
                super().__init__(sess,
                                 ob_space,
                                 ac_space,
                                 n_env,
                                 n_steps,
                                 n_batch,
                                 args,
                                 reuse=reuse,
                                 **_kwargs)

    t0 = time.time()

    from mpi4py import MPI as mpi
    comm = mpi.COMM_WORLD
    rank = comm.Get_rank()
    all = comm.Get_size()

    gpus = os.environ["CUDA_VISIBLE_DEVICES"].split(',')
    gpu_count = len(gpus)
    gpu = gpus[rank % gpu_count]
    os.environ["CUDA_VISIBLE_DEVICES"] = gpu
    print("My rank is {} out of {}, using GPU {}".format(rank, all, gpu))

    if args.model_type == "ppo2":
        from stable_baselines import PPO2 as PPO
        env = SubprocVecEnv([(lambda: ProofEnv.ProofEnv(args))
                             for _ in range(args.parallel_envs)
                             ])  #, start_method="spawn")
    elif args.model_type == "ppo1":
        args.parallel_envs = 1
        env = DummyVecEnv([lambda: ProofEnv.ProofEnv(args)])
        # from stable_baselines import PPO1 as PPO
        from ppo import PPO1 as PPO

    if args.saved_model == None:
        myPolicy = MyPolicy
        if args.model_type == "ppo2":
            model = PPO(
                policy=myPolicy,
                env=env,
                n_steps=args.actorbatch,
                # nminibatches=args.optim_stepsize,
                lam=0.95,
                gamma=args.gamma,
                noptepochs=4,
                ent_coef=args.entcoeff,
                learning_rate=lambda f: f * 2.5e-4,
                cliprange=lambda f: f * 0.1,
                verbose=1)
        elif args.model_type == "ppo1":
            model = PPO(myPolicy,
                        env,
                        verbose=2,
                        timesteps_per_actorbatch=args.actorbatch,
                        schedule=args.lr_schedule,
                        optim_stepsize=args.optim_stepsize,
                        entcoeff=args.entcoeff,
                        optim_batchsize=args.optim_batchsize,
                        gamma=args.gamma)
    else:
        print("Loading model from {}".format(args.saved_model))
        model = PPO.load(args.saved_model)
        model.set_env(env)

    counter = 0

    for ind in range(args.parallel_envs):
        env.env_method("set_model",
                       model,
                       indices=list(range(args.parallel_envs)))

    modelfiles = []
    for train_timestep, train_dir in zip(args.train_timesteps,
                                         args.train_dirs):
        problem_files = sorted(util.list_problems(train_dir))
        problem_files = util.split_list(problem_files, all)[rank]
        problem_files_splitted = util.split_list(problem_files,
                                                 args.parallel_envs,
                                                 extensible=False)

        if args.add_repeating_pretraining:
            for ind in range(args.parallel_envs):
                env.env_method("set_source",
                               problem_files_splitted[ind],
                               indices=[ind],
                               generator_type="repeating")
            # all_thread_timestep = train_timestep * all
            print("PRETRAINING")
            model.learn(total_timesteps=train_timestep)
            print("Pretraining on {} finished in {}".format(
                train_dir, util.format_time(time.time() - t0)))

        for ind in range(args.parallel_envs):
            env.env_method("set_source",
                           problem_files_splitted[ind],
                           indices=[ind])
        # all_thread_timestep = train_timestep * all
        model.learn(total_timesteps=train_timestep)

        modelfile = "{}/ppo1_fcop_train_{}".format(args.outdir, counter)
        modelfiles.append(modelfile)
        if rank == 0:
            model.save(modelfile)
            # logger.logkv("finished_train_problems", counter)
        counter += 1

        print("Training on {} finished in {}".format(
            train_dir, util.format_time(time.time() - t0)))
        statistics_list = env.get_attr("statistics",
                                       indices=list(range(args.parallel_envs)))
        blacklist_list = env.get_attr("blacklist",
                                      indices=list(range(args.parallel_envs)))
        for i, statistics in enumerate(statistics_list):
            print("ENV {} - {} - blacklist: {}\n".format(
                rank, i, blacklist_list[i])),
            util.print_problemdict(statistics, rank)

            # for f in statistics:
            #     statistics[f]["mcts"].display_tree([0])

        # util.print_problemdict(env.envs[0].statistics)

    if len(args.train_dirs) > 0 and len(
            args.train_timesteps) > 0:  # we did training
        print("We have finished training, rank {}".format(rank))

        # for p in problem_files:
        #     vis_policy.vis_policy(env.envs[0], model, p)

        env.close()
        del env
        del model

    # here we wait for everyone
    comm.Barrier()
    print("We have started evaluation, rank {}".format(rank))

    # evaluation without training
    if (args.saved_model is not None) and (len(
            args.train_dirs) == 0):  # no training, just evaluation
        modelfiles = [args.saved_model]

    for evaldir in args.evaldirs:
        for model_index, modelfile in enumerate(modelfiles):
            eval.eval_mpi(args, evaldir, modelfile, model_index)

            # here we wait for everyone
            comm.Barrier()
Esempio n. 23
0
    callbacks += 1
    if RENDER_TO_SCREEN:
        locals["self"].env.render()
    # Saves the model every 1000 calls
    if callbacks % 10000 == 0:
        locals['self'].save("models/" + folderName + "/" + runName + "-" +
                            str(callbacks))
    return True  # Returns true as false ends the training


n = 6
list = [lambda: gym.make('gvgai-aliens-lvl0-v0') for _ in range(n)] + \
       [lambda: gym.make('gvgai-aliens-lvl1-v0') for _ in range(n)] + \
       [lambda: gym.make('gvgai-boulderdash-lvl0-v0') for _ in range(n)] + \
       [lambda: gym.make('gvgai-boulderdash-lvl1-v0') for _ in range(n)] + \
       [lambda: gym.make('gvgai-missilecommand-lvl0-v0') for _ in range(n)] + \
       [lambda: gym.make('gvgai-missilecommand-lvl1-v0') for _ in range(n)]

# multiprocess environment
n_cpu = multiprocessing.cpu_count()
venv = SubprocVecEnv(list)
venv = EnvWrapper(venv, (128, 128, 3))  #(110, 300, 3)
model = A2C(ONet,
            venv,
            verbose=1,
            tensorboard_log="tensorboard/" + folderName + "/",
            n_steps=stepsUpdate)
model.learn(total_timesteps=int(1e8), tb_log_name=runName, callback=callback)
venv.close()
model.save("models/" + folderName + "/" + runName + "-Final")