Beispiel #1
0
def Train():
    setup_utils.setup_and_load(use_cmd_line_args=False,
                               set_seed=3,
                               num_levels=1,
                               use_black_white=True,
                               frame_stack=4)
    # env=make("platform",num_envs=8)
    env = make("platform", num_envs=128)
    env = CourierWrapper(env, False)
    env = MyReward(env)
    env = StickAct(env, 0.5)
    env = VecMonitor(env)
    learning_rate = 5e-4
    clip_range = 0.2
    n_timesteps = int(1e8)
    hyperparmas = {
        'nsteps': 256,
        'noptepochs': 4,
        'nminibatches': 8,
        'lr': learning_rate,
        'cliprange': clip_range,
        'vf_coef': 0.5,
        'ent_coef': 0.01
    }

    act = ppo2.learn(
        network=MyPolicy,
        env=env,
        total_timesteps=n_timesteps,
        **hyperparmas,
        save_interval=100,
        log_interval=20,

        # value_network="copy"
    )
Beispiel #2
0
    def __call__(self, env_maker, seed=None, monitor_file=None):
        """
        :param env_maker: instance of roam_learning.robot_env.EnvMaker
        :param seed: int that is used to generate seeds for vectorized envs
        :param monitor_file: path to a .csv file to log episode rewards, lengths etc,. of the vectorized envs
        :return: instance of either DummyVecEnv, SubprocVecEnv or ShmemVecEnv
        """
        # Create a list of env makers
        if seed is not None:
            assert isinstance(seed, int)
        env_makers = []
        for i in range(self.nenvs):
            env_makers += [deepcopy(env_maker)]
            if seed is not None:
                seed = hash_seed(seed)
                env_makers[i].set_seed(seed + i)

        # Create the vectorized envs
        envs = self.vec_env_wrapper(env_makers)

        # Monitor the envs before normalization
        if monitor_file is not None:
            envs = VecMonitor(envs, filename=monitor_file)
        if self.normalize_obs or self.normalize_ret:
            envs = VecNormalize(envs, ob=self.normalize_obs, ret=self.normalize_ret, use_tf=True)
        return envs
def Eval():



    def EnvFunc(iSeed):
        def InnerFunc():
            oEnv=Env()
            return oEnv
        return InnerFunc

    def linear_schedule(initial_value):
        def func(process):
            return process * initial_value
        return func

    learning_rate = linear_schedule(5e-4)
    clip_range = linear_schedule(0.2)
    n_timesteps = int(0)
    hyperparmas = {'nsteps': 256, 'noptepochs': 8, 'nminibatches': 4, 'lr': learning_rate, 'cliprange': clip_range,
                   'vf_coef': 0.5, 'ent_coef': 0.01}


    num_env = 1
    env = SubprocVecEnv([EnvFunc(i) for i in range(num_env)])
    env = VecNormalize(env,ob=True,ret=False)
    env=VecMonitor(env)

    act = ppo2.learn(
        network="mlp",
        env=env,
        total_timesteps=n_timesteps,
        save_interval=100,
        load_path="baselineLog/ppobaseliens-2019-06-05-17-38-15-168854/checkpoints/00300",
        **hyperparmas,
        value_network="copy"
    )


    obs = env.reset()
    print("obs", obs.shape)
    bDone = False
    iFrame = 0
    iReward = 0
    reward_list=deque(maxlen=100)
    while not bDone:
        action = act.step(obs)[0]
        obs, reward, done, _ = env.step(action)
        iReward += reward[0]
        # time.sleep(0.01)
        # print("reward",reward)
        iFrame += 1
        # env.render()
        if done[0]:
            obs = env.reset()
            reward_list.append(iReward)
            print("done.................", iFrame, iReward,sum(reward_list)/len(reward_list))

            iFrame = 0
            iReward = 0
def Train():
    logdir = "baselineLog/ppo" + datetime.datetime.now().strftime("baseliens-%Y-%m-%d-%H-%M-%S-%f")
    logger.configure(logdir, ["tensorboard", "stdout"])


    def EnvFunc(iIndex):
        def InnerFunc():
            oEnv=Env(iIndex)
            return oEnv
        return InnerFunc

    def linear_schedule(initial_value):
        def func(process):
            return process * initial_value
        return func

    learning_rate = linear_schedule(3e-4)
    clip_range = linear_schedule(0.2)
    n_timesteps = int(1e8)
    hyperparmas = {'nsteps': 1024, 'noptepochs': 10, 'nminibatches': 32, 'lr': learning_rate, 'cliprange': clip_range,
                   'vf_coef': 0.5, 'ent_coef': 0.0}


    num_env = 22
    env = SubprocVecEnv([EnvFunc(i) for i in range(num_env)])
    env=VecMonitor(env)
    env=VecNormalize(env,cliprew=5000.,use_tf=True)

    act = ppo2.learn(
        network="mlp",
        env=env,
        total_timesteps=n_timesteps,
        save_interval=100,
        log_interval=4,
        # load_path="/tmp/openai-2019-05-30-11-53-14-660522/checkpoints/16000",
        **hyperparmas,
        value_network="copy"
    )
Beispiel #5
0
    def __call__(self, n_envs=1, *, train=True):
        env_fn = EnvMaker(self.env_id)

        if (
            "AtariEnv" in gym.spec(self.env_id)._entry_point
            and "-ram-" not in self.env_id
        ):
            if n_envs == 1:
                vec_env = DummyVecEnv([env_fn])
            else:
                vec_env = ShmEnvPool(env_fn, n_envs=n_envs)
            vec_env = VecFrameStack(vec_env, 4)
        else:
            if n_envs == 1:
                vec_env = DummyVecEnv([env_fn])
            else:
                vec_env = EnvPool(env_fn, n_envs=n_envs)

        monitor_dir = os.path.join(
            logger.get_dir(), ("train" if train else "eval") + "_monitor"
        )
        os.makedirs(monitor_dir, exist_ok=True)
        vec_env = VecMonitor(vec_env, filename=monitor_dir)
        return vec_env
Beispiel #6
0
from baselines import bench
from baselines import logger

from baselines.common.vec_env.dummy_vec_env import DummyVecEnv
from baselines.common.vec_env.vec_monitor import VecMonitor
from baselines.ppo2 import ppo2

BeraterEnv.BeraterEnv.showStep = False
BeraterEnv.BeraterEnv.showDone = True

print("--- PPO2 learn ---")

env = BeraterEnv.BeraterEnv( currentGraph )

wrapped_env = DummyVecEnv([lambda: BeraterEnv.BeraterEnv(currentGraph)])
monitored_env = VecMonitor(wrapped_env, log_dir)

# https://github.com/openai/baselines/blob/master/baselines/ppo2/ppo2.py
# https://github.com/openai/baselines/blob/master/baselines/common/models.py#L30
model = ppo2.learn(\
    env=monitored_env,\
    network='mlp',\
    num_hidden=50,\
    num_layers=2,\
    ent_coef=0.01,\
    total_timesteps=5000)

model.save('berater-ppo-v8.pkl')
monitored_env.close()

##################################################
Beispiel #7
0
import gym
from gym.wrappers.flatten_observation import FlattenObservation
from baselines.ppo2.ppo2 import learn
from baselines.common.vec_env.subproc_vec_env import SubprocVecEnv
from baselines.common.vec_env.dummy_vec_env import DummyVecEnv
from baselines.common.vec_env.vec_monitor import VecMonitor

ENV = "FetchReach-v1"


def run_env():
    env = gym.make(ENV)
    env.reset()
    while True:
        action = env.action_space.sample()
        env.step(action)
        env.render()


def make_env():
    env = gym.make(ENV)
    env = FlattenObservation(env)
    return env


if __name__ == '__main__':

    nenvs = 4
    env_fns = [make_env for _ in range(4)]
    env = VecMonitor(SubprocVecEnv(env_fns))
    learn(network='mlp', env=env, total_timesteps=int(1e5), log_interval=1)