Example #1
0
#         if mean_reward > best_mean_reward:
#             best_mean_reward = mean_reward
#         # Example for saving best model
#         print("Saving new best model")
#         _locals['self'].save(log_dir + 'best_model.pkl')
#         n_steps += 1
#         return False
#
# # Create log dir
# log_dir = "/tmp/"
# os.makedirs(log_dir, exist_ok=True)

# environment
# env = OsmoEnv()
# env = Monitor(env, log_dir, allow_early_resets=True)
env = DummyVecEnv([lambda: OsmoEnv()])

# parameters(for training)
tau = 0.1  # update rate for target model
gamma = 0.95  # discount rate for q value.
# batch_size = NUMCONC*5+3    # size of batch
batch_size = 10
alr = 0.003  # actor learning rate
clr = 0.003  # critic learning rate

# noise(to better exploration)
n_actions = env.action_space.shape[-1]
param_noise = AdaptiveParamNoiseSpec()
# action_noise = None
# param_noise = None
action_noise = NormalActionNoise(mean=np.zeros(n_actions),
Example #2
0
#     obs, rewards, dones, info = env2.step(action)
#     if (i%50 == 0):
#     	print("LUNCH TIME:", obs)

# env2.close()

# env = gym.make('bot-v0', training = False)
# obs = env.reset()
# print("obs: ", obs)
# for i in range(1000):
# 	action = np.array([0,-10,0])
# 	obs,rewards,done,info=env.step(action)
# 	print("obs: ", obs)
# 	print("reward: ", rewards)
# 	if done ==True:
# 		obs = env.reset()
# 		print ("obs after reset: ", obs)

env2 = gym.make('bot-v0', training=False)
env2 = DummyVecEnv([lambda: env2])

model = PPO2.load('saved_model', env=env2)

obs = env2.reset()
for i in range(2000):
    action, _states = model.predict(obs)
    obs, rewards, done, info = env2.step(action)
    if (i % 50 == 0):
        print("current observation:", obs)
    if done == True:
        break
Example #3
0
    def get_rewards(self,
                    skills=[],
                    train_total_timesteps=100000,
                    eval_times=100,
                    eval_max_steps=1000,
                    model_save_name=None,
                    log_action_skill=True):
        # def get_rewards(self, skills=[], train_total_timesteps=10, eval_times=10, eval_max_steps=10, model_save_name=None, log_action_skill=True):
        """
        
        :param skills: (list) the availiable action sequence for agent 
        e.g [[0,2,2],[0,1,1]]
        :param train_total_timesteps: (int)total_timesteps to train 
        :param eval_times: (int)the evaluation times
        e.g eval_times=100, evalulate the policy by averageing the reward of 100 episode
        :param eval_max_steps: (int)maximum timesteps per episode when evaluate
        :param model_save_name: (str)specify the name of saved model (should not repeat)
        :param log_action_skill: ()
        """

        env = SkillWrapper(self.env, skills=skills)
        env = DummyVecEnv([lambda: env])
        model = self.model(self.policy, env, verbose=self.verbose)

        strat_time = time.time()
        print("start to train agent...")
        model.learn(total_timesteps=train_total_timesteps)
        print("Finish train agent")

        if self.save_path is not None:
            if self.preserve_model > 0:
                self.save_model(model, model_save_name, skills=skills)

        #TODO evaluate
        #eval model
        info = OrderedDict()
        if log_action_skill:
            action_statistic = OrderedDict()
            for i in range(env.action_space.n):
                action_statistic[str(env.action_space[i])] = 0

        ep_reward = []
        ep_ave_reward = []
        print("start to eval agent...")
        for i in range(eval_times):
            obs = env.reset()
            total_reward = []
            for i in range(eval_max_steps):
                action, _states = model.predict(obs)
                obs, rewards, dones, info_ = env.step(action)
                total_reward.append(rewards[0])

                if log_action_skill is True:
                    action_statistic[str(
                        env.action_space[action[0]])] = action_statistic[str(
                            env.action_space[action[0]])] + 1

                if bool(dones[0]) is True:
                    break

            ep_reward.append(sum(total_reward))
            ep_ave_reward.append(sum(total_reward) / len(total_reward))

        print("Finish eval agent")
        print("Elapsed: {} sec".format(round(time.time() - strat_time, 3)))
        ave_score = sum(ep_reward) / len(ep_reward)
        ave_action_reward = sum(ep_ave_reward) / len(ep_ave_reward)
        ave_score_std = round(np.std(np.array(ep_reward)), 3)

        # info.update({"ave_score":ave_score, "ave_score_std":ave_score_std, "ave_reward":ave_reward})
        info["ave_score"] = ave_score
        info["ave_score_std"] = ave_score_std
        info["ave_action_reward"] = ave_action_reward
        if log_action_skill:
            info.update(action_statistic)
        env.close()

        #log result
        self.log(info)

        self._serial_num = self._serial_num + 1
        return ave_score, ave_action_reward
Example #4
0
from gym.wrappers import Monitor
from stable_baselines.common.vec_env import DummyVecEnv

from stable_baselines import DQN, DDPG
from stable_baselines.common.vec_env import DummyVecEnv
from stable_baselines.ddpg.policies import MlpPolicy

#
import time
from myenv import MyEnv

log = 'env/'
env1 = Monitor(MyEnv(8), log, force=True)
env = DummyVecEnv([lambda: env1])
# env = gym.make('CartPole-v1')
#
# # the noise objects for DDPG
# # n_actions = env.action_space.shape[-1]
# # param_noise = None
# # action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions), sigma=float(0.5) * np.ones(n_actions))
#
# model = DDPG(MlpPolicy, env, verbose=1, param_noise=param_noise, action_noise=action_noise)
model = DDPG(MlpPolicy, env, verbose=1, tensorboard_log='./log/')
model.learn(total_timesteps=10000)
model.save("ddpg_mountain")
# del model # remove to demonstrate saving and loading
# #
# model = DDPG.load("ddpg_mountain")
#
print('test')
obs = env.reset()
Example #5
0
    def test(self, model_epoch: int = 0, render_env: bool = True, render_report: bool = True, save_report: bool = False):
        train_provider, test_provider = self.data_provider.split_data_train_test(
            self.train_split_percentage)

        del train_provider

        history_data = test_provider.historical_ohlcv()
        history_data["Day"] = history_data["Date"].apply(
            lambda x: time.strftime("%Y-%m-%d", time.localtime(x)))
        history_data["Day"] = pd.to_datetime(history_data["Day"])
        history_data.sort_values(
            ['Day', 'Date'], ascending=[1, 0], inplace=True)
        grouped = history_data.groupby(['Day']).head(1)
        benchmark = grouped[["Day", "Close"]]
        benchmark.set_index('Day', drop=True, inplace=True)
        benchmark = benchmark.pct_change()[1:]
        self.logger.info(f"benchmark is:\n {benchmark}")

        init_envs = DummyVecEnv([make_env(test_provider)
                                 for _ in range(self.n_envs)])

        model_path = path.join(
            'data', 'agents', f'{self.study_name}__{model_epoch}.pkl')
        model = self.Model.load(model_path, env=init_envs)

        test_env = DummyVecEnv([make_env(test_provider) for _ in range(1)])

        self.logger.info(f'Testing model ({self.study_name}__{model_epoch})')

        zero_completed_obs = np.zeros(
            (self.n_envs,) + init_envs.observation_space.shape)
        zero_completed_obs[0, :] = test_env.reset()

        state = None
        rewards = []

        for _ in range(len(test_provider.data_frame)):
            action, state = model.predict(zero_completed_obs, state=state)
            obs, reward, done, info = test_env.step([action[0]])

            zero_completed_obs[0, :] = obs

            rewards.append(reward)

            if render_env:
                test_env.render(mode='human')

            if done:
                net_worths = pd.DataFrame({
                    'Date': info[0]['timestamps'],
                    'Balance': info[0]['net_worths'],
                })

                net_worths.set_index('Date', drop=True, inplace=True)
                returns = net_worths.pct_change()[1:]
                self.logger.info(f"returns.Balance is:\n {returns.Balance}")

                if render_report:
                    qs.plots.snapshot(
                        returns.Balance, title='RL Trader Performance')

                if save_report:
                    reports_path = path.join(
                        'data', 'reports', f'{self.study_name}__{model_epoch}.html')
                    try:
                        qs.reports.html(
                            returns.Balance, benchmark=benchmark.Close, output=reports_path)
                    except Exception as e:
                        self.logger.debug('catch exception: %s\n' % e)

        self.logger.info(
            f'Finished testing model ({self.study_name}__{model_epoch}): ${"{:.2f}".format(np.sum(rewards))}')
Example #6
0
def main():
    args = arg_parser()

    # Create log dir
    home = str(os.environ['HOME'])
    tensorboard_log_dir = home + "/HDD/RA-L/tensorboard_log/"
    os.makedirs(tensorboard_log_dir,exist_ok=True)

    # Create result tmp dir
    figdir = "./fig/"
    os.makedirs(figdir,exist_ok=True)

    # Create ndarray save dir
    nd_dir = "./Data/CustomAnt/" + str(args.agent) + "/" # ランダムな脚が故障する環境での評価を格納するディレクトリ
    # nd_dir = "./Data/CustomAnt_2JointBroken/" + str(args.agent) + "/" # ランダムな関節2個が故障する環境での評価を格納するディレクトリ
    os.makedirs(nd_dir, exist_ok=True)

    # Create and wrap the environment 
    env1 = gym.make(config['env'])
    broken_env = ChangeJointRangeEnv(env1) # 脚一本が故障する環境
    # broken_env = Random2JointBrokenEnv(env1) # 関節2個が故障する環境

    if args.video:
        broken_env = wrappers.Monitor(broken_env,'./videos/' + args.loaddir + "-" + datetime.datetime.now().isoformat(),force=True,video_callable=(lambda ep: ep % 1 == 0)) # for output video

    # broken_env = DummyVecEnv([lambda :broken_env]) #複数の環境用の単純なベクトル化されたラッパーを作成し、現在のPythonプロセスで各環境を順番に呼び出します。
    env1 = DummyVecEnv([lambda : env1])

    # argpaserから入力する場合
    agentName = []
    agentName.append(args.agent)


    plainData = []
    brokenData = []
    perror = []
    berror = []

    plt.figure()
    sns.set()
    # fig,ax = plt.subplots()
    for agent in agentName:
        brokenSeedAveReward = []

        load_dir = "./trained_agent_dir/" + agent + "/"

        # seedごとに平均報酬を獲得する ,range(1,6)
        for seed in range(1,6):

            # PPO2modelの生成(トレーニングを行うエージェントの作成)
            trainedAnt = PPO2(MlpPolicy, env1, verbose=1, tensorboard_log=tensorboard_log_dir)

            # 保存したモデル(学習済みモデル)のload :zipファイルのファイル名のみとパスを指定,seedごとに
            trainedAnt = PPO2.load(load_dir + "trainedAnt" + "-seed" + str(seed)) 

            # seedの設定
            trainedAnt.set_random_seed(seed+100)

            print("loaddir:",load_dir + "trainedAnt" + "-seed" + str(seed))

            broken_obs = broken_env.reset()

            broken_total_rewards = [] 
            rewards = 0
            forwards = 0
            ctrls = 0
            contacts = 0
            survives = 0

            # kを0から1まで,0.01刻みで変化させる
            for k in tqdm(range(0, 100)):
                # 故障が起きる環境でのrewardを求めるループ(100)
                for episode in range(args.n_episodes):
                    # iteration of time steps, default is 1000 time steps
                    for i in range(1000):
                        # predict phase
                        action, _states = trainedAnt.predict(broken_obs)

                        # step phase
                        # broken環境で評価する時
                        broken_obs, reward, done, info = broken_env.step(action, k)
                        rewards += reward
                        forwards += info['reward_forward']
                        ctrls += info['reward_ctrl']
                        contacts += info['reward_contact']
                        survives += info['reward_survive']
                        
                        if done:
                            break

                    # k_geneにkとその時の報酬を格納
                    k_gene[seed-1][episode][k] = rewards

                    # 報酬関数の各項の値を格納
                    reward_forward_map[seed-1][episode][k] = forwards
                    reward_ctrl_map[seed-1][episode][k] = ctrls
                    reward_contact_map[seed-1][episode][k] = contacts
                    reward_survive_map[seed-1][episode][k] = survives

                    # 環境をリセット
                    broken_obs = broken_env.reset()

                    # 合計報酬の記録とリセット
                    broken_total_rewards.append(rewards)
                    rewards = 0
                    forwards = 0
                    ctrls = 0
                    contacts = 0
                    survives = 0

            broken_reward_average1 = sum(broken_total_rewards)/len(broken_total_rewards)
            brokenSeedAveReward.append(broken_reward_average1)

            del trainedAnt 
        
        # agentのplain,broken環境での平均報酬が格納されている
        broken_ave = sum(brokenSeedAveReward)/len(brokenSeedAveReward)
        brokenData.append(broken_ave)
        broken_error = np.std(brokenSeedAveReward,ddof=1)/np.sqrt(len(brokenSeedAveReward))
        berror.append(broken_error)

    brokenData = np.array(brokenData).flatten()
    berror = np.array(berror)

    

    # print(k_gene)
    for seed in range(1, 6):
        seed_gene = k_gene[seed-1,:,:]
        seed_gene = np.sum(seed_gene, axis=0)
        seed_gene = seed_gene/args.n_episodes # 平均報酬, It was '/100' before.
        np.save(nd_dir + str(agentName[0]) + "_rewardForEachK" + "_seed=" + str(seed), seed_gene)

    # 報酬関数の各項の二次元配列を一次元配列に変形してnpyで保存
    save_reward_map(map=reward_forward_map, save_path=nd_dir, agent_name=str(agentName[0]), save_name="_rewardForward", n_episodes=args.n_episodes)
    save_reward_map(map=reward_ctrl_map, save_path=nd_dir, agent_name=str(agentName[0]), save_name="_rewardCtrl", n_episodes=args.n_episodes)
    save_reward_map(map=reward_contact_map, save_path=nd_dir, agent_name=str(agentName[0]), save_name="_rewardContact", n_episodes=args.n_episodes)
    save_reward_map(map=reward_survive_map, save_path=nd_dir, agent_name=str(agentName[0]), save_name="_rewardSurvive", n_episodes=args.n_episodes)
from stable_baselines import PPO1
from particle_env_continuous import PrticleEnv
import os
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3"

save_name = "model/part_circle_exp1"
epochs = 20

if __name__ == "__main__":

    # env = gym.make('CartPole-v1')
    env = DummyVecEnv([
        lambda: PrticleEnv(alpha=1,
                           beta=10,
                           win_thre=1,
                           max_timestep=256,
                           for_circle_traj=True)
    ])
    # Optional: PPO2 requires a vectorized environment to run
    # the env is now wrapped automatically when passing it to the constructor
    # env = DummyVecEnv([lambda: env])
    # env = VecNormalize(env, norm_obs=True, norm_reward=False,
    #                 clip_obs=10.)

    if not os.path.exists(save_name):
        os.makedirs(save_name)
    # model = PPO2(MlpPolicy, env, verbose=0,tensorboard_log="./ppo2_particle_tensorboard/",n_cpu_tf_sess=1)
    model = PPO1(MlpPolicy,env,verbose=0,\
            timesteps_per_actorbatch=256,
            tensorboard_log=save_name,\
Example #8
0
import gym

from stable_baselines.common.policies import MlpPolicy
from stable_baselines.common.vec_env import DummyVecEnv
from stable_baselines import PPO2

from env.BitcoinTradingEnv import BitcoinTradingEnv

import pandas as pd

train_df = pd.read_csv('./datasets/bot_train_ETHBTC_700_hour.csv')
train_df = train_df.sort_values('Date')

test_df = pd.read_csv('./datasets/bot_rollout_ETHBTC_700_hour.csv')
test_df = test_df.sort_values('Date')

train_env = DummyVecEnv([lambda: BitcoinTradingEnv(train_df, serial=True)])

model = PPO2(MlpPolicy, train_env, verbose=1, tensorboard_log="./tensorboard/")
model.learn(total_timesteps=5000)

test_env = DummyVecEnv([lambda: BitcoinTradingEnv(test_df, serial=True)])

obs = test_env.reset()
for i in range(50000):
    action, _states = model.predict(obs)
    obs, rewards, done, info = test_env.step(action)
    test_env.render(mode="human", title="BTC")

test_env.close()
def main():

    # train the policy, then do some tests to get a sense of how it performs

    for arg in sys.argv:
        if arg.startswith('--job='):
            i = int(arg.split('--job=')[1]) - 1

    # pull in the encoder params
    p_dir = "./experiments/extra_train_exps/{}".format(i)
    proj = np.load(p_dir + "projectors.npz")
    proj = np.row_stack([v for k, v in proj.items()])
    proj = la.svd(proj, full_matrices=False)[2]
    enc_dim = proj.shape[0]
    weights = np.load(p_dir + "weights.npz")
    biases = np.load(p_dir + "biases.npz")
    weights = [v for k, v in weights.items()]
    biases = [v for k, v in biases.items()]

    saveload_path = "./experiments/extra_train_exps/{}".format(i)

    # train the model
    # try a few restarts, keep the best
    best_avg_perf = -np.inf
    perfs = []
    for j in range(5):
        # set up the environment
        env = TimeLimit(
            RestartablePendulumEnv(enc_dim=enc_dim),
            max_episode_steps=200)  # not sure effect of max_episode_steps
        env = EncoderWrapper(env, mlp_encoder, [weights, biases, proj])
        env = DummyVecEnv([lambda: env])
        pol = LinearPolicy_MLPCritic
        pol_args = dict(
            layers=[64, 64], layer_norm=False
        )  # this is the architecture for the critic in ddpg, doesn't specify policy

        model = train_policy_ddpg(env,
                                  pol,
                                  pol_args,
                                  300000,
                                  verbose=0,
                                  actor_lr=.5,
                                  critic_lr=.001)

        # clean up
        env.close()

        #model = DDPG.load(saveload_path+"model")

        # now let's test the model
        # specify the test task
        n_test_steps = 100

        # uniform grid over statespace (20 points)
        angs = np.linspace(-np.pi, np.pi, 5)[:-1]
        vels = np.linspace(-1, 1, 5)
        test_states = np.array(list(itertools.product(angs, vels)))
        n_test_states = len(angs) * len(vels)
        performance = np.zeros(n_test_states)

        # restart the env
        env = TimeLimit(RestartablePendulumEnv(), max_episode_steps=200)
        env = EncoderWrapper(env, mlp_encoder, [weights, biases, proj])

        # for each test state, start the env in the state, then run forward and collect rewards
        for k in range(n_test_states):
            obs = env.reset(state=test_states[k])
            rewards = []
            for j in range(n_test_steps):
                action, _states = model.predict(obs)
                obs, reward, dones, info = env.step(action)
                rewards.append(reward)
                #env.render()
            performance[k] = np.array(rewards).mean()

        avg_perf = performance.mean()
        perfs.append(avg_perf)
        print("average performance of this model:{}".format(avg_perf))
        if avg_perf > best_avg_perf:
            best_avg_perf = avg_perf
            # specify the path to save the model

            model.save(saveload_path + "model")
            np.savetxt(saveload_path + "test_performance.txt", performance)

        # clean up and save results
        np.savetxt(saveload_path + "avg_per_runs.txt", np.array(perfs))
        env.close()
        del model
def test_model_manipulation(request, model_class):
    """
    Test if the algorithm can be loaded and saved without any issues, the environment switching
    works and that the action prediction works

    :param model_class: (BaseRLModel) A model
    """
    try:
        env = DummyVecEnv([lambda: IdentityEnvBox(eps=0.5)])

        # create and train
        model = model_class(policy="MlpPolicy", env=env)
        model.learn(total_timesteps=NUM_TIMESTEPS, seed=0)

        # predict and measure the acc reward
        acc_reward = 0
        set_global_seeds(0)
        obs = env.reset()
        for _ in range(N_TRIALS):
            action, _ = model.predict(obs)
            obs, reward, _, _ = env.step(action)
            acc_reward += reward
        acc_reward = sum(acc_reward) / N_TRIALS

        # saving
        model_fname = './test_model_{}.zip'.format(request.node.name)
        model.save(model_fname)

        del model, env

        # loading
        model = model_class.load(model_fname)

        # changing environment (note: this can be done at loading)
        env = DummyVecEnv([lambda: IdentityEnvBox(eps=0.5)])
        model.set_env(env)

        # predict the same output before saving
        loaded_acc_reward = 0
        set_global_seeds(0)
        obs = env.reset()
        for _ in range(N_TRIALS):
            action, _ = model.predict(obs)
            obs, reward, _, _ = env.step(action)
            loaded_acc_reward += reward
        loaded_acc_reward = sum(loaded_acc_reward) / N_TRIALS

        with pytest.warns(None) as record:
            act_prob = model.action_probability(obs)

        if model_class in [DDPG, SAC, TD3]:
            # check that only one warning was raised
            assert len(record) == 1, "No warning was raised for {}".format(
                model_class)
            assert act_prob is None, "Error: action_probability should be None for {}".format(
                model_class)
        else:
            assert act_prob[0].shape == (1, 1) and act_prob[1].shape == (1, 1), \
                "Error: action_probability not returning correct shape"

        # test action probability for given (obs, action) pair
        # must return zero and raise a warning or raise an exception if not defined
        env = model.get_env()
        obs = env.reset()
        observations = np.array([obs for _ in range(10)])
        observations = np.squeeze(observations)
        observations = observations.reshape((-1, 1))
        actions = np.array([env.action_space.sample() for _ in range(10)])

        if model_class in [DDPG, SAC, TD3]:
            with pytest.raises(ValueError):
                model.action_probability(observations, actions=actions)
        else:
            actions_probas = model.action_probability(observations,
                                                      actions=actions)
            assert actions_probas.shape == (len(actions),
                                            1), actions_probas.shape
            assert np.all(actions_probas >= 0), actions_probas
            actions_logprobas = model.action_probability(observations,
                                                         actions=actions,
                                                         logp=True)
            assert np.allclose(actions_probas,
                               np.exp(actions_logprobas)), (actions_probas,
                                                            actions_logprobas)

        # assert <15% diff
        assert abs(acc_reward - loaded_acc_reward) / max(acc_reward, loaded_acc_reward) < 0.15, \
            "Error: the prediction seems to have changed between loading and saving"

        # learn post loading
        model.learn(total_timesteps=100, seed=0)

        # validate no reset post learning
        # This test was failing from time to time for no good reason
        # other than bad luck
        # We should change this test
        # loaded_acc_reward = 0
        # set_global_seeds(0)
        # obs = env.reset()
        # for _ in range(N_TRIALS):
        #     action, _ = model.predict(obs)
        #     obs, reward, _, _ = env.step(action)
        #     loaded_acc_reward += reward
        # loaded_acc_reward = sum(loaded_acc_reward) / N_TRIALS
        # # assert <10% diff
        # assert abs(acc_reward - loaded_acc_reward) / max(acc_reward, loaded_acc_reward) < 0.1, \
        #     "Error: the prediction seems to have changed between pre learning and post learning"

        # predict new values
        obs = env.reset()
        for _ in range(N_TRIALS):
            action, _ = model.predict(obs)
            obs, _, _, _ = env.step(action)

        # Free memory
        del model, env

    finally:
        if os.path.exists("./test_model.zip"):
            os.remove("./test_model.zip")
Example #11
0
from stable_baselines.common.vec_env import DummyVecEnv
from stable_baselines import PPO2

from env.StockTradingEnv import StockTradingEnv

import pandas as pd

import pandas_datareader as pdr

ticker = 'AAPL'
start_date = '2018-01-01'

df = pdr.get_data_yahoo(ticker, start_date)
df['Date'] = df.index
df.index = range(len(df))
df = df.sort_values('Date')
df['Date'] = df['Date'].astype('str')

# The algorithms require a vectorized environment to run
env = DummyVecEnv([lambda: StockTradingEnv(df)])

model = PPO2(MlpPolicy, env, verbose=1)
model.learn(total_timesteps=50)

obs = env.reset()
for i in range(len(df['Date'])):
    action, _states = model.predict(obs)
    obs, rewards, done, info = env.step(action)
    print(rewards, done, info)
    env.render(title="AAPL")
Example #12
0
def main():
    """
    the main function
    it starts a droidbot according to the arguments given in cmd line
    """

    opts = parse_args()
    import os
    if not os.path.exists(opts.apk_path):
        print("APK does not exist.")
        return
    if not opts.output_dir and opts.cv_mode:
        print("To run in CV mode, you need to specify an output dir (using -o option).")

    if opts.distributed:
        if opts.distributed == "master":
            start_mode = "master"
        else:
            start_mode = "worker"
    else:
        start_mode = "normal"

    if start_mode == "master":
        droidmaster = DroidMaster(
            app_path=opts.apk_path,
            is_emulator=opts.is_emulator,
            output_dir=opts.output_dir,
            # env_policy=opts.env_policy,
            env_policy=env_manager.POLICY_NONE,
            policy_name=opts.input_policy,
            random_input=opts.random_input,
            script_path=opts.script_path,
            event_interval=opts.interval,
            timeout=opts.timeout,
            event_count=opts.count,
            cv_mode=opts.cv_mode,
            debug_mode=opts.debug_mode,
            keep_app=opts.keep_app,
            keep_env=opts.keep_env,
            profiling_method=opts.profiling_method,
            grant_perm=opts.grant_perm,
            enable_accessibility_hard=opts.enable_accessibility_hard,
            qemu_hda=opts.qemu_hda,
            qemu_no_graphic=opts.qemu_no_graphic,
            humanoid=opts.humanoid,
            ignore_ad=opts.ignore_ad,
            replay_output=opts.replay_output)
        droidmaster.start()
    else:
        droidbot = DroidBot(
            app_path=opts.apk_path,
            device_serial=opts.device_serial,
            is_emulator=opts.is_emulator,
            output_dir=opts.output_dir,
            # env_policy=opts.env_policy,
            env_policy=env_manager.POLICY_NONE,
            policy_name=opts.input_policy,
            random_input=opts.random_input,
            script_path=opts.script_path,
            event_interval=opts.interval,
            timeout=opts.timeout,
            event_count=opts.count,
            cv_mode=opts.cv_mode,
            debug_mode=opts.debug_mode,
            keep_app=opts.keep_app,
            keep_env=opts.keep_env,
            profiling_method=opts.profiling_method,
            grant_perm=opts.grant_perm,
            enable_accessibility_hard=opts.enable_accessibility_hard,
            master=opts.master,
            humanoid=opts.humanoid,
            ignore_ad=opts.ignore_ad,
            replay_output=opts.replay_output)

        droidbot.start()

    env = DummyVecEnv([lambda: droidbot_env.DroidBotEnv(droidbot)])
    start_time = time.time()
    env.reset()

    def events_so_state(env):
        events = env.envs[0].possible_events
        state_now = env.envs[0].device.get_current_state()
        event_ids = []
        probs = []

        for i, event in enumerate(events):
            event_str = str(type(event)) + '_' + event.get_event_str(state_now)
            if event_str in event_ids:
                1/0
            if event:
                event_ids.append(event_str)
                probs.append(env.envs[0].events_probs[i])
        state = state_now.state_str
        probs = np.array(probs)
        return state, probs, event_ids

    state_function = {}
    num_iterations = 1000
    EPSILON = 0.1
    Q_TABLE = []
    transitions_matrix = None
    number_of_trans = []
    event_to_id = []
    max_number_of_actions = 50

    def check_state(state_id):
        nonlocal Q_TABLE
        nonlocal transitions_matrix
        nonlocal number_of_trans
        nonlocal event_to_id
        nonlocal state_function
        #print(state_id)
        if state_function.get(state_id) is None:
            if Q_TABLE == []:
                Q_TABLE = np.zeros((1, max_number_of_actions))
                transitions_matrix = np.zeros((1, max_number_of_actions, 1))
            else:
                Q_TABLE = np.concatenate([Q_TABLE, np.zeros((1, max_number_of_actions))], axis=0)
                transition_matrix_new = np.zeros((Q_TABLE.shape[0], max_number_of_actions, Q_TABLE.shape[0]))
                transition_matrix_new[:-1, :, :-1] = transitions_matrix
                transitions_matrix = transition_matrix_new
            event_to_id.append({})
            state_function[state_id] = Q_TABLE.shape[0] - 1
            Q_TABLE[-1][-1] = 1.0
            number_of_trans.append(np.zeros(max_number_of_actions))
        #print(state_function)
    state_pre, probs, event_ids = events_so_state(env)
    check_state(state_pre)
    state = state_function[state_pre]

    def make_decision(state_i, events):
        nonlocal Q_TABLE, event_to_id
        id_to_action = np.zeros((max_number_of_actions), dtype=np.int32) + 1000
        q_values = np.zeros(max_number_of_actions)
        probs_now = np.zeros(max_number_of_actions)

        for i, event in enumerate(events):
            if i == len(events) - 1:
                q_values[-1] = Q_TABLE[state_i][-1]
                id_to_action[-1] = min(len(events), max_number_of_actions) - 1
                continue
            if event_to_id[state_i].get(event) is None:
                if len(event_to_id[state_i]) >= max_number_of_actions - 1:
                    continue
                event_to_id[state_i][event] = int(len(list(event_to_id[state_i].keys())))
                Q_TABLE[state_i][event_to_id[state_i][event]] = 1.0
            q_values[event_to_id[state_i][event]] = Q_TABLE[state_i][event_to_id[state_i][event]]

            id_to_action[event_to_id[state_i][event]] = int(i)


        if np.random.rand() < EPSILON:
            action = max_number_of_actions - 1
            make_action = id_to_action[action]
        else:
            max_q = np.max(q_values)
            actions_argmax = np.arange(max_number_of_actions)[q_values >= max_q - 0.0001]
            probs_unnormed = 1/(np.arange(actions_argmax.shape[0]) + 1.)
            probs_unnormed /= np.sum(probs_unnormed)
            action = np.random.choice(actions_argmax)
            make_action = id_to_action[action]
        return action, make_action

    for i_step in np.arange(num_iterations):
        action, make_action = make_decision(state, event_ids)
        print(state, action, make_action)
        env.step([make_action])
        new_state_pre, probs, event_ids = events_so_state(env)

        check_state(new_state_pre)
        new_state = state_function[new_state_pre]

        number_of_trans[state][action] += 1
        transitions_matrix[state, action] *= (number_of_trans[state][action] - 1)
        transitions_matrix[state, action, new_state] += 1
        transitions_matrix[state, action] /= number_of_trans[state][action]
        for _ in np.arange(10):
            for i in np.arange(max_number_of_actions):
                transitions = transitions_matrix[:, i, :]
                q_target = np.array([[np.max(Q_TABLE[i])] for i in np.arange(Q_TABLE.shape[0])])
                new_q_values = np.matmul(transitions, q_target) * 0.99
                good_states = np.sum(transitions, axis=1) > 0.5
                if True in good_states:
                    Q_TABLE[good_states, i] = new_q_values[good_states, 0]
                else:
                    continue
        for i in np.arange(Q_TABLE.shape[0]):
            print(Q_TABLE[i])
        if i_step%10==0:
            np.save('q_function', Q_TABLE)
            np.save('transition_function', transitions_matrix)
            with open('states.json', 'w') as f:
                json.dump(state_function, f)
        state = new_state
    1/0
    droidbot.stop()
Example #13
0
from env import HillCartpole
from stable_baselines.common.vec_env import DummyVecEnv
from stable_baselines.sac.policies import MlpPolicy as SacMlpPolicy
from stable_baselines import SAC

e = HillCartpole(
    verbose=True,
    trig_observations=True,
    windup_penalty=0.1
)

e = DummyVecEnv([lambda: e])

# Each trial of 400 timesteps takes ~14 seconds. Thus, to run for 12 hours
# we would want ~3000 trials.

TRIALS = 3000

model = SAC(SacMlpPolicy, e, verbose=0, seed=0, tensorboard_log="./cartpole_tboard/")

try:
    model.learn(
        total_timesteps=400*TRIALS, tb_log_name='sac'
    )
except KeyboardInterrupt:
    print('Exiting early.')

e.reset()

print('Model learned, saving:')
model.save('sac_learned_windup_01')
Example #14
0
        os.makedirs(LOGDIR)

    MILLION = 1000000
    TRAIN_STEPS = args.n
    if TRAIN_STEPS is None:
        TRAIN_STEPS = 60 * MILLION

    N_ENVS = 6
    if args.debug:
        if args.backend in ["VAE_LSTM", "VAE1D_LSTM"]:
            # can't share encoder for VAE_LSTM as it contains RNN state internally
            shared_encoder = None
        env = DummyVecEnv([
            lambda: NavRepTrainEncodedEnv(args.backend,
                                          args.encoding,
                                          silent=True,
                                          scenario='train',
                                          gpu=not args.no_gpu,
                                          shared_encoder=shared_encoder)
        ] * N_ENVS)
    else:
        env = SubprocVecEnv([
            lambda: NavRepTrainEncodedEnv(args.backend,
                                          args.encoding,
                                          silent=True,
                                          scenario='train',
                                          gpu=not args.no_gpu)
        ] * N_ENVS,
                            start_method='spawn')
    eval_env = NavRepTrainEncodedEnv(args.backend,
                                     args.encoding,
                                     silent=True,
Example #15
0
        t1 = time.time()
        model.learn(total_timesteps=algo_config["iters"],
                    callback=checkpoint_callback)
        t2 = time.time()

        # Make tb run script inside tb dir
        if os.path.exists(os.path.join("tb", config["session_ID"])):
            copyfile("tb_runner.py",
                     os.path.join("tb", config["session_ID"], "tb_runner.py"))

        print("Training time: {}".format(t2 - t1))
        pprint(config)

        model.save("agents/{}_SB_policy".format(config["session_ID"]))
        env.save(stats_path)
        env.close()

    if args["test"] and socket.gethostname() != "goedel":
        stats_path = "agents/{}_vecnorm.pkl".format(
            args["test_agent_path"][:3])
        env_fun = my_utils.import_env(env_config["env_name"])
        #env = env_fun(config)  # Default, without normalization
        env = DummyVecEnv([lambda: env_fun(config)])
        #env = VecNormalize.load(stats_path, env)
        env.training = False
        env.norm_reward = False

        model = load_model(config)

        test_agent(env, model, deterministic=True)
    del hyperparams['n_timesteps']

    # Create the environment and wrap it if necessary
    if is_atari:
        print("Using Atari wrapper")
        env = make_atari_env(env_id, num_env=n_envs, seed=args.seed)
        # Frame-stacking with 4 frames
        env = VecFrameStack(env, n_stack=4)
    elif args.algo in ['dqn', 'ddpg']:
        if hyperparams.get('normalize', False):
            print("WARNING: normalization not supported yet for DDPG/DQN")
        env = gym.make(env_id)
        env.seed(args.seed)
    else:
        if n_envs == 1:
            env = DummyVecEnv([make_env(env_id, 0, args.seed)])
        else:
            env = SubprocVecEnv([make_env(env_id, i, args.seed) for i in range(n_envs)])
        if normalize:
            print("Normalizing input and return")
            env = VecNormalize(env)

    # Parse noise string for DDPG
    if args.algo == 'ddpg' and hyperparams.get('noise_type') is not None:
        noise_type = hyperparams['noise_type'].strip()
        noise_std = hyperparams['noise_std']
        n_actions = env.action_space.shape[0]
        if 'adaptive-param' in noise_type:
            hyperparams['param_noise'] = AdaptiveParamNoiseSpec(initial_stddev=noise_std,
                                                                desired_action_stddev=noise_std)
        elif 'normal' in noise_type:
Example #17
0
            sql = 'COPY {} ({}) FROM STDIN WITH CSV'.format(
                table_name, columns)
            cur.copy_expert(sql=sql, file=s_buf)

    study = optuna.create_study(study_name='cartpol_optuna',
                                storage='sqlite:///params.db',
                                load_if_exists=True)
    model_params = get_model_params()
    print(model_params)

    n_cpu = 6

    env = SubprocVecEnv([make_env(i)
                         for i in range(n_cpu)])  # задаем колл-во процессоров
    test_env = DummyVecEnv([make_envTest(i) for i in range(1)])

    # model = PPO2(MlpLnLstmPolicy, env, nminibatches=1, verbose=1, n_steps=49, tensorboard_log="./tensorboard_keep/",
    #              **model_params)
    model = PPO2.load("model/Windows/model_epoch_4.pkl",
                      nminibatches=n_cpu,
                      env=env,
                      verbose=1,
                      n_steps=49,
                      tensorboard_log="./tensorboard_keep_trade/")
    model.is_tb_set = True

    for n_epoch in range(0, 1):
        summary_writer = tf.compat.v1.summary.FileWriter(
            "./tensorboard_keep/" + "Keep_trade_test_" + str(n_epoch + 1))
        print('\x1b[6;30;42m' + '**************  Calculate epoch:', n_epoch,
Example #18
0
def main():
    parser = argparse.ArgumentParser()

    parser.add_argument(
        "--generate_pretrain",
        type=int,
        default=0,
        help="If true, launch an interface to generate an expert trajectory")

    parser.add_argument(
        "--train",
        type=int,
        default=1,
        help="True: training, False: using a trained model")

    parser.add_argument(
        "--algo",
        type=str,
        default="ppo2",
        help="The learning algorithm to be used (ppo2 or ddpg)")

    parser.add_argument(
        "--model",
        type=str,
        default="",
        help="The version name of the model")

    parser.add_argument(
        "--gui",
        type=int,
        default=1,
        help="Wether the GUI of the simulation should be used or not. 0 or 1")

    args = parser.parse_args()
    algo = args.algo.lower()

    try:
        assert args.gui == 0 or args.gui == 1
        assert algo == "ppo2" or algo == "ddpg"

    except AssertionError as e:
        print(str(e))
        return

    env = RobotEnv(gui=args.gui)
    vec_env = DummyVecEnv([lambda: env])

    # Generate an expert trajectory
    if args.generate_pretrain:
        pass
    
    # Train a model
    elif args.train == 1:
        while True:
            req = Request(
                "https://frightanic.com/goodies_content/docker-names.php",
                headers={'User-Agent': 'Mozilla/5.0'})

            webpage = str(urlopen(req).read())
            word = webpage.split("b\'")[1]
            word = word.split("\\")[0]
            word.replace(" ", "_")

            try:
                assert os.path.isfile(
                    "models/" + algo + "_throw_" + word + ".pkl")

            except AssertionError:
                break

        log_name = "./logs/throw/" + word

        if algo == "ppo2":
            # For recurrent policies, nminibatches should be a multiple of the 
            # nb of env used in parallel (so for LSTM, 1)
            model = PPO2(
                MlpLstmPolicy,
                vec_env,
                nminibatches=1,
                verbose=0,
                tensorboard_log=log_name)

        elif algo == "ddpg":
            action_noise = OrnsteinUhlenbeckActionNoise(
                mean=np.zeros(env.action_space.shape[-1]),
                sigma=float(0.5) * np.ones(env.action_space.shape[-1]))

            model = DDPG(
                stable_baselines.ddpg.LnMlpPolicy,
                env,
                verbose=0,
                param_noise=None,
                action_noise=action_noise,
                tensorboard_log=log_name)

        try:
            model.learn(total_timesteps=1000000)
        
        except KeyboardInterrupt:
            print("#---------------------------------#")
            print("Training \'" + word + "\' interrupted")
            print("#---------------------------------#")
            sys.exit(1)

        model.save("models/" + algo + "_throw_" + word)

    # Use a trained model
    else:
        if args.model == "":
            print("Specify the version of the model using --model")
            return

        if algo == "ppo2":
            model = PPO2.load("models/" + algo + "_throw_" + args.model)
        elif algo == "ddpg":
            model = DDPG.load("models/" + algo + "_throw_" + args.model)

        for test in range(10):
            dones = False
            obs = env.reset()

            while not dones:
                action, _states = model.predict(obs)
                obs, rewards, dones, info = env.step(action)

    time.sleep(2)
    env._termination()
Example #19
0
def test_executive():

    episodes = 50
    max_timesteps = 1000

    #Input elements for various trained policies
    arg_parser = argparse.ArgumentParser()
    arg_parser.add_argument(
        '--expert_policy_dir',
        type=str)  # trained policy directory, not necessary for openloop
    arg_parser.add_argument('--type', type=str)  #ars, ppo1, ppo2, openloop
    arg_parser.add_argument('--mode',
                            type=str)  #linearbias, mlp, 3finger, 2finger
    arg_parser.add_argument('--render', type=str)  #on, off
    args = arg_parser.parse_args()
    mode_dict = {}
    mode_dict['mode'] = args.mode

    #Parsing parameters/file to reset environment for testing objects
    categ = ['shape', 'x', 'y', 'rot_z', 'len', 'width', 'height', 'radius']
    path = os.path.join(os.environ['GRASPING_RL'],
                        'evaluation/baseline_testset_calcs.csv')
    csv_file = pd.read_csv(path,
                           sep='\t',
                           header=None,
                           names=categ,
                           skiprows=1)
    parser = BaselineParser(csv_file)

    #Enviornment Creation
    env_id = 'ROAMHandGraspCube-v1'
    if "ppo" in args.type:
        env = VecNormalize(DummyVecEnv([lambda: gym.make(env_id)]),
                           norm_reward=False)
        env.set_attr('_max_episode_steps', max_timesteps)
        env.env_method('set_evaluation')
        env.load_running_average(args.expert_policy_dir)
    else:
        env = gym.make(env_id)
        env._max_episode_steps = max_timesteps
        env.env.set_evaluation()

    #Testing loop to evaluate grasps on 50 objects
    total_successes = np.zeros(episodes)
    for i in range(episodes):
        obs = env.reset()
        params = parser.get_testcase(i)

        if "ppo" in args.type:
            env.env_method('set_object', params)
            success = SuccessCriterion(env.get_attr('sim')[0])
        else:
            env.env.set_object(params)
            success = SuccessCriterion(env.env.sim)

        agent = MetaAgent(env=env,
                          load_dir=args.expert_policy_dir,
                          load_type=args.type,
                          **mode_dict)

        #Per episode simulation and evaluation
        success_array = np.zeros(max_timesteps)
        for j in range(max_timesteps):
            action = agent.act(obs)
            if args.type == 'openloop':
                env.env.sim.step()
            else:
                obs, reward, done, info = env.step(action)
            if args.render != 'off':
                env.render()
            success_array[j] = success.grasp_criteria()

        #Success Criterion Evaluation
        if np.sum((success_array)) >= 250:
            total_successes[i] = 1
            print("Baseline {} is a Success!".format(i), np.sum(
                (success_array)))
        else:
            total_successes[i] = 0
            print("Baseline {} is a Failure!".format(i), np.sum(
                (success_array)))

    return total_successes
def test_model_manipulation(model_class):
    """
    Test if the algorithm can be loaded and saved without any issues, the environment switching
    works and that the action prediction works

    :param model_class: (BaseRLModel) A model
    """
    try:
        env = DummyVecEnv([lambda: IdentityEnvBox(eps=0.5)])

        # create and train
        model = model_class(policy="MlpPolicy", env=env)
        model.learn(total_timesteps=NUM_TIMESTEPS, seed=0)

        # predict and measure the acc reward
        acc_reward = 0
        set_global_seeds(0)
        obs = env.reset()
        for _ in range(N_TRIALS):
            action, _ = model.predict(obs)
            obs, reward, _, _ = env.step(action)
            acc_reward += reward
        acc_reward = sum(acc_reward) / N_TRIALS

        # saving
        model.save("./test_model")

        del model, env

        # loading
        model = model_class.load("./test_model")

        # changing environment (note: this can be done at loading)
        env = DummyVecEnv([lambda: IdentityEnvBox(eps=0.5)])
        model.set_env(env)

        # predict the same output before saving
        loaded_acc_reward = 0
        set_global_seeds(0)
        obs = env.reset()
        for _ in range(N_TRIALS):
            action, _ = model.predict(obs)
            obs, reward, _, _ = env.step(action)
            loaded_acc_reward += reward
        loaded_acc_reward = sum(loaded_acc_reward) / N_TRIALS
        # assert <15% diff
        assert abs(acc_reward - loaded_acc_reward) / max(acc_reward, loaded_acc_reward) < 0.15, \
            "Error: the prediction seems to have changed between loading and saving"

        # learn post loading
        model.learn(total_timesteps=100, seed=0)

        # validate no reset post learning
        # This test was failing from time to time for no good reason
        # other than bad luck
        # We should change this test
        # loaded_acc_reward = 0
        # set_global_seeds(0)
        # obs = env.reset()
        # for _ in range(N_TRIALS):
        #     action, _ = model.predict(obs)
        #     obs, reward, _, _ = env.step(action)
        #     loaded_acc_reward += reward
        # loaded_acc_reward = sum(loaded_acc_reward) / N_TRIALS
        # # assert <10% diff
        # assert abs(acc_reward - loaded_acc_reward) / max(acc_reward, loaded_acc_reward) < 0.1, \
        #     "Error: the prediction seems to have changed between pre learning and post learning"

        # predict new values
        obs = env.reset()
        for _ in range(N_TRIALS):
            action, _ = model.predict(obs)
            obs, _, _, _ = env.step(action)

        # Free memory
        del model, env

    finally:
        if os.path.exists("./test_model"):
            os.remove("./test_model")
Example #21
0
    def create_env(n_envs, eval_env=False, no_log=False):
        """
        Create the environment and wrap it if necessary
        :param n_envs: (int)
        :param eval_env: (bool) Whether is it an environment used for evaluation or not
        :param no_log: (bool) Do not log training when doing hyperparameter optim
            (issue with writing the same file)
        :return: (Union[gym.Env, VecEnv])
        """
        global hyperparams
        global env_kwargs

        # Do not log eval env (issue with writing the same file)
        log_dir = None if eval_env or no_log else save_path

        # Set initialzier and action type for environment, standard implementation currently does not support
        # custom types, so pass them here (kwargs is global, so do set again during repeated calls)
        if "initializer" in env_kwargs.keys() and isinstance(
                env_kwargs["initializer"], int):
            if env_kwargs["initializer"] == 0:
                env_kwargs["initializer"] = RandomInitializer(
                    env_kwargs.pop("difficulty"))
            elif env_kwargs["initializer"] == 1:
                env_kwargs["initializer"] = CompletelyRandomInitializer()
            else:
                raise RuntimeError('Unsupported initializer "{}"'.format(
                    env_kwargs["initializer"]))

        if "action_type" in env_kwargs.keys() and isinstance(
                env_kwargs["action_type"], int):
            if env_kwargs["action_type"] == "POSITION":
                env_kwargs["action_type"] = ActionType.POSITION
            elif env_kwargs["action_type"] == "TORQUE":
                env_kwargs["action_type"] = ActionType.TORQUE
            elif env_kwargs["action_type"] == "TORQUE_AND_POSITION":
                env_kwargs["action_type"] = ActionType.TORQUE_AND_POSITION
            else:
                raise RuntimeError('Unsupported Action Type"{}"'.format(
                    kwargs["action_type"]))
        else:
            env_kwargs["action_type"] = ActionType.POSITION

        if is_atari:
            if args.verbose > 0:
                print("Using Atari wrapper")
            env = make_atari_env(env_id, num_env=n_envs, seed=args.seed)
            # Frame-stacking with 4 frames
            env = VecFrameStack(env, n_stack=4)
        elif algo_ in ['dqn', 'ddpg']:
            if hyperparams.get('normalize', False):
                print("WARNING: normalization not supported yet for DDPG/DQN")
            env = gym.make(env_id, **env_kwargs)
            env.seed(args.seed)
            if env_wrapper is not None:
                env = env_wrapper(env)
        else:
            if n_envs == 1:
                env = DummyVecEnv([
                    make_env(env_id,
                             0,
                             args.seed,
                             wrapper_class=env_wrapper,
                             log_dir=log_dir,
                             env_kwargs=env_kwargs)
                ])
            else:
                # env = SubprocVecEnv([make_env(env_id, i, args.seed) for i in range(n_envs)])
                # On most env, SubprocVecEnv does not help and is quite memory hungry
                env = DummyVecEnv([
                    make_env(env_id,
                             i,
                             args.seed,
                             log_dir=log_dir,
                             wrapper_class=env_wrapper,
                             env_kwargs=env_kwargs) for i in range(n_envs)
                ])
            if normalize:
                # Copy to avoid changing default values by reference
                local_normalize_kwargs = normalize_kwargs.copy()
                # Do not normalize reward for env used for evaluation
                if eval_env:
                    if len(local_normalize_kwargs) > 0:
                        local_normalize_kwargs['norm_reward'] = False
                    else:
                        local_normalize_kwargs = {'norm_reward': False}

                if args.verbose > 0:
                    if len(local_normalize_kwargs) > 0:
                        print("Normalization activated: {}".format(
                            local_normalize_kwargs))
                    else:
                        print("Normalizing input and reward")
                env = VecNormalize(env, **local_normalize_kwargs)

        # Optional Frame-stacking
        if hyperparams.get('frame_stack', False):
            n_stack = hyperparams['frame_stack']
            env = VecFrameStack(env, n_stack)
            print("Stacking {} frames".format(n_stack))
        if args.algo == 'her':
            # Wrap the env if need to flatten the dict obs
            if isinstance(env, VecEnv):
                env = _UnvecWrapper(env)
            env = HERGoalEnvWrapper(env)
        return env
Example #22
0
import gym

from stable_baselines.common.policies import CnnPolicy
from stable_baselines.common.vec_env import DummyVecEnv, VecFrameStack
from stable_baselines import PPO2

# multiprocess environment
n_cpu = 4
env = DummyVecEnv([lambda: gym.make('CarRacing-v0') for i in range(n_cpu)])
env = VecFrameStack(env, n_stack=4)
model = PPO2(CnnPolicy, env, verbose=1, tensorboard_log='logs/')
model.learn(total_timesteps=10000000)
model.save("CarRacing_ppo")

del model  # remove to demonstrate saving and loading

model = PPO2.load("CarRacing_ppo")

# Enjoy trained agent
obs = env.reset()
while True:
    action, _states = model.predict(obs)
    obs, rewards, dones, info = env.step(action)
    env.render()
Example #23
0
from stable_baselines import PPO2

import imageio
import numpy as np

from stable_baselines.common.policies import MlpPolicy, ActorCriticPolicy
from stable_baselines import A2C
from stable_baselines.her import HER
from stable_baselines.common.vec_env import DummyVecEnv

from active_env.envs import TwoBusEnv

#MountainCarContinuous-v0

powerenv = TwoBusEnv()
powerenv = DummyVecEnv([lambda: powerenv])

model = A2C(MlpPolicy, "CartPole-v1", verbose=1)
model1 = HER(ActorCriticPolicy, 'MountainCarContinuous-v0', verbose=1)
powermodel = A2C(MlpPolicy,
                 powerenv,
                 verbose=1,
                 tensorboard_log='C:\\Users\\vegar\\Dropbox\\Master\\logs')

model.learn(50000)

#model = HER(ActorCriticPolicy,'MountainCarContinuous-v0', verbose=1)
#model.learn(30000)

images = []
obs = model.env.reset()
Example #24
0
def create_test_env(env_id,
                    n_envs=1,
                    is_atari=False,
                    stats_path=None,
                    seed=0,
                    log_dir='',
                    should_render=True,
                    hyperparams=None):
    """
    Create environment for testing a trained agent

    :param env_id: (str)
    :param n_envs: (int) number of processes
    :param is_atari: (bool)
    :param stats_path: (str) path to folder containing saved running averaged
    :param seed: (int) Seed for random number generator
    :param log_dir: (str) Where to log rewards
    :param should_render: (bool) For Pybullet env, display the GUI
    :param env_wrapper: (type) A subclass of gym.Wrapper to wrap the original
                        env with
    :param hyperparams: (dict) Additional hyperparams (ex: n_stack)
    :return: (gym.Env)
    """
    # HACK to save logs
    if log_dir is not None:
        os.environ["OPENAI_LOG_FORMAT"] = 'csv'
        os.environ["OPENAI_LOGDIR"] = os.path.abspath(log_dir)
        os.makedirs(log_dir, exist_ok=True)
        logger.configure()

    # Create the environment and wrap it if necessary
    env_wrapper = get_wrapper_class(hyperparams)
    if 'env_wrapper' in hyperparams.keys():
        del hyperparams['env_wrapper']

    if is_atari:
        print("Using Atari wrapper")
        env = make_atari_env(env_id, num_env=n_envs, seed=seed)
        # Frame-stacking with 4 frames
        env = VecFrameStack(env, n_stack=4)
    elif n_envs > 1:
        # start_method = 'spawn' for thread safe
        env = SubprocVecEnv([
            make_env(env_id, i, seed, log_dir, wrapper_class=env_wrapper)
            for i in range(n_envs)
        ])
    # Pybullet envs does not follow gym.render() interface
    elif "Bullet" in env_id:
        spec = gym.envs.registry.env_specs[env_id]
        try:
            class_ = load(spec.entry_point)
        except AttributeError:
            # Backward compatibility with gym
            class_ = load(spec._entry_point)
        # HACK: force SubprocVecEnv for Bullet env that does not
        # have a render argument
        render_name = None
        use_subproc = 'renders' not in inspect.getfullargspec(
            class_.__init__).args
        if not use_subproc:
            render_name = 'renders'
        # Dev branch of pybullet
        # use_subproc = use_subproc and 'render' not in inspect.getfullargspec(class_.__init__).args
        # if not use_subproc and render_name is None:
        #     render_name = 'render'

        # Create the env, with the original kwargs, and the new ones overriding them if needed
        def _init():
            # TODO: fix for pybullet locomotion envs
            env = class_(**{**spec._kwargs}, **{render_name: should_render})
            env.seed(0)
            if log_dir is not None:
                env = Monitor(env,
                              os.path.join(log_dir, "0"),
                              allow_early_resets=True)
            return env

        if use_subproc:
            env = SubprocVecEnv([
                make_env(env_id, 0, seed, log_dir, wrapper_class=env_wrapper)
            ])
        else:
            env = DummyVecEnv([_init])
    else:
        env = DummyVecEnv(
            [make_env(env_id, 0, seed, log_dir, wrapper_class=env_wrapper)])

    # Load saved stats for normalizing input and rewards
    # And optionally stack frames
    if stats_path is not None:
        if hyperparams['normalize']:
            print("Loading running average")
            print("with params: {}".format(hyperparams['normalize_kwargs']))
            env = VecNormalize(env,
                               training=False,
                               **hyperparams['normalize_kwargs'])
            env.load_running_average(stats_path)

        n_stack = hyperparams.get('frame_stack', 0)
        if n_stack > 0:
            print("Stacking {} frames".format(n_stack))
            env = VecFrameStack(env, n_stack)
    return env
    parser.add_argument('--result_name', type=str, default='stabilize_highway', help='Name of saved model')
    args = parser.parse_args()
    model = run_model(args.num_cpus, args.rollout_size, args.num_steps)
    # Save the model to a desired folder and then delete it to demonstrate loading
    if not os.path.exists(os.path.realpath(os.path.expanduser('~/baseline_results'))):
        os.makedirs(os.path.realpath(os.path.expanduser('~/baseline_results')))
    path = os.path.realpath(os.path.expanduser('~/baseline_results'))
    save_path = os.path.join(path, args.result_name)
    print('Saving the trained model!')
    model.save(save_path)
    # dump the flow params
    with open(os.path.join(path, args.result_name) + '.json', 'w') as outfile:
        json.dump(flow_params, outfile, cls=FlowParamsEncoder, sort_keys=True, indent=4)
    del model
    del flow_params

    # Replay the result by loading the model
    print('Loading the trained model and testing it out!')
    model = PPO2.load(save_path)
    flow_params = get_flow_params(os.path.join(path, args.result_name) + '.json')
    flow_params['sim'].render = True
    env_constructor = env_constructor(params=flow_params, version=0)()
    env = DummyVecEnv([lambda: env_constructor])  # The algorithms require a vectorized environment to run
    obs = env.reset()
    reward = 0
    for i in range(flow_params['env'].horizon):
        action, _states = model.predict(obs)
        obs, rewards, dones, info = env.step(action)
        reward += rewards
    print('the final reward is {}'.format(reward))
Example #26
0
def main():
    parser = argparse.ArgumentParser(description='PPO baseline implementation')
    parser.add_argument('-e',
                        '--experiment',
                        type=str,
                        default='ppo_test',
                        help='name of experiment')
    parser.add_argument('-w',
                        '--env',
                        type=str,
                        default='Shepherd-v0',
                        help='name of gym environment')
    parser.add_argument('-m',
                        '--mode',
                        type=str,
                        default='train',
                        help='mode to run experiment')
    parser.add_argument('-p',
                        '--policy',
                        type=str,
                        default='mlp',
                        help='type of policy network')
    parser.add_argument('-t',
                        '--timesteps',
                        type=int,
                        default=10000,
                        help='number of timesteps to train')
    parser.add_argument('-d',
                        '--datapath',
                        type=str,
                        default='../data',
                        help='path to save results')
    args = parser.parse_args()

    mode = args.mode
    env_name = args.env
    policy = args.policy
    data_path = args.datapath
    timesteps = args.timesteps
    experiment = args.experiment

    exp_path = '{}/{}'.format(data_path, experiment)
    log_path = '{}/log_{}'.format(exp_path, timesteps)
    model_path = '{}/model_{}'.format(exp_path, timesteps)

    env = gym.make(env_name)
    env = shepherd_gym.wrappers.SamplerWrapper(env,
                                               demo_path='../data/curriculum',
                                               increment_freq=250)
    env = DummyVecEnv([lambda: env])

    if policy == 'mlp':
        policy_type = MlpPolicy
    else:
        policy_type = MlpLstmPolicy

    model = PPO2(policy_type,
                 env,
                 verbose=1,
                 tensorboard_log=log_path,
                 nminibatches=1)

    if mode == 'train':
        model.learn(total_timesteps=timesteps)
        model.save(model_path)
    else:
        model.load(model_path)

    env.render()
    obs = env.reset()
    for _ in range(1000):
        action, _states = model.predict(obs)
        obs, _, _, _ = env.step(action)
        env.render()

    # complete simulation
    env.close()
Example #27
0
        "body_style": "donkey",
        "body_rgb": (128, 128, 128),
        "car_name": "me",
        "font_size": 100,
        "racer_name": "PPO",
        "country": "USA",
        "bio": "Learning to drive w PPO RL",
        "guid": str(uuid.uuid4()),
        "max_cte": 10,
    }

    if args.test:

        #Make an environment test our trained policy
        env = gym.make(args.env_name, conf=conf)
        env = DummyVecEnv([lambda: env])

        model = PPO2.load("ppo_donkey")

        obs = env.reset()
        for i in range(1000):
            action, _states = model.predict(obs)
            obs, rewards, dones, info = env.step(action)
            env.render()

        print("done testing")

    else:

        #make gym env
        env = gym.make(args.env_name, conf=conf)
Example #28
0
import gym
import json
import datetime as dt

from stable_baselines.common.policies import MlpPolicy
from stable_baselines.common.vec_env import DummyVecEnv
from stable_baselines import PPO2

from env.SingleMaze import newMaze

import pandas as pd
walls = {2, 5, 12, 13}
#env = DummyVecEnv([lambda: TestWorld(3,5,walls)])
env = DummyVecEnv([lambda: newMaze(5)])
model = PPO2(MlpPolicy, env, verbose=1).learn(total_timesteps=20000)
#model.learn(total_timesteps=20000)

obs = env.reset()
for i in range(2000):
    action, _states = model.predict(obs)
    obs, rewards, done, info = env.step(action)
    env.render()
Example #29
0
from stable_baselines.common.policies import MlpPolicy
from stable_baselines import PPO2
from stable_baselines.common.vec_env import DummyVecEnv,VecCheckNan
import numpy as np
from os import listdir
from os.path import isfile, join
from stable_baselines.gail import ExpertDataset
from generate_pretraindata import generate_pretraindata_heuristics
# Using only one expert trajectory
# you can specify `traj_limitation=-1` for using the whole dataset

is_box_space = False
dirname = "D:\\4-System\\rusty\\"
filename="400000_heutistic_pretrain_"
filename += "box" if is_box_space else "discrete"
dataset = ExpertDataset(expert_path=dirname + filename+'.npz', batch_size=1,sequential_preprocessing=True)
# pretrain_data = generate_pretraindata_heuristics(int(5e4), int(100e5), is_box_space)
# dataset = ExpertDataset(traj_data=pretrain_data, batch_size=1,sequential_preprocessing=True)
np.seterr(all='raise')

env = gym.make("rustybox-v0" if is_box_space else "rustydiscrete-v0")

env.max_invalid_tries = 7
env = VecCheckNan(DummyVecEnv([lambda: env]))

# Instantiate the agent
model = PPO2('MlpPolicy', env, nminibatches=1)

model.pretrain(dataset, n_epochs=1)
# Save the agent
model.save("models/pretrain/"+filename)
Example #30
0
    n_timesteps = args.n_timesteps
else:
    n_timesteps = int(hyperparams['n_timesteps'])
del hyperparams['n_timesteps']

normalize = False
normalize_kwargs = {}
if 'normalize' in hyperparams.keys():
    normalize = hyperparams['normalize']
    if isinstance(normalize, str):
        normalize_kwargs = eval(normalize)
        normalize = True
    del hyperparams['normalize']

if not args.teleop:
    env = DummyVecEnv([make_env(args.seed, vae=vae, teleop=args.teleop)])
else:
    env = make_env(args.seed,
                   vae=vae,
                   teleop=args.teleop,
                   n_stack=hyperparams.get('frame_stack', 1))()

if normalize:
    if hyperparams.get('normalize', False) and args.algo in ['ddpg']:
        print("WARNING: normalization not supported yet for DDPG")
    else:
        print("Normalizing input and return")
        env = VecNormalize(env, **normalize_kwargs)

# Optional Frame-stacking
n_stack = 1