def test_identity(): with tf.Graph().as_default(): env = IdentityEnv(10) random.seed(0) tf.set_random_seed(0) param_noise = False model = deepq.models.mlp([32]) act = deepq.learn( env, q_func=model, lr=1e-3, max_timesteps=10000, buffer_size=50000, exploration_fraction=0.1, exploration_final_eps=0.02, print_freq=10, param_noise=param_noise, ) tf.set_random_seed(0) N_TRIALS = 1000 sum_rew = 0 obs = env.reset() for i in range(N_TRIALS): obs, rew, done, _ = env.step(act([obs])) sum_rew += rew assert sum_rew > 0.9 * N_TRIALS
def main(): logger.configure() env = make_atari('PongNoFrameskip-v4') env = bench.Monitor(env, logger.get_dir()) env = deepq.wrap_atari_dqn(env) model = deepq.learn( env, "conv_only", convs=[(32, 8, 4), (64, 4, 2), (64, 3, 1)], hiddens=[256], dueling=True, lr=1e-4, total_timesteps=int(1e7), buffer_size=10000, exploration_fraction=0.1, exploration_final_eps=0.01, train_freq=4, learning_starts=10000, target_network_update_freq=1000, gamma=0.99, ) model.save('pong_model.pkl') env.close()
def main(): parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('--env', help='environment ID', default='BreakoutNoFrameskip-v4') parser.add_argument('--seed', help='RNG seed', type=int, default=0) parser.add_argument('--prioritized', type=int, default=1) parser.add_argument('--prioritized-replay-alpha', type=float, default=0.6) parser.add_argument('--dueling', type=int, default=1) parser.add_argument('--num-timesteps', type=int, default=int(10e6)) parser.add_argument('--checkpoint-freq', type=int, default=10000) parser.add_argument('--checkpoint-path', type=str, default=None) args = parser.parse_args() logger.configure() set_global_seeds(args.seed) env = make_atari(args.env) env = bench.Monitor(env, logger.get_dir()) env = deepq.wrap_atari_dqn(env) model = deepq.models.cnn_to_mlp( convs=[(32, 8, 4), (64, 4, 2), (64, 3, 1)], hiddens=[256], dueling=bool(args.dueling), ) deepq.learn( env, q_func=model, lr=1e-4, max_timesteps=args.num_timesteps, buffer_size=10000, exploration_fraction=0.1, exploration_final_eps=0.01, train_freq=4, learning_starts=10000, target_network_update_freq=1000, gamma=0.99, prioritized_replay=bool(args.prioritized), prioritized_replay_alpha=args.prioritized_replay_alpha, checkpoint_freq=args.checkpoint_freq, checkpoint_path=args.checkpoint_path, ) env.close()
def main(): env = gym.make("CartPole-v0") act = deepq.learn(env, network='mlp', total_timesteps=0, load_path="cartpole_model.pkl") while True: obs, done = env.reset(), False episode_rew = 0 while not done: env.render() obs, rew, done, _ = env.step(act(obs[None])[0]) episode_rew += rew print("Episode reward", episode_rew)
def main(): env = CartPoleBulletEnv(renders=False) model = deepq.models.mlp([64]) act = deepq.learn(env, q_func=model, lr=1e-3, max_timesteps=100000, buffer_size=50000, exploration_fraction=0.1, exploration_final_eps=0.02, print_freq=10, callback=callback) print("Saving model to cartpole_model.pkl") act.save("cartpole_model.pkl")
def main(): env = gym.make("CartPole-v0") act = deepq.learn( env, network='mlp', lr=1e-3, total_timesteps=100000, buffer_size=50000, exploration_fraction=0.1, exploration_final_eps=0.02, print_freq=10, callback=callback ) print("Saving model to cartpole_model.pkl") act.save("cartpole_model.pkl")
def main(): env = gym.make("Wavefollower-v0") model = deepq.models.mlp([64,64]) act = deepq.learn( env, q_func=model, lr=1e-3, max_timesteps=2500000, buffer_size=50000, exploration_fraction=0.4, exploration_final_eps=0.02, print_freq=1 ) print("Saving model to wavefollower_model.pkl") act.save("wavefollower_model.pkl")
def main(): env = KukaGymEnv(renders=False, isDiscrete=True) model = deepq.models.mlp([64]) act = deepq.learn(env, q_func=model, lr=1e-3, max_timesteps=10000000, buffer_size=50000, exploration_fraction=0.1, exploration_final_eps=0.02, print_freq=10, callback=callback) print("Saving model to kuka_model.pkl") act.save("kuka_model.pkl")
def main(): env = gym.make("MountainCar-v0") # Enabling layer_norm here is import for parameter space noise! act = deepq.learn( env, network=models.mlp(num_hidden=64, num_layers=1), lr=1e-3, total_timesteps=100000, buffer_size=50000, exploration_fraction=0.1, exploration_final_eps=0.1, print_freq=10, param_noise=True ) print("Saving model to mountaincar_model.pkl") act.save("mountaincar_model.pkl")
def main(): env = gym.make("MountainCar-v0") act = deepq.learn( env, network=models.mlp(num_layers=1, num_hidden=64), total_timesteps=0, load_path='mountaincar_model.pkl' ) while True: obs, done = env.reset(), False episode_rew = 0 while not done: env.render() obs, rew, done, _ = env.step(act(obs[None])[0]) episode_rew += rew print("Episode reward", episode_rew)
def main(): env = KukaCamGymEnv(renders=False, isDiscrete=True) model = deepq.models.cnn_to_mlp(convs=[(32, 8, 4), (64, 4, 2), (64, 3, 1)], hiddens=[256], dueling=False) act = deepq.learn(env, q_func=model, lr=1e-3, max_timesteps=10000000, buffer_size=50000, exploration_fraction=0.1, exploration_final_eps=0.02, print_freq=10, callback=callback) print("Saving model to kuka_cam_model.pkl") act.save("kuka_cam_model.pkl")
def main(): env = gym.make("MountainCar-v0") # Enabling layer_norm here is import for parameter space noise! model = deepq.models.mlp([64], layer_norm=True) act = deepq.learn( env, q_func=model, lr=1e-3, max_timesteps=100000, buffer_size=50000, exploration_fraction=0.1, exploration_final_eps=0.1, print_freq=10, param_noise=True ) print("Saving model to mountaincar_model.pkl") act.save("mountaincar_model.pkl")
def main(learning_rate, trainmaxsteps, nsimudays, npricedays): tf.reset_default_graph( ) # to avoid the conflict with the existing parameters, but this is not suggested for reuse parameters basetraindaysets = [ 3, 7, 12, 33, 43, 62, 69, 80, 91, 97, 98, 108, 116, 123, 126, 136, 144, 153, 161, 174, 192, 199, 225, 230, 234, 247, 261, 274, 281, 287, 295, 305, 313, 320, 327, 332, 345, 348, 357, 350, 360 ] basedatasetlen = len(basetraindaysets) selectdays = basetraindaysets[ dataset_start:basedatasetlen:dataset_interval] selectdaysfortrain = [] for iday in selectdays: selectdaysfortrain.append(iday) selectdaysfortrain.append(iday + 1) selectdaysfortrain.append(iday + 2) startday = 3 #nsimudays = 1 #npricedays = 1 print('---------------selectdaysfortrain: ---------------') print(selectdaysfortrain) env = SimpleBatterySimEnv(Lmpfile, batteryEtini, startday, nsimudays, npricedays, selectdaysfortrain) model = deepq.models.mlp([256, 256]) act = deepq.learn(env, q_func=model, lr=learning_rate, max_timesteps=trainmaxsteps, buffer_size=50000, checkpoint_freq=100, exploration_fraction=0.1, exploration_final_eps=0.02, print_freq=10, callback=callback) print("Saving final model to %s_lr_%s_%dw.pkl" % (model_name, str(learning_rate), int(trainmaxsteps / 10000))) act.save(savedModel + "/" + model_name + "_lr_%s_%dw.pkl" % (str(learning_rate), int(trainmaxsteps / 10000)))
def main(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument( '--env', help='environment ID', default='BeamRiderNoFrameskip-v4' ) # TODO changed to Beamrider since it gives larger rewards easy to see progress parser.add_argument('--seed', help='RNG seed', type=int, default=0) parser.add_argument('--prioritized', type=int, default=0) # TODO made it false code was complaining parser.add_argument('--dueling', type=int, default=0) # TODO made it false for code simplicity parser.add_argument('--num-timesteps', type=int, default=int(10e6)) args = parser.parse_args() logger.configure("./log/BeamRider") # TODO log results under BeamRider set_global_seeds(args.seed) env = make_atari(args.env) env = bench.Monitor(env, logger.get_dir()) env = deepq.wrap_atari_dqn(env) model = deepq.models.cnn_to_mlp( convs=[(32, 8, 4), (64, 4, 2), (64, 3, 1)], hiddens=[256], dueling=bool(args.dueling), nbins=1000, # TODO number of bins ) act = deepq.learn( env, q_func=model, lr=1e-4, max_timesteps=args.num_timesteps, buffer_size=10000, exploration_fraction=0.1, exploration_final_eps=0.01, train_freq=4, learning_starts=10000, target_network_update_freq=1000, gamma=0.99, prioritized_replay=bool(args.prioritized), min_Val=-50, # TODO min value of Q values max_Val=50, # TODO max value of Q values nbins=1000 # TODO number of bins ) # act.save("pong_model.pkl") XXX env.close()
def main(args): env_name = "CartPole-v0" env = CartPoleEnv(max_ep_len=args.ep_len, seed=args.seed, append=False) model = deepq.models.mlp([64]) max_timesteps = args.steps act = deepq.learn( env, env_name=env_name, q_func=model, lr=args.lr, max_timesteps=max_timesteps, buffer_size=50000, exploration_fraction=0.0001, exploration_final_eps=0.02, print_freq=1, callback=callback, eval=False, )
def main(): env = gym.make("CartPole-v0") model = deepq.models.mlp([64]) act = deepq.learn( env, q_func=model, lr=1e-3, max_timesteps=100000, buffer_size=50000, exploration_fraction=0.1, exploration_final_eps=0.02, print_freq=10, callback=callback ) print("Saving model to CartPole_model.pkl") act.save("CartPole_model.pkl") if __name__ == '__main__': main()
def main(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('--env', help='environment ID', default='BreakoutNoFrameskip-v0') parser.add_argument('--seed', help='RNG seed', type=int, default=0) parser.add_argument('--prioritized', type=int, default=1) parser.add_argument('--bufferSize', type=int, default=10000) parser.add_argument('--dueling', type=int, default=1) parser.add_argument('--num-timesteps', type=int, default=int(10e6)) parser.add_argument('--learningRate', type=float, default=5e-4) parser.add_argument('--epsStart', type=float, default=1.0) parser.add_argument('--epsEnd', type=float, default=.05) parser.add_argument('--learningStart', type=int, default=int(1000)) parser.add_argument('--targetNetworkUpdate', type=int, default=int(500)) args = parser.parse_args() logger.configure() set_global_seeds(args.seed) env = make_atari(args.env) env = bench.Monitor(env, logger.get_dir()) env = deepq.wrap_atari_dqn(env) model = deepq.models.cnn_to_mlp( convs=[(32, 8, 4), (64, 4, 2), (64, 3, 1)], hiddens=[256], dueling=bool(args.dueling), ) act = deepq.learn(env, q_func=model, lr=args.learningRate, max_timesteps=args.num_timesteps, buffer_size=args.bufferSize, exploration_fraction=args.epsStart, exploration_final_eps=args.epsEnd, train_freq=4, learning_starts=args.learningStart, target_network_update_freq=args.targetNetworkUpdate, gamma=0.99, prioritized_replay=bool(args.prioritized)) act.save() env.close()
def compare_exploration_p(): # Probability of ignoring action ignore_probs = [0, .01, .03, .05, .1, .2, .3] empirical_metrics = {} for p in ignore_probs: steps = int(3e4) # Create environment with this ignore prob env = GridworldEnv(p_ignore=p) # Train policy on this environment with tf.Graph().as_default(): policy = deepq.learn(env, network='mlp', lr=1e-3, total_timesteps=int(steps), buffer_size=int(2e4), exploration_fraction=.1, exploration_final_eps=0, print_freq=100, num_layers=2, num_hidden=64, activation=tf.nn.relu) policy.save('policy_ignore_p_{}.pkl'.format(p)) # Run trained policy on validation set and # collect metrics episodes = collect_metrics(policy) all_eps_metrics = [ep_metrics(ep_states) for ep_states in episodes] mac_rate, avg_ep_len = metrics(all_eps_metrics) empirical_metrics[p] = { 'mac_rate': mac_rate, 'avg_ep_len': avg_ep_len } tf.reset_default_graph() with open('metrics.pkl', 'wb') as f: pickle.dump(empirical_metrics, f) return empirical_metrics
def main(): args = setup_utils.setup_and_load() comm = MPI.COMM_WORLD rank = comm.Get_rank() seed = int(time.time()) % 10000 set_global_seeds(seed * 100 + rank) main_utils.setup_mpi_gpus() config = tf.ConfigProto() config.gpu_options.allow_growth = True # pylint: disable=E1101 env = main_utils.Scalarize(main_utils.make_general_env(1, seed=rank)) print("load path:") print("{}/saved_models/{}.pkl".format(Config.SAVE_PATH, Config.RUN_ID)) act = deepq.learn( env, network="conv_only", convs=[(32, 8, 4), (64, 4, 2), (64, 3, 1)], hiddens=[256], total_timesteps=0, load_path="{}/saved_models/{}.pkl".format(Config.SAVE_PATH, Config.RUN_ID) # load_path="{}/ckpts/{}/model".format(Config.SAVE_PATH, Config.RUN_ID) ) num_episodes = 500 # while True: episode_rew_ls = [] for i in range(num_episodes): obs, done = env.reset(), False episode_rew = 0 while not done: if Config.RENDER: env.render() obs, rew, done, _ = env.step(act(obs[None])[0]) episode_rew += rew episode_rew_ls.append(episode_rew) print("Episode reward", episode_rew) print("Avg episode reward", np.mean(episode_rew_ls)) print("Var episode reward", np.std(episode_rew_ls))
def main(): env = Env(64, 64) env = WarpFrame(env) env = ScaledFloatFrame(env) env = FrameStack(env, 1) parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('--seed', help='RNG seed', type=int, default=0) parser.add_argument('--prioritized', type=int, default=1) parser.add_argument('--dueling', type=int, default=0) parser.add_argument('--num-timesteps', type=int, default=int(10e6)) args = parser.parse_args() logger.configure() model = deepq.models.cnn_to_mlp( convs=[(32, 8, 4), (32, 4, 2), (64, 3, 1)], hiddens=[256], dueling=bool(args.dueling), ) act = deepq.learn(env, q_func=model, lr=1e-4, max_timesteps=args.num_timesteps, buffer_size=10000, exploration_fraction=0.25, exploration_final_eps=0.01, train_freq=4, learning_starts=10000, target_network_update_freq=1000, gamma=0.99, prioritized_replay=bool(args.prioritized), restore=True) for _ in range(100): obs, done = env.reset(), False episode_rew = 0 while not done: sleep(0.01) env.render() action = act(np.array(obs)[None])[0] obs, rew, done, _ = env.step(action) episode_rew += rew # print(action, rew) print("Episode reward", episode_rew)
def main(): global episode_rewards env = gym.make("LunarLander-v2") max_timesteps_env = env.env._spec.__dict__['tags']['wrapper_config.TimeLimit.max_episode_steps'] model = deepq.models.mlp([256, 128], activation_fn=tf.nn.tanh) act = deepq.learn( env, lr=1e-5, q_func=model, target_network_update_freq=1, batch_size=32, max_timesteps=max_timesteps_env*10000, buffer_size=500, exploration_fraction=0.1, exploration_final_eps=0.02, print_freq=10, callback=callback ) print('done') return episode_rewards
def main(): env = gym.make("imgreg_train-v4") data_path = 'data/KLA/train/3.h5' env.loadData(data_path) model = deepq.models.cnn_to_mlp([(16, 8, 4), (32, 8, 4), (64, 4, 2), (32, 3, 1)], [256]) act = deepq.learn(env, q_func=model, lr=1e-3, max_timesteps=50000, checkpoint_freq=1000, buffer_size=50000, exploration_fraction=0.3, exploration_final_eps=0.02, print_freq=10, gamma=0.95, batch_size=32, load_model='models/KLA/2.pkl') print("Saving model") act.save("models/KLA/3.pkl")
def main(args): """ train and save the DeepQ model, for the mountain car problem :param args: (ArgumentParser) the input arguments """ env = gym.make("MountainCar-v0") # Enabling layer_norm here is import for parameter space noise! model = deepq.models.mlp([64], layer_norm=True) act = deepq.learn(env, q_func=model, learning_rate=1e-3, max_timesteps=args.max_timesteps, buffer_size=50000, exploration_fraction=0.1, exploration_final_eps=0.1, print_freq=10, param_noise=True) print("Saving model to mountaincar_model.pkl") act.save("mountaincar_model.pkl")
def main(): env = gym.make("PongNoFrameskip-v4") env = deepq.wrap_atari_dqn(env) model = deepq.learn( env, "conv_only", convs=[(32, 8, 4), (64, 4, 2), (64, 3, 1)], hiddens=[256], dueling=True, total_timesteps=0 ) while True: obs, done = env.reset(), False episode_rew = 0 while not done: env.render() obs, rew, done, _ = env.step(model(obs[None])[0]) episode_rew += rew print("Episode reward", episode_rew)
def play(): engine_configuration_channel = EngineConfigurationChannel() # 時間スケールを10倍に設定 engine_configuration_channel.set_configuration_parameters(time_scale=10.0) unity_env = UnityEnvironment("./ml-agents/Project/PushBlock", side_channels=[engine_configuration_channel]) env = UnityToGymWrapper(unity_env, 0, flatten_branched=True) # モデル読み込み model = deepq.learn(env, "mlp", total_timesteps=0, load_path="./model") obs = env.reset() obs = np.expand_dims(np.array(obs), axis=0) while True: action, _, _, _ = model.step(tf.constant(obs)) action = action[0].numpy() obs, rew, done, _ = env.step(action) if done: obs = env.reset() obs = np.expand_dims(np.array(obs), axis=0)
def main(learning_rate): tf.reset_default_graph( ) # to avoid the conflict with the existing parameters, but this is not suggested for reuse parameters env = PowerDynSimEnv(case_files_array, dyn_config_file, rl_config_file, java_port) model = deepq.models.mlp([128, 128]) act = deepq.learn(env, q_func=model, lr=learning_rate, max_timesteps=900000, buffer_size=50000, checkpoint_freq=1000, exploration_fraction=0.1, exploration_final_eps=0.02, print_freq=10, callback=callback) print("Saving final model to power_model_multistep498_508_lr_%s_90w.pkl" % (str(learning_rate)))
def main(): env = gym.make("Pendulum-v0") model = deepq.models.mlp([256,256]) exp_name = 'half_up' act = deepq.learn( env, q_func=model, lr=1e-4, max_timesteps=350000, buffer_size=50000, exploration_fraction=0.1, exploration_final_eps=0.02, print_freq=10, exp_name=exp_name, callback=callback ) print("Saving model to pendulum_model.pkl") act.save("pendulum_model_{}.pkl".format(exp_name))
def main(): env = gym.make("PongNoFrameskip-v3") env = ScaledFloatFrame(wrap_dqn(env)) model = deepq.models.cnn_to_mlp(convs=[(32, 8, 4), (64, 4, 2), (64, 3, 1)], hiddens=[256], dueling=True) act = deepq.learn(env, q_func=model, lr=1e-4, max_timesteps=2000000, buffer_size=10000, exploration_fraction=0.1, exploration_final_eps=0.01, train_freq=4, learning_starts=10000, target_network_update_freq=1000, gamma=0.99, prioritized_replay=True) act.save("pong_model.pkl") env.close()
def main(): env = gym.make('State-Based-MDP-Navigation-2d-Map0-Goal0-KnownGoalPosition-v0') #env = gym.make('Image-Based-Navigation-2d-Map0-Goal0-v0') env.action_space = spaces.Discrete(100) model = deepq.models.mlp([64]) act = deepq.learn( env, q_func=model, lr=1e-3, max_timesteps=100000, buffer_size=50000, exploration_fraction=0.1, exploration_final_eps=0.02, print_freq=1, callback=callback ) print("Saving model to cartpole_model.pkl") act.save("cartpole_model.pkl")
def main(): # create the environment env = gym.make("balancebot-v0") # <-- this we need to create # create the learning agent model = deepq.models.mlp([16, 16]) # train the agent on the environment act = deepq.learn(env, q_func=model, lr=1e-3, max_timesteps=200000, buffer_size=50000, exploration_fraction=0.1, exploration_final_eps=0.02, print_freq=10, callback=callback) # save trained model act.save("balance.pkl")
def main(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('--env', help='environment ID', default='snake-single-v0') parser.add_argument('--seed', help='Random seed', type=int, default=0) parser.add_argument('--prioritized', type=int, default=1) parser.add_argument('--prioritized-replay-alpha', type=float, default=0.6) parser.add_argument('--dueling', type=int, default=1) parser.add_argument('--num-timesteps', type=int, default=int(10e6)) parser.add_argument('--checkpoint-freq', type=int, default=10000) parser.add_argument('--checkpoint-path', type=str, default='./single-dqn/') args = parser.parse_args() # make_session first argument : num of cpus with U.make_session(8): env = gym.make(args.env) env = FrameStack(env, 4) print("observation space is ", env.observation_space) print("action space is ", env.action_space) model = deepq.models.cnn_to_mlp(convs=[(32, 8, 4), (64, 4, 2), (64, 3, 1)], hiddens=[512], dueling=bool(args.dueling)) act = deepq.learn(env, q_func=model, lr=1e-4, max_timesteps=10000000, buffer_size=50000, train_freq=4, exploration_fraction=0.1, exploration_final_eps=0.02, gamma=0.99, print_freq=10, checkpoint_freq=args.checkpoint_freq, checkpoint_path=args.checkpoint_path, param_noise=True) act.save("../models/single-dqn/single_dqn_model_final.pkl")
def main(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('--env_id', help='environment ID', default='BreakoutNoFrameskip-v4') parser.add_argument('--seed', help='RNG seed', type=int, default=0) parser.add_argument('--prioritized', type=int, default=0) #parser.add_argument('--dueling', type=int, default=1) parser.add_argument('--num-timesteps', type=int, default=int(10e6)) args = parser.parse_args() logger.configure() set_global_seeds(args.seed) env = gym.make(args.env_id) env = bench.Monitor(env, logger.get_dir()) env.seed(args.seed) v_func = deepq.models.mlp(hiddens=[200, 200], ) l_func = deepq.models.mlp(hiddens=[200, 200], ) mu_func = deepq.models.mlp(hiddens=[200, 200], ) stddev = 0.3 nb_actions = env.action_space.shape[-1] action_noise = OrnsteinUhlenbeckActionNoise(mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) act = deepq.learn(env, models=[mu_func, v_func, l_func], action_noise=action_noise, lr=1e-3, max_timesteps=args.num_timesteps, buffer_size=100000, exploration_fraction=0.1, exploration_final_eps=0.01, train_freq=4, learning_starts=10000, target_network_update_freq=1000, gamma=0.99, prioritized_replay=bool(args.prioritized)) # act.save("pong_model.pkl") XXX env.close()
def main(): env = gym.make("imgreg_train-v3") data_path = 'data/train.h5' env.loadData(data_path) model = deepq.models.cnn_to_mlp([(16, 8, 4), (16, 4, 1), (32, 4, 2)], [256]) act = deepq.learn( env, q_func=model, lr = 1e-3, max_timesteps = 5000000, checkpoint_freq = 1000, buffer_size = 50000, exploration_fraction = 0.2, exploration_final_eps = 0.02, print_freq = 10, gamma = 0.95, batch_size = 64, load_model = None ) print("Saving model") act.save("models/iter_6.1.pkl")
def play(self): config = self.config env = self.get_player() model = deepq.learn( env, config['MODEL']['TYPE'], **config['MODEL']['ARGS'], **config['LOAD_PATH'], dueling=config['DUELING'], total_timesteps=0 ) while True: obs, done = env.reset(), False episode_rew = 0 while not done: env.render() obs, rew, done, _ = env.step(model(obs[None])[0]) episode_rew += rew print("Episode reward", episode_rew)
def main(env_name, seed, exp_name): data_dir = osp.join( osp.dirname( osp.dirname( osp.dirname(osp.dirname(osp.dirname(osp.abspath(__file__)))))), 'spinup_data', datetime.datetime.now().strftime("openai-%Y-%m-%d-%H-%M-%S_") + exp_name) logger.configure(dir=data_dir) env = gym.make(env_name) act = deepq.learn(env, network='mlp', seed=seed, lr=1e-3, total_timesteps=100000, buffer_size=50000, exploration_fraction=0.1, exploration_final_eps=0.02, print_freq=10) print("Saving model to cartpole_model.pkl") act.save("cartpole_model.pkl")
def main(args): """ train and save the DeepQ model, for the cartpole problem :param args: (ArgumentParser) the input arguments """ env = gym.make("CartPole-v0") model = deepq.models.mlp([64]) act = deepq.learn( env, q_func=model, learning_rate=1e-3, max_timesteps=args.max_timesteps, buffer_size=50000, exploration_fraction=0.1, exploration_final_eps=0.02, print_freq=10, callback=callback ) print("Saving model to cartpole_model.pkl") act.save("cartpole_model.pkl")
def main(): env = gym.make("imgreg_train-v5") data_paths = ['data/train/1.h5', 'data/train/2.h5', 'data/train/3.h5', 'data/train/4.h5', 'data/train/5.h5'] env.loadData(data_paths) model = deepq.models.cnn_to_mlp([(16, 8, 4), (32, 4, 2), (32, 3, 1)], [256]) act = deepq.learn( env, q_func=model, lr = 1e-3, max_timesteps = 100000, checkpoint_freq = 1000, buffer_size = 10000, exploration_fraction = 0.3, exploration_final_eps = 0.02, print_freq = 10, gamma = 0.95, batch_size = 64, load_model = None ) print("Saving model") act.save("models/2.1.pkl")
def main(): env = gym.make("AirSimCarEnv-v0") model = deepq.models.mlp([64], layer_norm=True) print("\n======= Training session starts for DQN Car =======") act = deepq.learn( env, q_func=model, lr=1e-3, max_timesteps=100000, buffer_size=50000, exploration_fraction=1.0, #0.1, exploration_final_eps=0.02, print_freq=10, param_noise=True, checkpoint_freq=2, learning_starts=5, callback=callback) trainedModel = "car.pkl" print("\nSaving model to", trainedModel) act.save(trainedModel)
def learn(self): def callback(lcl, _glb): is_solved = lcl['t'] > 100 and sum(lcl['episode_rewards'][-101:-1]) / 100 >= 199 return is_solved self.logger.configure() config = self.config env = self.get_player(train=True) model = deepq.learn( env, config['MODEL']['TYPE'], **config['MODEL']['ARGS'], **config['LOAD_PATH'], dueling=config['DUELING'], lr=config['LEARNING_RATE'], total_timesteps=config['TOTAL_TIMESTEPS'], buffer_size=config['BUFFER_SIZE'], exploration_fraction=config['EXPLORATION_FRACTION'], exploration_final_eps=config['EXPLORATION_FINAL_EPS'], train_freq=config['TRAIN_FREQ'], learning_starts=config['NO_OP_STEPS'], target_network_update_freq=config['TARGET_UPDATE_FREQ'], gamma=config['GAMMA'], seed=config['SEED'], batch_size=config['BATCH_SIZE'], print_freq=config['PRINT_FREQ'], checkpoint_freq=config['CHECKPOINT_FREQ'], checkpoint_path=config['CHECKPOINT_PATH_PREFIX'], prioritized_replay=config['PRIORITIZED_REPLAY'], prioritized_replay_alpha=config['PRIORITIZED_REPLAY_ALPHA'], prioritized_replay_beta0=config['PRIORITIZED_REPLAY_BETA'], prioritized_replay_beta_iters=config['PRIORITIZED_REPLAY_BETA_ITERS'], prioritized_replay_eps=config['PRIORITIZED_REPLAY_EPS'], param_noise=config['PARAM_NOISE'], callback=callback ) model.save(config['CHECKPOINT_PATH_PREFIX'] + config['ENV_NAME'] + '.pkl') env.close()
def train(): set_global_seeds(args.seed) directory = os.path.join( args.log_dir, '_'.join([args.env, datetime.datetime.now().strftime("%m%d%H%M")])) if not os.path.exists(directory): os.makedirs(directory) else: ValueError("The directory already exists...", directory) json.dump(vars(args), open(os.path.join(directory, 'learning_prop.json'), 'w')) env = gym.make(args.env) with tf.device(args.device): model = deepq.models.mlp([64]) act, records = deepq.learn( env, q_func=model, lr=args.learning_rate, max_timesteps=args.nb_train_steps, buffer_size=args.buffer_size, exploration_fraction=args.eps_fraction, exploration_final_eps=args.eps_min, print_freq=10, checkpoint_freq=int(args.nb_train_steps / 10), learning_starts=args.nb_warmup_steps, gamma=args.gamma, callback=None, #callback, epoch_steps=args.nb_epoch_steps, gpu_memory=args.gpu_memory, directory=directory, double_q=args.double_q, nb_test_steps=args.nb_test_steps, ) print("Saving model to model.pkl") act.save(os.path.join(directory, "model.pkl")) plot(records, directory)
def main(): parser = argsparser() args = parser.parse_args() logger.configure(dir=args.log_dir) env = gym.make(args.env_id) env.seed(args.seed) set_global_seeds(args.seed) model = deepq.models.mlp([64]) act = deepq.learn( env, q_func=model, lr=1e-3, max_timesteps=1000000, buffer_size=50000, exploration_fraction=0.01, exploration_final_eps=0.02, print_freq=10 #callback=callback ) print("Saving model to cartpole_model.pkl") act.save("cartpole_model.pkl")
def main(): # Load orderbook orderbook = Orderbook() orderbook.loadFromEvents('ob-1-small.tsv') env = gym.make("ctc-executioner-v0") env.configure(orderbook) model = deepq.models.cnn_to_mlp( convs=[(1, 10, 20)], hiddens=[200]) act = deepq.learn( env, q_func=model, lr=1e-4, max_timesteps=100000, buffer_size=5000, exploration_fraction=0.1, exploration_final_eps=0.1, target_network_update_freq=1, print_freq=10, ) print("Saving model as ctc-executioner-v0.pkl") act.save("ctc-executioner-v0.pkl")
def main(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('--seed', help='RNG seed', type=int, default=0) parser.add_argument('--prioritized', type=int, default=1) parser.add_argument('--dueling', type=int, default=1) parser.add_argument('--num-timesteps', type=int, default=int(3 * 10e6)) args = parser.parse_args() logger.configure() set_global_seeds(args.seed) import time current_milli_time = lambda: int(round(time.time() * 1000)) env = Env(64, 44) env = WarpFrame(env) env = ScaledFloatFrame(env) model = deepq.models.cnn_to_mlp( convs=[(16, 8, 4), (16, 4, 2), (32, 3, 1)], hiddens=[256], dueling=bool(args.dueling), ) act = deepq.learn(env, q_func=model, lr=5e-4, max_timesteps=args.num_timesteps, buffer_size=100000, exploration_fraction=0.05, exploration_final_eps=0.01, train_freq=2, learning_starts=10000, target_network_update_freq=1000, gamma=0.99, print_freq=30, checkpoint_freq=200000, prioritized_replay=bool(args.prioritized)) act.save("draw_model.pkl") env.close()
def main(): env = gym.make("PongNoFrameskip-v4") env = ScaledFloatFrame(wrap_dqn(env)) model = deepq.models.cnn_to_mlp( convs=[(32, 8, 4), (64, 4, 2), (64, 3, 1)], hiddens=[256], dueling=True ) act = deepq.learn( env, q_func=model, lr=1e-4, max_timesteps=2000000, buffer_size=10000, exploration_fraction=0.1, exploration_final_eps=0.01, train_freq=4, learning_starts=10000, target_network_update_freq=1000, gamma=0.99, prioritized_replay=True ) act.save("pong_model.pkl") env.close()
env = gym.make("CartPole-v0") # # set up the logger # logdir = '/tmp/experiments/discrete/DQN/' # logger.configure(os.path.abspath(logdir)) # print("logger.get_dir(): ", logger.get_dir() and os.path.join(logger.get_dir())) # models = [[64], [64,64], [128,128], [256,256]] models = [[64], [128], [64,64], [128,128], [256,256]] for m in models: g = tf.Graph() with g.as_default(): # tf.reset_default_graph() act = deepq.learn( env, q_func=deepq.models.mlp(m), lr=1e-3, max_timesteps=10000, buffer_size=50000, exploration_fraction=0.1, exploration_final_eps=0.02, print_freq=10, callback=callback, outdir="/tmp/experiments/discrete/DQN/"+str(m) ) act.save("models/cartpole_model_DQN_"+str(m)+".pkl") # print("Saving model to cartpole_model.pkl") # act.save("cartpole_model.pkl")
import gym from baselines import deepq env = gym.make("MountainCar-v0") # env = gym.make("MountainCarContinuous-v0") print(env.action_space.n) # Enabling layer_norm here is import for parameter space noise! model = deepq.models.mlp([64], layer_norm=True) act = deepq.learn( env, q_func=model, lr=1e-3, max_timesteps=100000, buffer_size=50000, exploration_fraction=0.1, exploration_final_eps=0.1, print_freq=10, param_noise=False ) print("Saving model to mountaincar_model.pkl") act.save("mountaincar_model.pkl")