def main(): parser = argparse.ArgumentParser() parser.add_argument('--dataset', type=str, default='breakout-medium-v0') parser.add_argument('--seed', type=int, default=1) parser.add_argument('--gpu', type=int) args = parser.parse_args() dataset, env = d3rlpy.datasets.get_atari(args.dataset) # fix seed d3rlpy.seed(args.seed) env.seed(args.seed) _, test_episodes = train_test_split(dataset, test_size=0.2) cql = d3rlpy.algos.DiscreteCQL( optim_factory=d3rlpy.models.optimizers.AdamFactory(eps=1e-2 / 32), scaler='pixel', n_frames=4, q_func_factory='qr', use_gpu=args.gpu) env_scorer = d3rlpy.metrics.evaluate_on_environment(env, epsilon=0.001) cql.fit(dataset.episodes, eval_episodes=test_episodes, n_steps=50000000, n_steps_per_epoch=10000, scorers={ 'environment': env_scorer, 'value_scale': d3rlpy.metrics.average_value_estimation_scorer, }, experiment_name=f"DiscreteCQL_{args.dataset}_{args.seed}")
def main(): parser = argparse.ArgumentParser() parser.add_argument('--dataset', type=str, default='breakout-medium-v0') parser.add_argument('--seed', type=int, default=1) parser.add_argument('--gpu', action='store_true') args = parser.parse_args() d3rlpy.seed(args.seed) dataset, env = d3rlpy.datasets.get_atari(args.dataset) _, test_episodes = train_test_split(dataset, test_size=0.2) cql = d3rlpy.algos.DiscreteCQL( optim_factory=d3rlpy.models.optimizers.AdamFactory(eps=1e-2 / 32), scaler='pixel', n_frames=4, q_func_factory='qr', use_gpu=args.gpu) scorers = { 'env': d3rlpy.metrics.scorer.evaluate_on_environment(env, epsilon=0.001), 'value_scale': d3rlpy.metrics.scorer.average_value_estimation_scorer } cql.fit(dataset.episodes, eval_episodes=test_episodes, n_epochs=2000, scorers=scorers)
def main(args): dataset, env = get_pybullet(args.dataset) d3rlpy.seed(args.seed) train_episodes, test_episodes = train_test_split(dataset, test_size=0.2) device = None if args.gpu is None else Device(args.gpu) dynamics = ProbabilisticEnsembleDynamics(use_gpu=device) dynamics.fit(train_episodes, eval_episodes=test_episodes, n_steps=100000, scorers={ "obs_error": dynamics_observation_prediction_error_scorer, "reward_error": dynamics_reward_prediction_error_scorer, }) combo = COMBO(q_func_factory=args.q_func, dynamics=dynamics, use_gpu=device) combo.fit(train_episodes, eval_episodes=test_episodes, n_steps=1000000, scorers={ 'environment': evaluate_on_environment(env), 'td_error': td_error_scorer, 'discounted_advantage': discounted_sum_of_advantage_scorer, 'value_scale': average_value_estimation_scorer, 'value_std': value_estimation_std_scorer, 'action_diff': continuous_action_diff_scorer })
def main(): parser = argparse.ArgumentParser() parser.add_argument('--dataset', type=str, default='hopper-medium-v0') parser.add_argument('--seed', type=int, default=1) parser.add_argument('--gpu', action='store_true') args = parser.parse_args() d3rlpy.seed(args.seed) dataset, env = d3rlpy.datasets.get_d4rl(args.dataset) _, test_episodes = train_test_split(dataset, test_size=0.2) vae_encoder = d3rlpy.models.encoders.VectorEncoderFactory([750, 750]) if 'halfcheetah' in env.unwrapped.spec.id.lower(): kernel = 'gaussian' else: kernel = 'laplacian' bear = d3rlpy.algos.BEAR(imitator_encoder_factory=vae_encoder, temp_learning_rate=0.0, initial_temperature=1e-20, kernel_type=kernel, use_gpu=args.gpu) scorers = { 'environment': d3rlpy.metrics.scorer.evaluate_on_environment(env), 'value_scale': d3rlpy.metrics.scorer.average_value_estimation_scorer } bear.fit(dataset.episodes, eval_episodes=test_episodes, n_epochs=2000, scorers=scorers)
def main(args): dataset, env = get_pybullet(args.dataset) d3rlpy.seed(args.seed) train_episodes, test_episodes = train_test_split(dataset, test_size=0.2) device = None if args.gpu is None else Device(args.gpu) encoder_factory = VectorEncoderFactory(hidden_units=[256, 256, 256, 256]) awac = AWAC(actor_encoder_factory=encoder_factory, critic_encoder_factory=encoder_factory, q_func_factory=args.q_func, use_gpu=device) awac.fit(train_episodes, eval_episodes=test_episodes, n_epochs=1000, scorers={ 'environment': evaluate_on_environment(env), 'td_error': td_error_scorer, 'discounted_advantage': discounted_sum_of_advantage_scorer, 'value_scale': average_value_estimation_scorer, 'value_std': value_estimation_std_scorer, 'action_diff': continuous_action_diff_scorer })
def main(): parser = argparse.ArgumentParser() parser.add_argument('--dataset', type=str, default='hopper-medium-v0') parser.add_argument('--seed', type=int, default=1) parser.add_argument('--gpu', type=int) args = parser.parse_args() dataset, env = d3rlpy.datasets.get_dataset(args.dataset) # fix seed d3rlpy.seed(args.seed) env.seed(args.seed) _, test_episodes = train_test_split(dataset, test_size=0.2) encoder = d3rlpy.models.encoders.VectorEncoderFactory([256, 256, 256]) cql = d3rlpy.algos.CQL(actor_encoder_factory=encoder, critic_encoder_factory=encoder, alpha_learning_rate=0.0, use_gpu=args.gpu) cql.fit(dataset.episodes, eval_episodes=test_episodes, n_steps=500000, n_steps_per_epoch=1000, save_interval=10, scorers={ 'environment': d3rlpy.metrics.evaluate_on_environment(env), 'value_scale': d3rlpy.metrics.average_value_estimation_scorer, }, experiment_name=f"CQL_{args.dataset}_{args.seed}")
def main(): parser = argparse.ArgumentParser() parser.add_argument('--dataset', type=str, default='hopper-medium-v0') parser.add_argument('--seed', type=int, default=1) parser.add_argument('--gpu', type=int) args = parser.parse_args() d3rlpy.seed(args.seed) dataset, env = d3rlpy.datasets.get_dataset(args.dataset) _, test_episodes = train_test_split(dataset, test_size=0.2) vae_encoder = d3rlpy.models.encoders.VectorEncoderFactory([750, 750]) rl_encoder = d3rlpy.models.encoders.VectorEncoderFactory([400, 300]) bcq = d3rlpy.algos.BCQ(actor_encoder_factory=rl_encoder, critic_encoder_factory=rl_encoder, imitator_encoder_factory=vae_encoder, use_gpu=args.gpu) scorers = { 'environment': d3rlpy.metrics.scorer.evaluate_on_environment(env), 'value_scale': d3rlpy.metrics.scorer.average_value_estimation_scorer } bcq.fit(dataset.episodes, eval_episodes=test_episodes, n_steps=500000, n_steps_per_epoch=1000, save_interval=10, scorers=scorers, experiment_name=f"BCQ_{args.dataset}_{args.seed}")
def main(): parser = argparse.ArgumentParser() parser.add_argument('--env', type=str, default='Hopper-v2') parser.add_argument('--seed', type=int, default=1) parser.add_argument('--gpu', action='store_true') args = parser.parse_args() d3rlpy.seed(args.seed) env = gym.make(args.env) eval_env = gym.make(args.env) # setup algorithm sac = d3rlpy.algos.SAC(batch_size=256, actor_learning_rate=3e-4, critic_learning_rate=3e-4, temp_learning_rate=3e-4, use_gpu=args.gpu) # replay buffer for experience replay buffer = d3rlpy.online.buffers.ReplayBuffer(maxlen=1000000, env=env) # start training sac.fit_online(env, buffer, eval_env=eval_env, n_steps=1000000, n_steps_per_epoch=10000, update_interval=1, update_start_step=1000)
def main(): parser = argparse.ArgumentParser() parser.add_argument('--dataset', type=str, default='hopper-medium-v0') parser.add_argument('--seed', type=int, default=1) parser.add_argument('--gpu', action='store_true') args = parser.parse_args() d3rlpy.seed(args.seed) dataset, env = d3rlpy.datasets.get_d4rl(args.dataset) _, test_episodes = train_test_split(dataset, test_size=0.2) encoder = d3rlpy.models.encoders.VectorEncoderFactory([256, 256, 256]) cql = d3rlpy.algos.CQL(actor_encoder_factory=encoder, critic_encoder_factory=encoder, alpha_learning_rate=0.0, use_gpu=args.gpu) scorers = { 'environment': d3rlpy.metrics.scorer.evaluate_on_environment(env), 'value_scale': d3rlpy.metrics.scorer.average_value_estimation_scorer } cql.fit(dataset.episodes, eval_episodes=test_episodes, n_epochs=2000, scorers=scorers)
def main(args): dataset, env = get_atari(args.dataset) d3rlpy.seed(args.seed) train_episodes, test_episodes = train_test_split(dataset, test_size=0.2) bc = DiscreteBC( n_frames=4, # frame stacking scaler='pixel', use_gpu=args.gpu) bc.fit(train_episodes, eval_episodes=test_episodes, n_epochs=100, scorers={'environment': evaluate_on_environment(env, epsilon=0.05)})
def main(args): dataset, env = get_atari(args.dataset) d3rlpy.seed(args.seed) train_episodes, test_episodes = train_test_split(dataset, test_size=0.2) device = None if args.gpu is None else Device(args.gpu) bc = DiscreteBC(n_epochs=100, scaler='pixel', use_batch_norm=False, use_gpu=device) bc.fit(train_episodes, eval_episodes=test_episodes, scorers={'environment': evaluate_on_environment(env, epsilon=0.05)})
def main(args): dataset, env = get_pybullet(args.dataset) d3rlpy.seed(args.seed) train_episodes, test_episodes = train_test_split(dataset, test_size=0.2) device = None if args.gpu is None else Device(args.gpu) bc = BC(n_epochs=100, use_gpu=device) bc.fit(train_episodes, eval_episodes=test_episodes, scorers={ 'environment': evaluate_on_environment(env), 'action_diff': continuous_action_diff_scorer })
def main(): parser = argparse.ArgumentParser() parser.add_argument('--env', type=str, default='BreakoutNoFrameskip-v4') parser.add_argument('--seed', type=int, default=1) parser.add_argument('--gpu', action='store_true') args = parser.parse_args() # get wrapped atari environment env = d3rlpy.envs.Atari(gym.make(args.env)) eval_env = d3rlpy.envs.Atari(gym.make(args.env), is_eval=True) # fix seed d3rlpy.seed(args.seed) env.seed(args.seed) eval_env.seed(args.seed) # setup algorithm dqn = d3rlpy.algos.DQN( batch_size=32, learning_rate=5e-5, optim_factory=d3rlpy.models.optimizers.AdamFactory(eps=1e-2 / 32), target_update_interval=10000 // 4, q_func_factory=d3rlpy.models.q_functions.QRQFunctionFactory( n_quantiles=200), scaler='pixel', n_frames=4, use_gpu=args.gpu) # replay buffer for experience replay buffer = d3rlpy.online.buffers.ReplayBuffer(maxlen=1000000, env=env) # epilon-greedy explorer explorer = d3rlpy.online.explorers.LinearDecayEpsilonGreedy( start_epsilon=1.0, end_epsilon=0.01, duration=1000000) # start training dqn.fit_online(env, buffer, explorer, eval_env=eval_env, eval_epsilon=0.001, n_steps=50000000, n_steps_per_epoch=100000, update_interval=4, update_start_step=50000)
def main(args): dataset, env = get_pybullet(args.dataset) d3rlpy.seed(args.seed) train_episodes, test_episodes = train_test_split(dataset, test_size=0.2) device = None if args.gpu is None else Device(args.gpu) awr = AWR(n_epochs=100, use_gpu=device) awr.fit(train_episodes, eval_episodes=test_episodes, scorers={ 'environment': evaluate_on_environment(env), 'td_error': td_error_scorer, 'value_scale': average_value_estimation_scorer, 'action_diff': continuous_action_diff_scorer })
def main(args): dataset, env = get_pybullet(args.dataset) d3rlpy.seed(args.seed) train_episodes, test_episodes = train_test_split(dataset, test_size=0.2) device = None if args.gpu is None else Device(args.gpu) sac = SAC(n_epochs=100, q_func_type=args.q_func_type, use_gpu=device) sac.fit(train_episodes, eval_episodes=test_episodes, scorers={ 'environment': evaluate_on_environment(env), 'td_error': td_error_scorer, 'discounted_advantage': discounted_sum_of_advantage_scorer, 'value_scale': average_value_estimation_scorer, 'value_std': value_estimation_std_scorer, 'action_diff': continuous_action_diff_scorer })
def main(): parser = argparse.ArgumentParser() parser.add_argument('--dataset', type=str, default='hopper-medium-v0') parser.add_argument('--seed', type=int, default=1) parser.add_argument('--gpu', type=int) args = parser.parse_args() dataset, env = d3rlpy.datasets.get_dataset(args.dataset) # fix seed d3rlpy.seed(args.seed) env.seed(args.seed) _, test_episodes = train_test_split(dataset, test_size=0.2) if 'medium-replay' in env.unwrapped.spec.id.lower(): vae_encoder = d3rlpy.models.encoders.VectorEncoderFactory([128, 128]) else: vae_encoder = d3rlpy.models.encoders.VectorEncoderFactory([750, 750]) encoder = d3rlpy.models.encoders.VectorEncoderFactory([400, 300]) plas = d3rlpy.algos.PLASWithPerturbation( actor_encoder_factory=encoder, critic_encoder_factory=encoder, imitator_encoder_factory=vae_encoder, use_gpu=args.gpu) plas.fit( dataset.episodes, eval_episodes=test_episodes, n_steps=500000, n_steps_per_epoch=1000, save_interval=10, scorers={ 'environment': d3rlpy.metrics.evaluate_on_environment(env), 'value_scale': d3rlpy.metrics.average_value_estimation_scorer, }, experiment_name=f"PLASWithPerturbation_{args.dataset}_{args.seed}")
def main(args): dataset, env = get_atari(args.dataset) d3rlpy.seed(args.seed) train_episodes, test_episodes = train_test_split(dataset, test_size=0.2) dqn = DQN( n_frames=4, # frame stacking q_func_type=args.q_func_type, scaler='pixel', use_gpu=args.gpu) dqn.fit(train_episodes, eval_episodes=test_episodes, n_epochs=100, scorers={ 'environment': evaluate_on_environment(env, epsilon=0.05), 'td_error': td_error_scorer, 'discounted_advantage': discounted_sum_of_advantage_scorer, 'value_scale': average_value_estimation_scorer })
def main(): parser = argparse.ArgumentParser() parser.add_argument('--dataset', type=str, default='hopper-medium-v0') parser.add_argument('--seed', type=int, default=1) parser.add_argument('--gpu', type=int) args = parser.parse_args() dataset, env = d3rlpy.datasets.get_dataset(args.dataset) # fix seed d3rlpy.seed(args.seed) env.seed(args.seed) _, test_episodes = train_test_split(dataset, test_size=0.2) vae_encoder = d3rlpy.models.encoders.VectorEncoderFactory([750, 750]) if 'halfcheetah' in env.unwrapped.spec.id.lower(): kernel = 'gaussian' else: kernel = 'laplacian' bear = d3rlpy.algos.BEAR(imitator_encoder_factory=vae_encoder, temp_learning_rate=0.0, initial_temperature=1e-20, mmd_kernel=kernel, use_gpu=args.gpu) bear.fit(dataset.episodes, eval_episodes=test_episodes, n_steps=500000, n_steps_per_epoch=1000, save_interval=10, scorers={ 'environment': d3rlpy.metrics.evaluate_on_environment(env), 'value_scale': d3rlpy.metrics.average_value_estimation_scorer, }, experiment_name=f"BEAR_{args.dataset}_{args.seed}")
def main(args): dataset, env = get_atari(args.dataset) d3rlpy.seed(args.seed) train_episodes, test_episodes = train_test_split(dataset, test_size=0.2) device = None if args.gpu is None else Device(args.gpu) dqn = DQN(n_epochs=100, q_func_type=args.q_func_type, scaler='pixel', use_batch_norm=False, use_gpu=device) dqn.fit(train_episodes, eval_episodes=test_episodes, scorers={ 'environment': evaluate_on_environment(env, epsilon=0.05), 'td_error': td_error_scorer, 'discounted_advantage': discounted_sum_of_advantage_scorer, 'value_scale': average_value_estimation_scorer })
def main(): parser = argparse.ArgumentParser() parser.add_argument('--dataset', type=str, default='hopper-medium-v0') parser.add_argument('--seed', type=int, default=1) parser.add_argument('--gpu', type=int) args = parser.parse_args() # create dataset without masks dataset, env = d3rlpy.datasets.get_dataset(args.dataset) # fix seed d3rlpy.seed(args.seed) env.seed(args.seed) _, test_episodes = train_test_split(dataset, test_size=0.2) # prepare dynamics model dynamics_encoder = d3rlpy.models.encoders.VectorEncoderFactory( hidden_units=[200, 200, 200, 200], activation='swish', ) dynamics_optim = d3rlpy.models.optimizers.AdamFactory(weight_decay=2.5e-5) dynamics = d3rlpy.dynamics.ProbabilisticEnsembleDynamics( encoder_factory=dynamics_encoder, optim_factory=dynamics_optim, learning_rate=1e-3, n_ensembles=5, use_gpu=args.gpu, ) # train dynamics model dynamics.fit(dataset.episodes, eval_episodes=test_episodes, n_steps=100000, scorers={ "obs_error": dynamics_observation_prediction_error_scorer, "rew_error": dynamics_reward_prediction_error_scorer, }) if args.dataset in PARAMETER_TABLE: rollout_horizon, lam = PARAMETER_TABLE[args.dataset] else: rollout_horizon, lam = 5, 1 # prepare combo mopo = d3rlpy.algos.MOPO(dynamics=dynamics, rollout_horizon=rollout_horizon, lam=lam, use_gpu=args.gpu) # train combo mopo.fit(dataset.episodes, eval_episodes=test_episodes, n_steps=500000, n_steps_per_epoch=1000, save_interval=10, scorers={ "environment": d3rlpy.metrics.evaluate_on_environment(env), 'value_scale': d3rlpy.metrics.average_value_estimation_scorer }, experiment_name=f"MOPO_{args.dataset}_{args.seed}")
def main(): parser = argparse.ArgumentParser() parser.add_argument('--dataset', type=str, default='hopper-medium-v0') parser.add_argument('--seed', type=int, default=1) parser.add_argument('--gpu', type=int) args = parser.parse_args() # create dataset without masks dataset, env = d3rlpy.datasets.get_dataset(args.dataset) # fix seed d3rlpy.seed(args.seed) env.seed(args.seed) _, test_episodes = train_test_split(dataset, test_size=0.2) # prepare dynamics model dynamics_encoder = d3rlpy.models.encoders.VectorEncoderFactory( hidden_units=[200, 200, 200, 200], activation='swish', ) dynamics_optim = d3rlpy.models.optimizers.AdamFactory(weight_decay=2.5e-5) dynamics = d3rlpy.dynamics.ProbabilisticEnsembleDynamics( encoder_factory=dynamics_encoder, optim_factory=dynamics_optim, learning_rate=1e-3, n_ensembles=5, use_gpu=args.gpu, ) # train dynamics model dynamics.fit(dataset.episodes, eval_episodes=test_episodes, n_steps=100000, scorers={ "obs_error": dynamics_observation_prediction_error_scorer, "rew_error": dynamics_reward_prediction_error_scorer, }) if 'halfcheetah' in args.dataset: conservative_weight = 0.5 elif 'medium-expert' in args.dataset: conservative_weight = 5.0 elif 'random' in args.dataset or 'medium-replay' in args.dataset: if 'hopper' in args.dataset: conservative_weight = 1.0 else: conservative_weight = 0.5 elif 'medium' in args.dataset: conservative_weight = 5.0 else: conservative_weight = 1.0 if 'walker2d' in args.dataset: critic_learning_rate = 1e-4 actor_learning_rate = 1e-5 else: critic_learning_rate = 3e-4 actor_learning_rate = 1e-4 # prepare combo encoder = d3rlpy.models.encoders.VectorEncoderFactory([256, 256, 256]) combo = d3rlpy.algos.COMBO(dynamics=dynamics, actor_encoder_factory=encoder, critic_encoder_factory=encoder, actor_learning_rate=actor_learning_rate, critic_learning_rate=critic_learning_rate, temp_learning_rate=actor_learning_rate, conservative_weight=conservative_weight, use_gpu=args.gpu) # train combo combo.fit(dataset.episodes, eval_episodes=test_episodes, n_steps=500000, n_steps_per_epoch=1000, save_interval=10, scorers={ "environment": d3rlpy.metrics.evaluate_on_environment(env), 'value_scale': d3rlpy.metrics.average_value_estimation_scorer }, experiment_name=f"COMBO_{args.dataset}_{args.seed}")