def main(args): dataset, env = get_pybullet(args.dataset) d3rlpy.seed(args.seed) train_episodes, test_episodes = train_test_split(dataset, test_size=0.2) device = None if args.gpu is None else Device(args.gpu) encoder_factory = VectorEncoderFactory(hidden_units=[256, 256, 256, 256]) awac = AWAC(actor_encoder_factory=encoder_factory, critic_encoder_factory=encoder_factory, q_func_factory=args.q_func, use_gpu=device) awac.fit(train_episodes, eval_episodes=test_episodes, n_epochs=1000, scorers={ 'environment': evaluate_on_environment(env), 'td_error': td_error_scorer, 'discounted_advantage': discounted_sum_of_advantage_scorer, 'value_scale': average_value_estimation_scorer, 'value_std': value_estimation_std_scorer, 'action_diff': continuous_action_diff_scorer })
def main(args): dataset, env = get_pybullet(args.dataset) d3rlpy.seed(args.seed) train_episodes, test_episodes = train_test_split(dataset, test_size=0.2) device = None if args.gpu is None else Device(args.gpu) dynamics = ProbabilisticEnsembleDynamics(use_gpu=device) dynamics.fit(train_episodes, eval_episodes=test_episodes, n_steps=100000, scorers={ "obs_error": dynamics_observation_prediction_error_scorer, "reward_error": dynamics_reward_prediction_error_scorer, }) combo = COMBO(q_func_factory=args.q_func, dynamics=dynamics, use_gpu=device) combo.fit(train_episodes, eval_episodes=test_episodes, n_steps=1000000, scorers={ 'environment': evaluate_on_environment(env), 'td_error': td_error_scorer, 'discounted_advantage': discounted_sum_of_advantage_scorer, 'value_scale': average_value_estimation_scorer, 'value_std': value_estimation_std_scorer, 'action_diff': continuous_action_diff_scorer })
def main(args): dataset, env = get_pybullet(args.dataset) d3rlpy.seed(args.seed) train_episodes, test_episodes = train_test_split(dataset, test_size=0.2) device = None if args.gpu is None else Device(args.gpu) bc = BC(n_epochs=100, use_gpu=device) bc.fit(train_episodes, eval_episodes=test_episodes, scorers={ 'environment': evaluate_on_environment(env), 'action_diff': continuous_action_diff_scorer })
def main(args): dataset, env = get_pybullet(args.dataset) d3rlpy.seed(args.seed) train_episodes, test_episodes = train_test_split(dataset, test_size=0.2) device = None if args.gpu is None else Device(args.gpu) awr = AWR(n_epochs=100, use_gpu=device) awr.fit(train_episodes, eval_episodes=test_episodes, scorers={ 'environment': evaluate_on_environment(env), 'td_error': td_error_scorer, 'value_scale': average_value_estimation_scorer, 'action_diff': continuous_action_diff_scorer })
def main(args): dataset, env = get_pybullet(args.dataset) d3rlpy.seed(args.seed) train_episodes, test_episodes = train_test_split(dataset, test_size=0.2) device = None if args.gpu is None else Device(args.gpu) sac = SAC(n_epochs=100, q_func_type=args.q_func_type, use_gpu=device) sac.fit(train_episodes, eval_episodes=test_episodes, scorers={ 'environment': evaluate_on_environment(env), 'td_error': td_error_scorer, 'discounted_advantage': discounted_sum_of_advantage_scorer, 'value_scale': average_value_estimation_scorer, 'value_std': value_estimation_std_scorer, 'action_diff': continuous_action_diff_scorer })
from sklearn.model_selection import train_test_split from d3rlpy.datasets import get_pybullet from d3rlpy.algos import CQL from d3rlpy.ope import FQE from d3rlpy.metrics.scorer import evaluate_on_environment from d3rlpy.metrics.scorer import initial_state_value_estimation_scorer from d3rlpy.metrics.scorer import soft_opc_scorer dataset, env = get_pybullet('hopper-bullet-mixed-v0') train_episodes, test_episodes = train_test_split(dataset, test_size=0.2) # train algorithm cql = CQL(n_epochs=100, use_gpu=True) cql.fit(train_episodes, eval_episodes=test_episodes, scorers={ 'environment': evaluate_on_environment(env), 'init_value': initial_state_value_estimation_scorer, 'soft_opc': soft_opc_scorer(600) }) # or load the trained model # cql = CQL.from_json('<path-to-json>/params.json') # cql.load_model('<path-to-model>/model.pt') # evaluate the trained policy fqe = FQE(algo=cql, n_epochs=200, q_func_factory='qr', learning_rate=1e-4,
from d3rlpy.algos import AWAC from d3rlpy.datasets import get_pybullet from d3rlpy.online.buffers import ReplayBuffer from d3rlpy.metrics.scorer import evaluate_on_environment from d3rlpy.metrics.scorer import discounted_sum_of_advantage_scorer from d3rlpy.metrics.scorer import average_value_estimation_scorer from sklearn.model_selection import train_test_split # prepare dataset and environment dataset, env = get_pybullet('hopper-bullet-random-v0') _, eval_env = get_pybullet('hopper-bullet-random-v0') train_episodes, test_episodes = train_test_split(dataset) # setup algorithm awac = AWAC(encoder_params={'hidden_units': [256, 256, 256, 256]}, use_gpu=True) ## pretrain awac.fit(train_episodes[:10000], eval_episodes=test_episodes, n_epochs=30, scorers={ 'environment': evaluate_on_environment(env), 'advantage': discounted_sum_of_advantage_scorer, 'value_scale': average_value_estimation_scorer }) # fine-tuning awac.fit_online(env, ReplayBuffer(1000000, env, train_episodes[:10000]),