def experiment(variant): expl_env = gym.make("CartPole-v0") eval_env = gym.make("CartPole-v0") obs_dim = expl_env.observation_space.low.size action_dim = eval_env.action_space.n qf = Mlp(hidden_sizes=[32, 32], input_size=obs_dim, output_size=action_dim) target_qf = Mlp(hidden_sizes=[32, 32], input_size=obs_dim, output_size=action_dim) qf_criterion = nn.MSELoss() eval_policy = ArgmaxDiscretePolicy(qf) expl_policy = PolicyWrappedWithExplorationStrategy( EpsilonGreedy(expl_env.action_space), eval_policy) eval_path_collector = MdpPathCollector(eval_env, eval_policy) expl_path_collector = MdpPathCollector(expl_env, expl_policy) trainer = DQNTrainer(qf=qf, target_qf=target_qf, qf_criterion=qf_criterion, **variant["trainer_kwargs"]) replay_buffer = EnvReplayBuffer(variant["replay_buffer_size"], expl_env) algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, **variant["algorithm_kwargs"]) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): expl_env = gym.make('GoalGridworld-v0') eval_env = gym.make('GoalGridworld-v0') obs_dim = expl_env.observation_space.spaces['observation'].low.size goal_dim = expl_env.observation_space.spaces['desired_goal'].low.size action_dim = expl_env.action_space.n qf = FlattenMlp( input_size=obs_dim + goal_dim, output_size=action_dim, hidden_sizes=[400, 300], ) target_qf = FlattenMlp( input_size=obs_dim + goal_dim, output_size=action_dim, hidden_sizes=[400, 300], ) eval_policy = ArgmaxDiscretePolicy(qf) exploration_strategy = EpsilonGreedy(action_space=expl_env.action_space, ) expl_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=exploration_strategy, policy=eval_policy, ) replay_buffer = ObsDictRelabelingBuffer(env=eval_env, **variant['replay_buffer_kwargs']) observation_key = 'observation' desired_goal_key = 'desired_goal' eval_path_collector = GoalConditionedPathCollector( eval_env, eval_policy, observation_key=observation_key, desired_goal_key=desired_goal_key, ) expl_path_collector = GoalConditionedPathCollector( expl_env, expl_policy, observation_key=observation_key, desired_goal_key=desired_goal_key, ) trainer = DQNTrainer(qf=qf, target_qf=target_qf, **variant['trainer_kwargs']) trainer = HERTrainer(trainer) algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, **variant['algo_kwargs']) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): qf = CNN( input_width=obs_dim, input_height=obs_dim, input_channels=channels, output_size=action_dim, kernel_sizes=[8, 4], n_channels=[16, 32], strides=[4, 2], paddings=[0, 0], hidden_sizes=[256], ) target_qf = CNN( input_width=obs_dim, input_height=obs_dim, input_channels=channels, output_size=action_dim, kernel_sizes=[8, 4], n_channels=[16, 32], strides=[4, 2], paddings=[0, 0], hidden_sizes=[256], ) qf_criterion = nn.MSELoss() eval_learner_policy = ArgmaxDiscretePolicy(qf) expl_learner_policy = PolicyWrappedWithExplorationStrategy( AnnealedEpsilonGreedy(symbolic_action_space, anneal_rate=variant["anneal_rate"]), eval_learner_policy, ) eval_policy = LearnPlanPolicy(eval_learner_policy) expl_policy = LearnPlanPolicy(expl_learner_policy) eval_path_collector = MdpPathCollector(eval_env, eval_policy, rollout=hierarchical_rollout) expl_path_collector = MdpPathCollector(expl_env, expl_policy, rollout=hierarchical_rollout) trainer = DQNTrainer(qf=qf, target_qf=target_qf, qf_criterion=qf_criterion, **variant["trainer_kwargs"]) replay_buffer = EnvReplayBuffer(variant["replay_buffer_size"], symb_env) algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, **variant["algorithm_kwargs"]) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): # Select a different success_function for different tasks. expl_env = GymCraftingEnv(state_obs=True, few_obj=True, success_function=eval_eatbread) eval_env = GymCraftingEnv(state_obs=True, few_obj=True, success_function=eval_eatbread) obs_dim = expl_env.observation_space.low.size action_dim = eval_env.action_space.n qf = Mlp( hidden_sizes=[32, 32], input_size=obs_dim, output_size=action_dim, ) target_qf = Mlp( hidden_sizes=[32, 32], input_size=obs_dim, output_size=action_dim, ) qf_criterion = nn.MSELoss() eval_policy = ArgmaxDiscretePolicy(qf) expl_policy = PolicyWrappedWithExplorationStrategy( EpsilonGreedy(expl_env.action_space), eval_policy, ) eval_path_collector = MdpPathCollector( eval_env, eval_policy, ) expl_path_collector = MdpPathCollector( expl_env, expl_policy, ) trainer = DQNTrainer(qf=qf, target_qf=target_qf, qf_criterion=qf_criterion, **variant['trainer_kwargs']) replay_buffer = EnvReplayBuffer( variant['replay_buffer_size'], expl_env, ) algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, **variant['algorithm_kwargs']) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): """Run the experiment.""" eval_env = gym.make('CartPole-v0') obs_dim = eval_env.observation_space.low.size action_dim = eval_env.action_space.n # Collect data. print('Collecting data...') data = [] while len(data) < variant['offline_data_size']: done = False s = eval_env.reset() while not done: a = np.random.randint(action_dim) n, r, done, _ = eval_env.step(a) one_hot_a = np.zeros(action_dim) one_hot_a[a] = 1 data.append((s, one_hot_a, r, n, done)) s = n if len(data) == variant['offline_data_size']: break qf = Mlp( hidden_sizes=[32, 32], input_size=obs_dim, output_size=action_dim, ) target_qf = Mlp( hidden_sizes=[32, 32], input_size=obs_dim, output_size=action_dim, ) qf_criterion = nn.MSELoss() eval_policy = ArgmaxDiscretePolicy(qf) eval_path_collector = MdpPathCollector( eval_env, eval_policy, ) trainer = DQNTrainer( qf=qf, target_qf=target_qf, qf_criterion=qf_criterion, **variant['trainer_kwargs'] ) offline_data = OfflineDataStore(data=data,) algorithm = TorchOfflineRLAlgorithm( trainer=trainer, evaluation_env=eval_env, evaluation_data_collector=eval_path_collector, offline_data=offline_data, **variant['algorithm_kwargs'] ) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): args = getArgs() # expl_env = NormalizedBoxEnv(environment(args)) expl_env = environment(args, 'dqn') eval_env = environment(args, 'dqn') # expl_env.render() obs_dim = expl_env.get_obsdim() action_dim = expl_env.action_space.n qf = Mlp( hidden_sizes=[32, 32], input_size=obs_dim, output_size=action_dim, ) target_qf = Mlp( hidden_sizes=[32, 32], input_size=obs_dim, output_size=action_dim, ) qf_criterion = nn.MSELoss() eval_policy = ArgmaxDiscretePolicy(qf) expl_policy = PolicyWrappedWithExplorationStrategy( EpsilonGreedy(expl_env.action_space), eval_policy, ) eval_path_collector = MdpPathCollector( eval_env, eval_policy, ) expl_path_collector = MdpPathCollector( expl_env, expl_policy, ) trainer = DQNTrainer(qf=qf, target_qf=target_qf, qf_criterion=qf_criterion, **variant['trainer_kwargs']) replay_buffer = EnvReplayBuffer( variant['replay_buffer_size'], expl_env, ) algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, **variant['algorithm_kwargs']) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): # common.initialise(variant) setup_logger("name-of-experiment", variant=variant) ptu.set_gpu_mode(True) expl_env = gym.make(variant["env_name"], seed=5) eval_env = gym.make(variant["env_name"], seed=5) ANCILLARY_GOAL_SIZE = 16 SYMBOLIC_ACTION_SIZE = ( 12 ) # Size of embedding (ufva/multihead) for goal space direction to controller GRID_SIZE = 31 action_dim = ANCILLARY_GOAL_SIZE symbolic_action_space = gym.spaces.Discrete(ANCILLARY_GOAL_SIZE) symb_env = gym.make(variant["env_name"]) symb_env.action_space = symbolic_action_space ( obs_shape, obs_space, action_space, n, mlp, channels, fc_input, ) = common.get_spaces(expl_env) qf = Mlp( input_size=n, output_size=action_dim, hidden_sizes=[256, 256], init_w=variant["init_w"], b_init_value=variant["b_init_value"], ) target_qf = Mlp( input_size=n, output_size=action_dim, hidden_sizes=[256, 256], init_w=variant["init_w"], b_init_value=variant["b_init_value"], ) planner = ENHSPPlanner() # collect filepath = "/home/achester/anaconda3/envs/goal-gen/.guild/runs/e77c75eed02e4b38a0a308789fbfcbd8/data/params.pkl" # collect with (open(filepath, "rb")) as openfile: while True: try: policies = pickle.load(openfile) except EOFError: break loaded_collect_policy = policies["exploration/policy"] loaded_collect_policy.rnn_hxs = loaded_collect_policy.rnn_hxs[0].unsqueeze( 0) eval_collect = CraftController(loaded_collect_policy, n=GRID_SIZE) expl_collect = CraftController(loaded_collect_policy, n=GRID_SIZE) # other # filepath = "/home/achester/anaconda3/envs/goal-gen/.guild/runs/cf5c31afe0724acd8f6398d77a80443e/data/params.pkl" # other (RC 28) filepath = "/home/achester/anaconda3/envs/goal-gen/.guild/runs/4989f4bcbadb4ac58c3668c068d63225/data/params.pkl" # other (RC 55) # filepath = "/home/achester/Documents/misc/craft-model/params.pkl" with (open(filepath, "rb")) as openfile: while True: try: policies = pickle.load(openfile) except EOFError: break loaded_other_policy = policies["exploration/policy"] loaded_other_policy.rnn_hxs = loaded_other_policy.rnn_hxs[0].unsqueeze(0) eval_other = CraftController(loaded_other_policy, n=GRID_SIZE) expl_other = CraftController(loaded_other_policy, n=GRID_SIZE) eval_controller = PretrainedController([eval_collect, eval_other]) expl_controller = PretrainedController([expl_collect, expl_other]) function_env = gym.make(variant["env_name"]) qf_criterion = nn.MSELoss() if variant["softmax"]: eval_learner = SoftmaxDiscretePolicy(qf, variant["temperature"]) else: eval_learner = ArgmaxDiscretePolicy(qf) expl_learner = PolicyWrappedWithExplorationStrategy( LinearEpsilonGreedy(symbolic_action_space, anneal_schedule=variant["anneal_schedule"]), eval_learner, ) eval_policy = LearnPlanPolicy( eval_learner, planner, eval_controller, num_processes=1, vectorised=False, env=function_env, ) expl_policy = LearnPlanPolicy( expl_learner, planner, expl_controller, num_processes=1, vectorised=False, env=function_env, ) eval_path_collector = IntermediatePathCollector( eval_env, eval_policy, rollout=intermediate_rollout, gamma=1, render=variant["render"], single_plan_discounting=variant["trainer_kwargs"] ["single_plan_discounting"], experience_interval=variant["experience_interval"], ) expl_path_collector = IntermediatePathCollector( expl_env, expl_policy, rollout=intermediate_rollout, gamma=variant["trainer_kwargs"]["discount"], render=variant["render"], single_plan_discounting=variant["trainer_kwargs"] ["single_plan_discounting"], experience_interval=variant["experience_interval"], ) if variant["double_dqn"]: trainer = DoubleDQNTrainer(qf=qf, target_qf=target_qf, qf_criterion=qf_criterion, **variant["trainer_kwargs"]) else: trainer = DQNTrainer(qf=qf, target_qf=target_qf, qf_criterion=qf_criterion, **variant["trainer_kwargs"]) replay_buffer = PlanReplayBuffer(variant["replay_buffer_size"], symb_env) algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, **variant["algorithm_kwargs"]) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): setup_logger("name-of-experiment", variant=variant) ptu.set_gpu_mode(True) expl_env = gym.make(variant["env_name"]) eval_env = gym.make(variant["env_name"]) obs_dim = expl_env.observation_space.image.shape[1] channels = expl_env.observation_space.image.shape[0] action_dim = SYMBOLIC_ACTION_COUNT symbolic_action_space = gym.spaces.Discrete(SYMBOLIC_ACTION_COUNT) symb_env = gym.make(variant["env_name"]) symb_env.action_space = symbolic_action_space qf = CNN( input_width=obs_dim, input_height=obs_dim, input_channels=channels, output_size=action_dim, kernel_sizes=[8, 4], n_channels=[16, 32], strides=[4, 2], paddings=[0, 0], hidden_sizes=[256], ) target_qf = CNN( input_width=obs_dim, input_height=obs_dim, input_channels=channels, output_size=action_dim, kernel_sizes=[8, 4], n_channels=[16, 32], strides=[4, 2], paddings=[0, 0], hidden_sizes=[256], ) qf_criterion = nn.MSELoss() eval_policy = LearnPlanPolicy(None) expl_policy = LearnPlanPolicy(None) eval_path_collector = MdpPathCollector(eval_env, eval_policy, rollout=hierarchical_rollout, render=variant["render"]) expl_path_collector = MdpPathCollector(expl_env, expl_policy, rollout=hierarchical_rollout, render=variant["render"]) trainer = DQNTrainer(qf=qf, target_qf=target_qf, qf_criterion=qf_criterion, **variant["trainer_kwargs"]) replay_buffer = EnvReplayBuffer(variant["replay_buffer_size"], symb_env) algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, **variant["algorithm_kwargs"]) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): fov, delta, num_ch = 13, 3, 3 expl_env = EnvBrainbow('0:data/brainbow/training_sample.tif', coord_interval=2, img_mean=128, img_stddev=33, num_ch=3, fov=fov, delta=delta, seed=0) eval_env = EnvBrainbow('0:data/brainbow/training_sample.tif', coord_interval=2, img_mean=128, img_stddev=33, num_ch=3, fov=fov, delta=delta, seed=0) obs_dim = expl_env.observation_space.low.shape # 13, 13, 3 action_dim = eval_env.action_space.n # 2 kernel_sizes = [3, 3, 3] n_channels = [32, 64, 64] strides = [1, 1, 1] paddings = [0, 0, 0] hidden_sizes = [512] qf = CNN( input_width=fov, input_height=fov, input_channels=num_ch, output_size=action_dim, kernel_sizes=kernel_sizes, n_channels=n_channels, strides=strides, paddings=paddings, hidden_sizes=hidden_sizes, batch_norm_conv=True, batch_norm_fc=False ) target_qf = CNN( input_width=fov, input_height=fov, input_channels=num_ch, output_size=action_dim, kernel_sizes=kernel_sizes, n_channels=n_channels, strides=strides, paddings=paddings, hidden_sizes=hidden_sizes, batch_norm_conv=True, batch_norm_fc=False ) print(qf) qf_criterion = nn.MSELoss() eval_policy = ArgmaxDiscretePolicy(qf) expl_policy = PolicyWrappedWithExplorationStrategy( EpsilonGreedy(expl_env.action_space), eval_policy, ) eval_path_collector = MdpPathCollector( eval_env, eval_policy, ) expl_path_collector = MdpPathCollector( expl_env, expl_policy, ) trainer = DQNTrainer( qf=qf, target_qf=target_qf, qf_criterion=qf_criterion, **variant['trainer_kwargs'] ) replay_buffer = EnvReplayBuffer( variant['replay_buffer_size'], expl_env, ) algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, **variant['algorithm_kwargs'] ) algorithm.to(ptu.device) algorithm.train()
def experiment(doodad_config, variant): from rlkit.core import logger from rlkit.launchers.launcher_util import setup_logger print ("doodad_config.base_log_dir: ", doodad_config.base_log_dir) from datetime import datetime timestamp = datetime.now().strftime('%Y_%m_%d_%H_%M_%S_%f') setup_logger('wrapped_'+variant['env'], variant=variant, log_dir=doodad_config.base_log_dir+"/smirl/"+variant['exp_name']+"/"+timestamp+"/") if (variant["log_comet"]): try: comet_logger = Experiment(api_key=launchers.config.COMET_API_KEY, project_name=launchers.config.COMET_PROJECT_NAME, workspace=launchers.config.COMET_WORKSPACE) logger.set_comet_logger(comet_logger) comet_logger.set_name(str(variant['env'])+"_"+str(variant['exp_name'])) print("variant: ", variant) variant['comet_key'] = comet_logger.get_key() comet_logger.log_parameters(variant) print(comet_logger) except Exception as inst: print ("Not tracking training via commet.ml") print ("Error: ", inst) import gym from torch import nn as nn import rlkit.torch.pytorch_util as ptu import torch from rlkit.exploration_strategies.epsilon_greedy import EpsilonGreedy from rlkit.exploration_strategies.base import \ PolicyWrappedWithExplorationStrategy from rlkit.policies.argmax import ArgmaxDiscretePolicy from rlkit.torch.dqn.dqn import DQNTrainer from rlkit.data_management.env_replay_buffer import EnvReplayBuffer from rlkit.samplers.data_collector import MdpPathCollector from rlkit.torch.torch_rl_algorithm import TorchBatchRLAlgorithm from surprise.utils.rendering_algorithm import TorchBatchRLRenderAlgorithm from surprise.envs.tetris.tetris import TetrisEnv from surprise.wrappers.obsresize import ResizeObservationWrapper, RenderingObservationWrapper, SoftResetWrapper import pdb base_env = get_env(variant) base_env2 = get_env(variant) print ("GPU_BUS_Index", variant["GPU_BUS_Index"]) if torch.cuda.is_available() and doodad_config.use_gpu: print ("Using the GPU for learning") # ptu.set_gpu_mode(True, gpu_id=doodad_config.gpu_id) ptu.set_gpu_mode(True, gpu_id=variant["GPU_BUS_Index"]) else: print ("NOT Using the GPU for learning") # base_env2 = RenderingObservationWrapper(base_env2) expl_env, network = add_wrappers(base_env, variant, device=ptu.device) eval_env, _ = add_wrappers(base_env2, variant, device=ptu.device, eval=True, network=network) if ("vae_wrapper" in variant["wrappers"]): eval_env._network = base_env._network obs_dim = expl_env.observation_space.low.shape print("Final obs dim", obs_dim) action_dim = eval_env.action_space.n print("Action dimension: ", action_dim) qf, target_qf = get_network(variant["network_args"], obs_dim, action_dim) qf_criterion = nn.MSELoss() eval_policy = ArgmaxDiscretePolicy(qf) if "prob_random_action" in variant: expl_policy = PolicyWrappedWithExplorationStrategy( EpsilonGreedy(expl_env.action_space, prob_random_action=variant["prob_random_action"], prob_end=variant["prob_end"], steps=variant["steps"]), eval_policy, ) else: expl_policy = PolicyWrappedWithExplorationStrategy( EpsilonGreedy(expl_env.action_space, prob_random_action=0.8, prob_end=0.05), eval_policy, ) eval_path_collector = MdpPathCollector( eval_env, eval_policy, render_kwargs=variant['render_kwargs'] ) expl_path_collector = MdpPathCollector( expl_env, expl_policy, ) trainer = DQNTrainer( qf=qf, target_qf=target_qf, qf_criterion=qf_criterion, **variant['trainer_kwargs'] ) replay_buffer = EnvReplayBuffer( variant['replay_buffer_size'], expl_env, ) algorithm = TorchBatchRLRenderAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, **variant['algorithm_kwargs'] ) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): setup_logger("name-of-experiment", variant=variant) ptu.set_gpu_mode(True) expl_env = gym.make(variant["env_name"]) eval_env = gym.make(variant["env_name"]) # OLD - Taxi image env # if isinstance(expl_env.observation_space, Json): # expl_env = BoxWrapper(expl_env) # eval_env = BoxWrapper(eval_env) # # obs_shape = expl_env.observation_space.image.shape # obs_shape = expl_env.observation_space.shape # if len(obs_shape) == 3 and obs_shape[2] in [1, 3]: # convert WxHxC into CxWxH # expl_env = TransposeImage(expl_env, op=[2, 0, 1]) # eval_env = TransposeImage(eval_env, op=[2, 0, 1]) # obs_shape = expl_env.observation_space.shape # channels, obs_width, obs_height = obs_shape # action_dim = eval_env.action_space.n # qf = CNN( # input_width=obs_width, # input_height=obs_height, # input_channels=channels, # output_size=action_dim, # kernel_sizes=[8, 4], # n_channels=[16, 32], # strides=[4, 2], # paddings=[0, 0], # hidden_sizes=[256], # ) # target_qf = CNN( # input_width=obs_width, # input_height=obs_height, # input_channels=channels, # output_size=action_dim, # kernel_sizes=[8, 4], # n_channels=[16, 32], # strides=[4, 2], # paddings=[0, 0], # hidden_sizes=[256], # ) ( obs_shape, obs_space, action_space, n, mlp, channels, fc_input, ) = common.get_spaces(expl_env) qf = Mlp( input_size=n, output_size=action_space.n, hidden_sizes=[256, 256], init_w=variant["init_w"], b_init_value=variant["b_init_value"], ) target_qf = Mlp( input_size=n, output_size=action_space.n, hidden_sizes=[256, 256], init_w=variant["init_w"], b_init_value=variant["b_init_value"], ) qf_criterion = nn.MSELoss() if variant["softmax"]: eval_policy = SoftmaxDiscretePolicy(qf, variant["temperature"]) else: eval_policy = ArgmaxDiscretePolicy(qf) expl_policy = PolicyWrappedWithExplorationStrategy( LinearEpsilonGreedy(action_space, anneal_schedule=variant["anneal_schedule"]), eval_policy, ) eval_path_collector = MdpPathCollector(eval_env, eval_policy, render=variant["render"]) expl_path_collector = MdpPathCollector(expl_env, expl_policy, render=variant["render"]) trainer = DQNTrainer(qf=qf, target_qf=target_qf, qf_criterion=qf_criterion, **variant["trainer_kwargs"]) replay_buffer = EnvReplayBuffer(variant["replay_buffer_size"], expl_env) algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, **variant["algorithm_kwargs"]) algorithm.to(ptu.device) algorithm.train()