def __init__(self, load_model=False, model_path=None): self.preprocessor = DTPytorchWrapper() self.model = DDPG(state_dim=self.preprocessor.shape, action_dim=2, max_action=1, net_type="cnn") self.current_image = np.zeros((640, 480, 3))
class PytorchRLBaseline: def init(self, context: Context): context.info('init()') self.image_processor = DTPytorchWrapper() self.action_processor = ActionWrapper(FakeWrap()) from model import DDPG self.check_gpu_available(context) self.model = DDPG(state_dim=self.image_processor.shape, action_dim=2, max_action=1, net_type="cnn") self.current_image = np.zeros((640, 480, 3)) self.model.load("model", directory="./models") def check_gpu_available(self, context: Context): import torch available = torch.cuda.is_available() req = os.environ.get('AIDO_REQUIRE_GPU', None) context.info(f'torch.cuda.is_available = {available!r} AIDO_REQUIRE_GPU = {req!r}') context.info('init()') if available: i = torch.cuda.current_device() count = torch.cuda.device_count() name = torch.cuda.get_device_name(i) context.info(f'device {i} of {count}; name = {name!r}') else: if req is not None: msg = 'I need a GPU; bailing.' context.error(msg) raise RuntimeError(msg) def on_received_seed(self, data: int): np.random.seed(data) def on_received_episode_start(self, context: Context, data: EpisodeStart): context.info(f'Starting episode "{data.episode_name}".') def on_received_observations(self, data: DB20Observations): camera: JPGImage = data.camera obs = jpg2rgb(camera.jpg_data) self.current_image = self.image_processor.preprocess(obs) def compute_action(self, observation): action = self.model.predict(observation) return self.action_processor.action(action.astype(float)) def on_received_get_commands(self, context: Context): pwm_left, pwm_right = self.compute_action(self.current_image) pwm_left = float(np.clip(pwm_left, -1, +1)) pwm_right = float(np.clip(pwm_right, -1, +1)) grey = RGB(0.0, 0.0, 0.0) led_commands = LEDSCommands(grey, grey, grey, grey, grey) pwm_commands = PWMCommands(motor_left=pwm_left, motor_right=pwm_right) commands = DB20Commands(pwm_commands, led_commands) context.write('commands', commands) def finish(self, context: Context): context.info('finish()')
def __init__(self, load_model=False, model_path=None): self.preprocessor = DTPytorchWrapper() self.model = DDPG(state_dim=self.preprocessor.shape, action_dim=2, max_action=1, net_type="cnn") self.current_image = np.zeros((640, 480, 3)) if load_model: fp = model_path if model_path else "model" self.model.load(fp, "models", for_inference=True)
def init(self, context: Context): context.info('init()') self.image_processor = DTPytorchWrapper() self.action_processor = ActionWrapper(FakeWrap()) from model import DDPG self.check_gpu_available(context) self.model = DDPG(state_dim=self.image_processor.shape, action_dim=2, max_action=1, net_type="cnn") self.current_image = np.zeros((640, 480, 3)) self.model.load("model", directory="./models")
class PytorchRLTemplateAgent: def __init__(self, load_model=False, model_path=None): logger.info('PytorchRLTemplateAgent init') self.preprocessor = DTPytorchWrapper() self.model = DDPG(state_dim=self.preprocessor.shape, action_dim=2, max_action=1, net_type="cnn") self.current_image = np.zeros((640, 480, 3)) if load_model: logger.info('PytorchRLTemplateAgent loading models') fp = model_path if model_path else "model" self.model.load(fp, "models", for_inference=True) logger.info('PytorchRLTemplateAgent init complete') def init(self, context: Context): context.info('init()') def on_received_seed(self, data: int): np.random.seed(data) def on_received_episode_start(self, context: Context, data: EpisodeStart): context.info(f'Starting episode "{data.episode_name}".') def on_received_observations(self, data: Duckiebot1Observations): camera: JPGImage = data.camera obs = jpg2rgb(camera.jpg_data) self.current_image = self.preprocessor.preprocess(obs) def compute_action(self, observation): #if observation.shape != self.preprocessor.transposed_shape: # observation = self.preprocessor.preprocess(observation) action = self.model.predict(observation) return action.astype(float) def on_received_get_commands(self, context: Context): pwm_left, pwm_right = self.compute_action(self.current_image) pwm_left = float(np.clip(pwm_left, -1, +1)) pwm_right = float(np.clip(pwm_right, -1, +1)) grey = RGB(0.0, 0.0, 0.0) led_commands = LEDSCommands(grey, grey, grey, grey, grey) pwm_commands = PWMCommands(motor_left=pwm_left, motor_right=pwm_right) commands = Duckiebot1Commands(pwm_commands, led_commands) context.write('commands', commands) def finish(self, context: Context): context.info('finish()')
def __init__(self, load_model=False, model_path=None): logger.info('PytorchRLTemplateAgent init') self.preprocessor = DTPytorchWrapper() self.model = DDPG(state_dim=self.preprocessor.shape, action_dim=2, max_action=1, net_type="cnn") self.current_image = np.zeros((640, 480, 3)) if load_model: logger.info('PytorchRLTemplateAgent loading models') fp = model_path if model_path else "model" self.model.load(fp, "models", for_inference=True) logger.info('PytorchRLTemplateAgent init complete')
def init(self, context: Context): self.check_gpu_available(context) logger.info("PytorchRLTemplateAgent init") from model import DDPG self.preprocessor = DTPytorchWrapper() self.model = DDPG(state_dim=self.preprocessor.shape, action_dim=2, max_action=1, net_type="cnn") self.current_image = np.zeros((640, 480, 3)) if self.load_model: logger.info("Pytorch Template Agent loading models") fp = self.model_path if self.model_path else "model" self.model.load(fp, "models", for_inference=True) logger.info("PytorchRLTemplateAgent init complete")
def __init__(self, obs_space, action_space, ram, writer, device, args): """ :param obs_space: Dimensions of state (int) :param action_space: Dimension of action (int) :param ram: replay memory buffer object :return: """ self.state_dim = obs_space.shape[0] self.action_dim = action_space.shape[0] self.action_high = action_space.high self.action_low = action_space.low self.ram = ram self.iter = 1 self.steps = 0 self.gamma = args.gamma self.batch_size = args.batch_size self.tau = args.tau self.decay_rate = args.decay_rate self.eps_start = args.eps_start self.eps_end = args.eps_end self.eps_decay = args.eps_decay self.start_step = args.start_learning self.device = device self.noise = utils.OrnsteinUhlenbeckActionNoise(self.action_dim) self.writer = writer self.args = args # init network target_net = DDPG(obs_space.shape, self.action_dim, args).to(device) learn_net = DDPG(obs_space.shape, self.action_dim, args).to(device) utils.hard_update(target_net, learn_net) self.AC = learn_net self.AC_T = target_net self.actor_optimizer = torch.optim.Adam( self.AC.actor.policyNet.parameters(), args.lr_a) self.critic_optimizer = torch.optim.Adam(self.AC.critic.parameters(), args.lr_c) self.actor = self.AC.actor self.target_actor = self.AC_T.actor self.critic = self.AC.critic self.target_critic = self.AC_T.critic
def __init__(self, hparams): super(HER, self).__init__() self.hparams = hparams self.test_env = make_env(hparams, render=self.hparams.render_test) sample_obs = self.test_env.observation_space['observation'].sample() sample_goal = self.test_env.observation_space['achieved_goal'].sample() # HARD CODED VALUES FOR Bullet-HRL action_limits, state_limits = get_env_boundaries() action_offset, action_bounds, action_clip_low, action_clip_high = action_limits state_shape = sample_obs.shape[0] action_shape = self.test_env.action_space.shape[0] goal_shape = sample_goal.shape[0] self.action_clips = (action_clip_low, action_clip_high) self.model = DDPG(params=self.hparams, obs_size=state_shape, goal_size=goal_shape, act_size=action_shape, action_clips=(action_clip_low, action_clip_high), action_bounds=action_bounds, action_offset=action_offset) self.model.actor.share_memory() self.model.critic.share_memory() self.state_normalizer = Normalizer( state_shape, default_clip_range=self.hparams.clip_range) self.goal_normalizer = Normalizer( goal_shape, default_clip_range=self.hparams.clip_range) self.replay_buffer = SharedReplayBuffer(self.hparams.buffer_size, state_shape, action_shape, goal_shape)
def solve(params, cis): # python has dynamic typing, the line below can help IDEs with autocompletion assert isinstance(cis, ChallengeInterfaceSolution) # after this cis. will provide you with some autocompletion in some IDEs (e.g.: pycharm) cis.info('Creating model.') # you can have logging capabilties through the solution interface (cis). # the info you log can be retrieved from your submission files. # We get environment from the Evaluation Engine cis.info('Making environment') env = gym.make(params['env']) # === BEGIN SUBMISSION === # If you created custom wrappers, you also need to copy them into this folder. from wrappers import NormalizeWrapper, ImgWrapper, ActionWrapper, ResizeWrapper env = ResizeWrapper(env) env = NormalizeWrapper(env) # to make the images pytorch-conv-compatible env = ImgWrapper(env) env = ActionWrapper(env) # you ONLY need this wrapper if you trained your policy on [speed,steering angle] # instead [left speed, right speed] env = SteeringToWheelVelWrapper(env) # you have to make sure that you're wrapping at least the actions # and observations in the same as during training so that your model # receives the same kind of input, because that's what it's trained for # (for example if your model is trained on grayscale images and here # you _don't_ make it grayscale too, then your model wont work) # HERE YOU NEED TO CREATE THE POLICY NETWORK SAME AS YOU DID IN THE TRAINING CODE # if you aren't using the DDPG baseline code, then make sure to copy your model # into the model.py file and that it has a model.predict(state) method. from model import DDPG model = DDPG(state_dim=env.observation_space.shape, action_dim=2, max_action=1, net_type="cnn") try: model.load("model", "models") # === END SUBMISSION === # Then we make sure we have a connection with the environment and it is ready to go cis.info('Reset environment') observation = env.reset() # While there are no signal of completion (simulation done) # we run the predictions for a number of episodes, don't worry, we have the control on this part while True: # we passe the observation to our model, and we get an action in return action = model.predict(observation) # we tell the environment to perform this action and we get some info back in OpenAI Gym style observation, reward, done, info = env.step(action) # here you may want to compute some stats, like how much reward are you getting # notice, this reward may no be associated with the challenge score. # it is important to check for this flag, the Evalution Engine will let us know when should we finish # if we are not careful with this the Evaluation Engine will kill our container and we will get no score # from this submission if 'simulation_done' in info: cis.info('simulation_done received.') break if done: cis.info('Episode done; calling reset()') env.reset() finally: # release CPU/GPU resources, let's be friendly with other users that may need them cis.info('Releasing resources') try: model.close() except: msg = 'Could not call model.close():\n%s' % traceback.format_exc() cis.error(msg) cis.info('Graceful exit of solve()')
gamma = 0.99 # 用多少比例的 critic value來當作target q value var = 3.0 # 動作搜索變異性 if __name__ == '__main__': # Create environment env = gym.make('Pendulum-v0').unwrapped n_state = env.observation_space.shape[0] # 提取state的維度 n_action = env.action_space.shape[0] # 提取action的維度 a_limit = env.action_space.high[0] # 提取action連續動作中,最大的可能數值 # Create network net = DDPG(n_state=n_state, n_action=n_action, a_limit=a_limit, model_folder=model_folder, memory_size=memory_size, batch_size=batch_size, tau=tau, gamma=gamma, var=var) net.load() # Train reward_list = [] for i in range(episode): s = env.reset() total_reward = 0 for j in range(max_iter): # env.render() a = net.chooseAction(s) s_, r, finish, info = env.step(a)
def master_loop(env): logger = logging.getLogger() formatter = logging.Formatter( '%(asctime)s - %(name)s - %(levelname)s - %(message)s') fileHandler = logging.FileHandler('./log/test.log') fileHandler.setFormatter(formatter) logger.addHandler(fileHandler) logger.setLevel(logging.INFO) s_dim = env.get_s_dim() a_dim = env.get_a_dim() a_high = env.get_a_high() a_low = env.get_a_low() # print(a_bound) print("s_dim: {}, a_dim{}, a_high:{}, a_low:{}".format( s_dim, a_dim, a_high, a_low)) ddpg = DDPG(a_dim, s_dim, a_high, a_low, lr_a=LR_A, lr_c=LR_C, gamma=GAMMA, tau=TAU, rpm_size=MEMORY_CAPACITY, batch_size=BATCH_SIZE) status = MPI.Status() start_time = time.time() reset_time = time.time() total_eps = 0 total_step = 0 n_step = 0 n_eps = 0 max_reward = -9999 max_reward_rank = 0 ddpg.load() while total_eps < MAX_EPISODES: data = comm.recv(source=MPI.ANY_SOURCE, tag=MPI.ANY_TAG, status=status) source = status.Get_source() tag = status.Get_tag() if tag == REQ_ACTION: # action = env.action_space.sample() action = ddpg.choose_action(data) comm.send((action, total_eps, total_step), dest=source, tag=RSP_ACTION) elif tag == OBS_DATA: n_step += 1 total_step += 1 (s, a, r, s_, done, ep_reward, ep_step) = data is_done = 0.0 if done: is_done = 1.0 ddpg.store_transition(s, a, r, s_, is_done) if ddpg.pointer > LEARN_START and total_step % 3 == 0: ddpg.learn() if done: total_eps += 1 if ep_reward > max_reward: max_reward = ep_reward max_reward_rank = source s = "eps: {:>8}, worker: {:>3}, ep_reward:{:7.4f}, max:{:7.4f}/{:>3}, step:{:4}".format( total_eps, source, ep_reward, max_reward, max_reward_rank, ep_step) #print(s) logging.info(s) if total_eps % 500 == 0: ddpg.save(total_eps) interval = time.time() - reset_time s = "# total_step: {:>8} ,total_eps: {:>6} eps/min: {:>6}, frame/sec: {:>6}".format( total_step, total_eps, n_eps / interval * 60, n_step / interval) #print(s) logging.info(s) n_step = 0 n_eps = 0 reset_time = time.time()
if not os.path.exists(param_path): print("创建参数文件夹") os.makedirs(param_path) if not os.path.exists(log_path): print("创建日志文件夹") os.makedirs(log_path) env = make_env(env_name) obs_ls = env.reset() # 初始化状态 global_input_size = 0 for cv in obs_ls: global_input_size += len(cv) for action_space in env.action_space: global_input_size += action_space.n # 初始化模型 agent_models = [DDPG(str(i), len(obs_ls[i]), env.action_space[i].n, global_input_size, MEM_LEN, LEARNING_RATE) for i in range(len(env.world.agents))] target_models = [DDPG(str(i), len(obs_ls[i]), env.action_space[i].n, global_input_size, MEM_LEN, LEARNING_RATE) for i in range(len(env.world.agents))] for idx, model in enumerate(target_models): model.load_state_dict(agent_models[idx].state_dict()) if LOAD_KEY: for idx, model in enumerate(agent_models): if idx == 0: check_point = torch.load('./param/DDPGagent0_listener_5000.pkl') else: check_point = torch.load('./param/DDPGagent1_listener_5m000.pkl') model.load_state_dict(check_point) for epo_i in range(MAX_EPOCH): obs_ls = env.reset()
max_iter = 200 model_folder = './model' var = 0.0 # 動作搜索變異性 if __name__ == '__main__': # Create environment env = gym.make('Pendulum-v0').unwrapped n_state = env.observation_space.shape[0] # 提取state的維度 n_action = env.action_space.shape[0] # 提取action的維度 a_limit = env.action_space.high[0] # 提取action連續動作中,最大的可能數值 # Create network net = DDPG( n_state = n_state, n_action = n_action, a_limit = a_limit, model_folder = model_folder, var = var ) net.load() # Train reward_list = [] for i in range(episode): s = env.reset() total_reward = 0 for j in range(max_iter): env.render() a = net.chooseAction(s) s_, r, finish, info = env.step(a)
class PytorchRLTemplateAgent: def __init__(self): pass def init(self, context: Context, load_model=False, model_path=None): self.check_gpu_available(context) logger.info('PytorchRLTemplateAgent init') self.preprocessor = DTPytorchWrapper() self.model = DDPG(state_dim=self.preprocessor.shape, action_dim=2, max_action=1, net_type="cnn") self.current_image = np.zeros((640, 480, 3)) if load_model: logger.info('PytorchRLTemplateAgent loading models') fp = model_path if model_path else "model" self.model.load(fp, "models", for_inference=True) logger.info('PytorchRLTemplateAgent init complete') def check_gpu_available(self, context: Context): available = torch.cuda.is_available() req = os.environ.get('AIDO_REQUIRE_GPU', None) context.info( f'torch.cuda.is_available = {available!r} AIDO_REQUIRE_GPU = {req!r}' ) context.info('init()') if available: i = torch.cuda.current_device() count = torch.cuda.device_count() name = torch.cuda.get_device_name(i) context.info(f'device {i} of {count}; name = {name!r}') else: if req is not None: msg = 'I need a GPU; bailing.' context.error(msg) raise Exception(msg) def on_received_seed(self, data: int): np.random.seed(data) def on_received_episode_start(self, context: Context, data: EpisodeStart): context.info(f'Starting episode "{data.episode_name}".') def on_received_observations(self, data: DB20Observations): camera: JPGImage = data.camera obs = jpg2rgb(camera.jpg_data) self.current_image = self.preprocessor.preprocess(obs) def compute_action(self, observation): #if observation.shape != self.preprocessor.transposed_shape: # observation = self.preprocessor.preprocess(observation) action = self.model.predict(observation) return action.astype(float) def on_received_get_commands(self, context: Context): pwm_left, pwm_right = self.compute_action(self.current_image) pwm_left = float(np.clip(pwm_left, -1, +1)) pwm_right = float(np.clip(pwm_right, -1, +1)) grey = RGB(0.0, 0.0, 0.0) led_commands = LEDSCommands(grey, grey, grey, grey, grey) pwm_commands = PWMCommands(motor_left=pwm_left, motor_right=pwm_right) commands = DB20Commands(pwm_commands, led_commands) context.write('commands', commands) def finish(self, context: Context): context.info('finish()')
class PytorchRLTemplateAgent: def __init__(self, load_model: bool, model_path: Optional[str]): self.load_model = load_model self.model_path = model_path def init(self, context: Context): self.check_gpu_available(context) logger.info("PytorchRLTemplateAgent init") from model import DDPG self.preprocessor = DTPytorchWrapper() self.model = DDPG(state_dim=self.preprocessor.shape, action_dim=2, max_action=1, net_type="cnn") self.current_image = np.zeros((640, 480, 3)) if self.load_model: logger.info("Pytorch Template Agent loading models") fp = self.model_path if self.model_path else "model" self.model.load(fp, "models", for_inference=True) logger.info("PytorchRLTemplateAgent init complete") def check_gpu_available(self, context: Context): import torch available = torch.cuda.is_available() context.info(f"torch.cuda.is_available = {available!r}") context.info("init()") if available: i = torch.cuda.current_device() count = torch.cuda.device_count() name = torch.cuda.get_device_name(i) context.info(f"device {i} of {count}; name = {name!r}") else: no_hardware_GPU_available(context) def on_received_seed(self, data: int): np.random.seed(data) def on_received_episode_start(self, context: Context, data: EpisodeStart): context.info(f'Starting episode "{data.episode_name}".') def on_received_observations(self, data: DB20Observations): camera: JPGImage = data.camera obs = jpg2rgb(camera.jpg_data) self.current_image = self.preprocessor.preprocess(obs) def compute_action(self, observation): # if observation.shape != self.preprocessor.transposed_shape: # observation = self.preprocessor.preprocess(observation) action = self.model.predict(observation) return action.astype(float) def on_received_get_commands(self, context: Context): pwm_left, pwm_right = self.compute_action(self.current_image) pwm_left = float(np.clip(pwm_left, -1, +1)) pwm_right = float(np.clip(pwm_right, -1, +1)) grey = RGB(0.0, 0.0, 0.0) led_commands = LEDSCommands(grey, grey, grey, grey, grey) pwm_commands = PWMCommands(motor_left=pwm_left, motor_right=pwm_right) commands = DB20Commands(pwm_commands, led_commands) context.write("commands", commands) def finish(self, context: Context): context.info("finish()")
test_set.append( (user, list(test_user.loc[i:i + 9, 'itemId']), test_user.loc[i + 10, 'itemId'], test_user.loc[i + 9, 'timestamp'] - test_user.loc[i + 8, 'timestamp'], float(test_user.loc[i + 10, 'reward']), float(test_user.loc[i + 10, 'objective1']), float(test_user.loc[i + 10, 'objective2']))) train_set = train_set[:len(train_set) // batch_size * batch_size] test_set = test_set[:len(test_set) // batch_size * batch_size] start_time = time.time() gpu_options = tf.GPUOptions(allow_growth=True) with tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) as sess: primary_network = DDPG(hidden_size, 'primary_network') target_network = DDPG(hidden_size, 'target_network') model = Reinforce_Model(user_count, item_count, hidden_size, batch_size, primary_network, target_network) sess.run(tf.global_variables_initializer()) sess.run(tf.local_variables_initializer()) print('Objective1_Value: %.4f\t Objective2_Value: %.4f\t' % evaluate(sess, model, train_set)) sys.stdout.flush() lr = 1 start_time = time.time() last_auc = 0.0 for epoch in range(100):
class HER(pl.LightningModule): def __init__(self, hparams): super(HER, self).__init__() self.hparams = hparams self.test_env = make_env(hparams, render=self.hparams.render_test) sample_obs = self.test_env.observation_space['observation'].sample() sample_goal = self.test_env.observation_space['achieved_goal'].sample() # HARD CODED VALUES FOR Bullet-HRL action_limits, state_limits = get_env_boundaries() action_offset, action_bounds, action_clip_low, action_clip_high = action_limits state_shape = sample_obs.shape[0] action_shape = self.test_env.action_space.shape[0] goal_shape = sample_goal.shape[0] self.action_clips = (action_clip_low, action_clip_high) self.model = DDPG(params=self.hparams, obs_size=state_shape, goal_size=goal_shape, act_size=action_shape, action_clips=(action_clip_low, action_clip_high), action_bounds=action_bounds, action_offset=action_offset) self.model.actor.share_memory() self.model.critic.share_memory() self.state_normalizer = Normalizer( state_shape, default_clip_range=self.hparams.clip_range) self.goal_normalizer = Normalizer( goal_shape, default_clip_range=self.hparams.clip_range) self.replay_buffer = SharedReplayBuffer(self.hparams.buffer_size, state_shape, action_shape, goal_shape) def log_func(self, d): self.log_dict(d, on_step=True, prog_bar=True) def collate_fn(self, batch): return collate.default_convert(batch) def __dataloader(self) -> DataLoader: dataset = RLDataset(self.replay_buffer, self.hparams.batch_size, self.hparams.n_batches, self.hparams.replay_initial) dataloader = DataLoader(dataset=dataset, collate_fn=self.collate_fn, batch_size=1, num_workers=1, pin_memory=True) return dataloader def train_dataloader(self): return self.__dataloader() def __testloader(self): testset = TestDataset(hparams=self.hparams, test_env=self.test_env, model=self.model, state_normalizer=self.state_normalizer, goal_normalizer=self.goal_normalizer) testloader = DataLoader(dataset=testset, batch_size=1) return testloader def val_dataloader(self): return self.__testloader() def configure_optimizers(self): return [self.model.crt_opt, self.model.act_opt], [] def training_step(self, batch, batch_idx, optimizer_idx): states_v, actions_v, next_states_v, rewards_v, dones_mask, goals_v = batch[ 0] norm_states_v = self.state_normalizer.normalize(states_v) norm_goals_v = self.goal_normalizer.normalize(goals_v) if optimizer_idx == 0: norm_next_states_v = self.state_normalizer.normalize(next_states_v) # train critic q_v = self.model.critic(norm_states_v, norm_goals_v, actions_v) with torch.no_grad(): next_act_v = self.model.tgt_act_net(norm_next_states_v, norm_goals_v) q_next_v = self.model.tgt_crt_net(norm_next_states_v, norm_goals_v, next_act_v) q_next_v[dones_mask] = 0.0 q_ref_v = rewards_v.unsqueeze( dim=-1) + q_next_v * self.hparams.gamma # clip the q value clip_return = 1 / (1 - self.hparams.gamma) q_ref_v = torch.clamp(q_ref_v, -clip_return, 0) critic_loss_v = F.mse_loss(q_v, q_ref_v.detach()) tqdm_dict = {'critic_loss': critic_loss_v} self.log_dict(tqdm_dict, prog_bar=True) return critic_loss_v elif optimizer_idx == 1: # train actor self.model.actor.offset.requires_grad = False self.model.actor.action_bounds.requires_grad = False cur_actions_v = self.model.actor(norm_states_v, norm_goals_v) actor_loss_v = -self.model.critic(norm_states_v, norm_goals_v, cur_actions_v).mean() actor_loss_v += ((cur_actions_v - self.model.actor.offset) / self.model.actor.action_bounds).pow(2).mean() tqdm_dict = {'actor_loss': actor_loss_v} self.log_dict(tqdm_dict, prog_bar=True) if batch_idx % self.hparams.sync_batches == 0: self.model.alpha_sync(self.hparams.polyak) return actor_loss_v def validation_step(self, batch, batch_idx): to_log = dict() for k, v in batch.items(): to_log[k] = v.detach().cpu().numpy() to_log['epoch_nr'] = int(self.current_epoch) if self.logger is not None: self.logger.experiment.log(to_log)