temp_act[action] = 1. actions.append(temp_act) # Store reward values.append(reward) # Time step t += 1 # Get intrinsic reward if t <= mem_size: states_mem.append(state) actions_mem.append(temp_act) next_states_mem.append(next_state) if t == mem_size: pre_z_mean, pre_z_std = tuc.dump_z_mean_std( state, temp_act[np.newaxis, :]) intrinsic_reward_1 = 0 intrinsic_reward_2 = 0 else: tuc.train_tuc(np.vstack(states_mem), np.vstack(next_states_mem), np.vstack(actions_mem)) z_mean, z_std = tuc.dump_z_mean_std(state, temp_act[np.newaxis, :]) intrinsic_reward_1 = KL_divergence(pre_z_mean, pre_z_std, z_mean, z_std) / mem_size intrinsic_reward_2 = tuc.dump_regret(state, action) pre_z_mean = z_mean pre_z_std = z_std # Update memory states_mem.append(state) actions_mem.append(temp_act) next_states_mem.append(next_state)
# find the max bounding box in the region image new_iou = find_max_bounding_box(gt_masks, region_mask, classes_gt_objects, CLASS_OBJECT) reward = get_reward_movement(iou, new_iou) iou = new_iou # update model-based module action_np_vec = np.zeros([1, 6]) action_np_vec[0, action - 1] = 1. action_vec = torch.from_numpy(action_np_vec).float().cuda() actions_matrix.append(action_vec) if step == 0: pre_mean, pre_std = TUC_dynamic.dump_z_mean_std( state, action_vec) TUC_dynamic.train_enc_dec(state, next_state, action_vec) if step > 0: mean, std = TUC_dynamic.dump_z_mean_std(state, action_vec) intrinsic_reward = TUC_dynamic.dump_exploration_reward( pre_mean, pre_std, mean, std) if i > 0: penalty = TUC_dynamic.dump_regret(state, action - 1) #print(intrinsic_reward,penalty) #print(ratio_1*((epochs-epoch)/epochs)*intrinsic_reward - ratio_2*(epoch/epochs)*penalty) # Store the transition in memory