class GoalNaiveTransporterAgent(TransporterAgent): """Naive version which stacks current and goal images through normal Transport.""" def __init__(self, name, task, n_rotations=36): super().__init__(name, task, n_rotations) # Stack the goal image for the vanilla Transport module. t_shape = (self.in_shape[0], self.in_shape[1], int(self.in_shape[2] * 2)) self.attention = Attention(in_shape=self.in_shape, n_rotations=1, preprocess=utils.preprocess) self.transport = Transport(in_shape=t_shape, n_rotations=self.n_rotations, crop_size=self.crop_size, preprocess=utils.preprocess) def get_image(self, obs): """Stack color and height images image.""" # if self.use_goal_image: # colormap_g, heightmap_g = utils.get_fused_heightmap(goal, configs) # goal_image = self.concatenate_c_h(colormap_g, heightmap_g) # input_image = np.concatenate((input_image, goal_image), axis=2) # assert input_image.shape[2] == 12, input_image.shape # do: we can see some camera model implementation here -> fuse multi-view image with camera intrinsics to # the orthographical height map. # Get color and height maps from RGB-D images. cmap, hmap = utils.get_fused_heightmap(obs, self.cam_config, self.bounds, self.pix_size) # do: wow, cmap has become a top-down view image -> how come? # import cv2 # cv2.imshow('haha', cmap) # cv2.waitKey(0) # exit(0) # do: visualize image. img = np.concatenate( (cmap, hmap[Ellipsis, None], hmap[Ellipsis, None], hmap[Ellipsis, None]), axis=2) assert img.shape == self.in_shape, img.shape return img def get_sample(self, dataset, augment=True): """Get a dataset sample. Args: dataset: a ravens.Dataset (train or validation) augment: if True, perform data augmentation. Returns: tuple of data for training: (input_image, p0, p0_theta, p1, p1_theta) tuple additionally includes (z, roll, pitch) if self.six_dof if self.use_goal_image, then the goal image is stacked with the current image in `input_image`. If splitting up current and goal images is desired, it should be done outside this method. """ # do: get current and goal observation here. (obs, act, _, _), (gobs, _, _, _) = dataset.sample() # do: visualize. # import cv2 # img = obs['color'][0, :, :, :3] # gimg = gobs['color'][0, :, :, :3] # cv2.imshow('haha', img) # cv2.waitKey(0) # cv2.imshow('haha', gimg) # cv2.waitKey(0) img = self.get_image(obs) gimg = self.get_image(gobs) # Get training labels from data sample. p0_xyz, p0_xyzw = act['pose0'] p1_xyz, p1_xyzw = act['pose1'] p0 = utils.xyz_to_pix(p0_xyz, self.bounds, self.pix_size) p0_theta = -np.float32(utils.quatXYZW_to_eulerXYZ(p0_xyzw)[2]) p1 = utils.xyz_to_pix(p1_xyz, self.bounds, self.pix_size) p1_theta = -np.float32(utils.quatXYZW_to_eulerXYZ(p1_xyzw)[2]) p1_theta = p1_theta - p0_theta p0_theta = 0 # Data augmentation. if augment: img, _, (p0, p1), _ = utils.perturb(img, [p0, p1]) gimg, _, _, _ = utils.perturb(gimg, [p0, p1]) return img, p0, p0_theta, p1, p1_theta, gimg def train(self, dataset, writer=None): """Train on a dataset sample for 1 iteration. Args: dataset: a ravens.Dataset. writer: a TF summary writer (for tensorboard). """ tf.keras.backend.set_learning_phase(1) # SAY: we can see that, this problem is, predict two poses using image observation. img, p0, p0_theta, p1, p1_theta, gimg = self.get_sample(dataset) # Get training losses. step = self.total_steps + 1 loss0 = self.attention.train(img, p0, p0_theta) if isinstance(self.transport, Attention): loss1 = self.transport.train(img, p1, p1_theta) else: img_stk = np.concatenate([img, gimg], axis=-1) loss1 = self.transport.train(img_stk, p0, p1, p1_theta) with writer.as_default(): sc = tf.summary.scalar sc('train_loss/attention', loss0, step) sc('train_loss/transport', loss1, step) print(f'Train Iter: {step} Loss: {loss0:.4f} {loss1:.4f}') self.total_steps = step # TODO(andyzeng) cleanup goal-conditioned model. # if self.use_goal_image: # half = int(input_image.shape[2] / 2) # img_curr = input_image[:, :, :half] # ignore goal portion # loss0 = self.attention.train(img_curr, p0, p0_theta) # else: # loss0 = self.attention.train(input_image, p0, p0_theta) # if isinstance(self.transport, Attention): # loss1 = self.transport.train(input_image, p1, p1_theta) # elif isinstance(self.transport, TransportGoal): # half = int(input_image.shape[2] / 2) # img_curr = input_image[:, :, :half] # img_goal = input_image[:, :, half:] # loss1 = self.transport.train(img_curr, img_goal, p0, p1, p1_theta) # else: # loss1 = self.transport.train(input_image, p0, p1, p1_theta) def validate(self, dataset, writer=None): # pylint: disable=unused-argument """Test on a validation dataset for 10 iterations.""" print('Skipping validation.') # tf.keras.backend.set_learning_phase(0) # n_iter = 10 # loss0, loss1 = 0, 0 # for i in range(n_iter): # img, p0, p0_theta, p1, p1_theta = self.get_sample(dataset, False) # # Get validation losses. Do not backpropagate. # loss0 += self.attention.train(img, p0, p0_theta, backprop=False) # if isinstance(self.transport, Attention): # loss1 += self.transport.train(img, p1, p1_theta, backprop=False) # else: # loss1 += self.transport.train(img, p0, p1, p1_theta, backprop=False) # loss0 /= n_iter # loss1 /= n_iter # with writer.as_default(): # sc = tf.summary.scalar # sc('test_loss/attention', loss0, self.total_steps) # sc('test_loss/transport', loss1, self.total_steps) # print(f'Validation Loss: {loss0:.4f} {loss1:.4f}') def act(self, obs, info=None, goal=None): # pylint: disable=unused-argument """Run inference and return best action given visual observations.""" tf.keras.backend.set_learning_phase(0) # Get heightmap from RGB-D images. img = self.get_image(obs) gobs = goal[0] gimg = self.get_image(gobs) # Attention model forward pass. pick_conf = self.attention.forward(img) argmax = np.argmax(pick_conf) argmax = np.unravel_index(argmax, shape=pick_conf.shape) p0_pix = argmax[:2] p0_theta = argmax[2] * (2 * np.pi / pick_conf.shape[2]) # Transport model forward pass. place_conf = self.transport.forward(img, gimg, p0_pix) argmax = np.argmax(place_conf) argmax = np.unravel_index(argmax, shape=place_conf.shape) p1_pix = argmax[:2] p1_theta = argmax[2] * (2 * np.pi / place_conf.shape[2]) # Pixels to end effector poses. hmap = img[:, :, 3] p0_xyz = utils.pix_to_xyz(p0_pix, hmap, self.bounds, self.pix_size) p1_xyz = utils.pix_to_xyz(p1_pix, hmap, self.bounds, self.pix_size) p0_xyzw = utils.eulerXYZ_to_quatXYZW((0, 0, -p0_theta)) p1_xyzw = utils.eulerXYZ_to_quatXYZW((0, 0, -p1_theta)) return { 'pose0': (np.asarray(p0_xyz), np.asarray(p0_xyzw)), 'pose1': (np.asarray(p1_xyz), np.asarray(p1_xyzw)) } # TODO(andyzeng) cleanup goal-conditioned model. # Make a goal image if needed, and for consistency stack with input. # if self.use_goal_image: # cmap_g, hmap_g = utils.get_fused_heightmap(goal, self.cam_config) # goal_image = self.concatenate_c_h(colormap_g, heightmap_g) # input_image = np.concatenate((input_image, goal_image), axis=2) # assert input_image.shape[2] == 12, input_image.shape # if self.use_goal_image: # half = int(input_image.shape[2] / 2) # input_only = input_image[:, :, :half] # ignore goal portion # pick_conf = self.attention.forward(input_only) # else: # if isinstance(self.transport, TransportGoal): # half = int(input_image.shape[2] / 2) # img_curr = input_image[:, :, :half] # img_goal = input_image[:, :, half:] # place_conf = self.transport.forward(img_curr, img_goal, p0_pix) def load(self, n_iter): """Load pre-trained models.""" print(f'Loading pre-trained model at {n_iter} iterations.') attention_fname = 'attention-ckpt-%d.h5' % n_iter transport_fname = 'transport-ckpt-%d.h5' % n_iter attention_fname = os.path.join(self.models_dir, attention_fname) transport_fname = os.path.join(self.models_dir, transport_fname) self.attention.load(attention_fname) self.transport.load(transport_fname) self.total_steps = n_iter def save(self): """Save models.""" if not tf.io.gfile.exists(self.models_dir): tf.io.gfile.makedirs(self.models_dir) attention_fname = 'attention-ckpt-%d.h5' % self.total_steps transport_fname = 'transport-ckpt-%d.h5' % self.total_steps attention_fname = os.path.join(self.models_dir, attention_fname) transport_fname = os.path.join(self.models_dir, transport_fname) self.attention.save(attention_fname) self.transport.save(transport_fname)
class Form2FitAgent: """Form-2-fit Agent (https://form2fit.github.io/).""" def __init__(self, name, task): self.name = name self.task = task self.total_iter = 0 self.num_rotations = 24 self.descriptor_dim = 16 self.pixel_size = 0.003125 self.input_shape = (320, 160, 6) self.camera_config = cameras.RealSenseD415.CONFIG self.models_dir = os.path.join('checkpoints', self.name) self.bounds = np.array([[0.25, 0.75], [-0.5, 0.5], [0, 0.28]]) self.pick_model = Attention(input_shape=self.input_shape, num_rotations=1, preprocess=self.preprocess, lite=True) self.place_model = Attention(input_shape=self.input_shape, num_rotations=1, preprocess=self.preprocess, lite=True) self.match_model = Matching(input_shape=self.input_shape, descriptor_dim=self.descriptor_dim, num_rotations=self.num_rotations, preprocess=self.preprocess, lite=True) def train(self, dataset, num_iter, writer, validation_dataset=None): """Train on dataset for a specific number of iterations.""" del validation_dataset for i in range(num_iter): obs, act, _ = dataset.random_sample() # Get heightmap from RGB-D images. configs = act['camera_config'] colormap, heightmap = self.get_heightmap(obs, configs) # Get training labels from data sample. pose0, pose1 = act['params']['pose0'], act['params']['pose1'] p0_position, p0_rotation = pose0[0], pose0[1] p0 = utils.xyz_to_pix(p0_position, self.bounds, self.pixel_size) p0_theta = -np.float32(utils.quatXYZW_to_eulerXYZ(p0_rotation)[2]) p1_position, p1_rotation = pose1[0], pose1[1] p1 = utils.xyz_to_pix(p1_position, self.bounds, self.pixel_size) p1_theta = -np.float32(utils.quatXYZW_to_eulerXYZ(p1_rotation)[2]) p1_theta = p1_theta - p0_theta p0_theta = 0 # Concatenate color with depth images. input_image = np.concatenate( (colormap, heightmap[Ellipsis, None], heightmap[Ellipsis, None], heightmap[Ellipsis, None]), axis=2) # Do data augmentation (perturb rotation and translation). input_image, _, roundedpixels, _ = utils.perturb( input_image, [p0, p1]) p0, p1 = roundedpixels # Compute training loss. loss0 = self.pick_model.train(input_image, p0, theta=0) loss1 = self.place_model.train(input_image, p1, theta=0) loss2 = self.match_model.train(input_image, p0, p1, theta=p1_theta) with writer.as_default(): tf.summary.scalar('pick_loss', self.pick_model.metric.result(), step=self.total_iter + i) tf.summary.scalar('place_loss', self.place_model.metric.result(), step=self.total_iter + i) tf.summary.scalar('match_loss', self.match_model.metric.result(), step=self.total_iter + i) print( f'Train Iter: {self.total_iter + i} Loss: {loss0:.4f} {loss1:.4f} {loss2:.4f}' ) self.total_iter += num_iter self.save() def act(self, obs, info): """Run inference and return best action given visual observations.""" del info act = {'camera_config': self.camera_config, 'primitive': None} if not obs: return act # Get heightmap from RGB-D images. colormap, heightmap = self.get_heightmap(obs, self.camera_config) # Concatenate color with depth images. input_image = np.concatenate( (colormap, heightmap[Ellipsis, None], heightmap[Ellipsis, None], heightmap[Ellipsis, None]), axis=2) # Get top-k pixels from pick and place heatmaps. k = 100 pick_heatmap = self.pick_model.forward(input_image, apply_softmax=True).squeeze() place_heatmap = self.place_model.forward(input_image, apply_softmax=True).squeeze() descriptors = np.float32(self.match_model.forward(input_image)) # V4 pick_heatmap = cv2.GaussianBlur(pick_heatmap, (49, 49), 0) place_heatmap = cv2.GaussianBlur(place_heatmap, (49, 49), 0) pick_topk = np.int32( np.unravel_index( np.argsort(pick_heatmap.reshape(-1))[-k:], pick_heatmap.shape)).T pick_pixel = pick_topk[-1, :] from skimage.feature import peak_local_max # pylint: disable=g-import-not-at-top place_peaks = peak_local_max(place_heatmap, num_peaks=1) distances = np.ones((place_peaks.shape[0], self.num_rotations)) * 10 pick_descriptor = descriptors[0, pick_pixel[0], pick_pixel[1], :].reshape(1, -1) for i in range(place_peaks.shape[0]): peak = place_peaks[i, :] place_descriptors = descriptors[:, peak[0], peak[1], :] distances[i, :] = np.linalg.norm(place_descriptors - pick_descriptor, axis=1) ibest = np.unravel_index(np.argmin(distances), shape=distances.shape) p0_pixel = pick_pixel p0_theta = 0 p1_pixel = place_peaks[ibest[0], :] p1_theta = ibest[1] * (2 * np.pi / self.num_rotations) # # V3 # pick_heatmap = cv2.GaussianBlur(pick_heatmap, (49, 49), 0) # place_heatmap = cv2.GaussianBlur(place_heatmap, (49, 49), 0) # pick_topk = np.int32( # np.unravel_index( # np.argsort(pick_heatmap.reshape(-1))[-k:], pick_heatmap.shape)).T # place_topk = np.int32( # np.unravel_index( # np.argsort(place_heatmap.reshape(-1))[-k:], # place_heatmap.shape)).T # pick_pixel = pick_topk[-1, :] # place_pixel = place_topk[-1, :] # pick_descriptor = descriptors[0, pick_pixel[0], # pick_pixel[1], :].reshape(1, -1) # place_descriptor = descriptors[:, place_pixel[0], place_pixel[1], :] # distances = np.linalg.norm(place_descriptor - pick_descriptor, axis=1) # irotation = np.argmin(distances) # p0_pixel = pick_pixel # p0_theta = 0 # p1_pixel = place_pixel # p1_theta = irotation * (2 * np.pi / self.num_rotations) # # V2 # pick_topk = np.int32( # np.unravel_index( # np.argsort(pick_heatmap.reshape(-1))[-k:], pick_heatmap.shape)).T # place_topk = np.int32( # np.unravel_index( # np.argsort(place_heatmap.reshape(-1))[-k:], # place_heatmap.shape)).T # pick_pixel = pick_topk[-1, :] # pick_descriptor = descriptors[0, pick_pixel[0], # pick_pixel[1], :].reshape(1, 1, 1, -1) # distances = np.linalg.norm(descriptors - pick_descriptor, axis=3) # distances = np.transpose(distances, [1, 2, 0]) # max_distance = int(np.round(np.max(distances))) # for i in range(self.num_rotations): # distances[:, :, i] = cv2.circle(distances[:, :, i], # (pick_pixel[1], pick_pixel[0]), 50, # max_distance, -1) # ibest = np.unravel_index(np.argmin(distances), shape=distances.shape) # p0_pixel = pick_pixel # p0_theta = 0 # p1_pixel = ibest[:2] # p1_theta = ibest[2] * (2 * np.pi / self.num_rotations) # # V1 # pick_topk = np.int32( # np.unravel_index( # np.argsort(pick_heatmap.reshape(-1))[-k:], pick_heatmap.shape)).T # place_topk = np.int32( # np.unravel_index( # np.argsort(place_heatmap.reshape(-1))[-k:], # place_heatmap.shape)).T # distances = np.zeros((k, k, self.num_rotations)) # for ipick in range(k): # pick_descriptor = descriptors[0, pick_topk[ipick, 0], # pick_topk[ipick, 1], :].reshape(1, -1) # for iplace in range(k): # place_descriptors = descriptors[:, place_topk[iplace, 0], # place_topk[iplace, 1], :] # distances[ipick, iplace, :] = np.linalg.norm( # place_descriptors - pick_descriptor, axis=1) # ibest = np.unravel_index(np.argmin(distances), shape=distances.shape) # p0_pixel = pick_topk[ibest[0], :] # p0_theta = 0 # p1_pixel = place_topk[ibest[1], :] # p1_theta = ibest[2] * (2 * np.pi / self.num_rotations) # Pixels to end effector poses. p0_position = utils.pix_to_xyz(p0_pixel, heightmap, self.bounds, self.pixel_size) p1_position = utils.pix_to_xyz(p1_pixel, heightmap, self.bounds, self.pixel_size) p0_rotation = utils.eulerXYZ_to_quatXYZW((0, 0, -p0_theta)) p1_rotation = utils.eulerXYZ_to_quatXYZW((0, 0, -p1_theta)) act['primitive'] = 'pick_place' if self.task == 'sweeping': act['primitive'] = 'sweep' elif self.task == 'pushing': act['primitive'] = 'push' params = { 'pose0': (np.asarray(p0_position), np.asarray(p0_rotation)), 'pose1': (np.asarray(p1_position), np.asarray(p1_rotation)) } act['params'] = params return act #------------------------------------------------------------------------- # Helper Functions #------------------------------------------------------------------------- def preprocess(self, image): """Pre-process images (subtract mean, divide by std).""" color_mean = 0.18877631 depth_mean = 0.00509261 color_std = 0.07276466 depth_std = 0.00903967 image[:, :, :3] = (image[:, :, :3] / 255 - color_mean) / color_std image[:, :, 3:] = (image[:, :, 3:] - depth_mean) / depth_std return image def get_heightmap(self, obs, configs): """Reconstruct orthographic heightmaps with segmentation masks.""" heightmaps, colormaps = utils.reconstruct_heightmaps( obs['color'], obs['depth'], configs, self.bounds, self.pixel_size) colormaps = np.float32(colormaps) heightmaps = np.float32(heightmaps) # Fuse maps from different views. valid = np.sum(colormaps, axis=3) > 0 repeat = np.sum(valid, axis=0) repeat[repeat == 0] = 1 colormap = np.sum(colormaps, axis=0) / repeat[Ellipsis, None] colormap = np.uint8(np.round(colormap)) heightmap = np.sum(heightmaps, axis=0) / repeat return colormap, heightmap def load(self, num_iter): """Load pre-trained models.""" pick_fname = 'pick-ckpt-%d.h5' % num_iter place_fname = 'place-ckpt-%d.h5' % num_iter match_fname = 'match-ckpt-%d.h5' % num_iter pick_fname = os.path.join(self.models_dir, pick_fname) place_fname = os.path.join(self.models_dir, place_fname) match_fname = os.path.join(self.models_dir, match_fname) self.pick_model.load(pick_fname) self.place_model.load(place_fname) self.match_model.load(match_fname) self.total_iter = num_iter def save(self): """Save models.""" if not tf.io.gfile.exists(self.models_dir): tf.io.gfile.makedirs(self.models_dir) pick_fname = 'pick-ckpt-%d.h5' % self.total_iter place_fname = 'place-ckpt-%d.h5' % self.total_iter match_fname = 'match-ckpt-%d.h5' % self.total_iter pick_fname = os.path.join(self.models_dir, pick_fname) place_fname = os.path.join(self.models_dir, place_fname) match_fname = os.path.join(self.models_dir, match_fname) self.pick_model.save(pick_fname) self.place_model.save(place_fname) self.match_model.save(match_fname)