class VIBE_Demo(nn.Module): def __init__( self, seqlen, batch_size=64, n_layers=1, hidden_size=2048, add_linear=False, bidirectional=False, use_residual=True, pretrained=osp.join(VIBE_DATA_DIR, 'spin_model_checkpoint.pth.tar'), ): super(VIBE_Demo, self).__init__() self.seqlen = seqlen self.batch_size = batch_size self.encoder = TemporalEncoder( n_layers=n_layers, hidden_size=hidden_size, bidirectional=bidirectional, add_linear=add_linear, use_residual=use_residual, ) self.hmr = hmr() checkpoint = torch.load(pretrained) self.hmr.load_state_dict(checkpoint['model'], strict=False) # regressor can predict cam, pose and shape params in an iterative way self.regressor = Regressor() if pretrained and os.path.isfile(pretrained): pretrained_dict = torch.load(pretrained)['model'] self.regressor.load_state_dict(pretrained_dict, strict=False) print(f'=> loaded pretrained model from \'{pretrained}\'') def forward(self, input, J_regressor=None): # input size NTF batch_size, seqlen, nc, h, w = input.shape feature = self.hmr.feature_extractor(input.reshape(-1, nc, h, w)) feature = feature.reshape(batch_size, seqlen, -1) feature = self.encoder(feature) feature = feature.reshape(-1, feature.size(-1)) smpl_output = self.regressor(feature, J_regressor=J_regressor) for s in smpl_output: s['theta'] = s['theta'].reshape(batch_size, seqlen, -1) s['verts'] = s['verts'].reshape(batch_size, seqlen, -1, 3) s['kp_2d'] = s['kp_2d'].reshape(batch_size, seqlen, -1, 2) s['kp_3d'] = s['kp_3d'].reshape(batch_size, seqlen, -1, 3) s['rotmat'] = s['rotmat'].reshape(batch_size, seqlen, -1, 3, 3) return smpl_output
def get_regressor_output(features): from lib.models.spin import Regressor batch_size, seqlen = features.shape[:2] device = 'cuda' if torch.cuda.is_available() else 'cpu' model = Regressor().to(device) smpl = SMPL(SMPL_MODEL_DIR).to(device) pretrained = torch.load('models/model_best.pth.tar', map_location=torch.device('cpu'))['gen_state_dict'] new_pretrained_dict = {} for k, v in pretrained.items(): if 'regressor' in k: new_pretrained_dict[k[10:]] = v # adapt mean theta to new batch size if 'mean_theta' in k: del new_pretrained_dict[k[10:]] model.load_state_dict(new_pretrained_dict, strict=False) features = features.reshape(batch_size * seqlen, -1) features = features.to(device) theta = model(features)[-1] cam = theta[:, 0:3].contiguous() pose = theta[:, 3:75].contiguous() shape = theta[:, 75:].contiguous() pred_output = smpl(betas=shape, body_pose=pose[:, 3:], global_orient=pose[:, :3], pose2rot=True) verts = pred_output.vertices # , _, _ = smpl(pose, shape) verts = verts.reshape(batch_size, seqlen, -1, 3) cam = cam.reshape(batch_size, seqlen, -1) return verts, cam
class VIBE_Demo(nn.Module): def __init__(self, seqlen, batch_size=64, n_layers=1, hidden_size=2048, pretrained='data/vibe_data/spin_model_checkpoint.pth.tar', add_linear=False, bidirectional=False, attention=False, attention_cfg=None, use_residual=True, disable_temporal=False): super(VIBE_Demo, self).__init__() self.seqlen = seqlen self.batch_size = batch_size self.disable_temporal = disable_temporal if attention: cfg = attention_cfg self.encoder = TemporalEncoderWAttention( hidden_size=hidden_size, bidirectional=bidirectional, add_linear=add_linear, attention_size=cfg.SIZE, attention_layers=cfg.LAYERS, attention_dropout=cfg.DROPOUT, use_residual=use_residual, ) else: self.encoder = TemporalEncoder( n_layers=n_layers, hidden_size=hidden_size, bidirectional=bidirectional, add_linear=add_linear, use_residual=use_residual, ) self.hmr = hmr() if torch.cuda.is_available(): checkpoint = torch.load(pretrained) else: checkpoint = torch.load(pretrained, map_location=torch.device('cpu')) self.hmr.load_state_dict(checkpoint['model'], strict=False) # regressor can predict cam, pose and shape params in an iterative way self.regressor = Regressor() if pretrained and os.path.isfile(pretrained): if torch.cuda.is_available(): pretrained_dict = torch.load(pretrained)['model'] else: pretrained_dict = torch.load( pretrained, map_location=torch.device('cpu'))['model'] self.regressor.load_state_dict(pretrained_dict, strict=False) print(f'=> loaded pretrained model from \'{pretrained}\'') def forward(self, input, J_regressor=None): # input size NTF batch_size, seqlen, nc, h, w = input.shape feature = self.hmr.feature_extractor(input.reshape(-1, nc, h, w)) if not self.disable_temporal: feature = feature.reshape(batch_size, seqlen, -1) feature = self.encoder(feature) feature = feature.reshape(-1, feature.size(-1)) smpl_output = self.regressor(feature, J_regressor=J_regressor) for s in smpl_output: s['theta'] = s['theta'].reshape(batch_size, seqlen, -1) s['verts'] = s['verts'].reshape(batch_size, seqlen, -1, 3) s['kp_2d'] = s['kp_2d'].reshape(batch_size, seqlen, -1, 2) s['kp_3d'] = s['kp_3d'].reshape(batch_size, seqlen, -1, 3) s['rotmat'] = s['rotmat'].reshape(batch_size, seqlen, -1, 3, 3) return smpl_output
class VIBE(nn.Module): def __init__(self, seqlen, batch_size=64, n_layers=1, hidden_size=2048, pretrained='data/vibe_data/spin_model_checkpoint.pth.tar', add_linear=False, bidirectional=False, attention=False, attention_cfg=None, use_residual=True, use_6d=True, disable_temporal=False): super(VIBE, self).__init__() self.seqlen = seqlen self.batch_size = batch_size self.disable_temporal = disable_temporal if attention: cfg = attention_cfg self.encoder = TemporalEncoderWAttention( hidden_size=hidden_size, bidirectional=bidirectional, add_linear=add_linear, attention_size=cfg.SIZE, attention_layers=cfg.LAYERS, attention_dropout=cfg.DROPOUT, use_residual=use_residual, ) else: self.encoder = TemporalEncoder( n_layers=n_layers, hidden_size=hidden_size, bidirectional=bidirectional, add_linear=add_linear, use_residual=use_residual, ) # regressor can predict cam, pose and shape params in an iterative way self.regressor = Regressor(use_6d=use_6d) if pretrained and os.path.isfile(pretrained): pretrained_dict = torch.load(pretrained)['model'] if not use_6d: del pretrained_dict['decpose.weight'] del pretrained_dict['decpose.bias'] del pretrained_dict['fc1.weight'] del pretrained_dict['fc1.bias'] self.regressor.load_state_dict(pretrained_dict, strict=False) print(f'=> loaded pretrained model from \'{pretrained}\'') def forward(self, input, J_regressor=None): # input size NTF batch_size, seqlen = input.shape[:2] if self.disable_temporal: feature = input.reshape(-1, input.size(-1)) else: feature = self.encoder(input) feature = feature.reshape(-1, feature.size(-1)) smpl_output = self.regressor(feature, J_regressor=J_regressor) for s in smpl_output: s['theta'] = s['theta'].reshape(batch_size, seqlen, -1) s['verts'] = s['verts'].reshape(batch_size, seqlen, -1, 3) s['kp_2d'] = s['kp_2d'].reshape(batch_size, seqlen, -1, 2) s['kp_3d'] = s['kp_3d'].reshape(batch_size, seqlen, -1, 3) s['rotmat'] = s['rotmat'].reshape(batch_size, seqlen, -1, 3, 3) return smpl_output
class TCMR(nn.Module): def __init__( self, seqlen, batch_size=64, n_layers=1, hidden_size=2048, pretrained=osp.join(BASE_DATA_DIR, 'spin_model_checkpoint.pth.tar'), ): super(TCMR, self).__init__() self.seqlen = seqlen self.batch_size = batch_size self.encoder = \ TemporalEncoder( seq_len=seqlen, n_layers=n_layers, hidden_size=hidden_size ) # regressor can predict cam, pose and shape params in an iterative way self.regressor = Regressor() if pretrained and os.path.isfile(pretrained): pretrained_dict = torch.load(pretrained)['model'] self.regressor.load_state_dict(pretrained_dict, strict=False) print(f'=> loaded pretrained model from \'{pretrained}\'') def forward(self, input, is_train=False, J_regressor=None): # input size NTF batch_size, seqlen = input.shape[:2] feature, scores = self.encoder(input, is_train=is_train) feature = feature.reshape(-1, feature.size(-1)) smpl_output = self.regressor(feature, is_train=is_train, J_regressor=J_regressor) if not is_train: for s in smpl_output: s['theta'] = s['theta'].reshape(batch_size, -1) s['verts'] = s['verts'].reshape(batch_size, -1, 3) s['kp_2d'] = s['kp_2d'].reshape(batch_size, -1, 2) s['kp_3d'] = s['kp_3d'].reshape(batch_size, -1, 3) s['rotmat'] = s['rotmat'].reshape(batch_size, -1, 3, 3) s['scores'] = scores else: repeat_num = 3 for s in smpl_output: s['theta'] = s['theta'].reshape(batch_size, repeat_num, -1) s['verts'] = s['verts'].reshape(batch_size, repeat_num, -1, 3) s['kp_2d'] = s['kp_2d'].reshape(batch_size, repeat_num, -1, 2) s['kp_3d'] = s['kp_3d'].reshape(batch_size, repeat_num, -1, 3) s['rotmat'] = s['rotmat'].reshape(batch_size, repeat_num, -1, 3, 3) s['scores'] = scores return smpl_output, scores