def __init__(self, configs): super(MMNIST_ConvLSTM, self).__init__() _KEYS = ['encoder_configs', 'reconstruct_configs', 'predict_configs'] en_conf, rec_conf, pred_conf = unpack(configs, _KEYS) self.encoder = ConvLSTM(en_conf) self.reconstructor = Generator(rec_conf) self.predictor = Generator(pred_conf)
def __init__(self, num_classes=61, mem_size=512, no_cam=False): super(NewAttentionModelBi, self).__init__() self.num_classes = num_classes self.noCam = no_cam self.mem_size = mem_size self.resnet = resnet34(pretrained=True, noBN=True) self.attention_rgb = Variable( (torch.FloatTensor(512).normal_(0, .05)).unsqueeze(0).cuda()) self.attention_flow = Variable( (torch.FloatTensor(512).normal_(0, .05)).unsqueeze(0).cuda()) self.lstm_cell = ConvLSTM(1024, mem_size) self.avgpool = nn.AvgPool2d(7) self.dropout = nn.Dropout(0.7) self.fc = nn.Linear(mem_size, self.num_classes) self.classifier = nn.Sequential(self.dropout, self.fc) self._custom_train_mode = True
def __init__(self, num_classes=61, mem_size=512, no_cam=False, enable_motion_segmentation=False): super(AttentionModel, self).__init__() self.num_classes = num_classes self.noCam = no_cam self.mem_size = mem_size self.enable_motion_segmentation = enable_motion_segmentation self.resnet = resnet34(pretrained=True, noBN=True) self.weight_softmax = self.resnet.fc.weight self.lstm_cell = ConvLSTM(512, mem_size) self.avgpool = nn.AvgPool2d(7) self.dropout = nn.Dropout(0.7) self.fc = nn.Linear(mem_size, self.num_classes) self.classifier = nn.Sequential(self.dropout, self.fc) self.motion_segmentation = MotionSegmentationBlock() self._custom_train_mode = True
def __init__(self, in_dim, ae_en_h_dims, ae_de_h_dims, conv_lstm_in_size, conv_lstm_in_dim, conv_lstm_h_dim, conv_lstm_kernel_sizes, conv_lstm_n_layers, fc_in_dim, fc_h_dims, fc_out_dim, **kwargs): super(DeepAP, self).__init__() self.device = kwargs.get('device', 'cpu') ################ # masked layer # ################ mask = [[i for i in range(in_dim)], [i for i in range(in_dim)]] self.mask_layer = MaskNet(in_dim, in_dim, mask, device=self.device) self.mask_thre = kwargs.get('mask_thre', 0.0001) ###################### # auto_encoder layer # ###################### self.ae = AutoEncoder(in_dim=in_dim, en_h_dims=ae_en_h_dims, de_h_dims=ae_de_h_dims) if kwargs.get('ae_pretrain_weight') is not None: self.ae.load_state_dict(kwargs['ae_pretrain_weight']) else: raise ValueError('AutoEncoder not pretrained.') if kwargs.get('if_trainable'): for p in self.ae.parameters(): p.requires_grad = kwargs['if_trainable'] else: self.ae.weight.requires_grad = False #################### # conv_lstm layers # #################### self.conv_lstm_list = nn.ModuleList() for i in conv_lstm_kernel_sizes: i_kernel_size = (i, i) conv_lstm = ConvLSTM( in_size=conv_lstm_in_size, in_dim=conv_lstm_in_dim, h_dim=conv_lstm_h_dim, kernel_size=i_kernel_size, num_layers=conv_lstm_n_layers, batch_first=kwargs.get('conv_lstm_batch_first', True), bias=kwargs.get('conv_lstm_bias', True), only_last_state=kwargs.get('only_last_state', True), device=self.device) self.conv_lstm_list.append(conv_lstm) ######################### # fully-connected layer # ######################### self.fc = FC( in_dim=fc_in_dim, # assert in_size == n_conv_lstm * conv_lstm_h_dim h_dims=fc_h_dims, out_dim=fc_out_dim, p_dropout=kwargs.get('fc_p_dropout', 0.1))
class AttentionModel(nn.Module): def __init__(self, num_classes=61, mem_size=512, no_cam=False, enable_motion_segmentation=False): super(AttentionModel, self).__init__() self.num_classes = num_classes self.noCam = no_cam self.mem_size = mem_size self.enable_motion_segmentation = enable_motion_segmentation self.resnet = resnet34(pretrained=True, noBN=True) self.weight_softmax = self.resnet.fc.weight self.lstm_cell = ConvLSTM(512, mem_size) self.avgpool = nn.AvgPool2d(7) self.dropout = nn.Dropout(0.7) self.fc = nn.Linear(mem_size, self.num_classes) self.classifier = nn.Sequential(self.dropout, self.fc) self.motion_segmentation = MotionSegmentationBlock() self._custom_train_mode = True def train(self, mode=True): correct_values = {True, 'stage2', 'stage1', False} if mode not in correct_values: raise ValueError('Invalid modes, correct values are: ' + ' '.join(correct_values)) self._custom_train_mode = mode # Fai fare il training completo solo se mode == True super().train(mode == True) self.resnet.train(mode) self.lstm_cell.train(mode) if mode == 'stage2' or mode == True: self.motion_segmentation.train(True) if mode != False: self.classifier.train(True) def get_training_parameters(self, name='all'): train_params = [] train_params_ms = [] # Prima levo i gradienti a tutti, e poi li aggiungo solo a quelli # su cui faccio il training for params in self.parameters(): params.requires_grad = False # è responsabilità della funzione negli oggetti aggiungere i gradienti train_params += self.resnet.get_training_parameters() train_params += self.lstm_cell.get_training_parameters() # trainiamo l'ultimo layer a tutti gli stagi, eccetto se non sono in training if self._custom_train_mode != False: for params in self.classifier.parameters(): params.requires_grad = True train_params += [params] train_params_ms = self.motion_segmentation.get_training_parameters() if name == 'all': return train_params + train_params_ms elif name == 'main': return train_params elif name == 'ms': return train_params_ms def load_weights(self, file_path): model_dict = torch.load(file_path) if 'model_state_dict' in model_dict: self.load_state_dict(model_dict['model_state_dict']) else: self.load_state_dict(model_dict) def forward(self, inputVariable): state = (Variable( torch.zeros((inputVariable.size(1), self.mem_size, 7, 7)).cuda()), Variable( torch.zeros( (inputVariable.size(1), self.mem_size, 7, 7)).cuda())) ms_feats = None if self.enable_motion_segmentation: ms_feats = Variable( torch.zeros(inputVariable.size(0), inputVariable.size(1), 49 * 2).cuda()) for t in range(inputVariable.size(0)): logit, feature_conv, feature_convNBN = self.resnet( inputVariable[t]) bz, nc, h, w = feature_conv.size() feature_conv1 = feature_conv.view(bz, nc, h * w) probs, idxs = logit.sort(1, True) class_idx = idxs[:, 0] cam = torch.bmm(self.weight_softmax[class_idx].unsqueeze(1), feature_conv1) attentionMAP = F.softmax(cam.squeeze(1), dim=1) attentionMAP = attentionMAP.view(attentionMAP.size(0), 1, 7, 7) attentionFeat = feature_convNBN * attentionMAP.expand_as( feature_conv) if self.enable_motion_segmentation: ms_feats[t] = self.motion_segmentation(feature_convNBN) if self.noCam: state = self.lstm_cell(feature_convNBN, state) else: state = self.lstm_cell(attentionFeat, state) feats1 = self.avgpool(state[1]).view(state[1].size(0), -1) feats = self.classifier(feats1) return { 'classifications': feats, 'ms_feats': ms_feats, 'lstm_feats': feats1 } def get_class_activation_id(self, inputVariable): logit, _, _ = self.resnet(inputVariable) return logit def get_cam_visualisation(self, input_pil_image, preprocess_for_viz, preprocess_for_model): return get_cam_visualisation(self.resnet, self.weight_softmax, input_pil_image, preprocess_for_viz, preprocess_for_model)
class NewAttentionModelBi(nn.Module): def __init__(self, num_classes=61, mem_size=512, no_cam=False): super(NewAttentionModelBi, self).__init__() self.num_classes = num_classes self.noCam = no_cam self.mem_size = mem_size self.resnet = resnet34(pretrained=True, noBN=True) self.attention_rgb = Variable( (torch.FloatTensor(512).normal_(0, .05)).unsqueeze(0).cuda()) self.attention_flow = Variable( (torch.FloatTensor(512).normal_(0, .05)).unsqueeze(0).cuda()) self.lstm_cell = ConvLSTM(1024, mem_size) self.avgpool = nn.AvgPool2d(7) self.dropout = nn.Dropout(0.7) self.fc = nn.Linear(mem_size, self.num_classes) self.classifier = nn.Sequential(self.dropout, self.fc) self._custom_train_mode = True def train(self, mode=True): correct_values = {True, 'stage2', 'stage1', False} if mode not in correct_values: raise ValueError('Invalid modes, correct values are: ' + ' '.join(correct_values)) self._custom_train_mode = mode super().train(mode == True) self.resnet.train(mode) self.lstm_cell.train(mode) if mode != False: self.classifier.train(True) def get_training_parameters(self): train_params = [] for params in self.parameters(): params.requires_grad = False train_params += self.resnet.get_training_parameters() train_params += self.lstm_cell.get_training_parameters() if self._custom_train_mode != False: for params in self.classifier.parameters(): params.requires_grad = True train_params += [params] self.attention_rgb.requires_grad = True train_params += [self.attention_rgb] self.attention_flow.requires_grad = True train_params += [self.attention_flow] return train_params def load_weights(self, file_path): model_dict = torch.load(file_path) if 'model_state_dict' in model_dict: self.load_state_dict(model_dict['model_state_dict']) else: self.load_state_dict(model_dict) def get_resnet_output_feats(self, resnet, attention, input_frames): logit, feature_conv, feature_convNBN = resnet(input_frames) if self.noCam: return feature_convNBN bz, nc, h, w = feature_conv.size() feature_conv1 = feature_conv.view(bz, nc, h * w) cam = torch.bmm(attention[[0] * input_frames.size(0)].unsqueeze(1), feature_conv1) attentionMAP = F.softmax(cam.squeeze(1), dim=1) attentionMAP = attentionMAP.view(attentionMAP.size(0), 1, 7, 7) attentionFeat = feature_convNBN * attentionMAP.expand_as(feature_conv) return attentionFeat def forward(self, rgb_frames, flow_frames): state = (Variable( torch.zeros((rgb_frames.size(1), self.mem_size, 7, 7)).cuda()), Variable( torch.zeros( (rgb_frames.size(1), self.mem_size, 7, 7)).cuda())) for t in range(rgb_frames.size(0)): rgb_feats = self.get_resnet_output_feats(self.resnet, self.attention_rgb, rgb_frames[t]) flow_feats = self.get_resnet_output_feats(self.resnet, self.attention_flow, flow_frames[t]) state = self.lstm_cell(torch.cat((rgb_feats, flow_feats), dim=1), state) feats1 = self.avgpool(state[1]).view(state[1].size(0), -1) feats = self.classifier(feats1) return {'classifications': feats}
batch_size = 3 num_layers = 3 cell_conf_l0 = pack( [h_c, active_func, in_c, in_h, in_w, kernel_size, DEBUG], ConvLSTMCell.get_init_keys()) cell_conf_l1 = pack( [h_c, active_func, h_c, in_h, in_w, kernel_size, DEBUG], ConvLSTMCell.get_init_keys()) cell_conf_l2 = pack( [h_c, active_func, h_c, in_h, in_w, kernel_size, DEBUG], ConvLSTMCell.get_init_keys()) cell_configs = [cell_conf_l0, cell_conf_l1, cell_conf_l2] encoder_configs = pack([num_layers, cell_configs], ConvLSTM.get_init_keys()) reconstruct_configs = pack([num_layers, cell_configs], ConvLSTM.get_init_keys()) predict_configs = pack([num_layers, cell_configs], ConvLSTM.get_init_keys()) model_configs = pack( [encoder_configs, reconstruct_configs, predict_configs], _KEYS) model = MMNIST_ConvLSTM(model_configs) x_train = to_var(torch.randn(batch_size, time_steps, in_c, in_h, in_w)) x_predict = to_var(torch.randn(batch_size, time_steps, in_c, in_h, in_w)) data = pack([x_train, x_predict, None], ['x_train', 'x_predict', 'states']) configs = pack([True, 6], ['use_gt', 'max_steps'])
def MMNIST_CONV_LSTM(extra_info): _KEYS = ['encoder_configs', 'reconstruct_configs', 'predict_configs'] h_c = 16 active_func = nn.Tanh() in_c = 1 in_h = 64 in_w = 64 kernel_size = 5 DEBUG = True num_layers = 3 cell_conf_l0 = pack( [h_c, active_func, in_c, in_h, in_w, kernel_size, DEBUG], ConvLSTMCell.get_init_keys()) cell_conf_l1 = pack([8, active_func, h_c, in_h, in_w, kernel_size, DEBUG], ConvLSTMCell.get_init_keys()) cell_conf_l2 = pack([8, active_func, 8, in_h, in_w, kernel_size, DEBUG], ConvLSTMCell.get_init_keys()) cell_configs = [cell_conf_l0, cell_conf_l1, cell_conf_l2] encoder_configs = pack([num_layers, cell_configs], ConvLSTM.get_init_keys()) reconstruct_configs = pack([num_layers, cell_configs], ConvLSTM.get_init_keys()) predict_configs = pack([num_layers, cell_configs], ConvLSTM.get_init_keys()) model_info = pack([encoder_configs, reconstruct_configs, predict_configs], _KEYS) model_info['name'] = 'MMNIST_CONV_LSTM' trainloader_info = { 'file_addr': './data/mmnist_train.npy', 'batch_size': 32, 'shuffle': True, 'num_workers': 2 } valloader_info = { 'file_addr': './data/mmnist_val.npy', 'batch_size': 16, 'shuffle': False, 'num_workers': 2 } testloader_info = { 'file_addr': './data/mmnist_test.npy', 'batch_size': 16, 'shuffle': False, 'num_workers': 2 } seed = 666 folder_name = 'mmnist_convLSTM' main_info = { 'clip': 0.25, 'num_epochs': 60, 'halve_every': 10, 'log_dir': './logs/%s' % folder_name, 'save_dir': './checkpoints/%s' % folder_name } optimizer_info = { 'lr': 1e-4, 'optim_alg': 'RMSprop', 'weight_decay': 0.9, 'momentum': 0 } hparams = HParams(trainloader_info=trainloader_info, valloader_info=valloader_info, testloader_info=testloader_info, model_info=model_info, optimizer_info=optimizer_info, main_info=main_info, seed=seed) return hparams