def vgg16(input_size=(3, 224, 224), num_classes=1000, pretrained=None): model = models.vgg16(pretrained=pretrained) model = add_instances_to_torchvisionmodel(model) if input_size != (3, 224, 224): model._features[0] = nn.Conv2d(input_size[0], 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) model.input_size = input_size test_tensor = torch.randn((1, input_size[0], input_size[1], input_size[2])) x = model._features(test_tensor) x = x.view(x.size(0), -1) second_out_features = 4096 if x.shape[1] < 4096: second_out_features = x.shape[1] model.linear0 = nn.Linear(in_features=x.shape[1], out_features=second_out_features, bias=True) model.linear1 = nn.Linear(in_features=second_out_features, out_features=second_out_features, bias=True) model.last_linear = nn.Linear(in_features=second_out_features, out_features=num_classes, bias=True) return model
def classification_model_vgg(**kwargs): base_model = pretrainedmodels.vgg16() return ClassificationModelVGG(base_model, base_model_features=512, nb_features=6, base_model_l1_outputs=64, **kwargs)
def fix_frame_extract(frame_path, feats_path, frames_num, model, video_name): # load model C, H, W = 3, 224, 224 if model == 'resnet152': model = pretrainedmodels.resnet152(pretrained='imagenet') elif model == 'vgg16': model = pretrainedmodels.vgg16(pretrained='imagenet') elif model == 'inception_v4': C, H, W = 3, 299, 299 model = pretrainedmodels.inceptionv4(pretrained='imagenet') model.last_linear = utils.Identity() model = model.to(device) model.eval() load_image_fn = utils.LoadTransformImage(model) # load data img_list = sorted(frame_path.glob('*.jpg')) # get index samples_ix = np.linspace(0, len(img_list) - 1, frames_num).astype(int) img_list = [img_list[i] for i in samples_ix] # build tensor imgs = torch.zeros([len(img_list), C, H, W]) for i in range(len(img_list)): img = load_image_fn(img_list[i]) imgs[i] = img imgs = imgs.to(device) with torch.no_grad(): feats = model(imgs) feats = feats.cpu().numpy() # save np.save(os.path.join(feats_path, video_name + ".npy"), feats)
def __init__(self, dropout_rate_first: float = 0.2, dropout_rate_second: float = 0.1, mlp_hidden_dim: int = 128, n_mlp_layers: int = 1, out_channels: int = 1, device: str = 'cuda', model_type: str = 'effnet', pretrained: bool = True, custom_config: Optional[Mapping[str, Any]] = None): super().__init__() if model_type == 'vgg16': pretrained = 'imagenet' if pretrained else None self.base_model = pretrainedmodels.vgg16(pretrained=pretrained) self.base_model.linear0 = nn.Identity() self.base_model.relu0 = nn.Identity() self.base_model.dropout0 = nn.Identity() self.base_model.linear1 = nn.Identity() self.base_model.relu1 = nn.Identity() self.base_model.dropout1 = nn.Identity() self.base_model.last_linear = nn.Identity() nn_embed_size = 8192 elif model_type == 'effnet': self.base_model = geffnet.tf_efficientnet_b1_ns( pretrained=pretrained) self.base_model.classifier = nn.Identity() nn_embed_size = 1280 elif model_type == 'resnet50': pretrained = 'imagenet' if pretrained else None self.base_model = pretrainedmodels.resnet50(pretrained=pretrained) self.base_model.last_linear = nn.Identity() nn_embed_size = 2048 elif model_type == 'custom': custom_config = {} if custom_config is None else custom_config self.base_model = CustomCNN(**custom_config) nn_embed_size = 64 else: raise ValueError(f'{model_type} is invalid model_type') self.emb_drop = nn.Dropout(dropout_rate_first) self.mlp_layres = [] for i in range(n_mlp_layers): if i == 0: in_mlp_dim = nn_embed_size else: in_mlp_dim = mlp_hidden_dim self.mlp_layres.append( nn.Sequential(nn.Linear(in_mlp_dim, mlp_hidden_dim), nn.ELU(), nn.Dropout(dropout_rate_second))) self.mlp_layres = nn.ModuleList(self.mlp_layres) self.classifier = nn.Linear(mlp_hidden_dim, out_channels) self.to(device)
def __init__(self, use_gpu: bool = True): super().__init__(use_gpu) self.use_gpu = use_gpu self.cnn = vgg16() self.trans = utils.TransformImage(self.cnn) self.trans = transforms.Compose([transforms.ToPILImage(), self.trans]) if use_gpu: self.cnn = self.cnn.cuda() self.cnn.eval() for param in self.cnn.parameters(): param.requires_grad = False
def __init__(self, use_gpu: bool = True, transform: bool = True): super().__init__() self.cnn = pretrainedmodels.vgg16() self.tf_image = utils.TransformImage(self.cnn) self.transform = transform self.use_gpu = use_gpu if self.use_gpu: self.cnn = self.cnn.cuda() self.cnn.eval() for param in self.cnn.parameters(): param.requires_grad = False
def generate_2D_model(opt): if opt['model'] == 'inception_v3': C, H, W = 3, 299, 299 model = pretrainedmodels.inceptionv3(pretrained='imagenet') load_image_fn = utils.LoadTransformImage(model) elif opt['model'] == 'vgg16': C, H, W = 3, 224, 224 model = pretrainedmodels.vgg16(pretrained='imagenet') load_image_fn = utils.LoadTransformImage(model) elif opt['model'] == 'vgg19': C, H, W = 3, 224, 224 model = pretrainedmodels.vgg19(pretrained='imagenet') load_image_fn = utils.LoadTransformImage(model) elif opt['model'] == 'resnet50': C, H, W = 3, 224, 224 model = pretrainedmodels.resnet50(pretrained='imagenet') load_image_fn = utils.LoadTransformImage(model) elif opt['model'] == 'resnet101': C, H, W = 3, 224, 224 model = pretrainedmodels.resnet101(pretrained='imagenet') load_image_fn = utils.LoadTransformImage(model) elif opt['model'] == 'resnet152': C, H, W = 3, 224, 224 model = pretrainedmodels.resnet152(pretrained='imagenet') load_image_fn = utils.LoadTransformImage(model) elif opt['model'] == 'inception_v4': C, H, W = 3, 299, 299 model = pretrainedmodels.inceptionv4(num_classes=1000, pretrained='imagenet') load_image_fn = utils.LoadTransformImage(model) elif opt['model'] == 'nasnet': C, H, W = 3, 331, 331 model = pretrainedmodels.nasnetalarge(num_classes=1001, pretrained='imagenet+background') load_image_fn = utils.LoadTransformImage(model) else: print("doesn't support %s" % (opt['model'])) model.last_linear = utils.Identity() model = nn.DataParallel(model) # if opt['saved_model'] != '': # model.load_state_dict(torch.load(opt['saved_model']), strict=False) model = model.cuda() return model
def extract_feats(frame_path, feats_path, interval, model, video_name): """ extract feature from frames of one video :param video_name: :param model: name of model :param frame_path: path of frames :param feats_path: path to store results :param interval: (str) The interval when extract frames from videos :return: None """ # load model C, H, W = 3, 224, 224 if model == 'resnet152': model = pretrainedmodels.resnet152(pretrained='imagenet') elif model == 'vgg16': model = pretrainedmodels.vgg16(pretrained='imagenet') elif model == 'inception_v4': C, H, W = 3, 299, 299 model = pretrainedmodels.inceptionv4(pretrained='imagenet') model.last_linear = utils.Identity() model = model.to(device) model.eval() load_image_fn = utils.LoadTransformImage(model) # load data img_list = sorted(frame_path.glob('*.jpg')) # get index samples_ix = np.arange(0, len(img_list), interval) img_list = [img_list[int(i)] for i in samples_ix] # build tensor imgs = torch.zeros([len(img_list), C, H, W]) for i in range(len(img_list)): img = load_image_fn(img_list[i]) imgs[i] = img imgs = imgs.to(device) with torch.no_grad(): feats = model(imgs) feats = feats.cpu().numpy() # save np.save(os.path.join(feats_path, video_name + ".npy"), feats)
"--saved_model", dest="saved_model", type=str, default='', help='the pretrained CNN model you want to use to extract_feats') args = parser.parse_args() os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu params = vars(args) if params['model'] == 'inception_v3': C, H, W = 3, 299, 299 model = pretrainedmodels.inceptionv3(pretrained='imagenet') load_image_fn = utils.LoadTransformImage(model) elif params['model'] == 'vgg16': C, H, W = 3, 224, 224 model = pretrainedmodels.vgg16(pretrained='imagenet') load_image_fn = utils.LoadTransformImage(model) elif params['model'] == 'vgg19': C, H, W = 3, 224, 224 model = pretrainedmodels.vgg19(pretrained='imagenet') load_image_fn = utils.LoadTransformImage(model) elif params['model'] == 'resnet50': C, H, W = 3, 224, 224 model = pretrainedmodels.resnet50(pretrained='imagenet') load_image_fn = utils.LoadTransformImage(model) elif params['model'] == 'resnet101': C, H, W = 3, 224, 224 model = pretrainedmodels.resnet101(pretrained='imagenet') load_image_fn = utils.LoadTransformImage(model) elif params['model'] == 'resnet152': C, H, W = 3, 224, 224
def main(): parser = argparse.ArgumentParser() parser.add_argument( '--video_dir', type=str, default='../msrvtt_2017/train-video', help='The video dir that one would like to extract audio file from') parser.add_argument('--output_dir', type=str, default='../msrvtt_2017/preprocessed', help='The file output directory') parser.add_argument( '--output_channels', type=int, default=1, help='The number of output audio channels, default to 1') parser.add_argument( '--output_frequency', type=int, default=16000, help='The output audio frequency in Hz, default to 16000') parser.add_argument( '--band_width', type=int, default=160, help= 'Bandwidth specified to sample the audio (unit in kbps), default to 160' ) parser.add_argument( '--model', type=str, default='resnet152', help= 'The pretrained model to use for extracting image features, default to resnet152' ) parser.add_argument('--gpu', type=str, default='0', help='The CUDA_VISIBLE_DEVICES argument, default to 0') parser.add_argument( '--n_frame_steps', type=int, default=80, help='The number of frames to extract from a single video') opt = parser.parse_args() opt = vars(opt) if not os.path.exists(opt['output_dir']): os.mkdir(opt['output_dir']) vToA(opt) split_audio(opt) print('cleaning up original .wav files...') dir = opt['output_dir'] dir = os.listdir(dir) for file in dir: if file.endswith('.wav'): os.remove(os.path.join(opt['output_dir'], file)) os.environ['CUDA_VISIBLE_DEVICES'] = opt['gpu'] if opt['model'] == 'resnet152': C, H, W = 3, 224, 224 model = pretrainedmodels.resnet152(pretrained='imagenet') load_image_fn = utils.LoadTransformImage(model) elif opt['model'] == 'inception_v3': C, H, W = 3, 299, 299 model = pretrainedmodels.inceptionv3(pretrained='imagenet') load_image_fn = utils.LoadTransformImage(model) elif opt['model'] == 'vgg16': C, H, W = 3, 224, 224 model = pretrainedmodels.vgg16(pretrained='imagenet') load_image_fn = utils.LoadTransformImage(model) else: print('The image model is not supported') model.last_linear = utils.Identity() model = nn.DataParallel(model) model = model.cuda() extract_image_feats(opt, model, load_image_fn)
import pretrainedmodels import torchvision.datasets as dset from pretrainedmodels import utils from torch.utils.data import DataLoader from file_path_manager import FilePathManager extractor = pretrainedmodels.vgg16() captions = dset.CocoCaptions( root=FilePathManager.resolve(f'data/train'), annFile=FilePathManager.resolve( f"data/annotations/captions_train2017.json"), # transform=None) transform=utils.TransformImage(extractor)) batch_size = 1 dataloader = DataLoader(captions, batch_size=batch_size, shuffle=True, num_workers=1) for i, caps in dataloader: print(f"size: {len(caps)}, {caps}")