Example #1
0
def vgg16(input_size=(3, 224, 224), num_classes=1000, pretrained=None):
    model = models.vgg16(pretrained=pretrained)
    model = add_instances_to_torchvisionmodel(model)
    if input_size != (3, 224, 224):
        model._features[0] = nn.Conv2d(input_size[0],
                                       64,
                                       kernel_size=(3, 3),
                                       stride=(1, 1),
                                       padding=(1, 1))
        model.input_size = input_size
    test_tensor = torch.randn((1, input_size[0], input_size[1], input_size[2]))
    x = model._features(test_tensor)
    x = x.view(x.size(0), -1)
    second_out_features = 4096
    if x.shape[1] < 4096:
        second_out_features = x.shape[1]
    model.linear0 = nn.Linear(in_features=x.shape[1],
                              out_features=second_out_features,
                              bias=True)
    model.linear1 = nn.Linear(in_features=second_out_features,
                              out_features=second_out_features,
                              bias=True)
    model.last_linear = nn.Linear(in_features=second_out_features,
                                  out_features=num_classes,
                                  bias=True)
    return model
Example #2
0
def classification_model_vgg(**kwargs):
    base_model = pretrainedmodels.vgg16()
    return ClassificationModelVGG(base_model,
                                  base_model_features=512,
                                  nb_features=6,
                                  base_model_l1_outputs=64,
                                  **kwargs)
Example #3
0
def fix_frame_extract(frame_path, feats_path, frames_num, model, video_name):
    # load model
    C, H, W = 3, 224, 224
    if model == 'resnet152':
        model = pretrainedmodels.resnet152(pretrained='imagenet')
    elif model == 'vgg16':
        model = pretrainedmodels.vgg16(pretrained='imagenet')
    elif model == 'inception_v4':
        C, H, W = 3, 299, 299
        model = pretrainedmodels.inceptionv4(pretrained='imagenet')
    model.last_linear = utils.Identity()
    model = model.to(device)
    model.eval()
    load_image_fn = utils.LoadTransformImage(model)

    # load data
    img_list = sorted(frame_path.glob('*.jpg'))
    # get index
    samples_ix = np.linspace(0, len(img_list) - 1, frames_num).astype(int)
    img_list = [img_list[i] for i in samples_ix]
    # build tensor
    imgs = torch.zeros([len(img_list), C, H, W])
    for i in range(len(img_list)):
        img = load_image_fn(img_list[i])
        imgs[i] = img
    imgs = imgs.to(device)
    with torch.no_grad():
        feats = model(imgs)
    feats = feats.cpu().numpy()
    # save
    np.save(os.path.join(feats_path, video_name + ".npy"), feats)
Example #4
0
    def __init__(self,
                 dropout_rate_first: float = 0.2,
                 dropout_rate_second: float = 0.1,
                 mlp_hidden_dim: int = 128,
                 n_mlp_layers: int = 1,
                 out_channels: int = 1,
                 device: str = 'cuda',
                 model_type: str = 'effnet',
                 pretrained: bool = True,
                 custom_config: Optional[Mapping[str, Any]] = None):
        super().__init__()

        if model_type == 'vgg16':
            pretrained = 'imagenet' if pretrained else None
            self.base_model = pretrainedmodels.vgg16(pretrained=pretrained)
            self.base_model.linear0 = nn.Identity()
            self.base_model.relu0 = nn.Identity()
            self.base_model.dropout0 = nn.Identity()
            self.base_model.linear1 = nn.Identity()
            self.base_model.relu1 = nn.Identity()
            self.base_model.dropout1 = nn.Identity()
            self.base_model.last_linear = nn.Identity()
            nn_embed_size = 8192
        elif model_type == 'effnet':
            self.base_model = geffnet.tf_efficientnet_b1_ns(
                pretrained=pretrained)
            self.base_model.classifier = nn.Identity()
            nn_embed_size = 1280
        elif model_type == 'resnet50':
            pretrained = 'imagenet' if pretrained else None
            self.base_model = pretrainedmodels.resnet50(pretrained=pretrained)
            self.base_model.last_linear = nn.Identity()
            nn_embed_size = 2048
        elif model_type == 'custom':
            custom_config = {} if custom_config is None else custom_config
            self.base_model = CustomCNN(**custom_config)
            nn_embed_size = 64
        else:
            raise ValueError(f'{model_type} is invalid model_type')

        self.emb_drop = nn.Dropout(dropout_rate_first)

        self.mlp_layres = []
        for i in range(n_mlp_layers):
            if i == 0:
                in_mlp_dim = nn_embed_size
            else:
                in_mlp_dim = mlp_hidden_dim
            self.mlp_layres.append(
                nn.Sequential(nn.Linear(in_mlp_dim, mlp_hidden_dim), nn.ELU(),
                              nn.Dropout(dropout_rate_second)))
        self.mlp_layres = nn.ModuleList(self.mlp_layres)

        self.classifier = nn.Linear(mlp_hidden_dim, out_channels)
        self.to(device)
Example #5
0
 def __init__(self, use_gpu: bool = True):
     super().__init__(use_gpu)
     self.use_gpu = use_gpu
     self.cnn = vgg16()
     self.trans = utils.TransformImage(self.cnn)
     self.trans = transforms.Compose([transforms.ToPILImage(), self.trans])
     if use_gpu:
         self.cnn = self.cnn.cuda()
     self.cnn.eval()
     for param in self.cnn.parameters():
         param.requires_grad = False
Example #6
0
    def __init__(self, use_gpu: bool = True, transform: bool = True):
        super().__init__()

        self.cnn = pretrainedmodels.vgg16()
        self.tf_image = utils.TransformImage(self.cnn)
        self.transform = transform
        self.use_gpu = use_gpu

        if self.use_gpu:
            self.cnn = self.cnn.cuda()
        self.cnn.eval()

        for param in self.cnn.parameters():
            param.requires_grad = False
def generate_2D_model(opt):
    if opt['model'] == 'inception_v3':
        C, H, W = 3, 299, 299
        model = pretrainedmodels.inceptionv3(pretrained='imagenet')
        load_image_fn = utils.LoadTransformImage(model)
    elif opt['model'] == 'vgg16':
        C, H, W = 3, 224, 224
        model = pretrainedmodels.vgg16(pretrained='imagenet')
        load_image_fn = utils.LoadTransformImage(model)
    elif opt['model'] == 'vgg19':
        C, H, W = 3, 224, 224
        model = pretrainedmodels.vgg19(pretrained='imagenet')
        load_image_fn = utils.LoadTransformImage(model)
    elif opt['model'] == 'resnet50':
        C, H, W = 3, 224, 224
        model = pretrainedmodels.resnet50(pretrained='imagenet')
        load_image_fn = utils.LoadTransformImage(model)
    elif opt['model'] == 'resnet101':
        C, H, W = 3, 224, 224
        model = pretrainedmodels.resnet101(pretrained='imagenet')
        load_image_fn = utils.LoadTransformImage(model)
    elif opt['model'] == 'resnet152':
        C, H, W = 3, 224, 224
        model = pretrainedmodels.resnet152(pretrained='imagenet')
        load_image_fn = utils.LoadTransformImage(model)
    elif opt['model'] == 'inception_v4':
        C, H, W = 3, 299, 299
        model = pretrainedmodels.inceptionv4(num_classes=1000,
                                             pretrained='imagenet')
        load_image_fn = utils.LoadTransformImage(model)
    elif opt['model'] == 'nasnet':
        C, H, W = 3, 331, 331
        model = pretrainedmodels.nasnetalarge(num_classes=1001,
                                              pretrained='imagenet+background')
        load_image_fn = utils.LoadTransformImage(model)
    else:
        print("doesn't support %s" % (opt['model']))

    model.last_linear = utils.Identity()
    model = nn.DataParallel(model)
    # if opt['saved_model'] != '':
    #     model.load_state_dict(torch.load(opt['saved_model']), strict=False)
    model = model.cuda()
    return model
Example #8
0
def extract_feats(frame_path, feats_path, interval, model, video_name):
    """
    extract feature from frames of one video
    :param video_name:
    :param model: name of model
    :param frame_path: path of frames
    :param feats_path: path to store results
    :param interval: (str) The interval when extract frames from videos
    :return: None
    """
    # load model
    C, H, W = 3, 224, 224
    if model == 'resnet152':
        model = pretrainedmodels.resnet152(pretrained='imagenet')
    elif model == 'vgg16':
        model = pretrainedmodels.vgg16(pretrained='imagenet')
    elif model == 'inception_v4':
        C, H, W = 3, 299, 299
        model = pretrainedmodels.inceptionv4(pretrained='imagenet')
    model.last_linear = utils.Identity()
    model = model.to(device)
    model.eval()
    load_image_fn = utils.LoadTransformImage(model)

    # load data
    img_list = sorted(frame_path.glob('*.jpg'))
    # get index
    samples_ix = np.arange(0, len(img_list), interval)
    img_list = [img_list[int(i)] for i in samples_ix]
    # build tensor
    imgs = torch.zeros([len(img_list), C, H, W])
    for i in range(len(img_list)):
        img = load_image_fn(img_list[i])
        imgs[i] = img
    imgs = imgs.to(device)
    with torch.no_grad():
        feats = model(imgs)
    feats = feats.cpu().numpy()
    # save
    np.save(os.path.join(feats_path, video_name + ".npy"), feats)
        "--saved_model",
        dest="saved_model",
        type=str,
        default='',
        help='the pretrained CNN model you want to use to extract_feats')

    args = parser.parse_args()
    os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu
    params = vars(args)
    if params['model'] == 'inception_v3':
        C, H, W = 3, 299, 299
        model = pretrainedmodels.inceptionv3(pretrained='imagenet')
        load_image_fn = utils.LoadTransformImage(model)
    elif params['model'] == 'vgg16':
        C, H, W = 3, 224, 224
        model = pretrainedmodels.vgg16(pretrained='imagenet')
        load_image_fn = utils.LoadTransformImage(model)
    elif params['model'] == 'vgg19':
        C, H, W = 3, 224, 224
        model = pretrainedmodels.vgg19(pretrained='imagenet')
        load_image_fn = utils.LoadTransformImage(model)
    elif params['model'] == 'resnet50':
        C, H, W = 3, 224, 224
        model = pretrainedmodels.resnet50(pretrained='imagenet')
        load_image_fn = utils.LoadTransformImage(model)
    elif params['model'] == 'resnet101':
        C, H, W = 3, 224, 224
        model = pretrainedmodels.resnet101(pretrained='imagenet')
        load_image_fn = utils.LoadTransformImage(model)
    elif params['model'] == 'resnet152':
        C, H, W = 3, 224, 224
Example #10
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument(
        '--video_dir',
        type=str,
        default='../msrvtt_2017/train-video',
        help='The video dir that one would like to extract audio file from')
    parser.add_argument('--output_dir',
                        type=str,
                        default='../msrvtt_2017/preprocessed',
                        help='The file output directory')
    parser.add_argument(
        '--output_channels',
        type=int,
        default=1,
        help='The number of output audio channels, default to 1')
    parser.add_argument(
        '--output_frequency',
        type=int,
        default=16000,
        help='The output audio frequency in Hz, default to 16000')
    parser.add_argument(
        '--band_width',
        type=int,
        default=160,
        help=
        'Bandwidth specified to sample the audio (unit in kbps), default to 160'
    )
    parser.add_argument(
        '--model',
        type=str,
        default='resnet152',
        help=
        'The pretrained model to use for extracting image features, default to resnet152'
    )
    parser.add_argument('--gpu',
                        type=str,
                        default='0',
                        help='The CUDA_VISIBLE_DEVICES argument, default to 0')
    parser.add_argument(
        '--n_frame_steps',
        type=int,
        default=80,
        help='The number of frames to extract from a single video')
    opt = parser.parse_args()
    opt = vars(opt)

    if not os.path.exists(opt['output_dir']):
        os.mkdir(opt['output_dir'])
    vToA(opt)
    split_audio(opt)
    print('cleaning up original .wav files...')
    dir = opt['output_dir']
    dir = os.listdir(dir)
    for file in dir:
        if file.endswith('.wav'):
            os.remove(os.path.join(opt['output_dir'], file))

    os.environ['CUDA_VISIBLE_DEVICES'] = opt['gpu']
    if opt['model'] == 'resnet152':
        C, H, W = 3, 224, 224
        model = pretrainedmodels.resnet152(pretrained='imagenet')
        load_image_fn = utils.LoadTransformImage(model)
    elif opt['model'] == 'inception_v3':
        C, H, W = 3, 299, 299
        model = pretrainedmodels.inceptionv3(pretrained='imagenet')
        load_image_fn = utils.LoadTransformImage(model)
    elif opt['model'] == 'vgg16':
        C, H, W = 3, 224, 224
        model = pretrainedmodels.vgg16(pretrained='imagenet')
        load_image_fn = utils.LoadTransformImage(model)
    else:
        print('The image model is not supported')

    model.last_linear = utils.Identity()
    model = nn.DataParallel(model)

    model = model.cuda()
    extract_image_feats(opt, model, load_image_fn)
Example #11
0
import pretrainedmodels
import torchvision.datasets as dset
from pretrainedmodels import utils
from torch.utils.data import DataLoader

from file_path_manager import FilePathManager

extractor = pretrainedmodels.vgg16()

captions = dset.CocoCaptions(
    root=FilePathManager.resolve(f'data/train'),
    annFile=FilePathManager.resolve(
        f"data/annotations/captions_train2017.json"),
    # transform=None)
    transform=utils.TransformImage(extractor))
batch_size = 1
dataloader = DataLoader(captions,
                        batch_size=batch_size,
                        shuffle=True,
                        num_workers=1)
for i, caps in dataloader:
    print(f"size: {len(caps)}, {caps}")