def extract_feats(args): params = args if params['model'] == 'inception_v3': C, H, W = 3, 299, 299 model = pretrainedmodels.inceptionv3(pretrained='imagenet') load_image_fn = utils.LoadTransformImage(model) elif params['model'] == 'resnet152': C, H, W = 3, 224, 224 model = pretrainedmodels.resnet152(pretrained='imagenet') load_image_fn = utils.LoadTransformImage(model) elif params['model'] == 'inception_v4': C, H, W = 3, 299, 299 model = pretrainedmodels.inceptionv4(num_classes=1000, pretrained='imagenet') load_image_fn = utils.LoadTransformImage(model) else: print("doesn't support %s" % (params['model'])) model.last_linear = utils.Identity() model = nn.DataParallel(model) model = model.cuda() prepro_feats.extract_feats(params, model, load_image_fn)
def generate_C2D_model(opt): if opt.c2d_model_name == 'inception_v3': C, H, W = 3, 299, 299 model = pretrainedmodels.inceptionv3(num_classes=1000, pretrained='imagenet') load_image_fn = utils.LoadTransformImage(model) elif opt.c2d_model_name == 'resnet152': C, H, W = 3, 224, 224 model = pretrainedmodels.resnet152(num_classes=1000, pretrained='imagenet') load_image_fn = utils.LoadTransformImage(model) elif opt.c2d_model_name == 'inception_v4': C, H, W = 3, 299, 299 model = pretrainedmodels.inceptionv4(num_classes=1000, pretrained='imagenet') load_image_fn = utils.LoadTransformImage(model) elif opt.c2d_model_name == 'inceptionresnetv2': C, H, W = 3, 299, 299 model = pretrainedmodels.inceptionresnetv2(num_classes=1000, pretrained='imagenet') load_image_fn = utils.LoadTransformImage(model) model.last_linear = utils.Identity() if not opt.no_cuda: model = model.to(opt.device) return load_image_fn, model, (C, H, W)
def inceptionv3(): my_inception_v3 = pretrainedmodels.inceptionv3(1000, pretrained='imagenet') # my_inception_v3 = torchvision.models.inception_v3(pretrained=True) dim_feats = my_inception_v3.last_linear.in_features # =2048 nb_classes = 4 my_inception_v3.last_linear = nn.Linear(dim_feats, nb_classes) return my_inception_v3
def __init__(self, use_dropout=False): super(InceptionV3Body, self).__init__() self.model = pretrainedmodels.inceptionv3(pretrained='imagenet') self.features = self.model.features self.input_size = list(self.model.children())[-1].in_features self.use_dropout = use_dropout
def inceptionv3(input_size=(299, 299, 3), num_classes=1000, pretrained=None): model = models.inceptionv3(num_classes=1000, pretrained=None) if input_size != (299, 299, 3): model.features[0].conv = nn.Conv2d(input_size[0], 32, kernel_size=3, stride=2, bias=False) pass
def __init__(self, use_gpu: bool = True, transform: bool = True): super().__init__() print('USING InceptionV3Extractor') self.cnn = pretrainedmodels.inceptionv3() self.tf_image = utils.TransformImage(self.cnn) self.transform = transform self.use_gpu = use_gpu if self.use_gpu: self.cnn = self.cnn.cuda() self.cnn.eval() self.features_size = 2048 self.regions_count = 64 self.regions_features_size = 2048 for param in self.cnn.parameters(): param.requires_grad = False
def generate_2D_model(opt): if opt['model'] == 'inception_v3': C, H, W = 3, 299, 299 model = pretrainedmodels.inceptionv3(pretrained='imagenet') load_image_fn = utils.LoadTransformImage(model) elif opt['model'] == 'vgg16': C, H, W = 3, 224, 224 model = pretrainedmodels.vgg16(pretrained='imagenet') load_image_fn = utils.LoadTransformImage(model) elif opt['model'] == 'vgg19': C, H, W = 3, 224, 224 model = pretrainedmodels.vgg19(pretrained='imagenet') load_image_fn = utils.LoadTransformImage(model) elif opt['model'] == 'resnet50': C, H, W = 3, 224, 224 model = pretrainedmodels.resnet50(pretrained='imagenet') load_image_fn = utils.LoadTransformImage(model) elif opt['model'] == 'resnet101': C, H, W = 3, 224, 224 model = pretrainedmodels.resnet101(pretrained='imagenet') load_image_fn = utils.LoadTransformImage(model) elif opt['model'] == 'resnet152': C, H, W = 3, 224, 224 model = pretrainedmodels.resnet152(pretrained='imagenet') load_image_fn = utils.LoadTransformImage(model) elif opt['model'] == 'inception_v4': C, H, W = 3, 299, 299 model = pretrainedmodels.inceptionv4(num_classes=1000, pretrained='imagenet') load_image_fn = utils.LoadTransformImage(model) elif opt['model'] == 'nasnet': C, H, W = 3, 331, 331 model = pretrainedmodels.nasnetalarge(num_classes=1001, pretrained='imagenet+background') load_image_fn = utils.LoadTransformImage(model) else: print("doesn't support %s" % (opt['model'])) model.last_linear = utils.Identity() model = nn.DataParallel(model) # if opt['saved_model'] != '': # model.load_state_dict(torch.load(opt['saved_model']), strict=False) model = model.cuda() return model
def get_model(model_name='resnext101_32x16d_wsl', n_classes=6, raw=False): model = None if model_name.startswith('eff'): model = EfficientNet.from_pretrained(model_name) last_output_size=2048 if model_name.endswith('b1') or model_name.endswith('b0'): last_output_size = 1280 if raw: raise Exception('Checkout for other effnet types the channel size out last output') elif model_name.endswith('b2'): if raw: model._conv_stem = Conv2d(1, 32, kernel_size=3, stride=2, bias=False) # torch.nn.init.xavier_normal_(model._conv_stem.weight) last_output_size = 1408 elif model_name.endswith('b4'): if raw: model._conv_stem = Conv2d(1, 48, kernel_size=3, stride=2, bias=False, padding=(0,1)) # torch.nn.init.xavier_normal_(model._conv_stem.weight) last_output_size = 1792 else: raise Exception('Checkout for other effnet types the channel size out last output') model._fc = torch.nn.Linear(last_output_size, n_classes) elif model_name.startswith('resnext101_32x') and model_name.endswith('wsl'): model = torch.hub.load('facebookresearch/WSL-Images', model_name) if raw: model.conv1 = torch.nn.Conv2d(1, 64, kernel_size=7, stride=2, padding=3, bias=False) model.fc = torch.nn.Linear(2048, n_classes) elif model_name == 'inceptionv3': model = inceptionv3() model.last_linear = torch.nn.Linear(2048, n_classes) if raw: model.Conv2d_1a_3x3 = BasicConv2d(1, 32, kernel_size=3, stride=2) if model is None: raise Exception('failed to instantiate model: '+ model_name) return model
def main(args): global C, H, W coco_labels = json.load(open(args.coco_labels)) num_classes = coco_labels['num_classes'] if args.model == 'inception_v3': C, H, W = 3, 299, 299 model = pretrainedmodels.inceptionv3(pretrained='imagenet') elif args.model == 'resnet152': C, H, W = 3, 224, 224 model = pretrainedmodels.resnet152(pretrained='imagenet') elif args.model == 'inception_v4': C, H, W = 3, 299, 299 model = pretrainedmodels.inceptionv4(num_classes=1000, pretrained='imagenet') else: print("doesn't support %s" % (args['model'])) load_image_fn = utils.LoadTransformImage(model) dim_feats = model.last_linear.in_features model = MILModel(model, dim_feats, num_classes) model = model.cuda() dataset = CocoDataset(coco_labels) dataloader = DataLoader(dataset, batch_size=args.batch_size, shuffle=True) optimizer = optim.Adam(model.parameters(), lr=args.learning_rate, weight_decay=args.weight_decay) exp_lr_scheduler = optim.lr_scheduler.StepLR( optimizer, step_size=args.learning_rate_decay_every, gamma=args.learning_rate_decay_rate) crit = nn.MultiLabelSoftMarginLoss() if not os.path.isdir(args.checkpoint_path): os.mkdir(args.checkpoint_path) train(dataloader, model, crit, optimizer, exp_lr_scheduler, load_image_fn, args)
def getNetwork(args): if (args.net_type == 'alexnet'): net = models.alexnet(pretrained=args.finetune) file_name = 'alexnet' elif (args.net_type == 'vggnet'): if (args.depth == 11): net = models.vgg11(pretrained=args.finetune) elif (args.depth == 13): net = models.vgg13(pretrained=args.finetune) elif (args.depth == 16): net = models.vgg16(pretrained=args.finetune) elif (args.depth == 19): net = models.vgg19(pretrained=args.finetune) else: print( 'Error : VGGnet should have depth of either [11, 13, 16, 19]') sys.exit(1) file_name = 'vgg-%s' % (args.depth) elif (args.net_type == 'squeezenet'): net = models.squeezenet1_0(pretrained=args.finetune) file_name = 'squeeze' elif (args.net_type == 'resnet'): net = resnet(args.finetune, args.depth) file_name = 'resnet-%s' % (args.depth) elif (args.net_type == 'inception'): net = pretrainedmodels.inceptionv3(num_classes=1000, pretrained='imagenet') file_name = 'inception-v3' elif (args.net_type == 'xception'): net = pretrainedmodels.xception(num_classes=1000, pretrained='imagenet') file_name = 'xception' else: print( 'Error : Network should be either [alexnet / squeezenet / vggnet / resnet]' ) sys.exit(1) return net, file_name
help='how many frames to sampler per video') parser.add_argument("--video_path", dest='video_path', type=str, default='data/MSR-VTT_Lite/Train_Val_Video', help='path to video dataset') parser.add_argument("--model", dest="model", type=str, default='resnet152', help='the CNN model you want to use to extract_feats') args = parser.parse_args() params = vars(args) if params['model'] == 'inception_v3': C, H, W = 3, 299, 299 model = pretrainedmodels.inceptionv3(pretrained='imagenet') load_image_fn = utils.LoadTransformImage(model) elif params['model'] == 'resnet152': C, H, W = 3, 224, 224 model = pretrainedmodels.resnet152(pretrained='imagenet') load_image_fn = utils.LoadTransformImage(model) elif params['model'] == 'inception_v4': C, H, W = 3, 299, 299 model = pretrainedmodels.inceptionv4(num_classes=1000, pretrained='imagenet') load_image_fn = utils.LoadTransformImage(model) else: print("doesn't support %s" % (params['model']))
def InceptionV3(num_classes, pretrained=False): if pretrained: model = inceptionv3(pretrained='imagenet') model.avg_pool = nn.AdaptiveAvgPool2d(1) model.last_linear = nn.Linear(2048, num_classes) return model
def main(): parser = argparse.ArgumentParser() parser.add_argument( '--video_dir', type=str, default='../msrvtt_2017/train-video', help='The video dir that one would like to extract audio file from') parser.add_argument('--output_dir', type=str, default='../msrvtt_2017/preprocessed', help='The file output directory') parser.add_argument( '--output_channels', type=int, default=1, help='The number of output audio channels, default to 1') parser.add_argument( '--output_frequency', type=int, default=16000, help='The output audio frequency in Hz, default to 16000') parser.add_argument( '--band_width', type=int, default=160, help= 'Bandwidth specified to sample the audio (unit in kbps), default to 160' ) parser.add_argument( '--model', type=str, default='resnet152', help= 'The pretrained model to use for extracting image features, default to resnet152' ) parser.add_argument('--gpu', type=str, default='0', help='The CUDA_VISIBLE_DEVICES argument, default to 0') parser.add_argument( '--n_frame_steps', type=int, default=80, help='The number of frames to extract from a single video') opt = parser.parse_args() opt = vars(opt) if not os.path.exists(opt['output_dir']): os.mkdir(opt['output_dir']) vToA(opt) split_audio(opt) print('cleaning up original .wav files...') dir = opt['output_dir'] dir = os.listdir(dir) for file in dir: if file.endswith('.wav'): os.remove(os.path.join(opt['output_dir'], file)) os.environ['CUDA_VISIBLE_DEVICES'] = opt['gpu'] if opt['model'] == 'resnet152': C, H, W = 3, 224, 224 model = pretrainedmodels.resnet152(pretrained='imagenet') load_image_fn = utils.LoadTransformImage(model) elif opt['model'] == 'inception_v3': C, H, W = 3, 299, 299 model = pretrainedmodels.inceptionv3(pretrained='imagenet') load_image_fn = utils.LoadTransformImage(model) elif opt['model'] == 'vgg16': C, H, W = 3, 224, 224 model = pretrainedmodels.vgg16(pretrained='imagenet') load_image_fn = utils.LoadTransformImage(model) else: print('The image model is not supported') model.last_linear = utils.Identity() model = nn.DataParallel(model) model = model.cuda() extract_image_feats(opt, model, load_image_fn)
parser.add_argument('--gpu_id', type=int, default=0, help='the gpu id to use') args = parser.parse_args() args.device = torch.device( 'cuda:' + str(args.gpu_id) if torch.cuda.is_available() else 'cpu') args.output_dir = os.path.join(args.output_dir, args.model) os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu params = vars(args) if params['model'] == 'inception_v3': C, H, W = 3, 299, 299 model = pretrainedmodels.inceptionv3(num_classes=1000, pretrained='imagenet') load_image_fn = utils.LoadTransformImage(model) elif params['model'] == 'resnet152': C, H, W = 3, 224, 224 model = pretrainedmodels.resnet152(num_classes=1000, pretrained='imagenet') load_image_fn = utils.LoadTransformImage(model) elif params['model'] == 'inception_v4': C, H, W = 3, 299, 299 model = pretrainedmodels.inceptionv4(num_classes=1000, pretrained='imagenet') load_image_fn = utils.LoadTransformImage(model) elif params['model'] == 'inceptionresnetv2':
def Model_builder(configer): model_name = configer.model['name'] No_classes = configer.dataset_cfg["id_cfg"]["num_classes"] model_pretrained = configer.model['pretrained'] model_dataparallel = configer.model["DataParallel"] model_gpu_replica = configer.model["Multi_GPU_replica"] gpu_ids = configer.train_cfg["gpu"] if model_name == "Inceptionv3": model = PM.inceptionv3(num_classes=1000, pretrained=model_pretrained) d = model.last_linear.in_features model.last_linear = nn.Linear(d, No_classes) elif model_name == "Xception": model = PM.xception(num_classes=1000, pretrained=model_pretrained) d = model.last_linear.in_features model.last_linear = nn.Linear(d, No_classes) elif model_name == "VGG_19": model = PM.vgg19(num_classes=1000, pretrained=model_pretrained) d = model.last_linear.in_features model.last_linear = nn.Linear(d, No_classes) elif model_name == "Resnet18": model = PM.resnet18(num_classes=1000, pretrained=model_pretrained) d = model.last_linear.in_features model.last_linear = nn.Linear(d, No_classes) elif model_name == "Resnet50": model = PM.resnet50(num_classes=1000, pretrained=model_pretrained) d = model.last_linear.in_features model.last_linear = nn.Linear(d, No_classes) elif model_name == "Resnet101": model = PM.resnet101(num_classes=1000, pretrained=model_pretrained) d = model.last_linear.in_features model.last_linear = nn.Linear(d, No_classes) elif model_name == "Resnet152": model = PM.resnet152(num_classes=1000, pretrained=model_pretrained) d = model.last_linear.in_features model.last_linear = nn.Linear(d, No_classes) elif model_name == "Resnet34": model = PM.resnet34(num_classes=1000, pretrained=model_pretrained) d = model.last_linear.in_features model.last_linear = nn.Linear(d, No_classes) elif model_name == "Densenet121": model = PM.densenet121(num_classes=1000, pretrained=model_pretrained) d = model.last_linear.in_features model.last_linear = nn.Linear(d, No_classes) elif model_name == "ResNeXt101-32": model = PM.resnext101_32x4d(num_classes=1000, pretrained=model_pretrained) d = model.last_linear.in_features model.last_linear = nn.Linear(d, No_classes) elif model_name == "ResNeXt101-64": model = PM.resnext101_64x4d(num_classes=1000, pretrained=model_pretrained) d = model.last_linear.in_features model.last_linear = nn.Linear(d, No_classes) elif model_name == "MobilenetV2": model = MobileNetV2(n_class=No_classes) else: raise ImportError("Model Architecture not supported") # Performing Data Parallelism if configured if model_dataparallel: model = torch.nn.DataParallel(model.to(device), device_ids=gpu_ids) elif model_gpu_replica: torch.distributed.init_process_group(backend='nccl', world_size=1, rank=1) model = torch.nn.DistributedDataParallel(model.to(device), device_ids=gpu_ids) else: model = model.to(device) print('---------- Model Loaded') return model
x: DataLoader(image_datasets[x], batch_size=opt.batch_size, shuffle=True, **kwargs) for x in mode } class_names = image_datasets['train'].classes dataset_size = {x: len(image_datasets[x]) for x in mode} print('#training images \n') print(dataset_size) # define my net and criterion optimizer my_inception_v3 = pretrainedmodels.inceptionv3(1000, pretrained='imagenet') #my_inception_v3 = torchvision.models.inception_v3(pretrained=True) dim_feats = my_inception_v3.last_linear.in_features # =2048 nb_classes = 4 my_inception_v3.last_linear = nn.Linear(dim_feats, nb_classes) my_inception_v3 = nn.DataParallel(my_inception_v3).cuda() my_inception_v3 = my_inception_v3.to(device) criterion = nn.CrossEntropyLoss() optimizer = optim.SGD(my_inception_v3.parameters(), lr=opt.lr, momentum=0.9, weight_decay=5e-4)