type=float, default=1.0, help="Margin paramater for triplet loss") parser.add_argument("--input-length", "-L", type=int, default=2048, help="number of input frames", choices=[1024, 2048]) args = parser.parse_args() print(args) # Setup loaders, models and loss train_loader = torch.utils.data.DataLoader(dataloaders.ImageCaptionDataset( args.data_train, audio_conf={'target_length': args.input_length}), batch_size=args.batch_size, shuffle=True, num_workers=8, pin_memory=True) val_loader = torch.utils.data.DataLoader(dataloaders.ImageCaptionDataset( args.data_val, audio_conf={'target_length': args.input_length}, image_conf={'center_crop': True}), batch_size=args.batch_size, shuffle=False, num_workers=8, pin_memory=True) audio_model = models.DaveNet(embedding_dim=args.input_length)
help="audio model architecture", choices=["Davenet"]) parser.add_argument("--image-model", type=str, default="VGG16", help="image model architecture", choices=["VGG16"]) parser.add_argument("--pretrained-image-model", action="store_true", dest="pretrained_image_model", help="Use an image network pretrained on ImageNet") parser.add_argument("--margin", type=float, default=1.0, help="Margin paramater for triplet loss") parser.add_argument("--input-length", "-L", type=int, default=2048, help="number of input frames", choices=[1024, 2048]) args = parser.parse_args() print(args) # Setup loaders, models and loss train_loader = torch.utils.data.DataLoader( dataloaders.ImageCaptionDataset(args.data_train, audio_conf={'target_length': args.input_length}, image_conf={'center_crop': True}), batch_size=args.batch_size, shuffle=True, num_workers=8, pin_memory=True) val_loader = torch.utils.data.DataLoader( dataloaders.ImageCaptionDataset(args.data_val, audio_conf={'target_length': args.input_length}, image_conf={'center_crop': True}), batch_size=args.batch_size, shuffle=False, num_workers=8, pin_memory=True) audio_model = models.ConvX3AudioNet(input_length=args.input_length) image_model = models.VGG16() if bool(args.train_path): audio_model.load_state_dict(torch.load("%s/models/best_audio_model.pth" % args.train_path), strict=False) criterion = DotLoss() # Set up the optimizer
args.cuda = not args.no_cuda and torch.cuda.is_available() resume = args.resume if args.resume: assert(bool(args.exp_dir)) with open("%s/args.pkl" % args.exp_dir, "rb") as f: args = pickle.load(f) args.resume = resume print(args) para = {"num_workers":8, "pin_memory":True} if args.cuda else {} train_loader = torch.utils.data.DataLoader( dataloaders.ImageCaptionDataset(args.data_train), batch_size=args.batch_size, shuffle=True, **para) val_loader = torch.utils.data.DataLoader( dataloaders.ImageCaptionDataset(args.data_val, image_conf={'center_crop':True}), batch_size=args.batch_size, shuffle=False, **para) audio_model = models.Davenet() image_model = models.VGG16(pretrained=args.pretrained_image_model) if not bool(args.exp_dir): print("exp_dir not specified, automatically creating one...") now = datetime.datetime.now(dateutil.tz.tzlocal()) timestamp = now.strftime('%Y_%m_%d_%H_%M_%S') args.exp_dir = "exp/Data-%s/AudioModel-%s_ImageModel-%s_Optim-%s_LR-%s_Epochs-%s_%s" % ( os.path.basename(args.data_train), args.audio_model, args.image_model, args.optim,
help="matchmap similarity function", choices=["SISA", "MISA", "SIMA"]) args = parser.parse_args() resume = args.resume if args.resume: assert (bool(args.exp_dir)) with open("%s/args.pkl" % args.exp_dir, "rb") as f: args = pickle.load(f) args.resume = resume print(args) train_loader = torch.utils.data.DataLoader(dataloaders.ImageCaptionDataset( args.data_train), batch_size=args.batch_size, shuffle=True, num_workers=8, pin_memory=True) val_loader = torch.utils.data.DataLoader(dataloaders.ImageCaptionDataset( args.data_val, image_conf={'center_crop': True}), batch_size=args.batch_size, shuffle=False, num_workers=8, pin_memory=True) # Pick right model based on string input models_module = __import__("models") audio_class = getattr(models_module, args.audio_model)