Example #1
0
                                        shuffle=True),
            val=D.NewmanRatner_loader(split='val',
                                      register="ADS",
                                      root=root,
                                      batch_size=16,
                                      shuffle=False))

config = dict(SpeechEncoder=dict(conv=dict(in_channels=39,
                                           out_channels=64,
                                           kernel_size=6,
                                           stride=2,
                                           padding=0,
                                           bias=False),
                                 rnn=dict(input_size=64,
                                          hidden_size=1024,
                                          num_layers=4,
                                          bidirectional=True,
                                          dropout=0),
                                 att=dict(in_size=2048, hidden_size=128)),
              ImageEncoder=dict(linear=dict(in_size=768, out_size=2 * 1024),
                                norm=True),
              margin_size=0.2)

logging.info('Building model')
net = M.SpeechImage(config)
run_config = dict(max_lr=2 * 1e-4, epochs=50, seed=seed)

logging.info('Training')
folder = os.path.dirname(os.path.abspath(__file__))
M.experiment(net, data, run_config, folder=folder)
Example #2
0
logging.basicConfig(level=logging.INFO)

logging.info('Loading data')
data = dict(train=D.flickr8k_loader(split='train', batch_size=32,
                                    shuffle=True),
            val=D.flickr8k_loader(split='val', batch_size=32, shuffle=False))
D.Flickr8KData.init_vocabulary(data['train'].dataset)

config = dict(SpeechEncoder=dict(conv=dict(in_channels=39,
                                           out_channels=64,
                                           kernel_size=6,
                                           stride=2,
                                           padding=0,
                                           bias=False),
                                 rnn=dict(input_size=64,
                                          hidden_size=1024,
                                          num_layers=4,
                                          bidirectional=True,
                                          dropout=0),
                                 att=dict(in_size=2048, hidden_size=128)),
              ImageEncoder=dict(linear=dict(in_size=2048, out_size=2 * 1024),
                                norm=True),
              margin_size=0.2)

logging.info('Building model')
net = M.SpeechImage(config)
run_config = dict(max_lr=2 * 1e-4, epochs=config_args.epochs)

logging.info('Training')
M.experiment(net, data, run_config)
Example #3
0
                                 rnn=dict(input_size=64,
                                          hidden_size=args.hidden_size_factor,
                                          num_layers=4,
                                          bidirectional=True,
                                          dropout=0),
                                 att=dict(in_size=2 * args.hidden_size_factor,
                                          hidden_size=128)),
              ImageEncoder=dict(linear=dict(in_size=2048,
                                            out_size=2 *
                                            args.hidden_size_factor),
                                norm=True),
              margin_size=0.2)

logging.info('Building model')
net = M.SpeechImage(config)
run_config = dict(
    max_lr=args.cyclic_lr_max,
    min_lr=args.cyclic_lr_min,
    epochs=args.epochs,
    l2_regularization=args.l2_regularization,
)

logging.info('Training')
old_time = datetime.datetime.now()
logging.info(f'Start of training - {old_time}')
M.experiment(net, data, run_config, wandb_mode='disabled')
new_time = datetime.datetime.now()
logging.info(f'End of training - {new_time}')
diff_time = new_time - old_time
logging.info(f'Total duration: {diff_time}')
Example #4
0
def train(args):
    # Setting general configuration
    torch.manual_seed(args.seed)
    random.seed(args.seed)

    # Logging the arguments
    logging.info('Arguments: {}'.format(args))

    logging.info('Loading data')

    if args.dataset_name == 'flickr8k':
        data = dict(train=D.flickr8k_loader(
            args.flickr8k_root,
            args.flickr8k_meta,
            args.flickr8k_language,
            args.audio_features_fn,
            split='train',
            batch_size=32,
            shuffle=True,
            downsampling_factor=args.downsampling_factor),
                    val=D.flickr8k_loader(args.flickr8k_root,
                                          args.flickr8k_meta,
                                          args.flickr8k_language,
                                          args.audio_features_fn,
                                          split='val',
                                          batch_size=32,
                                          shuffle=False))
    elif args.dataset_name == "spokencoco":
        data = dict(train=D.spokencoco_loader(
            args.spokencoco_root,
            args.spokencoco_meta,
            args.audio_features_fn,
            split='train',
            batch_size=32,
            shuffle=True,
            downsampling_factor=args.downsampling_factor,
            debug=args.debug),
                    val=D.spokencoco_loader(args.spokencoco_root,
                                            args.spokencoco_meta,
                                            args.audio_features_fn,
                                            split='val',
                                            batch_size=32,
                                            shuffle=False,
                                            debug=args.debug))
    else:
        raise ValueError(
            "dataset_name should be in ['flickr8k', 'spokencoco']")

    config = dict(
        SpeechEncoder=dict(conv=dict(in_channels=39,
                                     out_channels=64,
                                     kernel_size=6,
                                     stride=2,
                                     padding=0,
                                     bias=False),
                           rnn=dict(input_size=64,
                                    hidden_size=args.hidden_size_factor,
                                    num_layers=4,
                                    bidirectional=True,
                                    dropout=0),
                           att=dict(in_size=2 * args.hidden_size_factor,
                                    hidden_size=128)),
        ImageEncoder=dict(linear=dict(in_size=2048,
                                      out_size=2 * args.hidden_size_factor),
                          norm=True),
        margin_size=0.2)

    logging.info('Building model')
    net = M.SpeechImage(config)
    run_config = dict(
        max_lr=args.cyclic_lr_max,
        min_lr=args.cyclic_lr_min,
        epochs=args.epochs,
        l2_regularization=args.l2_regularization,
    )

    logging.info('Training')
    old_time = datetime.datetime.now()
    logging.info(f'Start of training - {old_time}')
    M.experiment(net, data, run_config, wandb_mode='disabled')
    new_time = datetime.datetime.now()
    logging.info(f'End of training - {new_time}')
    diff_time = new_time - old_time
    logging.info(f'Total duration: {diff_time}')