Esempio n. 1
0
            split='train', batch_size=batch_size, shuffle=True,
            downsampling_factor=ds_factor_text),
        val=D.flickr8k_loader(split='val', batch_size=batch_size))
else:
    data_asr = data

if args.asr_model_dir:
    net = torch.load(os.path.join(args.asr_model_dir, 'net.best.pt'))
else:
    logging.info('Building ASR/SLT model')
    config = M1.get_default_config()
    net = M1.SpeechTranscriber(config)
    run_config = dict(max_norm=2.0, max_lr=2 * 1e-4, epochs=32)
    logging.info('Training ASR/SLT')
    if data_asr['train'].dataset.is_slt():
        M1.experiment(net, data_asr, run_config, slt=True)
        copy_best('.', 'result.json', 'asr.best.pt', experiment_type='slt')
    else:
        M1.experiment(net, data_asr, run_config)
        copy_best('.', 'result.json', 'asr.best.pt', experiment_type='asr')
    copyfile('result.json', 'result_asr.json')
    net = torch.load('asr.best.pt')

logging.info('Extracting ASR/SLT transcriptions')
for set_name in ['train', 'val']:
    ds = data[set_name].dataset
    hyp_asr, ref_asr = extract_trn(net, ds, use_beam_decoding=True)
    # Replacing original transcriptions with ASR/SLT's output
    for i in range(len(hyp_asr)):
        item = ds.split_data[i]
        if item[2] == ref_asr[i]:
Esempio n. 2
0
                                num_layers=6,
                                bidirectional=True,
                                dropout=dropout),
                       rnn_layer_type=nn.GRU),
    TextDecoder=dict(
        emb=dict(num_embeddings=fd.vocabulary_size(),
                 embedding_dim=hidden_size),
        drop=dict(p=dropout),
        att=dict(in_size_enc=hidden_size * 2,
                 in_size_state=hidden_size,
                 hidden_size=hidden_size),
        rnn=dict(input_size=hidden_size * 3,
                 hidden_size=hidden_size,
                 num_layers=1,
                 dropout=dropout),
        out=dict(in_features=hidden_size * 3,
                 out_features=fd.vocabulary_size()),
        rnn_layer_type=nn.GRU,
        max_output_length=400,  # max length for flickr annotations is 199
        sos_id=fd.get_token_id(fd.sos),
        eos_id=fd.get_token_id(fd.eos),
        pad_id=fd.get_token_id(fd.pad)),
    inverse_transform_fn=fd.get_label_encoder().inverse_transform)

logging.info('Building model')
net = M.SpeechTranscriber(config)
run_config = dict(max_norm=2.0, max_lr=2 * 1e-4, epochs=32, opt='adam')

logging.info('Training')
M.experiment(net, data, run_config)
Esempio n. 3
0
args.enable_help()
args.parse()

# Setting general configuration
torch.manual_seed(args.seed)
random.seed(args.seed)

# Logging the arguments
logging.info('Arguments: {}'.format(args))


batch_size = 8

logging.info('Loading data')
data = dict(
    train=D.flickr8k_loader(
        args.flickr8k_root, args.flickr8k_meta, args.flickr8k_language,
        args.audio_features_fn, split='train', batch_size=batch_size,
        shuffle=True, downsampling_factor=args.downsampling_factor),
    val=D.flickr8k_loader(
        args.flickr8k_root, args.flickr8k_meta, args.flickr8k_language,
        args.audio_features_fn, split='val', batch_size=batch_size,
        shuffle=False))

logging.info('Building model')
net = M.SpeechTranscriber(M.get_default_config())
run_config = dict(max_norm=2.0, max_lr=2 * 1e-4, epochs=args.epochs)

logging.info('Training')
M.experiment(net, data, run_config, slt=data['train'].dataset.is_slt())