Ejemplo n.º 1
0
for ds_factor in factors:
    logging.info('Loading data')
    data = dict(train=D.flickr8k_loader(split='train',
                                        batch_size=batch_size,
                                        shuffle=True,
                                        feature_fname=feature_fname,
                                        downsampling_factor=ds_factor),
                val=D.flickr8k_loader(split='val',
                                      batch_size=batch_size,
                                      shuffle=False,
                                      feature_fname=feature_fname))
    fd = D.Flickr8KData
    fd.init_vocabulary(data['train'].dataset)

    # Saving config
    pickle.dump(
        dict(feature_fname=feature_fname,
             label_encoder=fd.get_label_encoder(),
             language='en'), open('config.pkl', 'wb'))

    logging.info('Building model')
    net = M.TextImage(M.get_default_config())
    run_config = dict(max_lr=2 * 1e-4, epochs=32)

    logging.info('Training')
    M.experiment(net, data, run_config)
    suffix = str(ds_factor).zfill(lz)
    res_fname = 'result_{}.json'.format(suffix)
    copyfile('result.json', res_fname)
    copy_best(res_fname, 'net_{}.best.pt'.format(ds_factor))
Ejemplo n.º 2
0
                 language='en'), open('config.pkl', 'wb'))

    if args.asr_model_dir:
        net = torch.load(os.path.join(args.asr_model_dir, 'net.best.pt'))
    else:
        logging.info('Building ASR model')
        config = M1.get_default_config()
        net = M1.SpeechTranscriber(config)
        run_config = dict(max_norm=2.0, max_lr=2 * 1e-4, epochs=32, opt='adam')
        logging.info('Training ASR')
        M1.experiment(net, data, run_config)
        suffix = str(ds_factor).zfill(lz)
        res_fname = 'result_asr_{}.json'.format(suffix)
        copyfile('result.json', res_fname)
        net_fname = 'asr_{}.best.pt'.format(ds_factor)
        copy_best(res_fname, net_fname, experiment_type='asr')
        net = torch.load(net_fname)

    logging.info('Extracting ASR transcriptions')
    for set_name in ['train', 'val']:
        ds = data[set_name].dataset
        hyp_asr, ref_asr = extract_trn(net, ds, use_beam_decoding=True)
        # Replacing original transcriptions with ASR's output
        for i in range(len(hyp_asr)):
            item = ds.split_data[i]
            if item[2] == ref_asr[i]:
                ds.split_data[i] = (item[0], item[1], hyp_asr[i])
            else:
                msg = 'Extracted reference #{} ({}) doesn\'t match dataset\'s \
                        one ({}) for {} set.'
Ejemplo n.º 3
0
                     embedding_dim=hidden_size),
            drop=dict(p=dropout),
            att=dict(in_size_enc=hidden_size * 2,
                     in_size_state=hidden_size,
                     hidden_size=hidden_size),
            rnn=dict(input_size=hidden_size * 3,
                     hidden_size=hidden_size,
                     num_layers=1,
                     dropout=dropout),
            out=dict(in_features=hidden_size * 3,
                     out_features=fd.vocabulary_size()),
            rnn_layer_type=nn.GRU,
            max_output_length=400,  # max length for flickr annotations is 199
            sos_id=fd.get_token_id(fd.sos),
            eos_id=fd.get_token_id(fd.eos),
            pad_id=fd.get_token_id(fd.pad)),
        inverse_transform_fn=fd.get_label_encoder().inverse_transform)

    logging.info('Building model')
    net = M.SpeechTranscriber(config)
    run_config = dict(max_norm=2.0, max_lr=2 * 1e-4, epochs=32, opt='adam')

    logging.info('Training')
    M.experiment(net, data, run_config)
    suffix = str(ds_factor).zfill(lz)
    res_fname = 'result_{}.json'.format(suffix)
    copyfile('result.json', res_fname)
    copy_best(res_fname,
              'net_{}.best.pt'.format(ds_factor),
              experiment_type='asr')
Ejemplo n.º 4
0
    pickle.dump(
        dict(feature_fname=feature_fname,
             label_encoder=fd.get_label_encoder(),
             language='jp'), open('config.pkl', 'wb'))

if args.asr_model_dir:
    net = torch.load(os.path.join(args.asr_model_dir, 'net.best.pt'))
else:
    logging.info('Building ASR model')
    config = M1.get_default_config()
    net = M1.SpeechTranscriber(config)
    run_config = dict(max_norm=2.0, max_lr=2 * 1e-4, epochs=32, opt='adam')
    logging.info('Training ASR')
    M1.experiment(net, data, run_config)
    copyfile('result.json', 'result_asr.json')
    copy_best('result_asr.json', 'asr.best.pt', experiment_type='asr')
    net = torch.load('asr.best.pt')

logging.info('Extracting ASR transcriptions')
for set_name in ['train', 'val']:
    ds = data[set_name].dataset
    hyp_asr, ref_asr = extract_trn(net, ds, use_beam_decoding=True)
    # Replacing original transcriptions with ASR's output
    for i in range(len(hyp_asr)):
        item = ds.split_data[i]
        if item[2] == ref_asr[i]:
            ds.split_data[i] = (item[0], item[1], hyp_asr[i])
        else:
            msg = 'Extracted reference #{} ({}) doesn\'t match dataset\'s \
                    one ({}) for {} set.'