for ds_factor in factors: logging.info('Loading data') data = dict(train=D.flickr8k_loader(split='train', batch_size=batch_size, shuffle=True, feature_fname=feature_fname, downsampling_factor=ds_factor), val=D.flickr8k_loader(split='val', batch_size=batch_size, shuffle=False, feature_fname=feature_fname)) fd = D.Flickr8KData fd.init_vocabulary(data['train'].dataset) # Saving config pickle.dump( dict(feature_fname=feature_fname, label_encoder=fd.get_label_encoder(), language='en'), open('config.pkl', 'wb')) logging.info('Building model') net = M.TextImage(M.get_default_config()) run_config = dict(max_lr=2 * 1e-4, epochs=32) logging.info('Training') M.experiment(net, data, run_config) suffix = str(ds_factor).zfill(lz) res_fname = 'result_{}.json'.format(suffix) copyfile('result.json', res_fname) copy_best(res_fname, 'net_{}.best.pt'.format(ds_factor))
language='en'), open('config.pkl', 'wb')) if args.asr_model_dir: net = torch.load(os.path.join(args.asr_model_dir, 'net.best.pt')) else: logging.info('Building ASR model') config = M1.get_default_config() net = M1.SpeechTranscriber(config) run_config = dict(max_norm=2.0, max_lr=2 * 1e-4, epochs=32, opt='adam') logging.info('Training ASR') M1.experiment(net, data, run_config) suffix = str(ds_factor).zfill(lz) res_fname = 'result_asr_{}.json'.format(suffix) copyfile('result.json', res_fname) net_fname = 'asr_{}.best.pt'.format(ds_factor) copy_best(res_fname, net_fname, experiment_type='asr') net = torch.load(net_fname) logging.info('Extracting ASR transcriptions') for set_name in ['train', 'val']: ds = data[set_name].dataset hyp_asr, ref_asr = extract_trn(net, ds, use_beam_decoding=True) # Replacing original transcriptions with ASR's output for i in range(len(hyp_asr)): item = ds.split_data[i] if item[2] == ref_asr[i]: ds.split_data[i] = (item[0], item[1], hyp_asr[i]) else: msg = 'Extracted reference #{} ({}) doesn\'t match dataset\'s \ one ({}) for {} set.'
embedding_dim=hidden_size), drop=dict(p=dropout), att=dict(in_size_enc=hidden_size * 2, in_size_state=hidden_size, hidden_size=hidden_size), rnn=dict(input_size=hidden_size * 3, hidden_size=hidden_size, num_layers=1, dropout=dropout), out=dict(in_features=hidden_size * 3, out_features=fd.vocabulary_size()), rnn_layer_type=nn.GRU, max_output_length=400, # max length for flickr annotations is 199 sos_id=fd.get_token_id(fd.sos), eos_id=fd.get_token_id(fd.eos), pad_id=fd.get_token_id(fd.pad)), inverse_transform_fn=fd.get_label_encoder().inverse_transform) logging.info('Building model') net = M.SpeechTranscriber(config) run_config = dict(max_norm=2.0, max_lr=2 * 1e-4, epochs=32, opt='adam') logging.info('Training') M.experiment(net, data, run_config) suffix = str(ds_factor).zfill(lz) res_fname = 'result_{}.json'.format(suffix) copyfile('result.json', res_fname) copy_best(res_fname, 'net_{}.best.pt'.format(ds_factor), experiment_type='asr')
pickle.dump( dict(feature_fname=feature_fname, label_encoder=fd.get_label_encoder(), language='jp'), open('config.pkl', 'wb')) if args.asr_model_dir: net = torch.load(os.path.join(args.asr_model_dir, 'net.best.pt')) else: logging.info('Building ASR model') config = M1.get_default_config() net = M1.SpeechTranscriber(config) run_config = dict(max_norm=2.0, max_lr=2 * 1e-4, epochs=32, opt='adam') logging.info('Training ASR') M1.experiment(net, data, run_config) copyfile('result.json', 'result_asr.json') copy_best('result_asr.json', 'asr.best.pt', experiment_type='asr') net = torch.load('asr.best.pt') logging.info('Extracting ASR transcriptions') for set_name in ['train', 'val']: ds = data[set_name].dataset hyp_asr, ref_asr = extract_trn(net, ds, use_beam_decoding=True) # Replacing original transcriptions with ASR's output for i in range(len(hyp_asr)): item = ds.split_data[i] if item[2] == ref_asr[i]: ds.split_data[i] = (item[0], item[1], hyp_asr[i]) else: msg = 'Extracted reference #{} ({}) doesn\'t match dataset\'s \ one ({}) for {} set.'