def main():
    # load config
    parser = argparse.ArgumentParser(description='Building ptbs dictionary')
    parser = load_arguments(parser)
    args = vars(parser.parse_args())
    config = validate_config(args)

    path_m2 = config['path_m2']
    generate_ptbs(path_m2)
Ejemplo n.º 2
0
def main():
    # load config
    parser = argparse.ArgumentParser(description='Corrupting corpus')
    parser = load_arguments(parser)
    args = vars(parser.parse_args())
    config = validate_config(args)

    input = config['input']
    output = config['output']
    corrupt_corpus(input, output)
def main():
    # load config
    parser = argparse.ArgumentParser(description='Building vocab')
    parser = load_arguments(parser)
    args = vars(parser.parse_args())
    config = validate_config(args)

    path_src = config['input']
    path_src_no_punct = config['output']
    print('2')
    building_vocab(path_src, path_src_no_punct)
def main():
    # load config
    parser = argparse.ArgumentParser(description='Building vocab')
    parser = load_arguments(parser)
    args = vars(parser.parse_args())
    config = validate_config(args)

    path_vocab_src = config['path_vocab_src']
    vocab_path_out = config['vocab_path_out']
    print('2')
    building_vocab(path_vocab_src, vocab_path_out)
def main():
    # load config
    parser = argparse.ArgumentParser(description='Building vocab')
    parser = load_arguments(parser)
    args = vars(parser.parse_args())
    config = validate_config(args)

    path_m2 = config['path_m2']
    path_src = config['path_src']
    path_tgt = config['path_tgt']
    print('2')
    m2_to_parallel(path_m2, path_src, path_tgt, False, True)
Ejemplo n.º 6
0
def main():
    # load config
    parser = argparse.ArgumentParser(
        description='filtering using a language model')
    parser = load_arguments(parser)
    args = vars(parser.parse_args())
    config = validate_config(args)

    src = config['src']
    tgt = config['tgt']
    threshold = config['threshold']
    lm = config['lm']
    process(src, tgt, lm, threshold)
Ejemplo n.º 7
0
def main():

	# load config
	parser = argparse.ArgumentParser(description='Seq2seq Evaluation')
	parser = load_arguments(parser)
	args = vars(parser.parse_args())
	config = validate_config(args)

	# load src-tgt pair
	test_path_src = config['test_path_src']
	test_path_tgt = config['test_path_tgt']
	path_vocab_src = config['path_vocab_src']
	path_vocab_tgt = config['path_vocab_tgt']
	test_path_out = config['test_path_out']
	load_dir = config['load']
	max_seq_len = config['max_seq_len']
	batch_size = config['batch_size']
	beam_width = config['beam_width']
	use_gpu = config['use_gpu']
	seqrev = config['seqrev']
	use_type = config['use_type']

	if not os.path.exists(test_path_out):
		os.makedirs(test_path_out)
	config_save_dir = os.path.join(test_path_out, 'eval.cfg')
	save_config(config, config_save_dir)

	# set test mode: 1 = translate; 2 = plot
	MODE = config['eval_mode']

	# check device:
	device = check_device(use_gpu)
	print('device: {}'.format(device))

	# load test_set
	test_set = Dataset(test_path_src, test_path_tgt,
						path_vocab_src, path_vocab_tgt,
						seqrev=seqrev,
						max_seq_len=max_seq_len,
						batch_size=batch_size,
						use_gpu=use_gpu,
						use_type=use_type)
	print('Testset loaded')
	sys.stdout.flush()

	# run eval
	if MODE == 1:
		translate(test_set, load_dir, test_path_out, use_gpu,
			max_seq_len, beam_width, device, seqrev=seqrev)
def main():

    # import pdb; pdb.set_trace()
    # load config
    parser = argparse.ArgumentParser(description='LAS + NMT Training')
    parser = load_arguments(parser)
    args = vars(parser.parse_args())
    config = validate_config(args)

    # set random seed
    if config['random_seed'] is not None:
        set_global_seeds(config['random_seed'])

    # record config
    if not os.path.isabs(config['save']):
        config_save_dir = os.path.join(os.getcwd(), config['save'])
    if not os.path.exists(config['save']):
        os.makedirs(config['save'])

    # resume or not
    if type(config['load']) != type(None) and config['load_mode'] == 'resume':
        config_save_dir = os.path.join(config['save'], 'model-cont.cfg')
    else:
        config_save_dir = os.path.join(config['save'], 'model.cfg')
    save_config(config, config_save_dir)

    loss_coeff = {}
    loss_coeff['nll_asr'] = config['loss_nll_asr_coeff']
    loss_coeff['nll_mt'] = config['loss_nll_mt_coeff']
    loss_coeff['nll_st'] = config['loss_nll_st_coeff']

    # contruct trainer
    Trainer = globals()['Trainer_{}'.format(config['mode'])]
    t = Trainer(expt_dir=config['save'],
                load_dir=config['load'],
                load_mode=config['load_mode'],
                load_freeze=config['load_freeze'],
                batch_size=config['batch_size'],
                minibatch_partition=config['minibatch_partition'],
                checkpoint_every=config['checkpoint_every'],
                print_every=config['print_every'],
                learning_rate=config['learning_rate'],
                learning_rate_init=config['learning_rate_init'],
                lr_warmup_steps=config['lr_warmup_steps'],
                eval_with_mask=config['eval_with_mask'],
                use_gpu=config['use_gpu'],
                gpu_id=config['gpu_id'],
                max_grad_norm=config['max_grad_norm'],
                max_count_no_improve=config['max_count_no_improve'],
                max_count_num_rollback=config['max_count_num_rollback'],
                keep_num=config['keep_num'],
                normalise_loss=config['normalise_loss'],
                loss_coeff=loss_coeff)

    # vocab
    path_vocab_src = config['path_vocab_src']
    path_vocab_tgt = config['path_vocab_tgt']

    # ----- 3WAY -----
    train_set = None
    dev_set = None
    mode = config['mode']
    if 'ST' in mode:
        # load train set
        if config['st_train_path_src']:
            t.logger.info(' -- load ST train set -- ')
            train_path_src = config['st_train_path_src']
            train_path_tgt = config['st_train_path_tgt']
            train_acous_path = config['st_train_acous_path']
            train_set = Dataset(path_src=train_path_src,
                                path_tgt=train_path_tgt,
                                path_vocab_src=path_vocab_src,
                                path_vocab_tgt=path_vocab_tgt,
                                use_type=config['use_type'],
                                acous_path=train_acous_path,
                                seqrev=config['seqrev'],
                                acous_norm=config['las_acous_norm'],
                                acous_norm_path=config['st_acous_norm_path'],
                                acous_max_len=config['las_acous_max_len'],
                                max_seq_len_src=config['max_seq_len_src'],
                                max_seq_len_tgt=config['max_seq_len_tgt'],
                                batch_size=config['batch_size'],
                                data_ratio=config['st_data_ratio'],
                                use_gpu=config['use_gpu'],
                                mode='ST',
                                logger=t.logger)

            vocab_size_enc = len(train_set.vocab_src)
            vocab_size_dec = len(train_set.vocab_tgt)
            src_word2id = train_set.src_word2id
            tgt_word2id = train_set.tgt_word2id
            src_id2word = train_set.src_id2word
            tgt_id2word = train_set.tgt_id2word

        # load dev set
        if config['st_dev_path_src']:
            t.logger.info(' -- load ST dev set -- ')
            dev_path_src = config['st_dev_path_src']
            dev_path_tgt = config['st_dev_path_tgt']
            dev_acous_path = config['st_dev_acous_path']
            dev_set = Dataset(path_src=dev_path_src,
                              path_tgt=dev_path_tgt,
                              path_vocab_src=path_vocab_src,
                              path_vocab_tgt=path_vocab_tgt,
                              use_type=config['use_type'],
                              acous_path=dev_acous_path,
                              acous_norm_path=config['st_acous_norm_path'],
                              acous_max_len=config['las_acous_max_len'],
                              seqrev=config['seqrev'],
                              acous_norm=config['las_acous_norm'],
                              max_seq_len_src=config['max_seq_len_src'],
                              max_seq_len_tgt=config['max_seq_len_tgt'],
                              batch_size=config['batch_size'],
                              use_gpu=config['use_gpu'],
                              mode='ST',
                              logger=t.logger)
        else:
            dev_set = None

    # ----- ASR -----
    asr_train_set = None
    asr_dev_set = None
    if 'ASR' in mode:
        # load train set
        if config['asr_train_path_src']:
            t.logger.info(' -- load ASR train set -- ')
            asr_train_path_src = config['asr_train_path_src']
            asr_train_acous_path = config['asr_train_acous_path']
            asr_train_set = Dataset(
                path_src=asr_train_path_src,
                path_tgt=None,
                path_vocab_src=path_vocab_src,
                path_vocab_tgt=path_vocab_tgt,
                use_type=config['use_type'],
                acous_path=asr_train_acous_path,
                acous_norm_path=config['asr_train_acous_norm_path'],
                seqrev=config['seqrev'],
                acous_norm=config['las_acous_norm'],
                acous_max_len=config['las_acous_max_len'],
                max_seq_len_src=config['max_seq_len_src'],
                max_seq_len_tgt=config['max_seq_len_tgt'],
                batch_size=config['batch_size'],
                data_ratio=config['asr_data_ratio'],
                use_gpu=config['use_gpu'],
                mode='ASR',
                logger=t.logger)

            vocab_size_enc = len(asr_train_set.vocab_src)
            vocab_size_dec = len(asr_train_set.vocab_tgt)
            src_word2id = asr_train_set.src_word2id
            tgt_word2id = asr_train_set.tgt_word2id
            src_id2word = asr_train_set.src_id2word
            tgt_id2word = asr_train_set.tgt_id2word

        # load dev set
        if config['asr_dev_path_src']:
            t.logger.info(' -- load ASR dev set -- ')
            asr_dev_path_src = config['asr_dev_path_src']
            asr_dev_acous_path = config['asr_dev_acous_path']
            asr_dev_set = Dataset(
                path_src=asr_dev_path_src,
                path_tgt=None,
                path_vocab_src=path_vocab_src,
                path_vocab_tgt=path_vocab_tgt,
                use_type=config['use_type'],
                acous_path=asr_dev_acous_path,
                acous_norm_path=config['asr_dev_acous_norm_path'],
                acous_max_len=config['las_acous_max_len'],
                seqrev=config['seqrev'],
                acous_norm=config['las_acous_norm'],
                max_seq_len_src=config['max_seq_len_src'],
                max_seq_len_tgt=config['max_seq_len_tgt'],
                batch_size=config['batch_size'],
                use_gpu=config['use_gpu'],
                mode='ASR',
                logger=t.logger)
        else:
            asr_dev_set = None

    # ----- MT -----
    mt_train_set = None
    mt_dev_set = None
    if 'MT' in mode:
        # load train set
        if config['mt_train_path_src']:
            t.logger.info(' -- load MT train set -- ')
            mt_train_path_src = config['mt_train_path_src']
            mt_train_path_tgt = config['mt_train_path_tgt']
            mt_train_set = Dataset(path_src=mt_train_path_src,
                                   path_tgt=mt_train_path_tgt,
                                   path_vocab_src=path_vocab_src,
                                   path_vocab_tgt=path_vocab_tgt,
                                   use_type=config['use_type'],
                                   acous_path=None,
                                   acous_norm_path=None,
                                   seqrev=config['seqrev'],
                                   acous_norm=config['las_acous_norm'],
                                   acous_max_len=config['las_acous_max_len'],
                                   max_seq_len_src=config['max_seq_len_src'],
                                   max_seq_len_tgt=config['max_seq_len_tgt'],
                                   batch_size=config['batch_size'],
                                   data_ratio=config['mt_data_ratio'],
                                   use_gpu=config['use_gpu'],
                                   mode='MT',
                                   logger=t.logger)

            vocab_size_enc = len(mt_train_set.vocab_src)
            vocab_size_dec = len(mt_train_set.vocab_tgt)
            src_word2id = mt_train_set.src_word2id
            tgt_word2id = mt_train_set.tgt_word2id
            src_id2word = mt_train_set.src_id2word
            tgt_id2word = mt_train_set.tgt_id2word

        # load dev set
        if config['mt_dev_path_src']:
            t.logger.info(' -- load MT dev set -- ')
            mt_dev_path_src = config['mt_dev_path_src']
            mt_dev_path_tgt = config['mt_dev_path_tgt']
            mt_dev_set = Dataset(path_src=mt_dev_path_src,
                                 path_tgt=mt_dev_path_tgt,
                                 path_vocab_src=path_vocab_src,
                                 path_vocab_tgt=path_vocab_tgt,
                                 use_type=config['use_type'],
                                 acous_path=None,
                                 acous_norm_path=None,
                                 acous_max_len=config['las_acous_max_len'],
                                 seqrev=config['seqrev'],
                                 acous_norm=config['las_acous_norm'],
                                 max_seq_len_src=config['max_seq_len_src'],
                                 max_seq_len_tgt=config['max_seq_len_tgt'],
                                 batch_size=config['batch_size'],
                                 use_gpu=config['use_gpu'],
                                 mode='MT',
                                 logger=t.logger)
        else:
            mt_dev_set = None

    # collect all datasets
    train_sets = {}
    dev_sets = {}
    train_sets['st'] = train_set
    train_sets['asr'] = asr_train_set
    train_sets['mt'] = mt_train_set
    dev_sets['st'] = dev_set
    dev_sets['asr'] = asr_dev_set
    dev_sets['mt'] = mt_dev_set

    # device
    device = check_device(config['use_gpu'])
    t.logger.info('device:{}'.format(device))

    # construct nmt model
    seq2seq = Seq2seq(
        vocab_size_enc,
        vocab_size_dec,
        share_embedder=config['share_embedder'],
        enc_embedding_size=config['embedding_size_enc'],
        dec_embedding_size=config['embedding_size_dec'],
        load_embedding_src=config['load_embedding_src'],
        load_embedding_tgt=config['load_embedding_tgt'],
        num_heads=config['num_heads'],
        dim_model=config['dim_model'],
        dim_feedforward=config['dim_feedforward'],
        enc_layers=config['enc_layers'],
        dec_layers=config['dec_layers'],
        embedding_dropout=config['embedding_dropout'],
        dropout=config['dropout'],
        max_seq_len_src=config['max_seq_len_src'],
        max_seq_len_tgt=config['max_seq_len_tgt'],
        act=config['act'],
        enc_word2id=src_word2id,
        dec_word2id=tgt_word2id,
        enc_id2word=src_id2word,
        dec_id2word=tgt_id2word,
        transformer_type=config['transformer_type'],
        enc_emb_proj=config['enc_emb_proj'],
        dec_emb_proj=config['dec_emb_proj'],
        #
        acous_dim=config['las_acous_dim'],
        acous_hidden_size=config['las_acous_hidden_size'],
        #
        mode=config['mode'],
        load_mode=config['load_mode'])
    seq2seq = seq2seq.to(device=device)

    # run training
    seq2seq = t.train(train_sets,
                      seq2seq,
                      num_epochs=config['num_epochs'],
                      dev_sets=dev_sets,
                      grab_memory=config['grab_memory'])
Ejemplo n.º 9
0
def main():

	# load config
	parser = argparse.ArgumentParser(description='Seq2seq Evaluation')
	parser = load_arguments(parser)
	args = vars(parser.parse_args())
	config = validate_config(args)

	# load src-tgt pair
	test_path_src = config['test_path_src']
	test_path_tgt = config['test_path_tgt'] # dummy
	if type(test_path_tgt) == type(None):
		test_path_tgt = test_path_src
	test_path_out = config['test_path_out']
	load_dir = config['load']
	max_seq_len = config['max_seq_len']
	batch_size = config['batch_size']
	beam_width = config['beam_width']
	use_gpu = config['use_gpu']
	seqrev = config['seqrev']
	use_type = config['use_type']

	if not os.path.exists(test_path_out):
		os.makedirs(test_path_out)
	config_save_dir = os.path.join(test_path_out, 'eval.cfg')
	save_config(config, config_save_dir)

	# set test mode: 1 = translate; 3 = plot
	MODE = config['eval_mode']
	if MODE == 3:
		max_seq_len = 32
		batch_size = 1
		beam_width = 1
		use_gpu = False

	# check device:
	device = check_device(use_gpu)
	print('device: {}'.format(device))

	# load model
	latest_checkpoint_path = load_dir
	resume_checkpoint = Checkpoint.load(latest_checkpoint_path)
	model = resume_checkpoint.model.to(device)
	vocab_src = resume_checkpoint.input_vocab
	vocab_tgt = resume_checkpoint.output_vocab
	print('Model dir: {}'.format(latest_checkpoint_path))
	print('Model laoded')

	# load test_set
	test_set = Dataset(test_path_src, test_path_tgt,
						vocab_src_list=vocab_src, vocab_tgt_list=vocab_tgt,
						seqrev=seqrev,
						max_seq_len=max_seq_len,
						batch_size=batch_size,
						use_gpu=use_gpu,
						use_type=use_type)
	print('Test dir: {}'.format(test_path_src))
	print('Testset loaded')
	sys.stdout.flush()

	# run eval
	if MODE == 1: # FR
		translate(test_set, model, test_path_out, use_gpu,
			max_seq_len, beam_width, device, seqrev=seqrev)

	if MODE == 2:
		translate_batch(test_set, model, test_path_out, use_gpu,
			max_seq_len, beam_width, device, seqrev=seqrev)

	elif MODE == 3:
		# plotting
		att_plot(test_set, model, test_path_out, use_gpu,
			max_seq_len, beam_width, device)

	elif MODE == 4: # TF
		translate_tf(test_set, model, test_path_out, use_gpu,
			max_seq_len, beam_width, device, seqrev=seqrev)
Ejemplo n.º 10
0
def main():

	# load config
	parser = argparse.ArgumentParser(description='Seq2seq Evaluation')
	parser = load_arguments(parser)
	args = vars(parser.parse_args())
	config = validate_config(args)

	# load src-tgt pair
	test_path_src = config['test_path_src']
	test_path_tgt = test_path_src
	test_path_out = config['test_path_out']
	load_dir = config['load']
	max_seq_len = config['max_seq_len']
	batch_size = config['batch_size']
	beam_width = config['beam_width']
	use_gpu = config['use_gpu']
	seqrev = config['seqrev']
	use_type = config['use_type']

	# set test mode: 1 = translate; 2 = plot; 3 = save comb ckpt
	MODE = config['eval_mode']
	if MODE != 3:
		if not os.path.exists(test_path_out):
			os.makedirs(test_path_out)
		config_save_dir = os.path.join(test_path_out, 'eval.cfg')
		save_config(config, config_save_dir)

	# check device:
	device = check_device(use_gpu)
	print('device: {}'.format(device))

	# load model
	latest_checkpoint_path = load_dir
	resume_checkpoint = Checkpoint.load(latest_checkpoint_path)
	model = resume_checkpoint.model.to(device)
	vocab_src = resume_checkpoint.input_vocab
	vocab_tgt = resume_checkpoint.output_vocab
	print('Model dir: {}'.format(latest_checkpoint_path))
	print('Model laoded')

	# combine model
	if type(config['combine_path']) != type(None):
		model = combine_weights(config['combine_path'])

	# load test_set
	test_set = Dataset(test_path_src, test_path_tgt,
						vocab_src_list=vocab_src, vocab_tgt_list=vocab_tgt,
						seqrev=seqrev,
						max_seq_len=900,
						batch_size=batch_size,
						use_gpu=use_gpu,
						use_type=use_type)
	print('Test dir: {}'.format(test_path_src))
	print('Testset loaded')
	sys.stdout.flush()

	# run eval
	if MODE == 1:
		translate(test_set, model, test_path_out, use_gpu,
			max_seq_len, beam_width, device, seqrev=seqrev)

	elif MODE == 2: # output posterior
		translate_logp(test_set, model, test_path_out, use_gpu,
			max_seq_len, device, seqrev=seqrev)

	elif MODE == 3: # save combined model
		ckpt = Checkpoint(model=model,
				   optimizer=None, epoch=0, step=0,
				   input_vocab=test_set.vocab_src,
				   output_vocab=test_set.vocab_tgt)
		saved_path = ckpt.save_customise(
			os.path.join(config['combine_path'].strip('/')+'-combine','combine'))
		log_ckpts(config['combine_path'], config['combine_path'].strip('/')+'-combine')
		print('saving at {} ... '.format(saved_path))
Ejemplo n.º 11
0
def main():

	# load config
	parser = argparse.ArgumentParser(description='Seq2seq Training')
	parser = load_arguments(parser)
	args = vars(parser.parse_args())
	config = validate_config(args)

	# set random seed
	if config['random_seed'] is not None:
		set_global_seeds(config['random_seed'])

	# record config
	if not os.path.isabs(config['save']):
		config_save_dir = os.path.join(os.getcwd(), config['save'])
	if not os.path.exists(config['save']):
		os.makedirs(config['save'])

	# loading old models
	if config['load']:
		print('loading {} ...'.format(config['load']))
		config_save_dir = os.path.join(config['save'], 'model-cont.cfg')
	else:
		config_save_dir = os.path.join(config['save'], 'model.cfg')
	save_config(config, config_save_dir)

	# contruct trainer
	t = Trainer(expt_dir=config['save'],
					load_dir=config['load'],
					load_mode=config['load_mode'],
					batch_size=config['batch_size'],
					checkpoint_every=config['checkpoint_every'],
					print_every=config['print_every'],
					eval_mode=config['eval_mode'],
					eval_metric=config['eval_metric'],
					learning_rate=config['learning_rate'],
					learning_rate_init=config['learning_rate_init'],
					lr_warmup_steps=config['lr_warmup_steps'],
					eval_with_mask=config['eval_with_mask'],
					use_gpu=config['use_gpu'],
					gpu_id=config['gpu_id'],
					max_grad_norm=config['max_grad_norm'],
					max_count_no_improve=config['max_count_no_improve'],
					max_count_num_rollback=config['max_count_num_rollback'],
					keep_num=config['keep_num'],
					normalise_loss=config['normalise_loss'],
					minibatch_split=config['minibatch_split']
					)

	# load train set
	train_path_src = config['train_path_src']
	train_path_tgt = config['train_path_tgt']
	path_vocab_src = config['path_vocab_src']
	path_vocab_tgt = config['path_vocab_tgt']
	train_set = Dataset(train_path_src, train_path_tgt,
		path_vocab_src=path_vocab_src, path_vocab_tgt=path_vocab_tgt,
		seqrev=config['seqrev'],
		max_seq_len=config['max_seq_len'],
		batch_size=config['batch_size'],
		data_ratio=config['data_ratio'],
		use_gpu=config['use_gpu'],
		logger=t.logger,
		use_type=config['use_type'],
		use_type_src=config['use_type_src'])

	vocab_size_enc = len(train_set.vocab_src)
	vocab_size_dec = len(train_set.vocab_tgt)

	# load dev set
	if config['dev_path_src'] and config['dev_path_tgt']:
		dev_path_src = config['dev_path_src']
		dev_path_tgt = config['dev_path_tgt']
		dev_set = Dataset(dev_path_src, dev_path_tgt,
			path_vocab_src=path_vocab_src, path_vocab_tgt=path_vocab_tgt,
			seqrev=config['seqrev'],
			max_seq_len=config['max_seq_len'],
			batch_size=config['batch_size'],
			use_gpu=config['use_gpu'],
			logger=t.logger,
			use_type=config['use_type'],
			use_type_src=config['use_type_src'])
	else:
		dev_set = None

	# construct model
	seq2seq = Seq2seq(vocab_size_enc, vocab_size_dec,
					share_embedder=config['share_embedder'],
					enc_embedding_size=config['embedding_size_enc'],
					dec_embedding_size=config['embedding_size_dec'],
					load_embedding_src=config['load_embedding_src'],
					load_embedding_tgt=config['load_embedding_tgt'],
					num_heads=config['num_heads'],
					dim_model=config['dim_model'],
					dim_feedforward=config['dim_feedforward'],
					enc_layers=config['enc_layers'],
					dec_layers=config['dec_layers'],
					embedding_dropout=config['embedding_dropout'],
					dropout=config['dropout'],
					max_seq_len=config['max_seq_len'],
					act=config['act'],
					enc_word2id=train_set.src_word2id,
					dec_word2id=train_set.tgt_word2id,
					enc_id2word=train_set.src_id2word,
					dec_id2word=train_set.tgt_id2word,
					transformer_type=config['transformer_type'])

	# import pdb; pdb.set_trace()
	t.logger.info("total #parameters:{}".format(sum(p.numel() for p in
		seq2seq.parameters() if p.requires_grad)))

	device = check_device(config['use_gpu'])
	t.logger.info('device: {}'.format(device))
	seq2seq = seq2seq.to(device=device)

	# run training
	seq2seq = t.train(train_set, seq2seq, num_epochs=config['num_epochs'],
		dev_set=dev_set, grab_memory=config['grab_memory'])
Ejemplo n.º 12
0
def main():

    # load config
    parser = argparse.ArgumentParser(description='PyTorch Seq2Seq-DD Training')
    parser = load_arguments(parser)
    args = vars(parser.parse_args())
    config = validate_config(args)

    # record config
    if not os.path.isabs(config['save']):
        config_save_dir = os.path.join(os.getcwd(), config['save'])
    if not os.path.exists(config['save']):
        os.makedirs(config['save'])

    # check device:
    if config['use_gpu'] and torch.cuda.is_available():
        global device
        device = torch.device('cuda')
    else:
        device = torch.device('cpu')
    print('device: {}'.format(device))

    # resume or not
    if config['load']:
        resume = True
        print('resuming {} ...'.format(config['load']))
        config_save_dir = os.path.join(config['save'], 'model-cont.cfg')
    else:
        resume = False
        config_save_dir = os.path.join(config['save'], 'model.cfg')
    save_config(config, config_save_dir)

    # load train set
    train_path_src = config['train_path_src']
    train_path_tgt = config['train_path_tgt']
    path_vocab_src = config['path_vocab_src']
    path_vocab_tgt = config['path_vocab_tgt']
    train_attkey_path = config['train_attkey_path']
    train_set = Dataset(train_path_src,
                        train_path_tgt,
                        path_vocab_src,
                        path_vocab_tgt,
                        attkey_path=train_attkey_path,
                        seqrev=config['seqrev'],
                        max_seq_len=config['max_seq_len'],
                        batch_size=config['batch_size'],
                        use_gpu=config['use_gpu'])

    vocab_size_enc = len(train_set.vocab_src)
    vocab_size_dec = len(train_set.vocab_tgt)

    # load dev set
    if config['dev_path_src'] and config['dev_path_tgt']:
        dev_path_src = config['dev_path_src']
        dev_path_tgt = config['dev_path_tgt']
        dev_attkey_path = config['dev_attkey_path']
        dev_set = Dataset(dev_path_src,
                          dev_path_tgt,
                          path_vocab_src,
                          path_vocab_tgt,
                          attkey_path=dev_attkey_path,
                          seqrev=config['seqrev'],
                          max_seq_len=config['max_seq_len'],
                          batch_size=config['batch_size'],
                          use_gpu=config['use_gpu'])
    else:
        dev_set = None

    # construct model
    seq2seq = Seq2Seq(vocab_size_enc,
                      vocab_size_dec,
                      embedding_size_enc=config['embedding_size_enc'],
                      embedding_size_dec=config['embedding_size_dec'],
                      embedding_dropout=config['embedding_dropout'],
                      hidden_size_enc=config['hidden_size_enc'],
                      num_bilstm_enc=config['num_bilstm_enc'],
                      num_unilstm_enc=config['num_unilstm_enc'],
                      hidden_size_dec=config['hidden_size_dec'],
                      num_unilstm_dec=config['num_unilstm_dec'],
                      hidden_size_att=config['hidden_size_att'],
                      hidden_size_shared=config['hidden_size_shared'],
                      dropout=config['dropout'],
                      residual=config['residual'],
                      batch_first=config['batch_first'],
                      max_seq_len=config['max_seq_len'],
                      batch_size=config['batch_size'],
                      load_embedding_src=config['load_embedding_src'],
                      load_embedding_tgt=config['load_embedding_tgt'],
                      src_word2id=train_set.src_word2id,
                      tgt_word2id=train_set.tgt_word2id,
                      src_id2word=train_set.src_id2word,
                      att_mode=config['att_mode'],
                      hard_att=config['hard_att'],
                      use_gpu=config['use_gpu'],
                      additional_key_size=config['additional_key_size'],
                      ptr_net=config['ptr_net'],
                      use_bpe=config['use_bpe'])

    if config['use_gpu']:
        seq2seq = seq2seq.cuda()

    # contruct trainer
    t = Trainer(expt_dir=config['save'],
                load_dir=config['load'],
                batch_size=config['batch_size'],
                random_seed=config['random_seed'],
                checkpoint_every=config['checkpoint_every'],
                print_every=config['print_every'],
                learning_rate=config['learning_rate'],
                eval_with_mask=config['eval_with_mask'],
                scheduled_sampling=config['scheduled_sampling'],
                teacher_forcing_ratio=config['teacher_forcing_ratio'],
                use_gpu=config['use_gpu'],
                max_grad_norm=config['max_grad_norm'],
                ddatt_loss_weight=config['ddatt_loss_weight'],
                ddattcls_loss_weight=config['ddattcls_loss_weight'],
                att_scale_up=config['att_scale_up'])

    # run training
    seq2seq = t.train(train_set,
                      seq2seq,
                      num_epochs=config['num_epochs'],
                      resume=resume,
                      dev_set=dev_set)
Ejemplo n.º 13
0
def main():

    # import pdb; pdb.set_trace()
    # load config
    warnings.filterwarnings("ignore")
    parser = argparse.ArgumentParser(
        description='PyTorch Seq2Seq Joint Training')
    parser = load_arguments(parser)
    args = vars(parser.parse_args())
    config = validate_config(args)

    # record config
    if not os.path.isabs(config['save']):
        config_save_dir = os.path.join(os.getcwd(), config['save'])
    if not os.path.exists(config['save']):
        os.makedirs(config['save'])

    # check device:
    if config['use_gpu'] and torch.cuda.is_available():
        global device
        device = torch.device('cuda')
    else:
        device = torch.device('cpu')
    print('device: {}'.format(device))

    # set random seed
    if config['random_seed'] is not None:
        set_global_seeds(config['random_seed'])

    # resume or not
    if config['load']:
        print('resuming {} ...'.format(config['load']))
        config_save_dir = os.path.join(config['save'], 'model-cont.cfg')
    elif config['restart']:
        print('restarting from {} ...'.format(config['restart']))
        config_save_dir = os.path.join(config['save'], 'model-restart.cfg')
    else:
        config_save_dir = os.path.join(config['save'], 'model.cfg')
    save_config(config, config_save_dir)

    # load vacabulary
    path_vocab_src = config['path_vocab_src']
    path_vocab_tgt = config['path_vocab_tgt']
    assert config['load_embedding_src'] == config['load_embedding_tgt'], \
     'src tgt embeddings are different'
    if type(config['load_embedding_src']) == type(None):
        load_embedding = False
    else:
        load_embedding = True
    print('load embedding: {}'.format(load_embedding))

    # load dataset
    time_st1 = time.time()
    if config['train_mode'] == 'separate':

        ddgec_train_set = None
        ddgec_dev_set = None

        # load train set
        dd_train_path_src = config['dd_train_path_src']
        dd_train_path_tgt = config['dd_train_path_tgt']
        dd_train_tsv_path = config['dd_train_tsv_path']
        dd_train_set = Dataset(dd_train_path_src,
                               dd_train_path_tgt,
                               path_vocab_src,
                               path_vocab_tgt,
                               seqrev=config['seqrev'],
                               tsv_path=dd_train_tsv_path,
                               set_type='dd',
                               max_seq_len=config['max_seq_len'],
                               batch_size=config['batch_size'],
                               use_gpu=config['use_gpu'])
        gec_train_path_src = config['gec_train_path_src']
        gec_train_path_tgt = config['gec_train_path_tgt']
        gec_train_tsv_path = config['gec_train_tsv_path']
        gec_train_set = Dataset(gec_train_path_src,
                                gec_train_path_tgt,
                                path_vocab_src,
                                path_vocab_tgt,
                                seqrev=config['seqrev'],
                                tsv_path=gec_train_tsv_path,
                                set_type='gec',
                                max_seq_len=config['max_seq_len'],
                                batch_size=config['batch_size'],
                                use_gpu=config['use_gpu'])
        # note: vocab size of dd / gec should agree
        vocab_size_enc = len(dd_train_set.vocab_src)
        vocab_size_dec = len(dd_train_set.vocab_tgt)
        assert vocab_size_enc == vocab_size_dec, \
         'mismatch vocab size: {} - {}'.format(vocab_size_enc, vocab_size_dec)
        vocab_size = vocab_size_enc
        src_word2id = gec_train_set.src_word2id
        tgt_word2id = gec_train_set.tgt_word2id
        src_id2word = gec_train_set.src_id2word

        # load dev set
        if config['dd_dev_path_src'] and config['dd_dev_path_tgt']:
            dd_dev_path_src = config['dd_dev_path_src']
            dd_dev_path_tgt = config['dd_dev_path_tgt']
            dd_dev_tsv_path = config['dd_dev_tsv_path']
            dd_dev_set = Dataset(dd_dev_path_src,
                                 dd_dev_path_tgt,
                                 path_vocab_src,
                                 path_vocab_tgt,
                                 seqrev=config['seqrev'],
                                 tsv_path=dd_dev_tsv_path,
                                 set_type='dd',
                                 max_seq_len=config['max_seq_len'],
                                 batch_size=config['batch_size'],
                                 use_gpu=config['use_gpu'])
        else:
            dd_dev_set = None
        if config['gec_dev_path_src'] and config['gec_dev_path_tgt']:
            gec_dev_path_src = config['gec_dev_path_src']
            gec_dev_path_tgt = config['gec_dev_path_tgt']
            gec_dev_tsv_path = config['gec_dev_tsv_path']
            gec_dev_set = Dataset(gec_dev_path_src,
                                  gec_dev_path_tgt,
                                  path_vocab_src,
                                  path_vocab_tgt,
                                  seqrev=config['seqrev'],
                                  tsv_path=gec_dev_tsv_path,
                                  set_type='gec',
                                  max_seq_len=config['max_seq_len'],
                                  batch_size=config['batch_size'],
                                  use_gpu=config['use_gpu'])
        else:
            gec_dev_set = None

    elif config['train_mode'] == 'end2end':

        dd_train_set = None
        gec_train_set = None
        dd_dev_set = None
        gec_dev_set = None

        ddgec_train_path_src = config['ddgec_train_path_src']
        ddgec_train_path_tgt = config['ddgec_train_path_tgt']
        ddgec_train_path_flt = config['ddgec_train_path_flt']
        ddgec_train_tsv_path = config['ddgec_train_tsv_path']
        ddgec_train_set = Dataset(ddgec_train_path_src,
                                  ddgec_train_path_tgt,
                                  path_vocab_src,
                                  path_vocab_tgt,
                                  seqrev=config['seqrev'],
                                  tsv_path=ddgec_train_tsv_path,
                                  set_type='ddgec',
                                  max_seq_len=config['max_seq_len'],
                                  batch_size=config['batch_size'],
                                  use_gpu=config['use_gpu'],
                                  path_flt=ddgec_train_path_flt)

        vocab_size_enc = len(ddgec_train_set.vocab_src)
        vocab_size_dec = len(ddgec_train_set.vocab_tgt)
        assert vocab_size_enc == vocab_size_dec, \
         'mismatch vocab size: {} - {}'.format(vocab_size_enc, vocab_size_dec)
        vocab_size = vocab_size_enc
        src_word2id = ddgec_train_set.src_word2id
        tgt_word2id = ddgec_train_set.tgt_word2id
        src_id2word = ddgec_train_set.src_id2word

        if config['ddgec_dev_path_src'] and config[
                'ddgec_dev_path_tgt'] and config['ddgec_dev_path_flt']:
            ddgec_dev_path_src = config['ddgec_dev_path_src']
            ddgec_dev_path_tgt = config['ddgec_dev_path_tgt']
            ddgec_dev_path_flt = config['ddgec_dev_path_flt']
            ddgec_dev_tsv_path = config['ddgec_dev_tsv_path']
            ddgec_dev_set = Dataset(ddgec_dev_path_src,
                                    ddgec_dev_path_tgt,
                                    path_vocab_src,
                                    path_vocab_tgt,
                                    seqrev=config['seqrev'],
                                    tsv_path=ddgec_dev_tsv_path,
                                    set_type='ddgec',
                                    max_seq_len=config['max_seq_len'],
                                    batch_size=config['batch_size'],
                                    use_gpu=config['use_gpu'],
                                    path_flt=ddgec_dev_path_flt)
        else:
            ddgec_dev_set = None

    else:
        assert False, 'Not implemented mode {}'.format(config['train_mode'])

    time_st2 = time.time()
    print('data loading time: {}'.format(time_st2 - time_st1))

    # construct model
    seq2seq = Seq2Seq(
        vocab_size_enc,
        vocab_size_dec,
        embedding_size_enc=config['embedding_size_enc'],
        embedding_size_dec=config['embedding_size_dec'],
        embedding_dropout=config['embedding_dropout'],
        hidden_size_enc=config['hidden_size_enc'],
        hidden_size_dec=config['hidden_size_dec'],
        num_bilstm_enc=config['num_bilstm_enc'],
        num_unilstm_enc=config['num_unilstm_enc'],
        dd_num_unilstm_dec=config['dd_num_unilstm_dec'],
        dd_hidden_size_att=config['dd_hidden_size_att'],
        dd_att_mode=config['dd_att_mode'],
        dd_additional_key_size=config['dd_additional_key_size'],
        gec_num_bilstm_dec=config['gec_num_bilstm_dec'],
        gec_num_unilstm_dec_preatt=config['gec_num_unilstm_dec_preatt'],
        gec_num_unilstm_dec_pstatt=config['gec_num_unilstm_dec_pstatt'],
        gec_hidden_size_att=config['gec_hidden_size_att'],
        gec_att_mode=config['gec_att_mode'],
        shared_embed=config['shared_embed'],
        dropout=config['dropout'],
        residual=config['residual'],
        batch_first=config['batch_first'],
        max_seq_len=config['max_seq_len'],
        batch_size=config['batch_size'],
        load_embedding_src=config['load_embedding_src'],
        load_embedding_tgt=config['load_embedding_tgt'],
        src_word2id=src_word2id,
        tgt_word2id=tgt_word2id,
        src_id2word=src_id2word,
        hard_att=config['hard_att'],
        add_discriminator=config['add_discriminator'],
        dloss_coeff=config['dloss_coeff'],
        use_gpu=config['use_gpu'],
        ptr_net=config['ptr_net'],
        connect_type=config['connect_type'],
        dd_classifier=config['dd_classifier']).to(device)

    time_st3 = time.time()
    print('model init time: {}'.format(time_st3 - time_st2))

    # contruct trainer
    t = Trainer(expt_dir=config['save'],
                load_dir=config['load'],
                restart_dir=config['restart'],
                batch_size=config['batch_size'],
                random_seed=config['random_seed'],
                checkpoint_every=config['checkpoint_every'],
                print_every=config['print_every'],
                learning_rate=config['learning_rate'],
                eval_with_mask=config['eval_with_mask'],
                scheduled_sampling=config['scheduled_sampling'],
                teacher_forcing_ratio=config['teacher_forcing_ratio'],
                use_gpu=config['use_gpu'],
                ddreg=config['ddreg'],
                max_grad_norm=config['max_grad_norm'],
                loss_shift=config['loss_shift'],
                max_count_no_improve=config['max_count_no_improve'],
                max_count_num_rollback=config['max_count_num_rollback'],
                train_mode=config['train_mode'],
                gec_acc_weight=config['gec_acc_weight'],
                gec_loss_weight=config['gec_loss_weight'],
                dd_loss_weight=config['dd_loss_weight'],
                ddatt_loss_weight=config['ddatt_loss_weight'],
                ddattcls_loss_weight=config['ddattcls_loss_weight'],
                att_scale_up=config['att_scale_up'],
                save_schedule=config['save_schedule'])

    # run training
    t.train_mode = config['train_mode']
    if config['train_mode'] == 'separate':
        seq2seq = t.train_separate(dd_train_set,
                                   gec_train_set,
                                   seq2seq,
                                   num_epochs=config['num_epochs'],
                                   dd_dev_set=dd_dev_set,
                                   gec_dev_set=gec_dev_set)
    elif config['train_mode'] == 'end2end':
        seq2seq = t.train_end2end(ddgec_train_set,
                                  seq2seq,
                                  num_epochs=config['num_epochs'],
                                  ddgec_dev_set=ddgec_dev_set)
Ejemplo n.º 14
0
Archivo: train.py Proyecto: EdieLu/LAS
def main():

	# import pdb; pdb.set_trace()
	# load config
	parser = argparse.ArgumentParser(description='LAS Training')
	parser = load_arguments(parser)
	args = vars(parser.parse_args())
	config = validate_config(args)

	# set random seed
	if config['random_seed'] is not None:
		set_global_seeds(config['random_seed'])

	# record config
	if not os.path.isabs(config['save']):
		config_save_dir = os.path.join(os.getcwd(), config['save'])
	if not os.path.exists(config['save']):
		os.makedirs(config['save'])

	# resume or not
	if type(config['load']) != type(None):
		config_save_dir = os.path.join(config['save'], 'model-cont.cfg')
	else:
		config_save_dir = os.path.join(config['save'], 'model.cfg')
	save_config(config, config_save_dir)

	# contruct trainer
	t = Trainer(expt_dir=config['save'],
					load_dir=config['load'],
					batch_size=config['batch_size'],
					minibatch_partition=config['minibatch_partition'],
					checkpoint_every=config['checkpoint_every'],
					print_every=config['print_every'],
					learning_rate=config['learning_rate'],
					eval_with_mask=config['eval_with_mask'],
					scheduled_sampling=config['scheduled_sampling'],
					teacher_forcing_ratio=config['teacher_forcing_ratio'],
					use_gpu=config['use_gpu'],
					max_grad_norm=config['max_grad_norm'],
					max_count_no_improve=config['max_count_no_improve'],
					max_count_num_rollback=config['max_count_num_rollback'],
					keep_num=config['keep_num'],
					normalise_loss=config['normalise_loss'])

	# vocab
	path_vocab_src = config['path_vocab_src']

	# load train set
	train_path_src = config['train_path_src']
	train_acous_path = config['train_acous_path']
	train_set = Dataset(train_path_src, path_vocab_src=path_vocab_src,
		use_type=config['use_type'],
		acous_path=train_acous_path,
		seqrev=config['seqrev'],
		acous_norm=config['acous_norm'],
		acous_norm_path=config['acous_norm_path'],
		max_seq_len=config['max_seq_len'],
		batch_size=config['batch_size'],
		acous_max_len=config['acous_max_len'],
		use_gpu=config['use_gpu'],
		logger=t.logger)

	vocab_size = len(train_set.vocab_src)

	# load dev set
	if config['dev_path_src']:
		dev_path_src = config['dev_path_src']
		dev_acous_path = config['dev_acous_path']
		dev_set = Dataset(dev_path_src, path_vocab_src=path_vocab_src,
			use_type=config['use_type'],
			acous_path=dev_acous_path,
			acous_norm_path=config['acous_norm_path'],
			seqrev=config['seqrev'],
			acous_norm=config['acous_norm'],
			max_seq_len=config['max_seq_len'],
			batch_size=config['batch_size'],
			acous_max_len=config['acous_max_len'],
			use_gpu=config['use_gpu'],
			logger=t.logger)
	else:
		dev_set = None

	# construct model
	las_model = LAS(vocab_size,
					embedding_size=config['embedding_size'],
					acous_hidden_size=config['acous_hidden_size'],
					acous_att_mode=config['acous_att_mode'],
					hidden_size_dec=config['hidden_size_dec'],
					hidden_size_shared=config['hidden_size_shared'],
					num_unilstm_dec=config['num_unilstm_dec'],
					#
					acous_dim=config['acous_dim'],
					acous_norm=config['acous_norm'],
					spec_aug=config['spec_aug'],
					batch_norm=config['batch_norm'],
					enc_mode=config['enc_mode'],
					use_type=config['use_type'],
					#
					embedding_dropout=config['embedding_dropout'],
					dropout=config['dropout'],
					residual=config['residual'],
					batch_first=config['batch_first'],
					max_seq_len=config['max_seq_len'],
					load_embedding=config['load_embedding'],
					word2id=train_set.src_word2id,
					id2word=train_set.src_id2word,
					use_gpu=config['use_gpu'])

	device = check_device(config['use_gpu'])
	t.logger.info('device:{}'.format(device))
	las_model = las_model.to(device=device)

	# run training
	las_model = t.train(
		train_set, las_model, num_epochs=config['num_epochs'], dev_set=dev_set)
Ejemplo n.º 15
0
def main():

	# load config
	parser = argparse.ArgumentParser(description='Seq2seq Training')
	parser = load_arguments(parser)
	args = vars(parser.parse_args())
	config = validate_config(args)

	# set random seed
	if config['random_seed'] is not None:
		set_global_seeds(config['random_seed'])

	# record config
	if not os.path.isabs(config['save']):
		config_save_dir = os.path.join(os.getcwd(), config['save'])
	if not os.path.exists(config['save']):
		os.makedirs(config['save'])

	# resume or not
	if config['load']:
		resume = True
		print('resuming {} ...'.format(config['load']))
		config_save_dir = os.path.join(config['save'], 'model-cont.cfg')
	else:
		resume = False
		config_save_dir = os.path.join(config['save'], 'model.cfg')
	save_config(config, config_save_dir)

	# contruct trainer
	t = Trainer(expt_dir=config['save'],
					load_dir=config['load'],
					batch_size=config['batch_size'],
					checkpoint_every=config['checkpoint_every'],
					print_every=config['print_every'],
					learning_rate=config['learning_rate'],
					eval_with_mask=config['eval_with_mask'],
					scheduled_sampling=config['scheduled_sampling'],
					teacher_forcing_ratio=config['teacher_forcing_ratio'],
					use_gpu=config['use_gpu'],
					max_grad_norm=config['max_grad_norm'],
					max_count_no_improve=config['max_count_no_improve'],
					max_count_num_rollback=config['max_count_num_rollback'],
					keep_num=config['keep_num'],
					normalise_loss=config['normalise_loss'],
					minibatch_split=config['minibatch_split'])

	# load train set
	train_path_src = config['train_path_src']
	train_path_tgt = config['train_path_tgt']
	path_vocab_src = config['path_vocab_src']
	path_vocab_tgt = config['path_vocab_tgt']
	train_set = Dataset(train_path_src, train_path_tgt,
		path_vocab_src=path_vocab_src, path_vocab_tgt=path_vocab_tgt,
		seqrev=config['seqrev'],
		max_seq_len=config['max_seq_len'],
		batch_size=config['batch_size'],
		use_gpu=config['use_gpu'],
		logger=t.logger,
		use_type=config['use_type'])

	vocab_size_enc = len(train_set.vocab_src)
	vocab_size_dec = len(train_set.vocab_tgt)

	# load dev set
	if config['dev_path_src'] and config['dev_path_tgt']:
		dev_path_src = config['dev_path_src']
		dev_path_tgt = config['dev_path_tgt']
		dev_set = Dataset(dev_path_src, dev_path_tgt,
			path_vocab_src=path_vocab_src, path_vocab_tgt=path_vocab_tgt,
			seqrev=config['seqrev'],
			max_seq_len=config['max_seq_len'],
			batch_size=config['batch_size'],
			use_gpu=config['use_gpu'],
			logger=t.logger,
			use_type=config['use_type'])
	else:
		dev_set = None

	# construct model
	seq2seq = Seq2seq(vocab_size_enc, vocab_size_dec,
					share_embedder=config['share_embedder'],
					embedding_size_enc=config['embedding_size_enc'],
					embedding_size_dec=config['embedding_size_dec'],
					embedding_dropout=config['embedding_dropout'],
					hidden_size_enc=config['hidden_size_enc'],
					num_bilstm_enc=config['num_bilstm_enc'],
					num_unilstm_enc=config['num_unilstm_enc'],
					hidden_size_dec=config['hidden_size_dec'],
					num_unilstm_dec=config['num_unilstm_dec'],
					hidden_size_att=config['hidden_size_att'],
					hidden_size_shared=config['hidden_size_shared'],
					dropout=config['dropout'],
					residual=config['residual'],
					batch_first=config['batch_first'],
					max_seq_len=config['max_seq_len'],
					load_embedding_src=config['load_embedding_src'],
					load_embedding_tgt=config['load_embedding_tgt'],
					src_word2id=train_set.src_word2id,
					tgt_word2id=train_set.tgt_word2id,
					src_id2word=train_set.src_id2word,
					tgt_id2word=train_set.tgt_id2word,
					att_mode=config['att_mode'])


	device = check_device(config['use_gpu'])
	t.logger.info('device:{}'.format(device))
	seq2seq = seq2seq.to(device=device)

	# run training
	seq2seq = t.train(train_set, seq2seq,
		num_epochs=config['num_epochs'], resume=resume, dev_set=dev_set)
def main():

	# load config
	parser = argparse.ArgumentParser(description='PyTorch Seq2Seq Evaluation')
	parser = load_arguments(parser)
	args = vars(parser.parse_args())
	config = validate_config(args)
	config_save_dir = os.path.join(config['load'], 'eval.cfg')
	save_config(config, config_save_dir)

	# check device:
	if config['use_gpu'] and torch.cuda.is_available():
		global device
		device = torch.device('cuda')
	else:
		device = torch.device('cpu')
	print('device: {}'.format(device))

	# load src-tgt pair
	test_path_src = config['test_path_src']
	test_path_tgt = config['test_path_tgt']
	path_vocab_src = config['path_vocab_src']
	path_vocab_tgt = config['path_vocab_tgt']
	test_path_out = config['test_path_out']
	test_attkey_path = config['test_attkey_path']
	load_dir = config['load']
	max_seq_len = config['max_seq_len']
	batch_size = config['batch_size']
	beam_width = config['beam_width']
	use_gpu = config['use_gpu']
	seqrev = config['seqrev']
	print('attkey dir: {}'.format(test_attkey_path))
	print('reverse seq: {}'.format(seqrev))
	print('use gpu: {}'.format(use_gpu))

	if not os.path.exists(test_path_out):
		os.makedirs(test_path_out)

	if not os.path.exists(test_path_out):
		os.makedirs(test_path_out)
	config_save_dir = os.path.join(test_path_out, 'eval.cfg')
	save_config(config, config_save_dir)

	# set test mode
	MODE = config['eval_mode']
	if MODE == 3:
		max_seq_len = 32
		batch_size = 1
		beam_width = 1
		use_gpu = False

	# load test_set
	test_set = Dataset(test_path_src, test_path_tgt,
						path_vocab_src, path_vocab_tgt,
						attkey_path=test_attkey_path,seqrev=seqrev,
						max_seq_len=max_seq_len, batch_size=batch_size,
						use_gpu=use_gpu)
	print('Testset loaded')
	sys.stdout.flush()

	# run eval
	if MODE == 1:
		# run evaluation
		# print("use gpu: {}".format(config['use_gpu']))
		accuracy = evaluate(test_set, load_dir, test_path_out,
			use_gpu, max_seq_len, beam_width, seqrev=seqrev)
		print(accuracy)

	elif MODE == 2:
		translate(test_set, load_dir, test_path_out,
			use_gpu, max_seq_len, beam_width, seqrev=seqrev)

	elif MODE == 3:
		# plotting
		att_plot(test_set, load_dir, test_path_out, use_gpu, max_seq_len, beam_width)
Ejemplo n.º 17
0
def main():

    # load config
    parser = argparse.ArgumentParser(description='Evaluation')
    parser = load_arguments(parser)
    args = vars(parser.parse_args())
    config = validate_config(args)

    # load src-tgt pair
    test_path_src = config['test_path_src']
    test_path_tgt = config['test_path_tgt']
    if type(test_path_tgt) == type(None):
        test_path_tgt = test_path_src

    test_path_out = config['test_path_out']
    test_acous_path = config['test_acous_path']
    acous_norm_path = config['acous_norm_path']

    load_dir = config['load']
    max_seq_len = config['max_seq_len']
    batch_size = config['batch_size']
    beam_width = config['beam_width']
    use_gpu = config['use_gpu']
    seqrev = config['seqrev']
    use_type = config['use_type']

    # set test mode
    MODE = config['eval_mode']
    if MODE != 2:
        if not os.path.exists(test_path_out):
            os.makedirs(test_path_out)
        config_save_dir = os.path.join(test_path_out, 'eval.cfg')
        save_config(config, config_save_dir)

    # check device:
    device = check_device(use_gpu)
    print('device: {}'.format(device))

    # load model
    latest_checkpoint_path = load_dir
    resume_checkpoint = Checkpoint.load(latest_checkpoint_path)
    model = resume_checkpoint.model.to(device)
    vocab_src = resume_checkpoint.input_vocab
    vocab_tgt = resume_checkpoint.output_vocab
    print('Model dir: {}'.format(latest_checkpoint_path))
    print('Model laoded')

    # combine model
    if type(config['combine_path']) != type(None):
        model = combine_weights(config['combine_path'])
    # import pdb; pdb.set_trace()

    # load test_set
    test_set = Dataset(
        path_src=test_path_src,
        path_tgt=test_path_tgt,
        vocab_src_list=vocab_src,
        vocab_tgt_list=vocab_tgt,
        use_type=use_type,
        acous_path=test_acous_path,
        seqrev=seqrev,
        acous_norm=config['acous_norm'],
        acous_norm_path=config['acous_norm_path'],
        acous_max_len=6000,  # max 50k for mustc trainset
        max_seq_len_src=900,
        max_seq_len_tgt=900,  # max 2.5k for mustc trainset
        batch_size=batch_size,
        mode='ST',
        use_gpu=use_gpu)

    print('Test dir: {}'.format(test_path_src))
    print('Testset loaded')
    sys.stdout.flush()

    # '{AE|ASR|MT|ST}-{REF|HYP}'
    if len(config['gen_mode'].split('-')) == 2:
        gen_mode = config['gen_mode'].split('-')[0]
        history = config['gen_mode'].split('-')[1]
    elif len(config['gen_mode'].split('-')) == 1:
        gen_mode = config['gen_mode']
        history = 'HYP'

    # add external language model
    lm_mode = config['lm_mode']

    # run eval:
    if MODE == 1:
        translate(test_set,
                  model,
                  test_path_out,
                  use_gpu,
                  max_seq_len,
                  beam_width,
                  device,
                  seqrev=seqrev,
                  gen_mode=gen_mode,
                  lm_mode=lm_mode,
                  history=history)

    elif MODE == 2:  # save combined model
        ckpt = Checkpoint(model=model,
                          optimizer=None,
                          epoch=0,
                          step=0,
                          input_vocab=test_set.vocab_src,
                          output_vocab=test_set.vocab_tgt)
        saved_path = ckpt.save_customise(
            os.path.join(config['combine_path'].strip('/') + '-combine',
                         'combine'))
        log_ckpts(config['combine_path'],
                  config['combine_path'].strip('/') + '-combine')
        print('saving at {} ... '.format(saved_path))

    elif MODE == 3:
        plot_emb(test_set, model, test_path_out, use_gpu, max_seq_len, device)

    elif MODE == 4:
        gather_emb(test_set, model, test_path_out, use_gpu, max_seq_len,
                   device)

    elif MODE == 5:
        compute_kl(test_set, model, test_path_out, use_gpu, max_seq_len,
                   device)
Ejemplo n.º 18
0
def main():

    # load config
    parser = argparse.ArgumentParser(description='PyTorch LAS DD Evaluation')
    parser = load_arguments(parser)
    args = vars(parser.parse_args())
    config = validate_config(args)
    config_save_dir = os.path.join(config['load'], 'eval.cfg')

    # check device:
    if config['use_gpu'] and torch.cuda.is_available():
        global device
        device = torch.device('cuda')
    else:
        device = torch.device('cpu')
    print('device: {}'.format(device))

    # load src-tgt pair
    test_path_src = config['test_path_src']
    test_path_tgt = config['test_path_tgt']
    path_vocab_src = config['path_vocab_src']
    path_vocab_tgt = config['path_vocab_tgt']
    test_path_out = config['test_path_out']
    test_tsv_path = config['test_tsv_path']
    test_acous_path = config['test_acous_path']

    load_dir = config['load']
    max_seq_len = config['max_seq_len']
    batch_size = config['batch_size']
    beam_width = config['beam_width']
    use_gpu = config['use_gpu']
    seqrev = config['seqrev']
    print('reverse seq: {}'.format(seqrev))
    print('use gpu: {}'.format(use_gpu))

    if not os.path.exists(test_path_out):
        os.makedirs(test_path_out)
    config_save_dir = os.path.join(test_path_out, 'eval.cfg')
    save_config(config, config_save_dir)

    # set test mode: 3 = DEBUG; 4 = PLOT
    MODE = config['eval_mode']
    if MODE == 3 or MODE == 4 or MODE == 6:
        max_seq_len = 32
        batch_size = 1
        beam_width = 1
        use_gpu = False

    # load test_set
    test_set = Dataset(test_path_src,
                       test_path_tgt,
                       path_vocab_src,
                       path_vocab_tgt,
                       use_type=config['use_type'],
                       seqrev=config['seqrev'],
                       add_acous=config['add_acous'],
                       acous_path=test_acous_path,
                       acous_norm=config['acous_norm'],
                       tsv_path=test_tsv_path,
                       keep_filler=config['keep_filler'],
                       tag_rev=config['tag_rev'],
                       add_timestamp=config['add_times'],
                       timestamp_path=config['test_times_path'],
                       max_seq_len=max_seq_len,
                       batch_size=batch_size,
                       use_gpu=use_gpu)
    print('Testset loaded')
    sys.stdout.flush()

    # run eval
    if MODE == 2:
        if config['add_acous']:
            translate_acous(test_set,
                            load_dir,
                            test_path_out,
                            use_gpu,
                            max_seq_len,
                            beam_width,
                            seqrev=seqrev)
        else:
            translate(test_set,
                      load_dir,
                      test_path_out,
                      use_gpu,
                      max_seq_len,
                      beam_width,
                      seqrev=seqrev)

    elif MODE == 5:
        # debug for beam search
        debug_beam_search(test_set, load_dir, use_gpu, max_seq_len, beam_width)

    elif MODE == 6:
        # plotting las attn
        acous_att_plot(test_set, load_dir, test_path_out, use_gpu, max_seq_len,
                       beam_width)