def test_ConditionalLSTM_dot(): params = load_tests_params() # Current test params: Single layered LSTM - ConditionalGRU params['BIDIRECTIONAL_ENCODER'] = True params['N_LAYERS_ENCODER'] = 1 params['BIDIRECTIONAL_DEEP_ENCODER'] = True params['ENCODER_RNN_TYPE'] = 'LSTM' params['DECODER_RNN_TYPE'] = 'ConditionalLSTM' params['N_LAYERS_DECODER'] = 1 params['ATTENTION_MODE'] = 'dot' params['REBUILD_DATASET'] = True dataset = build_dataset(params) params['INPUT_VOCABULARY_SIZE'] = dataset.vocabulary_len[params['INPUTS_IDS_DATASET'][0]] params['OUTPUT_VOCABULARY_SIZE'] = dataset.vocabulary_len[params['OUTPUTS_IDS_DATASET'][0]] params['MODEL_NAME'] = \ params['TASK_NAME'] + '_' + params['SRC_LAN'] + params['TRG_LAN'] + '_' + params['MODEL_TYPE'] + \ '_src_emb_' + str(params['SOURCE_TEXT_EMBEDDING_SIZE']) + \ '_bidir_' + str(params['BIDIRECTIONAL_ENCODER']) + \ '_enc_' + params['ENCODER_RNN_TYPE'] + '_*' + str(params['N_LAYERS_ENCODER']) + '_' + str( params['ENCODER_HIDDEN_SIZE']) + \ '_dec_' + params['DECODER_RNN_TYPE'] + '_*' + str(params['N_LAYERS_DECODER']) + '_' + str( params['DECODER_HIDDEN_SIZE']) + params['ATTENTION_MODE'] + \ '_deepout_' + '_'.join([layer[0] for layer in params['DEEP_OUTPUT_LAYERS']]) + \ '_trg_emb_' + str(params['TARGET_TEXT_EMBEDDING_SIZE']) + \ '_' + params['OPTIMIZER'] + '_' + str(params['LR']) params['STORE_PATH'] = os.path.join(K.backend() + '_test_train_models', params['MODEL_NAME']) # Test several NMT-Keras utilities: train, sample, sample_ensemble, score_corpus... print("Training model") train_model(params) params['RELOAD'] = 1 print("Done") parser = argparse.ArgumentParser('Parser for unit testing') parser.dataset = os.path.join( params['DATASET_STORE_PATH'], 'Dataset_' + params['DATASET_NAME'] + '_' + params['SRC_LAN'] + params['TRG_LAN'] + '.pkl') parser.text = os.path.join(params['DATA_ROOT_PATH'], params['TEXT_FILES']['val'] + params['SRC_LAN']) parser.splits = ['val'] parser.config = params['STORE_PATH'] + '/config.pkl' parser.models = [params['STORE_PATH'] + '/epoch_' + str(1)] parser.verbose = 0 parser.dest = None parser.source = os.path.join(params['DATA_ROOT_PATH'], params['TEXT_FILES']['val'] + params['SRC_LAN']) parser.target = os.path.join(params['DATA_ROOT_PATH'], params['TEXT_FILES']['val'] + params['TRG_LAN']) parser.weights = [] parser.glossary = None for n_best in [True, False]: parser.n_best = n_best print("Sampling with n_best = %s " % str(n_best)) sample_ensemble(parser, params) print("Done") print("Scoring corpus") score_corpus(parser, params) print("Done")
def test_unk_replace_1(): params = load_tests_params() params['REBUILD_DATASET'] = True params['INPUT_VOCABULARY_SIZE'] = 0 params['OUTPUT_VOCABULARY_SIZE'] = 50 params['POS_UNK'] = True params['HEURISTIC'] = 1 params['ALIGN_FROM_RAW'] = True dataset = build_dataset(params) # params['MAPPING'] = DATA_ROOT_PATH + '/mapping.%s_%s.pkl' % (SRC_LAN, TRG_LAN) params['INPUT_VOCABULARY_SIZE'] = dataset.vocabulary_len[params['INPUTS_IDS_DATASET'][0]] params['OUTPUT_VOCABULARY_SIZE'] = dataset.vocabulary_len[params['OUTPUTS_IDS_DATASET'][0]] params['MODEL_NAME'] = \ params['TASK_NAME'] + '_' + params['SRC_LAN'] + params['TRG_LAN'] + '_' + params['MODEL_TYPE'] + \ '_src_emb_' + str(params['SOURCE_TEXT_EMBEDDING_SIZE']) + \ '_bidir_' + str(params['BIDIRECTIONAL_ENCODER']) + \ '_enc_' + params['ENCODER_RNN_TYPE'] + '_*' + str(params['N_LAYERS_ENCODER']) + '_' + str( params['ENCODER_HIDDEN_SIZE']) + \ '_dec_' + params['DECODER_RNN_TYPE'] + '_*' + str(params['N_LAYERS_DECODER']) + '_' + str( params['DECODER_HIDDEN_SIZE']) + \ '_deepout_' + '_'.join([layer[0] for layer in params['DEEP_OUTPUT_LAYERS']]) + \ '_trg_emb_' + str(params['TARGET_TEXT_EMBEDDING_SIZE']) + \ '_' + params['OPTIMIZER'] + '_' + str(params['LR']) params['STORE_PATH'] = os.path.join(K.backend() + '_test_train_models', params['MODEL_NAME']) # Test several NMT-Keras utilities: train, sample, sample_ensemble, score_corpus... print("Training model") train_model(params) params['RELOAD'] = 1 print("Done") parser = argparse.ArgumentParser('Parser for unit testing') parser.dataset = os.path.join( params['DATASET_STORE_PATH'], 'Dataset_' + params['DATASET_NAME'] + '_' + params['SRC_LAN'] + params['TRG_LAN'] + '.pkl') parser.text = os.path.join(params['DATA_ROOT_PATH'], params['TEXT_FILES']['val'] + params['SRC_LAN']) parser.splits = ['val'] parser.config = os.path.join(params['STORE_PATH'], 'config.pkl') parser.models = [os.path.join(params['STORE_PATH'], 'epoch_' + str(1))] parser.verbose = 0 parser.dest = None parser.source = os.path.join(params['DATA_ROOT_PATH'], params['TEXT_FILES']['val'] + params['SRC_LAN']) parser.target = os.path.join(params['DATA_ROOT_PATH'], params['TEXT_FILES']['val'] + params['TRG_LAN']) parser.weights = [] parser.glossary = None for n_best in [True, False]: parser.n_best = n_best print("Sampling with n_best = %s " % str(n_best)) sample_ensemble(parser, params) print("Done") print("Scoring corpus") score_corpus(parser, params) print("Done")
def load_transformer_test_params(): params = load_tests_params() params['MODEL_TYPE'] = 'Transformer' params['N_LAYERS_ENCODER'] = 2 params['N_LAYERS_DECODER'] = 2 params['MULTIHEAD_ATTENTION_ACTIVATION'] = 'relu' params['MODEL_SIZE'] = 8 params['FF_SIZE'] = params['MODEL_SIZE'] * 4 params['N_HEADS'] = 2 params['REBUILD_DATASET'] = True params['OPTIMIZED_SEARCH'] = True params['POS_UNK'] = False return params
def test_sampling_maxlikelihood(): params = load_tests_params() params['REBUILD_DATASET'] = True params['INPUT_VOCABULARY_SIZE'] = 550 params['OUTPUT_VOCABULARY_SIZE'] = 550 params['POS_UNK'] = True params['HEURISTIC'] = 0 params['ALIGN_FROM_RAW'] = True # Sampling params: Show some samples during training. params['SAMPLE_ON_SETS'] = ['train', 'val'] params['N_SAMPLES'] = 10 params['START_SAMPLING_ON_EPOCH'] = 0 params['SAMPLE_EACH_UPDATES'] = 50 params['SAMPLING'] = 'max_likelihood' dataset = build_dataset(params) params['INPUT_VOCABULARY_SIZE'] = dataset.vocabulary_len[ params['INPUTS_IDS_DATASET'][0]] params['OUTPUT_VOCABULARY_SIZE'] = dataset.vocabulary_len[ params['OUTPUTS_IDS_DATASET'][0]] params['MODEL_NAME'] = \ params['TASK_NAME'] + '_' + params['SRC_LAN'] + params['TRG_LAN'] + '_' + params['MODEL_TYPE'] + \ '_src_emb_' + str(params['SOURCE_TEXT_EMBEDDING_SIZE']) + \ '_bidir_' + str(params['BIDIRECTIONAL_ENCODER']) + \ '_enc_' + params['ENCODER_RNN_TYPE'] + '_*' + str(params['N_LAYERS_ENCODER']) + '_' + str( params['ENCODER_HIDDEN_SIZE']) + \ '_dec_' + params['DECODER_RNN_TYPE'] + '_*' + str(params['N_LAYERS_DECODER']) + '_' + str( params['DECODER_HIDDEN_SIZE']) + \ '_deepout_' + '_'.join([layer[0] for layer in params['DEEP_OUTPUT_LAYERS']]) + \ '_trg_emb_' + str(params['TARGET_TEXT_EMBEDDING_SIZE']) + \ '_' + params['OPTIMIZER'] + '_' + str(params['LR']) params['STORE_PATH'] = os.path.join(K.backend() + '_test_train_models', params['MODEL_NAME']) # Test several NMT-Keras utilities: train, sample, sample_ensemble, score_corpus... print("Training model") train_model(params) print("Done")
def test_transformer(): params = load_tests_params() # Current test params: Transformer params['MODEL_TYPE'] = 'Transformer' params['TIED_EMBEDDINGS'] = True params['N_LAYERS_ENCODER'] = 2 params['N_LAYERS_DECODER'] = 2 params['MULTIHEAD_ATTENTION_ACTIVATION'] = 'relu' params['MODEL_SIZE'] = 8 params['FF_SIZE'] = params['MODEL_SIZE'] * 4 params['N_HEADS'] = 2 params['REBUILD_DATASET'] = True params['OPTIMIZED_SEARCH'] = True params['POS_UNK'] = False dataset = build_dataset(params) params['INPUT_VOCABULARY_SIZE'] = dataset.vocabulary_len[ params['INPUTS_IDS_DATASET'][0]] params['OUTPUT_VOCABULARY_SIZE'] = dataset.vocabulary_len[ params['OUTPUTS_IDS_DATASET'][0]] params['MODEL_NAME'] = \ params['TASK_NAME'] + '_' + params['SRC_LAN'] + params['TRG_LAN'] + '_' + params['MODEL_TYPE'] + \ '_model_size_' + str(params['MODEL_SIZE']) + \ '_ff_size_' + str(params['FF_SIZE']) + \ '_num_heads_' + str(params['N_HEADS']) + \ '_encoder_blocks_' + str(params['N_LAYERS_ENCODER']) + \ '_decoder_blocks_' + str(params['N_LAYERS_DECODER']) + \ '_deepout_' + '_'.join([layer[0] for layer in params['DEEP_OUTPUT_LAYERS']]) + \ '_' + params['OPTIMIZER'] + '_' + str(params['LR']) # Test several NMT-Keras utilities: train, sample, sample_ensemble, score_corpus... print("Training model") train_model(params) params['RELOAD'] = 1 print("Done") parser = argparse.ArgumentParser('Parser for unit testing') parser.dataset = os.path.join( params['DATASET_STORE_PATH'], 'Dataset_' + params['DATASET_NAME'] + '_' + params['SRC_LAN'] + params['TRG_LAN'] + '.pkl') parser.text = os.path.join(params['DATA_ROOT_PATH'], params['TEXT_FILES']['val'] + params['SRC_LAN']) parser.splits = ['val'] parser.config = params['STORE_PATH'] + '/config.pkl' parser.models = [params['STORE_PATH'] + '/epoch_' + str(1)] parser.verbose = 0 parser.dest = None parser.source = os.path.join( params['DATA_ROOT_PATH'], params['TEXT_FILES']['val'] + params['SRC_LAN']) parser.target = os.path.join( params['DATA_ROOT_PATH'], params['TEXT_FILES']['val'] + params['TRG_LAN']) parser.weights = [] parser.glossary = None for n_best in [True, False]: parser.n_best = n_best print("Sampling with n_best = %s " % str(n_best)) sample_ensemble(parser, params) print("Done") print("Scoring corpus") score_corpus(parser, params) print("Done") clean_dirs(params)