def test_text_features_none(): params = load_tests_params() # Current test params: params['INPUTS_TYPES_DATASET'] = ['text', 'text'] params['OUTPUTS_TYPES_DATASET'] = ['text'] params['REBUILD_DATASET'] = True dataset = build_dataset(params) params['INPUT_VOCABULARY_SIZE'] = dataset.vocabulary_len[params['INPUTS_IDS_DATASET'][0]] params['OUTPUT_VOCABULARY_SIZE'] = dataset.vocabulary_len[params['OUTPUTS_IDS_DATASET'][0]] params['MODEL_NAME'] = \ params['TASK_NAME'] + '_' + params['SRC_LAN'] + params['TRG_LAN'] + '_' + params['MODEL_TYPE'] + \ '_src_emb_' + str(params['SOURCE_TEXT_EMBEDDING_SIZE']) + \ '_bidir_' + str(params['BIDIRECTIONAL_ENCODER']) + \ '_enc_' + params['ENCODER_RNN_TYPE'] + '_*' + str(params['N_LAYERS_ENCODER']) + '_' + str( params['ENCODER_HIDDEN_SIZE']) + \ '_dec_' + params['DECODER_RNN_TYPE'] + '_*' + str(params['N_LAYERS_DECODER']) + '_' + str( params['DECODER_HIDDEN_SIZE']) + \ '_deepout_' + '_'.join([layer[0] for layer in params['DEEP_OUTPUT_LAYERS']]) + \ '_trg_emb_' + str(params['TARGET_TEXT_EMBEDDING_SIZE']) + \ '_' + params['OPTIMIZER'] + '_' + str(params['LR']) params['STORE_PATH'] = K.backend() + '_test_train_models/' + params['MODEL_NAME'] + '/' # Test several NMT-Keras utilities: train, sample, sample_ensemble... print ("Training model") train_model(params) params['RELOAD'] = 1 print ("Done") parser = argparse.ArgumentParser('Parser for unit testing') parser.dataset = params['DATASET_STORE_PATH'] + '/Dataset_' + params['DATASET_NAME'] + '_' + params['SRC_LAN'] + params['TRG_LAN'] + '.pkl' parser.text = params['DATA_ROOT_PATH'] + '/' + params['TEXT_FILES']['val'] + params['SRC_LAN'] parser.splits = ['val'] parser.config = params['STORE_PATH'] + '/config.pkl' parser.models = [params['STORE_PATH'] + '/epoch_' + str(1)] parser.verbose = 0 parser.dest = None parser.source = params['DATA_ROOT_PATH'] + '/' + params['TEXT_FILES']['val'] + params['SRC_LAN'] parser.target = params['DATA_ROOT_PATH'] + '/' + params['TEXT_FILES']['val'] + params['TRG_LAN'] parser.weights = [] parser.glossary = None for n_best in [True, False]: parser.n_best = n_best print ("Sampling with n_best = %s " % str(n_best)) sample_ensemble(parser, params) print ("Done")
def test_NMT_Unidir_GRU_ConditionalLSTM(): params = load_tests_params() # Current test params: Single layered GRU - ConditionalLSTM params['BIDIRECTIONAL_ENCODER'] = False params['N_LAYERS_ENCODER'] = 1 params['BIDIRECTIONAL_DEEP_ENCODER'] = False params['ENCODER_RNN_TYPE'] = 'GRU' params['DECODER_RNN_TYPE'] = 'ConditionalLSTM' params['N_LAYERS_DECODER'] = 1 params['REBUILD_DATASET'] = True dataset = build_dataset(params) params['INPUT_VOCABULARY_SIZE'] = dataset.vocabulary_len[params['INPUTS_IDS_DATASET'][0]] params['OUTPUT_VOCABULARY_SIZE'] = dataset.vocabulary_len[params['OUTPUTS_IDS_DATASET'][0]] params['MODEL_NAME'] = \ params['TASK_NAME'] + '_' + params['SRC_LAN'] + params['TRG_LAN'] + '_' + params['MODEL_TYPE'] + \ '_src_emb_' + str(params['SOURCE_TEXT_EMBEDDING_SIZE']) + \ '_bidir_' + str(params['BIDIRECTIONAL_ENCODER']) + \ '_enc_' + params['ENCODER_RNN_TYPE'] + '_*' + str(params['N_LAYERS_ENCODER']) + '_' + str( params['ENCODER_HIDDEN_SIZE']) + \ '_dec_' + params['DECODER_RNN_TYPE'] + '_*' + str(params['N_LAYERS_DECODER']) + '_' + str( params['DECODER_HIDDEN_SIZE']) + \ '_deepout_' + '_'.join([layer[0] for layer in params['DEEP_OUTPUT_LAYERS']]) + \ '_trg_emb_' + str(params['TARGET_TEXT_EMBEDDING_SIZE']) + \ '_' + params['OPTIMIZER'] + '_' + str(params['LR']) params['STORE_PATH'] = K.backend() + '_test_train_models/' + params['MODEL_NAME'] + '/' # Test several NMT-Keras utilities: train, sample, sample_ensemble, score_corpus... train_model(params) params['RELOAD'] = 2 apply_NMT_model(params) parser = argparse.ArgumentParser('Parser for unit testing') parser.dataset = params['DATASET_STORE_PATH'] + '/Dataset_' + params['DATASET_NAME'] + '_' + params['SRC_LAN'] + params['TRG_LAN'] + '.pkl' parser.text = params['DATA_ROOT_PATH'] + '/' + params['TEXT_FILES']['val'] + params['SRC_LAN'] parser.splits = ['val'] parser.config = params['STORE_PATH'] + '/config.pkl' parser.models = [params['STORE_PATH'] + '/epoch_' + str(2)] parser.verbose = 0 parser.dest = None parser.source = params['DATA_ROOT_PATH'] + '/' + params['TEXT_FILES']['val'] + params['SRC_LAN'] parser.target = params['DATA_ROOT_PATH'] + '/' + params['TEXT_FILES']['val'] + params['TRG_LAN'] parser.weights = [] for n_best in [True, False]: parser.n_best = n_best sample_ensemble(parser, params) score_corpus(parser, params)
def test_keep_n_captions(): params = load_parameters() params['REBUILD_DATASET'] = True ds = build_dataset(params) len_splits = {'train': 9900, 'val': 100, 'test': 2996} for splits in [[], None, ['val'], ['val', 'test']]: prepare_references(ds, 1, n=1, set_names=splits) if splits is not None: for split in splits: len_split = len_splits[split] assert eval('ds.len_' + split) == len_split assert eval('all(ds.loaded_' + split + ')') assert len(eval('ds.X_' + split + str([params['INPUTS_IDS_DATASET'][0]]))) == len_split assert len(eval('ds.Y_' + split + str([params['OUTPUTS_IDS_DATASET'][0]]))) == len_split
def build(params): ds = build_dataset(params) params['OUTPUT_VOCABULARY_SIZE'] = ds.vocabulary_len[ params['OUTPUTS_IDS_DATASET'][0]] vocab = ds.vocabulary[params['OUTPUTS_IDS_DATASET'][0]]['idx2words'] # We only want the model for decoding video_model = VideoDesc_Model(params, type=params['MODEL_TYPE'], verbose=0, model_name=params['MODEL_NAME'], vocabularies=ds.vocabulary, store_path=params['STORE_PATH'], set_optimizer=False) return ds, vocab, video_model
def apply_VQA_model(params): """ Function for using a previously trained model for sampling. """ ########### Load data dataset = build_dataset(params) params['INPUT_VOCABULARY_SIZE'] = dataset.vocabulary_len[ params['INPUTS_IDS_DATASET'][0]] params['OUTPUT_VOCABULARY_SIZE'] = dataset.vocabulary_len[ params['OUTPUTS_IDS_DATASET'][0]] ########### ########### Load model vqa = loadModel(params['STORE_PATH'], params['RELOAD']) vqa.setOptimizer() ########### ########### Apply sampling for s in params["EVAL_ON_SETS"]: # Apply model predictions params_prediction = { 'batch_size': params['BATCH_SIZE'], 'n_parallel_loaders': params['PARALLEL_LOADERS'], 'predict_on_sets': [s] } predictions = vqa.predictNet(dataset, params_prediction)[s] # Convert predictions into sentences vocab = dataset.vocabulary[params['OUTPUTS_IDS_DATASET'] [0]]['idx2words'] predictions = vqa.decode_predictions( predictions, 1, # always set temperature to 1 vocab, params['SAMPLING'], verbose=params['VERBOSE']) # Store result filepath = vqa.model_path + '/' + s + '_sampling.txt' # results file if params['SAMPLING_SAVE_MODE'] == 'list': list2file(filepath, predictions) elif params['SAMPLING_SAVE_MODE'] == 'vqa': exec('question_ids = dataset.X_' + s + '["' + params['INPUTS_IDS_DATASET'][0] + '_ids"]') list2vqa(filepath, predictions, question_ids)
def test_build_datset(): params = load_parameters() for verbose in range(2): params['REBUILD_DATASET'] = True params['VERBOSE'] = verbose ds = build_dataset(params) assert isinstance(ds, Dataset) len_splits = [('train', 9900), ('val', 100), ('test', 2996)] for split, len_split in len_splits: assert eval('ds.len_' + split) == len_split assert eval('all(ds.loaded_' + split + ')') assert len( eval('ds.X_' + split + str([params['INPUTS_IDS_DATASET'][0]]))) == len_split assert len( eval('ds.Y_' + split + str([params['OUTPUTS_IDS_DATASET'][0]]))) == len_split
def test_build_datset(self): params = load_parameters() params['REBUILD_DATASET'] = True params['DATASET_STORE_PATH'] = './' ds = build_dataset(params) self.assertIsInstance(ds, Dataset) len_splits = [('train', 9900), ('val', 100), ('test', 2996)] for split, len_split in len_splits: self.assertEqual(eval('ds.len_' + split), len_split) self.assertTrue(eval('all(ds.loaded_' + split + ')')) self.assertEqual( len( eval('ds.X_' + split + str([params['INPUTS_IDS_DATASET'][0]]))), len_split) self.assertEqual( len( eval('ds.Y_' + split + str([params['OUTPUTS_IDS_DATASET'][0]]))), len_split)
def test_keep_n_captions(): params = load_parameters() params['REBUILD_DATASET'] = True params['DATASET_STORE_PATH'] = './' ds = build_dataset(params) len_splits = {'train': 9900, 'val': 100, 'test': 2996} for splits in [[], None, ['val'], ['val', 'test']]: keep_n_captions(ds, 1, n=1, set_names=splits) if splits is not None: for split in splits: len_split = len_splits[split] assert eval('ds.len_' + split) == len_split assert eval('all(ds.loaded_' + split + ')') assert len(eval('ds.X_' + split + str([params['INPUTS_IDS_DATASET'][0]]))) == len_split assert len(eval('ds.Y_' + split + str([params['OUTPUTS_IDS_DATASET'][0]]))) == len_split if __name__ == '__main__': pytest.main([__file__])
def test_update_dataset_from_file(): params = load_parameters() for rebuild_dataset in [True, False]: params['REBUILD_DATASET'] = rebuild_dataset params['DATASET_STORE_PATH'] = './' for splits in [[], None, ['val']]: ds = build_dataset(params) assert isinstance(ds, Dataset) for output_text_filename in [None, params['DATA_ROOT_PATH'] + params['TEXT_FILES']['test'] + params['TRG_LAN']]: for remove_outputs in [True, False]: for compute_state_below in [True, False]: for recompute_references in [True, False]: ds2 = update_dataset_from_file(copy.deepcopy(ds), params['DATA_ROOT_PATH'] + params['TEXT_FILES']['test'] + params['SRC_LAN'], params, splits=splits, output_text_filename=output_text_filename, remove_outputs=remove_outputs, compute_state_below=compute_state_below, recompute_references=recompute_references) assert isinstance(ds2, Dataset) # Final check: We update the val set with the test data. We check that dimensions match. split = 'val' len_test = 2996 ds2 = update_dataset_from_file(copy.deepcopy(ds), params['DATA_ROOT_PATH'] + params['TEXT_FILES']['test'] + params['SRC_LAN'], params, splits=[split], output_text_filename=params['DATA_ROOT_PATH'] + params['TEXT_FILES']['test'] + params['TRG_LAN'], remove_outputs=False, compute_state_below=True, recompute_references=True) assert isinstance(ds2, Dataset) assert eval('ds2.len_' + split) == len_test assert eval('all(ds2.loaded_' + split + ')') assert len(eval('ds2.X_' + split + str([params['INPUTS_IDS_DATASET'][0]]))) == len_test assert len(eval('ds2.Y_' + split + str([params['OUTPUTS_IDS_DATASET'][0]]))) == len_test if __name__ == '__main__': pytest.main([__file__])
def test_build(self): params = load_parameters() params['DATASET_STORE_PATH'] = './' params['REBUILD_DATASET'] = True dataset = build_dataset(params) params['INPUT_VOCABULARY_SIZE'] = \ dataset.vocabulary_len[params['INPUTS_IDS_DATASET'][0]] params['OUTPUT_VOCABULARY_SIZE'] = \ dataset.vocabulary_len[params['OUTPUTS_IDS_DATASET'][0]] for encoder_rnn_type in ['LSTM', 'GRU']: for decoder_rnn_type in [ 'LSTM', 'GRU', 'ConditionalLSTM', 'ConditionalGRU' ]: params['ENCODER_RNN_TYPE'] = encoder_rnn_type params['DECODER_RNN_TYPE'] = decoder_rnn_type for n_layers in range(2): params['N_LAYERS_DECODER'] = n_layers params['N_LAYERS_ENCODER'] = n_layers nmt_model = \ TranslationModel(params, model_type=params['MODEL_TYPE'], verbose=params['VERBOSE'], model_name=params['MODEL_NAME'], vocabularies=dataset.vocabulary, store_path=params['STORE_PATH'], clear_dirs=False) self.assertIsInstance(nmt_model, Model_Wrapper) # Check Inputs inputMapping = dict() for i, id_in in enumerate(params['INPUTS_IDS_DATASET']): pos_source = dataset.ids_inputs.index(id_in) id_dest = nmt_model.ids_inputs[i] inputMapping[id_dest] = pos_source nmt_model.setInputsMapping(inputMapping) outputMapping = dict() for i, id_out in enumerate(params['OUTPUTS_IDS_DATASET']): pos_target = dataset.ids_outputs.index(id_out) id_dest = nmt_model.ids_outputs[i] outputMapping[id_dest] = pos_target nmt_model.setOutputsMapping(outputMapping) return True
def test_sampling_maxlikelihood(): params = load_tests_params() params['REBUILD_DATASET'] = True params['INPUT_VOCABULARY_SIZE'] = 550 params['OUTPUT_VOCABULARY_SIZE'] = 550 params['POS_UNK'] = True params['HEURISTIC'] = 0 params['ALIGN_FROM_RAW'] = True # Sampling params: Show some samples during training. params['SAMPLE_ON_SETS'] = ['train', 'val'] params['N_SAMPLES'] = 10 params['START_SAMPLING_ON_EPOCH'] = 0 params['SAMPLE_EACH_UPDATES'] = 50 params['SAMPLING'] = 'max_likelihood' dataset = build_dataset(params) params['INPUT_VOCABULARY_SIZE'] = dataset.vocabulary_len[ params['INPUTS_IDS_DATASET'][0]] params['OUTPUT_VOCABULARY_SIZE'] = dataset.vocabulary_len[ params['OUTPUTS_IDS_DATASET'][0]] params['MODEL_NAME'] = \ params['TASK_NAME'] + '_' + params['SRC_LAN'] + params['TRG_LAN'] + '_' + params['MODEL_TYPE'] + \ '_src_emb_' + str(params['SOURCE_TEXT_EMBEDDING_SIZE']) + \ '_bidir_' + str(params['BIDIRECTIONAL_ENCODER']) + \ '_enc_' + params['ENCODER_RNN_TYPE'] + '_*' + str(params['N_LAYERS_ENCODER']) + '_' + str( params['ENCODER_HIDDEN_SIZE']) + \ '_dec_' + params['DECODER_RNN_TYPE'] + '_*' + str(params['N_LAYERS_DECODER']) + '_' + str( params['DECODER_HIDDEN_SIZE']) + \ '_deepout_' + '_'.join([layer[0] for layer in params['DEEP_OUTPUT_LAYERS']]) + \ '_trg_emb_' + str(params['TARGET_TEXT_EMBEDDING_SIZE']) + \ '_' + params['OPTIMIZER'] + '_' + str(params['LR']) params['STORE_PATH'] = os.path.join(K.backend() + '_test_train_models', params['MODEL_NAME']) # Test several NMT-Keras utilities: train, sample, sample_ensemble, score_corpus... print("Training model") train_model(params) print("Done")
def apply_model(params): """ Function for using a previously trained model for sampling. """ ########### Load data dataset = build_dataset(params) # Keep original images size if IMAGE_RESIZE == False if not params['IMAGE_CROPPING']: dataset.img_size_crop = dataset.img_size ########### ########### Load model model = loadModel(params['STORE_PATH'], params['RELOAD'], custom_objects={"AttentionComplex": AttentionComplex}) model.setOptimizer() ########### ########### Apply sampling callbacks = buildCallbacks(params,model,dataset) callbacks[0].evaluate(params['RELOAD'], 'epoch')
def apply_model(params): """ Function for using a previously trained model for predicting. """ ########### Load data dataset = build_dataset(params) ########### ########### Load model ing_model = loadModel(params['STORE_PATH'], params['RELOAD']) ing_model.setOptimizer() ########### ########### Apply sampling callbacks = buildCallbacks(params, ing_model, dataset) callbacks[0].evaluate(params['RELOAD'], 'epoch') """
def apply_VQA_model(params): """ Function for using a previously trained model for sampling. """ ########### Load data dataset = build_dataset(params) params['INPUT_VOCABULARY_SIZE'] = dataset.vocabulary_len[params['INPUTS_IDS_DATASET'][0]] params['OUTPUT_VOCABULARY_SIZE'] = dataset.vocabulary_len[params['OUTPUTS_IDS_DATASET'][0]] ########### ########### Load model vqa = loadModel(params['STORE_PATH'], params['RELOAD']) vqa.setOptimizer() ########### ########### Apply sampling for s in params["EVAL_ON_SETS"]: # Apply model predictions params_prediction = {'batch_size': params['BATCH_SIZE'], 'n_parallel_loaders': params['PARALLEL_LOADERS'], 'predict_on_sets': [s]} predictions = vqa.predictNet(dataset, params_prediction)[s] # Convert predictions into sentences vocab = dataset.vocabulary[params['OUTPUTS_IDS_DATASET'][0]]['idx2words'] predictions = vqa.decode_predictions(predictions, 1, # always set temperature to 1 vocab, params['SAMPLING'], verbose=params['VERBOSE']) # Store result filepath = vqa.model_path+'/'+ s +'_sampling.txt' # results file if params['SAMPLING_SAVE_MODE'] == 'list': list2file(filepath, predictions) elif params['SAMPLING_SAVE_MODE'] == 'vqa': exec('question_ids = dataset.X_'+s+'["'+params['INPUTS_IDS_DATASET'][0]+'_ids"]') list2vqa(filepath, predictions, question_ids)
def train_model(params, load_dataset=None): """ Training function. Sets the training parameters from params. Build or loads the model and launches the training. :param params: Dictionary of network hyperparameters. :param load_dataset: Load dataset from file or build it from the parameters. :return: None """ check_params(params) if params['RELOAD'] > 0: logging.info('Resuming training.') # Load data if load_dataset is None: if params['REBUILD_DATASET']: logging.info('Rebuilding dataset.') dataset = build_dataset(params) else: logging.info('Updating dataset.') dataset = loadDataset(params['DATASET_STORE_PATH'] + '/Dataset_' + params['DATASET_NAME'] + '_' + params['SRC_LAN'] + params['TRG_LAN'] + '.pkl') params['EPOCH_OFFSET'] = params['RELOAD'] if params['RELOAD_EPOCH'] else \ int(params['RELOAD'] * params['BATCH_SIZE'] / dataset.len_train) for split, filename in params['TEXT_FILES'].iteritems(): dataset = update_dataset_from_file( dataset, params['DATA_ROOT_PATH'] + '/' + filename + params['SRC_LAN'], params, splits=list([split]), output_text_filename=params['DATA_ROOT_PATH'] + '/' + filename + params['TRG_LAN'], remove_outputs=False, compute_state_below=True, recompute_references=True) dataset.name = params['DATASET_NAME'] + '_' + params[ 'SRC_LAN'] + params['TRG_LAN'] saveDataset(dataset, params['DATASET_STORE_PATH']) else: logging.info('Reloading and using dataset.') dataset = loadDataset(load_dataset) else: # Load data if load_dataset is None: dataset = build_dataset(params) else: dataset = loadDataset(load_dataset) params['INPUT_VOCABULARY_SIZE'] = dataset.vocabulary_len[ params['INPUTS_IDS_DATASET'][0]] params['OUTPUT_VOCABULARY_SIZE'] = dataset.vocabulary_len[ params['OUTPUTS_IDS_DATASET'][0]] # Build model set_optimizer = True if params['RELOAD'] == 0 else False clear_dirs = True if params['RELOAD'] == 0 else False # build new model nmt_model = TranslationModel(params, model_type=params['MODEL_TYPE'], verbose=params['VERBOSE'], model_name=params['MODEL_NAME'], vocabularies=dataset.vocabulary, store_path=params['STORE_PATH'], set_optimizer=set_optimizer, clear_dirs=clear_dirs) # Define the inputs and outputs mapping from our Dataset instance to our model inputMapping = dict() for i, id_in in enumerate(params['INPUTS_IDS_DATASET']): pos_source = dataset.ids_inputs.index(id_in) id_dest = nmt_model.ids_inputs[i] inputMapping[id_dest] = pos_source nmt_model.setInputsMapping(inputMapping) outputMapping = dict() for i, id_out in enumerate(params['OUTPUTS_IDS_DATASET']): pos_target = dataset.ids_outputs.index(id_out) id_dest = nmt_model.ids_outputs[i] outputMapping[id_dest] = pos_target nmt_model.setOutputsMapping(outputMapping) if params['RELOAD'] > 0: nmt_model = updateModel(nmt_model, params['STORE_PATH'], params['RELOAD'], reload_epoch=params['RELOAD_EPOCH']) nmt_model.setParams(params) nmt_model.setOptimizer() if params.get('EPOCH_OFFSET') is None: params['EPOCH_OFFSET'] = params['RELOAD'] if params['RELOAD_EPOCH'] else \ int(params['RELOAD'] * params['BATCH_SIZE'] / dataset.len_train) # Store configuration as pkl dict2pkl(params, params['STORE_PATH'] + '/config') # Callbacks callbacks = buildCallbacks(params, nmt_model, dataset) # Training total_start_time = timer() logger.debug('Starting training!') training_params = { 'n_epochs': params['MAX_EPOCH'], 'batch_size': params['BATCH_SIZE'], 'homogeneous_batches': params['HOMOGENEOUS_BATCHES'], 'maxlen': params['MAX_OUTPUT_TEXT_LEN'], 'joint_batches': params['JOINT_BATCHES'], 'lr_decay': params.get('LR_DECAY', None), # LR decay parameters 'reduce_each_epochs': params.get('LR_REDUCE_EACH_EPOCHS', True), 'start_reduction_on_epoch': params.get('LR_START_REDUCTION_ON_EPOCH', 0), 'lr_gamma': params.get('LR_GAMMA', 0.9), 'lr_reducer_type': params.get('LR_REDUCER_TYPE', 'linear'), 'lr_reducer_exp_base': params.get('LR_REDUCER_EXP_BASE', 0), 'lr_half_life': params.get('LR_HALF_LIFE', 50000), 'epochs_for_save': params['EPOCHS_FOR_SAVE'], 'verbose': params['VERBOSE'], 'eval_on_sets': params['EVAL_ON_SETS_KERAS'], 'n_parallel_loaders': params['PARALLEL_LOADERS'], 'extra_callbacks': callbacks, 'reload_epoch': params['RELOAD'], 'epoch_offset': params.get('EPOCH_OFFSET', 0), 'data_augmentation': params['DATA_AUGMENTATION'], 'patience': params.get('PATIENCE', 0), # early stopping parameters 'metric_check': params.get('STOP_METRIC', None) if params.get('EARLY_STOP', False) else None, 'eval_on_epochs': params.get('EVAL_EACH_EPOCHS', True), 'each_n_epochs': params.get('EVAL_EACH', 1), 'start_eval_on_epoch': params.get('START_EVAL_ON_EPOCH', 0), 'tensorboard': params.get('TENSORBOARD', False), 'tensorboard_params': { 'log_dir': params.get('LOG_DIR', 'tensorboard_logs'), 'histogram_freq': params.get('HISTOGRAM_FREQ', 0), 'batch_size': params.get('TENSORBOARD_BATCH_SIZE', params['BATCH_SIZE']), 'write_graph': params.get('WRITE_GRAPH', True), 'write_grads': params.get('WRITE_GRADS', False), 'write_images': params.get('WRITE_IMAGES', False), 'embeddings_freq': params.get('EMBEDDINGS_FREQ', 0), 'embeddings_layer_names': params.get('EMBEDDINGS_LAYER_NAMES', None), 'embeddings_metadata': params.get('EMBEDDINGS_METADATA', None), 'label_word_embeddings_with_vocab': params.get('LABEL_WORD_EMBEDDINGS_WITH_VOCAB', False), 'word_embeddings_labels': params.get('WORD_EMBEDDINGS_LABELS', None), } } nmt_model.trainNet(dataset, training_params) total_end_time = timer() time_difference = total_end_time - total_start_time logging.info('In total is {0:.2f}s = {1:.2f}m'.format( time_difference, time_difference / 60.0))
def apply_NMT_model(params, load_dataset=None): """ Sample from a previously trained model. :param params: Dictionary of network hyperparameters. :param load_dataset: Load dataset from file or build it from the parameters. :return: None """ # Load data if load_dataset is None: dataset = build_dataset(params) else: dataset = loadDataset(load_dataset) params['INPUT_VOCABULARY_SIZE'] = dataset.vocabulary_len[ params['INPUTS_IDS_DATASET'][0]] params['OUTPUT_VOCABULARY_SIZE'] = dataset.vocabulary_len[ params['OUTPUTS_IDS_DATASET'][0]] # Load model nmt_model = loadModel(params['STORE_PATH'], params['RELOAD'], reload_epoch=params['RELOAD_EPOCH']) # Evaluate training extra_vars = { 'language': params.get('TRG_LAN', 'en'), 'n_parallel_loaders': params['PARALLEL_LOADERS'], 'tokenize_f': eval('dataset.' + params['TOKENIZATION_METHOD']), 'detokenize_f': eval('dataset.' + params['DETOKENIZATION_METHOD']), 'apply_detokenization': params['APPLY_DETOKENIZATION'], 'tokenize_hypotheses': params['TOKENIZE_HYPOTHESES'], 'tokenize_references': params['TOKENIZE_REFERENCES'], } input_text_id = params['INPUTS_IDS_DATASET'][0] vocab_x = dataset.vocabulary[input_text_id]['idx2words'] vocab_y = dataset.vocabulary[params['OUTPUTS_IDS_DATASET'][0]]['idx2words'] if params['BEAM_SEARCH']: extra_vars['beam_size'] = params.get('BEAM_SIZE', 6) extra_vars['state_below_index'] = params.get('BEAM_SEARCH_COND_INPUT', -1) extra_vars['maxlen'] = params.get('MAX_OUTPUT_TEXT_LEN_TEST', 30) extra_vars['optimized_search'] = params.get('OPTIMIZED_SEARCH', True) extra_vars['model_inputs'] = params['INPUTS_IDS_MODEL'] extra_vars['model_outputs'] = params['OUTPUTS_IDS_MODEL'] extra_vars['dataset_inputs'] = params['INPUTS_IDS_DATASET'] extra_vars['dataset_outputs'] = params['OUTPUTS_IDS_DATASET'] extra_vars['normalize_probs'] = params.get('NORMALIZE_SAMPLING', False) extra_vars['search_pruning'] = params.get('SEARCH_PRUNING', False) extra_vars['alpha_factor'] = params.get('ALPHA_FACTOR', 1.0) extra_vars['coverage_penalty'] = params.get('COVERAGE_PENALTY', False) extra_vars['length_penalty'] = params.get('LENGTH_PENALTY', False) extra_vars['length_norm_factor'] = params.get('LENGTH_NORM_FACTOR', 0.0) extra_vars['coverage_norm_factor'] = params.get( 'COVERAGE_NORM_FACTOR', 0.0) extra_vars['state_below_maxlen'] = -1 if params.get('PAD_ON_BATCH', True) \ else params.get('MAX_OUTPUT_TEXT_LEN', 50) extra_vars['pos_unk'] = params['POS_UNK'] extra_vars['output_max_length_depending_on_x'] = params.get( 'MAXLEN_GIVEN_X', True) extra_vars['output_max_length_depending_on_x_factor'] = params.get( 'MAXLEN_GIVEN_X_FACTOR', 3) extra_vars['output_min_length_depending_on_x'] = params.get( 'MINLEN_GIVEN_X', True) extra_vars['output_min_length_depending_on_x_factor'] = params.get( 'MINLEN_GIVEN_X_FACTOR', 2) extra_vars['attend_on_output'] = params.get( 'ATTEND_ON_OUTPUT', 'transformer' in params['MODEL_TYPE'].lower()) if params['POS_UNK']: extra_vars['heuristic'] = params['HEURISTIC'] if params['HEURISTIC'] > 0: extra_vars['mapping'] = dataset.mapping for s in params["EVAL_ON_SETS"]: extra_vars[s] = dict() extra_vars[s]['references'] = dataset.extra_variables[s][ params['OUTPUTS_IDS_DATASET'][0]] callback_metric = PrintPerformanceMetricOnEpochEndOrEachNUpdates( nmt_model, dataset, gt_id=params['OUTPUTS_IDS_DATASET'][0], metric_name=params['METRICS'], set_name=params['EVAL_ON_SETS'], batch_size=params['BATCH_SIZE'], each_n_epochs=params['EVAL_EACH'], extra_vars=extra_vars, reload_epoch=params['RELOAD'], is_text=True, input_text_id=input_text_id, save_path=nmt_model.model_path, index2word_y=vocab_y, index2word_x=vocab_x, sampling_type=params['SAMPLING'], beam_search=params['BEAM_SEARCH'], start_eval_on_epoch=params['START_EVAL_ON_EPOCH'], write_samples=True, write_type=params['SAMPLING_SAVE_MODE'], eval_on_epochs=params['EVAL_EACH_EPOCHS'], save_each_evaluation=False, verbose=params['VERBOSE']) callback_metric.evaluate( params['RELOAD'], counter_name='epoch' if params['EVAL_EACH_EPOCHS'] else 'update')
def test_unk_replace_0(): params = load_tests_params() params['REBUILD_DATASET'] = True params['INPUT_VOCABULARY_SIZE'] = 0 params['OUTPUT_VOCABULARY_SIZE'] = 50 params['POS_UNK'] = True params['HEURISTIC'] = 0 params['ALIGN_FROM_RAW'] = True dataset = build_dataset(params) params['INPUT_VOCABULARY_SIZE'] = dataset.vocabulary_len[ params['INPUTS_IDS_DATASET'][0]] params['OUTPUT_VOCABULARY_SIZE'] = dataset.vocabulary_len[ params['OUTPUTS_IDS_DATASET'][0]] params['MODEL_NAME'] = \ params['TASK_NAME'] + '_' + params['SRC_LAN'] + params['TRG_LAN'] + '_' + params['MODEL_TYPE'] + \ '_src_emb_' + str(params['SOURCE_TEXT_EMBEDDING_SIZE']) + \ '_bidir_' + str(params['BIDIRECTIONAL_ENCODER']) + \ '_enc_' + params['ENCODER_RNN_TYPE'] + '_*' + str(params['N_LAYERS_ENCODER']) + '_' + str( params['ENCODER_HIDDEN_SIZE']) + \ '_dec_' + params['DECODER_RNN_TYPE'] + '_*' + str(params['N_LAYERS_DECODER']) + '_' + str( params['DECODER_HIDDEN_SIZE']) + \ '_deepout_' + '_'.join([layer[0] for layer in params['DEEP_OUTPUT_LAYERS']]) + \ '_trg_emb_' + str(params['TARGET_TEXT_EMBEDDING_SIZE']) + \ '_' + params['OPTIMIZER'] + '_' + str(params['LR']) # Test several NMT-Keras utilities: train, sample, sample_ensemble, score_corpus... print("Training model") train_model(params) params['RELOAD'] = 1 print("Done") parser = argparse.ArgumentParser('Parser for unit testing') parser.dataset = os.path.join( params['DATASET_STORE_PATH'], 'Dataset_' + params['DATASET_NAME'] + '_' + params['SRC_LAN'] + params['TRG_LAN'] + '.pkl') parser.text = os.path.join(params['DATA_ROOT_PATH'], params['TEXT_FILES']['val'] + params['SRC_LAN']) parser.splits = ['val'] parser.config = os.path.join(params['STORE_PATH'], 'config.pkl') parser.models = [os.path.join(params['STORE_PATH'], 'epoch_' + str(1))] parser.verbose = 0 parser.dest = None parser.source = os.path.join( params['DATA_ROOT_PATH'], params['TEXT_FILES']['val'] + params['SRC_LAN']) parser.target = os.path.join( params['DATA_ROOT_PATH'], params['TEXT_FILES']['val'] + params['TRG_LAN']) parser.weights = [] parser.glossary = None for n_best in [True, False]: parser.n_best = n_best print("Sampling with n_best = %s " % str(n_best)) sample_ensemble(parser, params) print("Done") print("Scoring corpus") score_corpus(parser, params) print("Done") clean_dirs(params)
def train_model(params): """ Training function. Sets the training parameters from params. Build or loads the model and launches the training. :param params: Dictionary of network hyperparameters. :return: None """ if params['RELOAD'] > 0: logging.info('Resuming training.') check_params(params) # Load data dataset = build_dataset(params) params['OUTPUT_VOCABULARY_SIZE'] = dataset.vocabulary_len[ params['OUTPUTS_IDS_DATASET'][0]] # Build model if (params['RELOAD'] == 0): # build new model video_model = VideoDesc_Model(params, type=params['MODEL_TYPE'], verbose=params['VERBOSE'], model_name=params['MODEL_NAME'], vocabularies=dataset.vocabulary, store_path=params['STORE_PATH']) dict2pkl(params, params['STORE_PATH'] + '/config') # Define the inputs and outputs mapping from our Dataset instance to our model inputMapping = dict() for i, id_in in enumerate(params['INPUTS_IDS_DATASET']): if len(video_model.ids_inputs) > i: pos_source = dataset.ids_inputs.index(id_in) id_dest = video_model.ids_inputs[i] inputMapping[id_dest] = pos_source video_model.setInputsMapping(inputMapping) outputMapping = dict() for i, id_out in enumerate(params['OUTPUTS_IDS_DATASET']): if len(video_model.ids_outputs) > i: pos_target = dataset.ids_outputs.index(id_out) id_dest = video_model.ids_outputs[i] outputMapping[id_dest] = pos_target video_model.setOutputsMapping(outputMapping) else: # resume from previously trained model video_model = loadModel(params['STORE_PATH'], params['RELOAD']) video_model.setOptimizer() ########### ########### Callbacks callbacks = buildCallbacks(params, video_model, dataset) ########### ########### Training total_start_time = timer() logger.debug('Starting training!') training_params = { 'n_epochs': params['MAX_EPOCH'], 'batch_size': params['BATCH_SIZE'], 'homogeneous_batches': params['HOMOGENEOUS_BATCHES'], 'maxlen': params['MAX_OUTPUT_TEXT_LEN'], 'lr_decay': params['LR_DECAY'], 'lr_gamma': params['LR_GAMMA'], 'epochs_for_save': params['EPOCHS_FOR_SAVE'], 'verbose': params['VERBOSE'], 'eval_on_sets': params['EVAL_ON_SETS_KERAS'], 'n_parallel_loaders': params['PARALLEL_LOADERS'], 'extra_callbacks': callbacks, 'reload_epoch': params['RELOAD'], 'epoch_offset': params['RELOAD'], 'data_augmentation': params['DATA_AUGMENTATION'], 'patience': params.get('PATIENCE', 0), 'metric_check': params.get('STOP_METRIC', None) } video_model.trainNet(dataset, training_params) total_end_time = timer() time_difference = total_end_time - total_start_time logging.info('In total is {0:.2f}s = {1:.2f}m'.format( time_difference, time_difference / 60.0))
def apply_Video_model(params): """ Function for using a previously trained model for sampling. """ ########### Load data dataset = build_dataset(params) params['OUTPUT_VOCABULARY_SIZE'] = dataset.vocabulary_len[ params['OUTPUTS_IDS_DATASET'][0]] ########### ########### Load model video_model = loadModel(params['STORE_PATH'], params['RELOAD']) video_model.setOptimizer() ########### ########### Apply sampling extra_vars = dict() extra_vars['tokenize_f'] = eval('dataset.' + params['TOKENIZATION_METHOD']) extra_vars['language'] = params.get('TRG_LAN', 'en') for s in params["EVAL_ON_SETS"]: # Apply model predictions params_prediction = { 'max_batch_size': params['BATCH_SIZE'], 'n_parallel_loaders': params['PARALLEL_LOADERS'], 'predict_on_sets': [s] } # Convert predictions into sentences vocab = dataset.vocabulary[params['OUTPUTS_IDS_DATASET'] [0]]['idx2words'] if params['BEAM_SEARCH']: params_prediction['beam_size'] = params['BEAM_SIZE'] params_prediction['maxlen'] = params['MAX_OUTPUT_TEXT_LEN_TEST'] params_prediction['optimized_search'] = params['OPTIMIZED_SEARCH'] params_prediction['model_inputs'] = params['INPUTS_IDS_MODEL'] params_prediction['model_outputs'] = params['OUTPUTS_IDS_MODEL'] params_prediction['dataset_inputs'] = params['INPUTS_IDS_DATASET'] params_prediction['dataset_outputs'] = params[ 'OUTPUTS_IDS_DATASET'] params_prediction['normalize_probs'] = params['NORMALIZE_SAMPLING'] params_prediction['alpha_factor'] = params['ALPHA_FACTOR'] predictions = video_model.predictBeamSearchNet( dataset, params_prediction)[s] predictions = video_model.decode_predictions_beam_search( predictions, vocab, verbose=params['VERBOSE']) else: predictions = video_model.predictNet(dataset, params_prediction)[s] predictions = video_model.decode_predictions( predictions, 1, # always set temperature to 1 vocab, params['SAMPLING'], verbose=params['VERBOSE']) # Store result filepath = video_model.model_path + '/' + s + '_sampling.pred' # results file if params['SAMPLING_SAVE_MODE'] == 'list': list2file(filepath, predictions) else: raise Exception, 'Only "list" is allowed in "SAMPLING_SAVE_MODE"' # Evaluate if any metric in params['METRICS'] for metric in params['METRICS']: logging.info('Evaluating on metric ' + metric) filepath = video_model.model_path + '/' + s + '_sampling.' + metric # results file # Evaluate on the chosen metric extra_vars[s] = dict() extra_vars[s]['references'] = dataset.extra_variables[s][ params['OUTPUTS_IDS_DATASET'][0]] metrics = evaluation.select[metric](pred_list=predictions, verbose=1, extra_vars=extra_vars, split=s) # Print results to file with open(filepath, 'w') as f: header = '' line = '' for metric_ in sorted(metrics): value = metrics[metric_] header += metric_ + ',' line += str(value) + ',' f.write(header + '\n') f.write(line + '\n') logging.info('Done evaluating on metric ' + metric)
def train_model(params): """ Main function """ if (params['RELOAD'] > 0): logging.info('Resuming training.') ########### Load data dataset = build_dataset(params) ########### ########### Build model if params['REUSE_MODEL_NAME'] is not None and params[ 'REUSE_MODEL_RELOAD'] > 0: ing_model = loadModel(params['REUSE_MODEL_NAME'], params['REUSE_MODEL_RELOAD']) ing_model.setName(model_name=params['MODEL_NAME'], store_path=params['STORE_PATH']) ing_model.changeClassifier(params, last_layer=params['LAST_LAYER']) ing_model.updateLogger(force=True) elif (params['RELOAD'] == 0): # build new model ing_model = Ingredients_Model(params, type=params['MODEL_TYPE'], verbose=params['VERBOSE'], model_name=params['MODEL_NAME'], store_path=params['STORE_PATH']) # Define the inputs and outputs mapping from our Dataset instance to our model ing_model.setInputsMapping(params['INPUTS_MAPPING']) ing_model.setOutputsMapping(params['OUTPUTS_MAPPING']) else: # resume from previously trained model ing_model = loadModel(params['STORE_PATH'], params['RELOAD']) # Update optimizer either if we are loading or building a model ing_model.params = params ing_model.setOptimizer() ########### ########### Callbacks callbacks = buildCallbacks(params, ing_model, dataset) ########### ########### Training total_start_time = timer() logger.debug('Starting training!') training_params = { 'normalize': True, 'normalization_type': '(-1)-1', 'mean_substraction': False, 'n_epochs': params['MAX_EPOCH'], 'batch_size': params['BATCH_SIZE'], 'lr_decay': params['LR_DECAY'], 'lr_gamma': params['LR_GAMMA'], 'epochs_for_save': params['EPOCHS_FOR_SAVE'], 'verbose': params['VERBOSE'], 'n_parallel_loaders': params['PARALLEL_LOADERS'], 'extra_callbacks': callbacks, 'reload_epoch': params['RELOAD'], 'epoch_offset': params['RELOAD'], 'data_augmentation': params['DATA_AUGMENTATION'], 'patience': params['PATIENCE'], 'metric_check': params['STOP_METRIC'] } ing_model.trainNet(dataset, training_params) total_end_time = timer() time_difference = total_end_time - total_start_time logging.info('Total time spent {0:.2f}s = {1:.2f}m'.format( time_difference, time_difference / 60.0))
def train_model(params): """ Training function. Sets the training parameters from params. Build or loads the model and launches the training. :param params: Dictionary of network hyperparameters. :return: None """ if params['RELOAD'] > 0: logging.info('Resuming training.') check_params(params) ########### Load data dataset = build_dataset(params) if not '-vidtext-embed' in params['DATASET_NAME']: params['OUTPUT_VOCABULARY_SIZE'] = dataset.vocabulary_len[ params['OUTPUTS_IDS_DATASET'][0]] else: params['OUTPUT_VOCABULARY_SIZE'] = dataset.vocabulary_len[ params['INPUTS_IDS_DATASET'][1]] ########### ########### Build model if params['MODE'] == 'finetuning': # video_model = loadModel(params['PRE_TRAINED_MODEL_STORE_PATHS'], params['RELOAD']) video_model = VideoDesc_Model(params, type=params['MODEL_TYPE'], verbose=params['VERBOSE'], model_name=params['MODEL_NAME'] + '_reloaded', vocabularies=dataset.vocabulary, store_path=params['STORE_PATH'], set_optimizer=False, clear_dirs=False) video_model = updateModel(video_model, params['RELOAD_PATH'], params['RELOAD'], reload_epoch=False) video_model.setParams(params) # Define the inputs and outputs mapping from our Dataset instance to our model inputMapping = dict() for i, id_in in enumerate(params['INPUTS_IDS_DATASET']): if len(video_model.ids_inputs) > i: pos_source = dataset.ids_inputs.index(id_in) id_dest = video_model.ids_inputs[i] inputMapping[id_dest] = pos_source video_model.setInputsMapping(inputMapping) outputMapping = dict() for i, id_out in enumerate(params['OUTPUTS_IDS_DATASET']): if len(video_model.ids_outputs) > i: pos_target = dataset.ids_outputs.index(id_out) id_dest = video_model.ids_outputs[i] outputMapping[id_dest] = pos_target video_model.setOutputsMapping(outputMapping) video_model.setOptimizer() params['MAX_EPOCH'] += params['RELOAD'] else: if params['RELOAD'] == 0 or params[ 'LOAD_WEIGHTS_ONLY']: # build new model video_model = VideoDesc_Model(params, type=params['MODEL_TYPE'], verbose=params['VERBOSE'], model_name=params['MODEL_NAME'], vocabularies=dataset.vocabulary, store_path=params['STORE_PATH'], set_optimizer=True) dict2pkl(params, params['STORE_PATH'] + '/config') # Define the inputs and outputs mapping from our Dataset instance to our model inputMapping = dict() for i, id_in in enumerate(params['INPUTS_IDS_DATASET']): if len(video_model.ids_inputs) > i: pos_source = dataset.ids_inputs.index(id_in) id_dest = video_model.ids_inputs[i] inputMapping[id_dest] = pos_source video_model.setInputsMapping(inputMapping) outputMapping = dict() for i, id_out in enumerate(params['OUTPUTS_IDS_DATASET']): if len(video_model.ids_outputs) > i: pos_target = dataset.ids_outputs.index(id_out) id_dest = video_model.ids_outputs[i] outputMapping[id_dest] = pos_target video_model.setOutputsMapping(outputMapping) # Only load weights from pre-trained model if params['LOAD_WEIGHTS_ONLY'] and params['RELOAD'] > 0: for i in range(0, len(params['RELOAD'])): old_model = loadModel( params['PRE_TRAINED_MODEL_STORE_PATHS'][i], params['RELOAD'][i]) video_model = transferWeights(old_model, video_model, params['LAYERS_MAPPING'][i]) video_model.setOptimizer() params['RELOAD'] = 0 else: # resume from previously trained model video_model = loadModel(params['PRE_TRAINED_MODEL_STORE_PATHS'], params['RELOAD']) video_model.params['LR'] = params['LR'] video_model.setOptimizer() if video_model.model_path != params['STORE_PATH']: video_model.setName(params['MODEL_NAME'], models_path=params['STORE_PATH'], clear_dirs=False) # Update optimizer either if we are loading or building a model video_model.params = params video_model.setOptimizer() ########### ########### Test model saving/loading functions # saveModel(video_model, params['RELOAD']) # video_model = loadModel(params['STORE_PATH'], params['RELOAD']) ########### ########### Callbacks callbacks = buildCallbacks(params, video_model, dataset) ########### ########### Training total_start_time = timer() logger.debug('Starting training!') training_params = { 'n_epochs': params['MAX_EPOCH'], 'batch_size': params['BATCH_SIZE'], 'homogeneous_batches': params['HOMOGENEOUS_BATCHES'], 'maxlen': params['MAX_OUTPUT_TEXT_LEN'], 'lr_decay': params['LR_DECAY'], 'lr_gamma': params['LR_GAMMA'], 'epochs_for_save': params['EPOCHS_FOR_SAVE'], 'verbose': params['VERBOSE'], 'eval_on_sets': params['EVAL_ON_SETS_KERAS'], 'n_parallel_loaders': params['PARALLEL_LOADERS'], 'extra_callbacks': callbacks, 'reload_epoch': params['RELOAD'], 'epoch_offset': params['RELOAD'], 'data_augmentation': params['DATA_AUGMENTATION'], 'patience': params.get('PATIENCE', 0), # early stopping parameters 'metric_check': params.get('STOP_METRIC', None), 'eval_on_epochs': params.get('EVAL_EACH_EPOCHS', True), 'each_n_epochs': params.get('EVAL_EACH', 1), 'start_eval_on_epoch': params.get('START_EVAL_ON_EPOCH', 0) } video_model.trainNet(dataset, training_params) total_end_time = timer() time_difference = total_end_time - total_start_time logging.info('In total is {0:.2f}s = {1:.2f}m'.format( time_difference, time_difference / 60.0))
def test_train(): params = load_parameters() params['REBUILD_DATASET'] = True params['DATASET_STORE_PATH'] = './' dataset = build_dataset(params) params['INPUT_VOCABULARY_SIZE'] = \ dataset.vocabulary_len[params['INPUTS_IDS_DATASET'][0]] params['OUTPUT_VOCABULARY_SIZE'] = \ dataset.vocabulary_len[params['OUTPUTS_IDS_DATASET'][0]] params['SOURCE_TEXT_EMBEDDING_SIZE'] = 2 params['TARGET_TEXT_EMBEDDING_SIZE'] = 2 params['ENCODER_HIDDEN_SIZE'] = 2 params['DECODER_HIDDEN_SIZE'] = 2 params['ATTENTION_SIZE'] = 2 params['SKIP_VECTORS_HIDDEN_SIZE'] = 2 params['DEEP_OUTPUT_LAYERS'] = [('linear', 2)] params['STORE_PATH'] = './' nmt_model = \ TranslationModel(params, model_type=params['MODEL_TYPE'], verbose=params['VERBOSE'], model_name=params['MODEL_NAME'], vocabularies=dataset.vocabulary, store_path=params['STORE_PATH'], clear_dirs=False) # Check Inputs inputMapping = dict() for i, id_in in enumerate(params['INPUTS_IDS_DATASET']): pos_source = dataset.ids_inputs.index(id_in) id_dest = nmt_model.ids_inputs[i] inputMapping[id_dest] = pos_source nmt_model.setInputsMapping(inputMapping) outputMapping = dict() for i, id_out in enumerate(params['OUTPUTS_IDS_DATASET']): pos_target = dataset.ids_outputs.index(id_out) id_dest = nmt_model.ids_outputs[i] outputMapping[id_dest] = pos_target nmt_model.setOutputsMapping(outputMapping) callbacks = buildCallbacks(params, nmt_model, dataset) training_params = { 'n_epochs': 1, 'batch_size': 50, 'homogeneous_batches': False, 'maxlen': 10, 'joint_batches': params['JOINT_BATCHES'], 'lr_decay': params['LR_DECAY'], 'lr_gamma': params['LR_GAMMA'], 'epochs_for_save': 1, 'verbose': params['VERBOSE'], 'eval_on_sets': params['EVAL_ON_SETS_KERAS'], 'n_parallel_loaders': params['PARALLEL_LOADERS'], 'extra_callbacks': callbacks, 'reload_epoch': 0, 'epoch_offset': 0, 'data_augmentation': False, 'patience': 1, # early stopping parameters 'metric_check': 'Bleu_4', 'eval_on_epochs': True, 'each_n_epochs': 1, 'start_eval_on_epoch': 0 } nmt_model.trainNet(dataset, training_params) return True
def train_model(params): """ Training function. Sets the training parameters from params. Build or loads the model and launches the training. :param params: Dictionary of network hyperparameters. :return: None """ if params['RELOAD'] > 0: logging.info('Resuming training.') # Load data dataset = build_dataset(params) params['OUTPUT_VOCABULARY_SIZE'] = dataset.vocabulary_len[params['OUTPUTS_IDS_DATASET'][0]] # Build model if (params['RELOAD'] == 0): # build new model video_model = Captioning_Model(params, model_type=params['MODEL_TYPE'], verbose=params['VERBOSE'], model_name=params['MODEL_NAME'], vocabularies=dataset.vocabulary, store_path=params['STORE_PATH']) dict2pkl(params, params['STORE_PATH'] + '/config') # Define the inputs and outputs mapping from our Dataset instance to our model inputMapping = dict() for i, id_in in enumerate(params['INPUTS_IDS_DATASET']): if len(video_model.ids_inputs) > i: pos_source = dataset.ids_inputs.index(id_in) id_dest = video_model.ids_inputs[i] inputMapping[id_dest] = pos_source video_model.setInputsMapping(inputMapping) outputMapping = dict() for i, id_out in enumerate(params['OUTPUTS_IDS_DATASET']): if len(video_model.ids_outputs) > i: pos_target = dataset.ids_outputs.index(id_out) id_dest = video_model.ids_outputs[i] outputMapping[id_dest] = pos_target video_model.setOutputsMapping(outputMapping) else: # resume from previously trained model video_model = loadModel(params['STORE_PATH'], params['RELOAD']) video_model.setOptimizer() # Callbacks callbacks = buildCallbacks(params, video_model, dataset) # Training total_start_time = timer() logger.debug('Starting training!') training_params = {'n_epochs': params['MAX_EPOCH'], 'batch_size': params['BATCH_SIZE'], 'homogeneous_batches': params['HOMOGENEOUS_BATCHES'], 'maxlen': params['MAX_OUTPUT_TEXT_LEN'], 'joint_batches': params['JOINT_BATCHES'], 'lr_decay': params.get('LR_DECAY', None), # LR decay parameters 'initial_lr': params.get('LR', 1.0), 'reduce_each_epochs': params.get('LR_REDUCE_EACH_EPOCHS', True), 'start_reduction_on_epoch': params.get('LR_START_REDUCTION_ON_EPOCH', 0), 'lr_gamma': params.get('LR_GAMMA', 0.9), 'lr_reducer_type': params.get('LR_REDUCER_TYPE', 'linear'), 'lr_reducer_exp_base': params.get('LR_REDUCER_EXP_BASE', 0), 'lr_half_life': params.get('LR_HALF_LIFE', 50000), 'lr_warmup_exp': params.get('WARMUP_EXP', -1.5), 'min_lr': params.get('MIN_LR', 1e-9), 'epochs_for_save': params['EPOCHS_FOR_SAVE'], 'verbose': params['VERBOSE'], 'eval_on_sets': params['EVAL_ON_SETS_KERAS'], 'n_parallel_loaders': params['PARALLEL_LOADERS'], 'extra_callbacks': callbacks, 'reload_epoch': params['RELOAD'], 'epoch_offset': params.get('EPOCH_OFFSET', 0), 'data_augmentation': params['DATA_AUGMENTATION'], 'patience': params.get('PATIENCE', 0), # early stopping parameters 'metric_check': params.get('STOP_METRIC', None) if params.get('EARLY_STOP', False) else None, 'eval_on_epochs': params.get('EVAL_EACH_EPOCHS', True), 'each_n_epochs': params.get('EVAL_EACH', 1), 'start_eval_on_epoch': params.get('START_EVAL_ON_EPOCH', 0), 'tensorboard': params.get('TENSORBOARD', False), 'n_gpus': params.get('N_GPUS', 1), 'tensorboard_params': {'log_dir': params.get('LOG_DIR', 'tensorboard_logs'), 'histogram_freq': params.get('HISTOGRAM_FREQ', 0), 'batch_size': params.get('TENSORBOARD_BATCH_SIZE', params['BATCH_SIZE']), 'write_graph': params.get('WRITE_GRAPH', True), 'write_grads': params.get('WRITE_GRADS', False), 'write_images': params.get('WRITE_IMAGES', False), 'embeddings_freq': params.get('EMBEDDINGS_FREQ', 0), 'embeddings_layer_names': params.get('EMBEDDINGS_LAYER_NAMES', None), 'embeddings_metadata': params.get('EMBEDDINGS_METADATA', None), 'label_word_embeddings_with_vocab': params.get('LABEL_WORD_EMBEDDINGS_WITH_VOCAB', False), 'word_embeddings_labels': params.get('WORD_EMBEDDINGS_LABELS', None), } } video_model.trainNet(dataset, training_params) total_end_time = timer() time_difference = total_end_time - total_start_time logging.info('In total is {0:.2f}s = {1:.2f}m'.format(time_difference, time_difference / 60.0))
def apply_Feature_Extractor_model(params, dataset=None, extractor_model=None): """ Function for using a previously trained model for sampling. """ ########### Load data if dataset is None: dataset = build_dataset(params) ########### Load model if extractor_model is None and params['RELOAD'] > 0: extractor_model = loadModel(params['STORE_PATH'], params['RELOAD']) else: extractor_model = Feature_Extractor(params, type=params['MODEL_TYPE'], verbose=params['VERBOSE'], model_name=params['MODEL_NAME'], store_path=params['STORE_PATH']) # Define the inputs and outputs mapping from our Dataset instance to our model inputMapping = dict() for i, id_in in enumerate(params['INPUTS_IDS_DATASET']): if len(extractor_model.ids_inputs) > i: pos_source = dataset.ids_inputs.index(id_in) id_dest = extractor_model.ids_inputs[i] inputMapping[id_dest] = pos_source extractor_model.setInputsMapping(inputMapping) ########### Apply sampling extra_vars = dict() for s in params["EVAL_ON_SETS"]: # Apply model predictions params_prediction = {'batch_size': params['BATCH_SIZE'], 'n_parallel_loaders': params['PARALLEL_LOADERS'], 'predict_on_sets': [s], 'verbose': 0} logging.info("<<< Predicting outputs of " + s + " set >>>") if params['SAMPLING_SAVE_MODE'] == 'list': filepath = extractor_model.model_path + '/' + s + '_sampling.pred' # results file list2file(filepath, [], permission='w') start_time = time.time() eta = -1 mode = 'w' for n_sample in range(0, eval('dataset.len_' + s), params.get('PREDICTION_STEP', 100)): params_prediction['init_sample'] = n_sample params_prediction['final_sample'] = min(n_sample + params.get('PREDICTION_STEP', 100), eval('dataset.len_' + s)) predictions = extractor_model.predictNet(dataset, params_prediction)[s] # Store result if params['SAMPLING_SAVE_MODE'] == 'list': filepath = extractor_model.model_path + '/' + s + '_sampling.pred' # results file list2file(filepath, predictions, permission='a') elif params['SAMPLING_SAVE_MODE'] == 'npy': filepath = extractor_model.model_path + '/' + s + '_' + params.get('MODEL_TYPE', '') + '_features.npy' numpy2file(filepath, predictions, permission=mode) elif params['SAMPLING_SAVE_MODE'] == 'hdf5': filepath = extractor_model.model_path + '/' + s + '_' + params.get('MODEL_TYPE', '') + '_features.hdf5' numpy2hdf5(filepath, predictions, permission=mode) else: raise Exception, 'Only "list" or "hdf5" are allowed in "SAMPLING_SAVE_MODE"' mode = 'a' sys.stdout.write('\r') sys.stdout.write("\t Processed %d/%d - ETA: %ds " % (n_sample, eval('dataset.len_' + s), int(eta))) sys.stdout.flush() eta = (eval('dataset.len_' + s) - n_sample) * (time.time() - start_time) / max(n_sample, 1)
def apply_NMT_model(params, load_dataset=None): """ Sample from a previously trained model. :param params: Dictionary of network hyperparameters. :param load_dataset: Load dataset from file or build it from the parameters. :return: None """ pred_vocab = params.get('PRED_VOCAB', None) if pred_vocab is not None: dataset_voc = loadDataset(params['PRED_VOCAB']) dataset = build_dataset(params, dataset_voc.vocabulary, dataset_voc.vocabulary_len) else: dataset = build_dataset(params) # Load data #if load_dataset is None: # dataset = build_dataset(params) #else: # dataset = loadDataset(load_dataset) #params['INPUT_VOCABULARY_SIZE'] = dataset.vocabulary_len[params['INPUTS_IDS_DATASET'][0]] #params['OUTPUT_VOCABULARY_SIZE'] = dataset.vocabulary_len[params['OUTPUTS_IDS_DATASET'][0]] #vocab_y = dataset.vocabulary[params['INPUTS_IDS_DATASET'][1]]['idx2words'] params['INPUT_VOCABULARY_SIZE'] = dataset.vocabulary_len[ params['INPUTS_IDS_DATASET'][0]] params['OUTPUT_VOCABULARY_SIZE'] = dataset.vocabulary_len['target_text'] # Load model #nmt_model = loadModel(params['STORE_PATH'], params['RELOAD'], reload_epoch=params['RELOAD_EPOCH']) nmt_model = TranslationModel(params, model_type=params['MODEL_TYPE'], verbose=params['VERBOSE'], model_name=params['MODEL_NAME'], set_optimizer=False, vocabularies=dataset.vocabulary, store_path=params['STORE_PATH'], trainable_pred=True, trainable_est=True, weights_path=None) nmt_model = updateModel(nmt_model, params['STORE_PATH'], params['RELOAD'], reload_epoch=params['RELOAD_EPOCH']) nmt_model.setParams(params) nmt_model.setOptimizer() inputMapping = dict() for i, id_in in enumerate(params['INPUTS_IDS_DATASET']): pos_source = dataset.ids_inputs.index(id_in) id_dest = nmt_model.ids_inputs[i] inputMapping[id_dest] = pos_source nmt_model.setInputsMapping(inputMapping) outputMapping = dict() for i, id_out in enumerate(params['OUTPUTS_IDS_DATASET']): pos_target = dataset.ids_outputs.index(id_out) id_dest = nmt_model.ids_outputs[i] outputMapping[id_dest] = pos_target nmt_model.setOutputsMapping(outputMapping) nmt_model.setOptimizer() for s in params["EVAL_ON_SETS"]: # Evaluate training extra_vars = { 'language': params.get('TRG_LAN', 'en'), 'n_parallel_loaders': params['PARALLEL_LOADERS'], 'tokenize_f': eval('dataset.' + params['TOKENIZATION_METHOD']), 'detokenize_f': eval('dataset.' + params['DETOKENIZATION_METHOD']), 'apply_detokenization': params['APPLY_DETOKENIZATION'], 'tokenize_hypotheses': params['TOKENIZE_HYPOTHESES'], 'tokenize_references': params['TOKENIZE_REFERENCES'] } #vocab = dataset.vocabulary[params['OUTPUTS_IDS_DATASET'][0]]['idx2words'] #vocab = dataset.vocabulary[params['INPUTS_IDS_DATASET'][1]]['idx2words'] extra_vars[s] = dict() if not params.get('NO_REF', False): extra_vars[s]['references'] = dataset.extra_variables[s][ params['OUTPUTS_IDS_DATASET'][0]] #input_text_id = None #vocab_src = None input_text_id = params['INPUTS_IDS_DATASET'][0] vocab_x = dataset.vocabulary[input_text_id]['idx2words'] vocab_y = dataset.vocabulary[params['INPUTS_IDS_DATASET'] [1]]['idx2words'] if params['BEAM_SEARCH']: extra_vars['beam_size'] = params.get('BEAM_SIZE', 6) extra_vars['state_below_index'] = params.get( 'BEAM_SEARCH_COND_INPUT', -1) extra_vars['maxlen'] = params.get('MAX_OUTPUT_TEXT_LEN_TEST', 30) extra_vars['optimized_search'] = params.get( 'OPTIMIZED_SEARCH', True) extra_vars['model_inputs'] = params['INPUTS_IDS_MODEL'] extra_vars['model_outputs'] = params['OUTPUTS_IDS_MODEL'] extra_vars['dataset_inputs'] = params['INPUTS_IDS_DATASET'] extra_vars['dataset_outputs'] = params['OUTPUTS_IDS_DATASET'] extra_vars['normalize_probs'] = params.get('NORMALIZE_SAMPLING', False) extra_vars['search_pruning'] = params.get('SEARCH_PRUNING', False) extra_vars['alpha_factor'] = params.get('ALPHA_FACTOR', 1.0) extra_vars['coverage_penalty'] = params.get( 'COVERAGE_PENALTY', False) extra_vars['length_penalty'] = params.get('LENGTH_PENALTY', False) extra_vars['length_norm_factor'] = params.get( 'LENGTH_NORM_FACTOR', 0.0) extra_vars['coverage_norm_factor'] = params.get( 'COVERAGE_NORM_FACTOR', 0.0) extra_vars['pos_unk'] = params['POS_UNK'] extra_vars['output_max_length_depending_on_x'] = params.get( 'MAXLEN_GIVEN_X', True) extra_vars['output_max_length_depending_on_x_factor'] = params.get( 'MAXLEN_GIVEN_X_FACTOR', 3) extra_vars['output_min_length_depending_on_x'] = params.get( 'MINLEN_GIVEN_X', True) extra_vars['output_min_length_depending_on_x_factor'] = params.get( 'MINLEN_GIVEN_X_FACTOR', 2) if params['POS_UNK']: extra_vars['heuristic'] = params['HEURISTIC'] input_text_id = params['INPUTS_IDS_DATASET'][0] vocab_src = dataset.vocabulary[input_text_id]['idx2words'] if params['HEURISTIC'] > 0: extra_vars['mapping'] = dataset.mapping callback_metric = PrintPerformanceMetricOnEpochEndOrEachNUpdates( nmt_model, dataset, gt_id=params['OUTPUTS_IDS_DATASET'][0], metric_name=params['METRICS'], set_name=params['EVAL_ON_SETS'], batch_size=params['BATCH_SIZE'], each_n_epochs=params['EVAL_EACH'], extra_vars=extra_vars, reload_epoch=params['RELOAD'], is_text=True, input_text_id=input_text_id, save_path=nmt_model.model_path, index2word_y=vocab_y, index2word_x=vocab_x, sampling_type=params['SAMPLING'], beam_search=params['BEAM_SEARCH'], start_eval_on_epoch=params['START_EVAL_ON_EPOCH'], write_samples=True, write_type=params['SAMPLING_SAVE_MODE'], eval_on_epochs=params['EVAL_EACH_EPOCHS'], save_each_evaluation=False, verbose=params['VERBOSE'], no_ref=params['NO_REF']) callback_metric.evaluate( params['RELOAD'], counter_name='epoch' if params['EVAL_EACH_EPOCHS'] else 'update')
def train_model(params, load_dataset=None): """ Training function. Sets the training parameters from params. Build or loads the model and launches the training. :param params: Dictionary of network hyperparameters. :param load_dataset: Load dataset from file or build it from the parameters. :return: None """ if params['RELOAD'] > 0: logging.info('Resuming training.') check_params(params) # Load data if load_dataset is None: dataset = build_dataset(params) else: dataset = loadDataset(load_dataset) params['INPUT_VOCABULARY_SIZE'] = dataset.vocabulary_len[ params['INPUTS_IDS_DATASET'][0]] params['OUTPUT_VOCABULARY_SIZE'] = dataset.vocabulary_len[ params['OUTPUTS_IDS_DATASET'][0]] # Build model if params['RELOAD'] == 0: # build new model nmt_model = TranslationModel(params, model_type=params['MODEL_TYPE'], verbose=params['VERBOSE'], model_name=params['MODEL_NAME'], vocabularies=dataset.vocabulary, store_path=params['STORE_PATH']) dict2pkl(params, params['STORE_PATH'] + '/config') # Define the inputs and outputs mapping from our Dataset instance to our model inputMapping = dict() for i, id_in in enumerate(params['INPUTS_IDS_DATASET']): pos_source = dataset.ids_inputs.index(id_in) id_dest = nmt_model.ids_inputs[i] inputMapping[id_dest] = pos_source nmt_model.setInputsMapping(inputMapping) outputMapping = dict() for i, id_out in enumerate(params['OUTPUTS_IDS_DATASET']): pos_target = dataset.ids_outputs.index(id_out) id_dest = nmt_model.ids_outputs[i] outputMapping[id_dest] = pos_target nmt_model.setOutputsMapping(outputMapping) else: # resume from previously trained model nmt_model = TranslationModel(params, model_type=params['MODEL_TYPE'], verbose=params['VERBOSE'], model_name=params['MODEL_NAME'], vocabularies=dataset.vocabulary, store_path=params['STORE_PATH'], set_optimizer=False, clear_dirs=False) # Define the inputs and outputs mapping from our Dataset instance to our model inputMapping = dict() for i, id_in in enumerate(params['INPUTS_IDS_DATASET']): pos_source = dataset.ids_inputs.index(id_in) id_dest = nmt_model.ids_inputs[i] inputMapping[id_dest] = pos_source nmt_model.setInputsMapping(inputMapping) outputMapping = dict() for i, id_out in enumerate(params['OUTPUTS_IDS_DATASET']): pos_target = dataset.ids_outputs.index(id_out) id_dest = nmt_model.ids_outputs[i] outputMapping[id_dest] = pos_target nmt_model.setOutputsMapping(outputMapping) nmt_model = updateModel(nmt_model, params['STORE_PATH'], params['RELOAD'], reload_epoch=params['RELOAD_EPOCH']) nmt_model.setParams(params) nmt_model.setOptimizer() params['EPOCH_OFFSET'] = params['RELOAD'] if params['RELOAD_EPOCH'] else \ int(params['RELOAD'] * params['BATCH_SIZE'] / dataset.len_train) # Callbacks callbacks = buildCallbacks(params, nmt_model, dataset) # Training total_start_time = timer() logger.debug('Starting training!') training_params = { 'n_epochs': params['MAX_EPOCH'], 'batch_size': params['BATCH_SIZE'], 'homogeneous_batches': params['HOMOGENEOUS_BATCHES'], 'maxlen': params['MAX_OUTPUT_TEXT_LEN'], 'joint_batches': params['JOINT_BATCHES'], 'lr_decay': params['LR_DECAY'], 'lr_gamma': params['LR_GAMMA'], 'epochs_for_save': params['EPOCHS_FOR_SAVE'], 'verbose': params['VERBOSE'], 'eval_on_sets': params['EVAL_ON_SETS_KERAS'], 'n_parallel_loaders': params['PARALLEL_LOADERS'], 'extra_callbacks': callbacks, 'reload_epoch': params['RELOAD'], 'epoch_offset': params.get('EPOCH_OFFSET', 0), 'data_augmentation': params['DATA_AUGMENTATION'], 'patience': params.get('PATIENCE', 0), # early stopping parameters 'metric_check': params.get('STOP_METRIC', None) if params.get('EARLY_STOP', False) else None, 'eval_on_epochs': params.get('EVAL_EACH_EPOCHS', True), 'each_n_epochs': params.get('EVAL_EACH', 1), 'start_eval_on_epoch': params.get('START_EVAL_ON_EPOCH', 0) } nmt_model.trainNet(dataset, training_params) total_end_time = timer() time_difference = total_end_time - total_start_time logging.info('In total is {0:.2f}s = {1:.2f}m'.format( time_difference, time_difference / 60.0))
def train_model(params): """ Main function """ if(params['RELOAD'] > 0): logging.info('Resuming training.') check_params(params) ########### Load data dataset = build_dataset(params) # Keep original images size if IMAGE_CROPPING == False if not params['IMAGE_CROPPING']: dataset.img_size_crop = dataset.img_size ########### ########### Build model if(params['RELOAD'] == 0): # build new model cnn_model = Segmentation_Model(params, type=params['MODEL_TYPE'], verbose=params['VERBOSE'], model_name=params['MODEL_NAME'], store_path=params['STORE_PATH']) # Define the inputs and outputs mapping from our Dataset instance to our model cnn_model.setInputsMapping(params['INPUTS_MAPPING']) cnn_model.setOutputsMapping(params['OUTPUTS_MAPPING']) # Save initial untrained model and try to load it again saveModel(cnn_model, 0) cnn_model=loadModel(params['STORE_PATH'], 0, custom_objects={"AttentionComplex": AttentionComplex, 'WeightedMerge': WeightedMerge}) cnn_model.params = params cnn_model.setOptimizer() else: # resume from previously trained model cnn_model = loadModel(params['STORE_PATH'], params['RELOAD'], custom_objects={"AttentionComplex": AttentionComplex}) cnn_model.model_path = params['STORE_PATH'] cnn_model.params = params cnn_model.setOptimizer() ########### # Test model save/load saveModel(cnn_model, 0) cnn_model = loadModel(params['STORE_PATH'], 0, custom_objects={"AttentionComplex": AttentionComplex}) cnn_model.setOptimizer() ########### Callbacks callbacks = buildCallbacks(params, cnn_model, dataset) ########### ########### Training total_start_time = timer() logger.debug('Starting training!') training_params = {'n_epochs': params['MAX_EPOCH'], 'batch_size': params['BATCH_SIZE'], 'lr_decay': params['LR_DECAY'], 'lr_gamma': params['LR_GAMMA'], 'epochs_for_save': params['EPOCHS_FOR_SAVE'], 'verbose': params['VERBOSE'], 'eval_on_sets': params['EVAL_ON_SETS_KERAS'], 'n_parallel_loaders': params['PARALLEL_LOADERS'], 'extra_callbacks': callbacks, 'reload_epoch': params['RELOAD'], 'epoch_offset': params['RELOAD'], 'data_augmentation': params['DATA_AUGMENTATION'], 'shuffle': params['SHUFFLE_TRAIN'], 'patience': params['PATIENCE'], 'metric_check': params['STOP_METRIC'], 'patience_check_split': params['PATIENCE_SPLIT'], 'normalize': params['NORMALIZE'], 'normalization_type': params['NORMALIZATION_TYPE'], 'mean_substraction': params['MEAN_SUBSTRACTION'], 'class_weights': params['OUTPUTS_IDS_DATASET'][0] if params['DISCARD_CLASSES'] or params['WEIGHT_CLASSES'] else None, } cnn_model.trainNet(dataset, training_params) total_end_time = timer() time_difference = total_end_time - total_start_time logging.info('In total is {0:.2f}s = {1:.2f}m'.format(time_difference, time_difference / 60.0))
def main(params): """ Main function """ if(params['RELOAD'] > 0): logging.info('Resuming training.') check_params(params) ########### Load data dataset = build_dataset(params) params['INPUT_VOCABULARY_SIZE'] = dataset.vocabulary_len[params['INPUTS_IDS_DATASET'][0]] params['OUTPUT_VOCABULARY_SIZE'] = dataset.vocabulary_len[params['OUTPUTS_IDS_DATASET'][0]] ########### ########### Build model if(params['RELOAD'] == 0): # build new model vqa = VQA_Model(params, type=params['MODEL_TYPE'], verbose=params['VERBOSE'], model_name=params['MODEL_NAME'], vocabularies=dataset.vocabulary, store_path=params['STORE_PATH']) # Define the inputs and outputs mapping from our Dataset instance to our model inputMapping = dict() for i, id_in in enumerate(params['INPUTS_IDS_DATASET']): pos_source = dataset.ids_inputs.index(id_in) id_dest = vqa.ids_inputs[i] inputMapping[id_dest] = pos_source vqa.setInputsMapping(inputMapping) outputMapping = dict() for i, id_out in enumerate(params['OUTPUTS_IDS_DATASET']): pos_target = dataset.ids_outputs.index(id_out) id_dest = vqa.ids_outputs[i] outputMapping[id_dest] = pos_target vqa.setOutputsMapping(outputMapping) else: # resume from previously trained model vqa = loadModel(params['STORE_PATH'], params['RELOAD']) vqa.setOptimizer() ########### ########### Callbacks callbacks = buildCallbacks(params, vqa, dataset) ########### ########### Training total_start_time = timer() logger.debug('Starting training!') training_params = {'n_epochs': params['MAX_EPOCH'], 'batch_size': params['BATCH_SIZE'], 'lr_decay': params['LR_DECAY'], 'lr_gamma': params['LR_GAMMA'], 'epochs_for_save': params['EPOCHS_FOR_SAVE'], 'verbose': params['VERBOSE'], 'eval_on_sets': params['EVAL_ON_SETS'], 'n_parallel_loaders': params['PARALLEL_LOADERS'], 'extra_callbacks': callbacks, 'reload_epoch': params['RELOAD']} vqa.trainNet(dataset, training_params) total_end_time = timer() time_difference = total_end_time - total_start_time logging.info('In total is {0:.2f}s = {1:.2f}m'.format(time_difference, time_difference / 60.0))
def test_train_and_load(self): if theano.config.device == 'gpu': def test_train(): params = load_parameters() params['REBUILD_DATASET'] = True params['DATASET_STORE_PATH'] = './' dataset = build_dataset(params) params['INPUT_VOCABULARY_SIZE'] = \ dataset.vocabulary_len[params['INPUTS_IDS_DATASET'][0]] params['OUTPUT_VOCABULARY_SIZE'] = \ dataset.vocabulary_len[params['OUTPUTS_IDS_DATASET'][0]] params['SOURCE_TEXT_EMBEDDING_SIZE'] = 2 params['TARGET_TEXT_EMBEDDING_SIZE'] = 2 params['ENCODER_HIDDEN_SIZE'] = 2 params['DECODER_HIDDEN_SIZE'] = 2 params['ATTENTION_SIZE'] = 2 params['SKIP_VECTORS_HIDDEN_SIZE'] = 2 params['DEEP_OUTPUT_LAYERS'] = [('linear', 2)] params['STORE_PATH'] = './' nmt_model = \ TranslationModel(params, model_type=params['MODEL_TYPE'], verbose=params['VERBOSE'], model_name=params['MODEL_NAME'], vocabularies=dataset.vocabulary, store_path=params['STORE_PATH'], clear_dirs=False) # Check Inputs inputMapping = dict() for i, id_in in enumerate(params['INPUTS_IDS_DATASET']): pos_source = dataset.ids_inputs.index(id_in) id_dest = nmt_model.ids_inputs[i] inputMapping[id_dest] = pos_source nmt_model.setInputsMapping(inputMapping) outputMapping = dict() for i, id_out in enumerate(params['OUTPUTS_IDS_DATASET']): pos_target = dataset.ids_outputs.index(id_out) id_dest = nmt_model.ids_outputs[i] outputMapping[id_dest] = pos_target nmt_model.setOutputsMapping(outputMapping) callbacks = buildCallbacks(params, nmt_model, dataset) training_params = { 'n_epochs': 1, 'batch_size': 50, 'homogeneous_batches': False, 'maxlen': 10, 'joint_batches': params['JOINT_BATCHES'], 'lr_decay': params['LR_DECAY'], 'lr_gamma': params['LR_GAMMA'], 'epochs_for_save': 1, 'verbose': params['VERBOSE'], 'eval_on_sets': params['EVAL_ON_SETS_KERAS'], 'n_parallel_loaders': params['PARALLEL_LOADERS'], 'extra_callbacks': callbacks, 'reload_epoch': 0, 'epoch_offset': 0, 'data_augmentation': False, 'patience': 1, # early stopping parameters 'metric_check': 'Bleu_4', 'eval_on_epochs': True, 'each_n_epochs': 1, 'start_eval_on_epoch': 0 } nmt_model.trainNet(dataset, training_params) return True test_train() params = load_parameters() params['REBUILD_DATASET'] = True params['DATASET_STORE_PATH'] = './' dataset = build_dataset(params) params['INPUT_VOCABULARY_SIZE'] = \ dataset.vocabulary_len[params['INPUTS_IDS_DATASET'][0]] params['OUTPUT_VOCABULARY_SIZE'] = \ dataset.vocabulary_len[params['OUTPUTS_IDS_DATASET'][0]] # Load model nmt_model = loadModel('./', 1, reload_epoch=True) nmt_model.setOptimizer() for s in ['val']: # Evaluate training extra_vars = { 'language': params.get('TRG_LAN', 'en'), 'n_parallel_loaders': params['PARALLEL_LOADERS'], 'tokenize_f': eval('dataset.' + params['TOKENIZATION_METHOD']), 'detokenize_f': eval('dataset.' + params['DETOKENIZATION_METHOD']), 'apply_detokenization': params['APPLY_DETOKENIZATION'], 'tokenize_hypotheses': params['TOKENIZE_HYPOTHESES'], 'tokenize_references': params['TOKENIZE_REFERENCES'] } vocab = dataset.vocabulary[params['OUTPUTS_IDS_DATASET'] [0]]['idx2words'] extra_vars[s] = dict() extra_vars[s]['references'] = dataset.extra_variables[s][ params['OUTPUTS_IDS_DATASET'][0]] input_text_id = None vocab_src = None if params['BEAM_SIZE']: extra_vars['beam_size'] = params.get('BEAM_SIZE', 6) extra_vars['state_below_index'] = params.get( 'BEAM_SEARCH_COND_INPUT', -1) extra_vars['maxlen'] = params.get( 'MAX_OUTPUT_TEXT_LEN_TEST', 30) extra_vars['optimized_search'] = params.get( 'OPTIMIZED_SEARCH', True) extra_vars['model_inputs'] = params['INPUTS_IDS_MODEL'] extra_vars['model_outputs'] = params['OUTPUTS_IDS_MODEL'] extra_vars['dataset_inputs'] = params['INPUTS_IDS_DATASET'] extra_vars['dataset_outputs'] = params[ 'OUTPUTS_IDS_DATASET'] extra_vars['normalize_probs'] = params.get( 'NORMALIZE_SAMPLING', False) extra_vars['alpha_factor'] = params.get( 'ALPHA_FACTOR', 1.0) extra_vars['coverage_penalty'] = params.get( 'COVERAGE_PENALTY', False) extra_vars['length_penalty'] = params.get( 'LENGTH_PENALTY', False) extra_vars['length_norm_factor'] = params.get( 'LENGTH_NORM_FACTOR', 0.0) extra_vars['coverage_norm_factor'] = params.get( 'COVERAGE_NORM_FACTOR', 0.0) extra_vars['pos_unk'] = params['POS_UNK'] if params['POS_UNK']: extra_vars['heuristic'] = params['HEURISTIC'] input_text_id = params['INPUTS_IDS_DATASET'][0] vocab_src = dataset.vocabulary[input_text_id][ 'idx2words'] if params['HEURISTIC'] > 0: extra_vars['mapping'] = dataset.mapping callback_metric = PrintPerformanceMetricOnEpochEndOrEachNUpdates( nmt_model, dataset, gt_id=params['OUTPUTS_IDS_DATASET'][0], metric_name=params['METRICS'], set_name=params['EVAL_ON_SETS'], batch_size=params['BATCH_SIZE'], each_n_epochs=params['EVAL_EACH'], extra_vars=extra_vars, reload_epoch=1, is_text=True, input_text_id=input_text_id, save_path=nmt_model.model_path, index2word_y=vocab, index2word_x=vocab_src, sampling_type=params['SAMPLING'], beam_search=params['BEAM_SEARCH'], start_eval_on_epoch=0, write_samples=True, write_type=params['SAMPLING_SAVE_MODE'], eval_on_epochs=params['EVAL_EACH_EPOCHS'], save_each_evaluation=False, verbose=params['VERBOSE']) callback_metric.evaluate( 1, counter_name='epoch' if params['EVAL_EACH_EPOCHS'] else 'update') return True else: pass
def train_model(params, weights_dict, load_dataset=None, trainable_pred=True, trainable_est=True, weights_path=None): """ Training function. Sets the training parameters from params. Build or loads the model and launches the training. :param params: Dictionary of network hyperparameters. :param load_dataset: Load dataset from file or build it from the parameters. :return: None """ check_params(params) if params['RELOAD'] > 0: logging.info('Resuming training.') # Load data if load_dataset is None: if params['REBUILD_DATASET']: logging.info('Rebuilding dataset.') pred_vocab = params.get('PRED_VOCAB', None) if pred_vocab is not None: dataset_voc = loadDataset(params['PRED_VOCAB']) dataset = build_dataset(params, dataset_voc.vocabulary, dataset_voc.vocabulary_len) else: dataset = build_dataset(params) else: logging.info('Updating dataset.') dataset = loadDataset(params['DATASET_STORE_PATH'] + '/Dataset_' + params['DATASET_NAME'] + '_' + params['SRC_LAN'] + params['TRG_LAN'] + '.pkl') for split, filename in params['TEXT_FILES'].iteritems(): dataset = update_dataset_from_file( dataset, params['DATA_ROOT_PATH'] + '/' + filename + params['SRC_LAN'], params, splits=list([split]), output_text_filename=params['DATA_ROOT_PATH'] + '/' + filename + params['TRG_LAN'], remove_outputs=False, compute_state_below=True, recompute_references=True) dataset.name = params['DATASET_NAME'] + '_' + params[ 'SRC_LAN'] + params['TRG_LAN'] saveDataset(dataset, params['DATASET_STORE_PATH']) else: logging.info('Reloading and using dataset.') dataset = loadDataset(load_dataset) else: # Load data if load_dataset is None: pred_vocab = params.get('PRED_VOCAB', None) if pred_vocab is not None: dataset_voc = loadDataset(params['PRED_VOCAB']) # for the testing pharse handle model vocab differences #dataset_voc.vocabulary['target_text'] = dataset_voc.vocabulary['target'] #dataset_voc.vocabulary_len['target_text'] = dataset_voc.vocabulary_len['target'] dataset = build_dataset(params, dataset_voc.vocabulary, dataset_voc.vocabulary_len) else: dataset = build_dataset(params) else: dataset = loadDataset(load_dataset) params['INPUT_VOCABULARY_SIZE'] = dataset.vocabulary_len[ params['INPUTS_IDS_DATASET'][0]] #params['OUTPUT_VOCABULARY_SIZE'] = dataset.vocabulary_len[params['OUTPUTS_IDS_DATASET_FULL'][0]] params['OUTPUT_VOCABULARY_SIZE'] = dataset.vocabulary_len['target_text'] # Build model if params['RELOAD'] == 0: # build new model nmt_model = TranslationModel(params, model_type=params['MODEL_TYPE'], verbose=params['VERBOSE'], model_name=params['MODEL_NAME'], vocabularies=dataset.vocabulary, store_path=params['STORE_PATH'], trainable_pred=trainable_pred, trainable_est=trainable_est, clear_dirs=True, weights_path=weights_path) # Define the inputs and outputs mapping from our Dataset instance to our model inputMapping = dict() for i, id_in in enumerate(params['INPUTS_IDS_DATASET']): pos_source = dataset.ids_inputs.index(id_in) id_dest = nmt_model.ids_inputs[i] inputMapping[id_dest] = pos_source nmt_model.setInputsMapping(inputMapping) outputMapping = dict() for i, id_out in enumerate(params['OUTPUTS_IDS_DATASET']): pos_target = dataset.ids_outputs.index(id_out) id_dest = nmt_model.ids_outputs[i] outputMapping[id_dest] = pos_target nmt_model.setOutputsMapping(outputMapping) else: # resume from previously trained model nmt_model = TranslationModel(params, model_type=params['MODEL_TYPE'], verbose=params['VERBOSE'], model_name=params['MODEL_NAME'], vocabularies=dataset.vocabulary, store_path=params['STORE_PATH'], set_optimizer=False, trainable_pred=trainable_pred, trainable_est=trainable_est, weights_path=weights_path) # Define the inputs and outputs mapping from our Dataset instance to our model inputMapping = dict() for i, id_in in enumerate(params['INPUTS_IDS_DATASET']): pos_source = dataset.ids_inputs.index(id_in) id_dest = nmt_model.ids_inputs[i] inputMapping[id_dest] = pos_source nmt_model.setInputsMapping(inputMapping) outputMapping = dict() for i, id_out in enumerate(params['OUTPUTS_IDS_DATASET']): pos_target = dataset.ids_outputs.index(id_out) id_dest = nmt_model.ids_outputs[i] outputMapping[id_dest] = pos_target nmt_model.setOutputsMapping(outputMapping) nmt_model = updateModel(nmt_model, params['STORE_PATH'], params['RELOAD'], reload_epoch=params['RELOAD_EPOCH']) nmt_model.setParams(params) nmt_model.setOptimizer() params['EPOCH_OFFSET'] = params['RELOAD'] if params['RELOAD_EPOCH'] else \ int(params['RELOAD'] * params['BATCH_SIZE'] / dataset.len_train) # Store configuration as pkl dict2pkl(params, params['STORE_PATH'] + '/config') # Callbacks callbacks = buildCallbacks(params, nmt_model, dataset) # Training total_start_time = timer() logger.debug('Starting training!') training_params = { 'n_epochs': params['MAX_EPOCH'], 'batch_size': params['BATCH_SIZE'], 'homogeneous_batches': params['HOMOGENEOUS_BATCHES'], 'maxlen': params['MAX_OUTPUT_TEXT_LEN'], 'joint_batches': params['JOINT_BATCHES'], 'lr_decay': params.get('LR_DECAY', None), # LR decay parameters 'reduce_each_epochs': params.get('LR_REDUCE_EACH_EPOCHS', True), 'start_reduction_on_epoch': params.get('LR_START_REDUCTION_ON_EPOCH', 0), 'lr_gamma': params.get('LR_GAMMA', 0.9), 'lr_reducer_type': params.get('LR_REDUCER_TYPE', 'linear'), 'lr_reducer_exp_base': params.get('LR_REDUCER_EXP_BASE', 0), 'lr_half_life': params.get('LR_HALF_LIFE', 50000), 'epochs_for_save': params['EPOCHS_FOR_SAVE'], 'verbose': params['VERBOSE'], 'eval_on_sets': params['EVAL_ON_SETS_KERAS'], 'n_parallel_loaders': params['PARALLEL_LOADERS'], 'extra_callbacks': callbacks, 'reload_epoch': params['RELOAD'], 'epoch_offset': params.get('EPOCH_OFFSET', 0), 'data_augmentation': params['DATA_AUGMENTATION'], 'patience': params.get('PATIENCE', 0), # early stopping parameters 'metric_check': params.get('STOP_METRIC', None) if params.get('EARLY_STOP', False) else None, 'eval_on_epochs': params.get('EVAL_EACH_EPOCHS', True), 'each_n_epochs': params.get('EVAL_EACH', 1), 'start_eval_on_epoch': params.get('START_EVAL_ON_EPOCH', 0) } if weights_dict is not None: for layer in nmt_model.model.layers: if layer.name in weights_dict: layer.set_weights(weights_dict[layer.name]) nmt_model.trainNet(dataset, training_params) if weights_dict is not None: for layer in nmt_model.model.layers: weights_dict[layer.name] = layer.get_weights() total_end_time = timer() time_difference = total_end_time - total_start_time logging.info('In total is {0:.2f}s = {1:.2f}m'.format( time_difference, time_difference / 60.0))
def test_transformer(): params = load_tests_params() # Current test params: Transformer params['MODEL_TYPE'] = 'Transformer' params['N_LAYERS_ENCODER'] = 2 params['N_LAYERS_DECODER'] = 2 params['MULTIHEAD_ATTENTION_ACTIVATION'] = 'relu' params['MODEL_SIZE'] = 8 params['FF_SIZE'] = params['MODEL_SIZE'] * 4 params['N_HEADS'] = 2 params['REBUILD_DATASET'] = True params['OPTIMIZED_SEARCH'] = True params['POS_UNK'] = False dataset = build_dataset(params) params['INPUT_VOCABULARY_SIZE'] = dataset.vocabulary_len[ params['INPUTS_IDS_DATASET'][0]] params['OUTPUT_VOCABULARY_SIZE'] = dataset.vocabulary_len[ params['OUTPUTS_IDS_DATASET'][0]] params['MODEL_NAME'] = \ params['TASK_NAME'] + '_' + params['SRC_LAN'] + params['TRG_LAN'] + '_' + params['MODEL_TYPE'] + \ '_model_size_' + str(params['MODEL_SIZE']) + \ '_ff_size_' + str(params['FF_SIZE']) + \ '_num_heads_' + str(params['N_HEADS']) + \ '_encoder_blocks_' + str(params['N_LAYERS_ENCODER']) + \ '_decoder_blocks_' + str(params['N_LAYERS_DECODER']) + \ '_deepout_' + '_'.join([layer[0] for layer in params['DEEP_OUTPUT_LAYERS']]) + \ '_' + params['OPTIMIZER'] + '_' + str(params['LR']) params['STORE_PATH'] = K.backend( ) + '_test_train_models/' + params['MODEL_NAME'] + '/' # Test several NMT-Keras utilities: train, sample, sample_ensemble, score_corpus... print("Training model") train_model(params) params['RELOAD'] = 1 print("Done") parser = argparse.ArgumentParser('Parser for unit testing') parser.dataset = params['DATASET_STORE_PATH'] + '/Dataset_' + params[ 'DATASET_NAME'] + '_' + params['SRC_LAN'] + params['TRG_LAN'] + '.pkl' parser.text = params['DATA_ROOT_PATH'] + '/' + params['TEXT_FILES'][ 'val'] + params['SRC_LAN'] parser.splits = ['val'] parser.config = params['STORE_PATH'] + '/config.pkl' parser.models = [params['STORE_PATH'] + '/epoch_' + str(1)] parser.verbose = 0 parser.dest = None parser.source = params['DATA_ROOT_PATH'] + '/' + params['TEXT_FILES'][ 'val'] + params['SRC_LAN'] parser.target = params['DATA_ROOT_PATH'] + '/' + params['TEXT_FILES'][ 'val'] + params['TRG_LAN'] parser.weights = [] parser.glossary = None for n_best in [True, False]: parser.n_best = n_best print("Sampling with n_best = %s " % str(n_best)) sample_ensemble(parser, params) print("Done") print("Scoring corpus") score_corpus(parser, params) print("Done")
def test_build_datset(self): params = load_parameters() params['REBUILD_DATASET'] = True params['DATASET_STORE_PATH'] = './' ds = build_dataset(params) self.assertIsInstance(ds, Dataset)