def test_update_dataset_from_file(): params = load_parameters() for rebuild_dataset in [True, False]: params['REBUILD_DATASET'] = rebuild_dataset params['DATASET_STORE_PATH'] = '.' for splits in [[], None, ['val']]: ds = build_dataset(params) assert isinstance(ds, Dataset) for output_text_filename in [ None, os.path.join( params['DATA_ROOT_PATH'], params['TEXT_FILES']['test'] + params['TRG_LAN']) ]: for remove_outputs in [True, False]: for compute_state_below in [True, False]: for recompute_references in [True, False]: ds2 = update_dataset_from_file( copy.deepcopy(ds), os.path.join( params['DATA_ROOT_PATH'], params['TEXT_FILES']['test'] + params['SRC_LAN']), params, splits=splits, output_text_filename=output_text_filename, remove_outputs=remove_outputs, compute_state_below=compute_state_below, recompute_references=recompute_references) assert isinstance(ds2, Dataset) # Final check: We update the val set with the test data. We check that dimensions match. split = 'val' len_test = 2996 ds2 = update_dataset_from_file( copy.deepcopy(ds), params['DATA_ROOT_PATH'] + params['TEXT_FILES']['test'] + params['SRC_LAN'], params, splits=[split], output_text_filename=os.path.join( params['DATA_ROOT_PATH'], params['TEXT_FILES']['test'] + params['TRG_LAN']), remove_outputs=False, compute_state_below=True, recompute_references=True) assert isinstance(ds2, Dataset) assert eval('ds2.len_' + split) == len_test assert eval('all(ds2.loaded_' + split + ')') assert len(eval('ds2.X_' + split + str([params['INPUTS_IDS_DATASET'][0]]))) == len_test assert len(eval('ds2.Y_' + split + str([params['OUTPUTS_IDS_DATASET'][0]]))) == len_test if __name__ == '__main__': pytest.main([__file__])
def test_update_dataset_from_file(self): params = load_parameters() params['REBUILD_DATASET'] = True params['DATASET_STORE_PATH'] = './' ds = build_dataset(params) self.assertIsInstance(ds, Dataset) for splits in [[], ['val']]: for output_text_filename in [ None, params['DATA_ROOT_PATH'] + params['TEXT_FILES']['test'] + params['TRG_LAN'] ]: for remove_outputs in [True, False]: for compute_state_below in [True, False]: for recompute_references in [True, False]: ds2 = update_dataset_from_file( copy.deepcopy(ds), params['DATA_ROOT_PATH'] + params['TEXT_FILES']['test'] + params['SRC_LAN'], params, splits=splits, output_text_filename=output_text_filename, remove_outputs=remove_outputs, compute_state_below=compute_state_below, recompute_references=recompute_references) self.assertIsInstance(ds2, Dataset) # Final check: We update the val set with the test data. We check that dimensions match. split = 'val' len_test = 2996 ds2 = update_dataset_from_file( copy.deepcopy(ds), params['DATA_ROOT_PATH'] + params['TEXT_FILES']['test'] + params['SRC_LAN'], params, splits=[split], output_text_filename=params['DATA_ROOT_PATH'] + params['TEXT_FILES']['test'] + params['TRG_LAN'], remove_outputs=False, compute_state_below=True, recompute_references=True) self.assertIsInstance(ds2, Dataset) self.assertEqual(eval('ds2.len_' + split), len_test) self.assertTrue(eval('all(ds2.loaded_' + split + ')')) self.assertEqual( len(eval('ds2.X_' + split + str([params['INPUTS_IDS_DATASET'][0]]))), len_test) self.assertEqual( len( eval('ds2.Y_' + split + str([params['OUTPUTS_IDS_DATASET'][0]]))), len_test)
def score_corpus(args, params): print "Using an ensemble of %d models" % len(args.models) models = [loadModel(m, -1, full_path=True) for m in args.models] dataset = loadDataset(args.dataset) if args.source is not None: dataset = update_dataset_from_file(dataset, args.source, params, splits=args.splits, output_text_filename=args.target, compute_state_below=True) params['INPUT_VOCABULARY_SIZE'] = dataset.vocabulary_len[params['INPUTS_IDS_DATASET'][0]] params['OUTPUT_VOCABULARY_SIZE'] = dataset.vocabulary_len[params['OUTPUTS_IDS_DATASET'][0]] # Apply scoring extra_vars = dict() extra_vars['tokenize_f'] = eval('dataset.' + params['TOKENIZATION_METHOD']) for s in args.splits: # Apply model predictions params_prediction = {'max_batch_size': params['BATCH_SIZE'], 'n_parallel_loaders': params['PARALLEL_LOADERS'], 'predict_on_sets': [s]} if params['BEAM_SEARCH']: params_prediction['beam_size'] = params['BEAM_SIZE'] params_prediction['maxlen'] = params['MAX_OUTPUT_TEXT_LEN_TEST'] params_prediction['optimized_search'] = params['OPTIMIZED_SEARCH'] params_prediction['model_inputs'] = params['INPUTS_IDS_MODEL'] params_prediction['model_outputs'] = params['OUTPUTS_IDS_MODEL'] params_prediction['dataset_inputs'] = params['INPUTS_IDS_DATASET'] params_prediction['dataset_outputs'] = params['OUTPUTS_IDS_DATASET'] params_prediction['normalize_probs'] = params.get('NORMALIZE_SAMPLING', False) params_prediction['alpha_factor'] = params.get('ALPHA_FACTOR', 1.0) params_prediction['coverage_penalty'] = params.get('COVERAGE_PENALTY', False) params_prediction['length_penalty'] = params.get('LENGTH_PENALTY', False) params_prediction['length_norm_factor'] = params.get('LENGTH_NORM_FACTOR', 0.0) params_prediction['coverage_norm_factor'] = params.get('COVERAGE_NORM_FACTOR', 0.0) params_prediction['pos_unk'] = params.get('POS_UNK', False) params_prediction['state_below_maxlen'] = -1 if params.get('PAD_ON_BATCH', True) \ else params.get('MAX_OUTPUT_TEXT_LEN', 50) params_prediction['output_max_length_depending_on_x'] = params.get('MAXLEN_GIVEN_X', True) params_prediction['output_max_length_depending_on_x_factor'] = params.get('MAXLEN_GIVEN_X_FACTOR', 3) params_prediction['output_min_length_depending_on_x'] = params.get('MINLEN_GIVEN_X', True) params_prediction['output_min_length_depending_on_x_factor'] = params.get('MINLEN_GIVEN_X_FACTOR', 2) beam_searcher = BeamSearchEnsemble(models, dataset, params_prediction, verbose=args.verbose) scores = beam_searcher.scoreNet()[s] # Store result if args.dest is not None: filepath = args.dest # results file if params['SAMPLING_SAVE_MODE'] == 'list': list2file(filepath, scores) elif params['SAMPLING_SAVE_MODE'] == 'numpy': numpy2file(filepath, scores) else: raise Exception('The sampling mode ' + params['SAMPLING_SAVE_MODE'] + ' is not currently supported.') else: print scores
k, v = arg.split('=') except ValueError: print 'Overwritten arguments must have the form key=Value. \n Currently are: %s' % str( args.changes) exit(1) try: params[k] = ast.literal_eval(v) except ValueError: params[k] = v except ValueError: print 'Error processing arguments: (', k, ",", v, ")" exit(2) dataset = loadDataset(args.dataset) dataset = update_dataset_from_file(dataset, args.text, params, splits=args.splits, remove_outputs=True) params['INPUT_VOCABULARY_SIZE'] = dataset.vocabulary_len[ params['INPUTS_IDS_DATASET'][0]] params['OUTPUT_VOCABULARY_SIZE'] = dataset.vocabulary_len[ params['OUTPUTS_IDS_DATASET'][0]] # For converting predictions into sentences index2word_y = dataset.vocabulary[params['OUTPUTS_IDS_DATASET'] [0]]['idx2words'] if params.get('APPLY_DETOKENIZATION', False): detokenize_function = eval('dataset.' + params['DETOKENIZATION_METHOD'])
def train_model(params, load_dataset=None): """ Training function. Sets the training parameters from params. Build or loads the model and launches the training. :param params: Dictionary of network hyperparameters. :param load_dataset: Load dataset from file or build it from the parameters. :return: None """ check_params(params) if params['RELOAD'] > 0: logging.info('Resuming training.') # Load data if load_dataset is None: if params['REBUILD_DATASET']: logging.info('Rebuilding dataset.') dataset = build_dataset(params) else: logging.info('Updating dataset.') dataset = loadDataset(params['DATASET_STORE_PATH'] + '/Dataset_' + params['DATASET_NAME'] + '_' + params['SRC_LAN'] + params['TRG_LAN'] + '.pkl') params['EPOCH_OFFSET'] = params['RELOAD'] if params['RELOAD_EPOCH'] else \ int(params['RELOAD'] * params['BATCH_SIZE'] / dataset.len_train) for split, filename in params['TEXT_FILES'].iteritems(): dataset = update_dataset_from_file( dataset, params['DATA_ROOT_PATH'] + '/' + filename + params['SRC_LAN'], params, splits=list([split]), output_text_filename=params['DATA_ROOT_PATH'] + '/' + filename + params['TRG_LAN'], remove_outputs=False, compute_state_below=True, recompute_references=True) dataset.name = params['DATASET_NAME'] + '_' + params[ 'SRC_LAN'] + params['TRG_LAN'] saveDataset(dataset, params['DATASET_STORE_PATH']) else: logging.info('Reloading and using dataset.') dataset = loadDataset(load_dataset) else: # Load data if load_dataset is None: dataset = build_dataset(params) else: dataset = loadDataset(load_dataset) params['INPUT_VOCABULARY_SIZE'] = dataset.vocabulary_len[ params['INPUTS_IDS_DATASET'][0]] params['OUTPUT_VOCABULARY_SIZE'] = dataset.vocabulary_len[ params['OUTPUTS_IDS_DATASET'][0]] # Build model set_optimizer = True if params['RELOAD'] == 0 else False clear_dirs = True if params['RELOAD'] == 0 else False # build new model nmt_model = TranslationModel(params, model_type=params['MODEL_TYPE'], verbose=params['VERBOSE'], model_name=params['MODEL_NAME'], vocabularies=dataset.vocabulary, store_path=params['STORE_PATH'], set_optimizer=set_optimizer, clear_dirs=clear_dirs) # Define the inputs and outputs mapping from our Dataset instance to our model inputMapping = dict() for i, id_in in enumerate(params['INPUTS_IDS_DATASET']): pos_source = dataset.ids_inputs.index(id_in) id_dest = nmt_model.ids_inputs[i] inputMapping[id_dest] = pos_source nmt_model.setInputsMapping(inputMapping) outputMapping = dict() for i, id_out in enumerate(params['OUTPUTS_IDS_DATASET']): pos_target = dataset.ids_outputs.index(id_out) id_dest = nmt_model.ids_outputs[i] outputMapping[id_dest] = pos_target nmt_model.setOutputsMapping(outputMapping) if params['RELOAD'] > 0: nmt_model = updateModel(nmt_model, params['STORE_PATH'], params['RELOAD'], reload_epoch=params['RELOAD_EPOCH']) nmt_model.setParams(params) nmt_model.setOptimizer() if params.get('EPOCH_OFFSET') is None: params['EPOCH_OFFSET'] = params['RELOAD'] if params['RELOAD_EPOCH'] else \ int(params['RELOAD'] * params['BATCH_SIZE'] / dataset.len_train) # Store configuration as pkl dict2pkl(params, params['STORE_PATH'] + '/config') # Callbacks callbacks = buildCallbacks(params, nmt_model, dataset) # Training total_start_time = timer() logger.debug('Starting training!') training_params = { 'n_epochs': params['MAX_EPOCH'], 'batch_size': params['BATCH_SIZE'], 'homogeneous_batches': params['HOMOGENEOUS_BATCHES'], 'maxlen': params['MAX_OUTPUT_TEXT_LEN'], 'joint_batches': params['JOINT_BATCHES'], 'lr_decay': params.get('LR_DECAY', None), # LR decay parameters 'reduce_each_epochs': params.get('LR_REDUCE_EACH_EPOCHS', True), 'start_reduction_on_epoch': params.get('LR_START_REDUCTION_ON_EPOCH', 0), 'lr_gamma': params.get('LR_GAMMA', 0.9), 'lr_reducer_type': params.get('LR_REDUCER_TYPE', 'linear'), 'lr_reducer_exp_base': params.get('LR_REDUCER_EXP_BASE', 0), 'lr_half_life': params.get('LR_HALF_LIFE', 50000), 'epochs_for_save': params['EPOCHS_FOR_SAVE'], 'verbose': params['VERBOSE'], 'eval_on_sets': params['EVAL_ON_SETS_KERAS'], 'n_parallel_loaders': params['PARALLEL_LOADERS'], 'extra_callbacks': callbacks, 'reload_epoch': params['RELOAD'], 'epoch_offset': params.get('EPOCH_OFFSET', 0), 'data_augmentation': params['DATA_AUGMENTATION'], 'patience': params.get('PATIENCE', 0), # early stopping parameters 'metric_check': params.get('STOP_METRIC', None) if params.get('EARLY_STOP', False) else None, 'eval_on_epochs': params.get('EVAL_EACH_EPOCHS', True), 'each_n_epochs': params.get('EVAL_EACH', 1), 'start_eval_on_epoch': params.get('START_EVAL_ON_EPOCH', 0), 'tensorboard': params.get('TENSORBOARD', False), 'tensorboard_params': { 'log_dir': params.get('LOG_DIR', 'tensorboard_logs'), 'histogram_freq': params.get('HISTOGRAM_FREQ', 0), 'batch_size': params.get('TENSORBOARD_BATCH_SIZE', params['BATCH_SIZE']), 'write_graph': params.get('WRITE_GRAPH', True), 'write_grads': params.get('WRITE_GRADS', False), 'write_images': params.get('WRITE_IMAGES', False), 'embeddings_freq': params.get('EMBEDDINGS_FREQ', 0), 'embeddings_layer_names': params.get('EMBEDDINGS_LAYER_NAMES', None), 'embeddings_metadata': params.get('EMBEDDINGS_METADATA', None), 'label_word_embeddings_with_vocab': params.get('LABEL_WORD_EMBEDDINGS_WITH_VOCAB', False), 'word_embeddings_labels': params.get('WORD_EMBEDDINGS_LABELS', None), } } nmt_model.trainNet(dataset, training_params) total_end_time = timer() time_difference = total_end_time - total_start_time logging.info('In total is {0:.2f}s = {1:.2f}m'.format( time_difference, time_difference / 60.0))
def sample_ensemble(args, params): from data_engine.prepare_data import update_dataset_from_file from keras_wrapper.model_ensemble import BeamSearchEnsemble from keras_wrapper.cnn_model import loadModel from keras_wrapper.dataset import loadDataset from keras_wrapper.utils import decode_predictions_beam_search logging.info("Using an ensemble of %d models" % len(args.models)) models = [loadModel(m, -1, full_path=True) for m in args.models] dataset = loadDataset(args.dataset) dataset = update_dataset_from_file(dataset, args.text, params, splits=args.splits, remove_outputs=True) params['INPUT_VOCABULARY_SIZE'] = dataset.vocabulary_len[ params['INPUTS_IDS_DATASET'][0]] params['OUTPUT_VOCABULARY_SIZE'] = dataset.vocabulary_len[ params['OUTPUTS_IDS_DATASET'][0]] # For converting predictions into sentences index2word_y = dataset.vocabulary[params['OUTPUTS_IDS_DATASET'] [0]]['idx2words'] if params.get('APPLY_DETOKENIZATION', False): detokenize_function = eval('dataset.' + params['DETOKENIZATION_METHOD']) params_prediction = dict() params_prediction['max_batch_size'] = params.get('BATCH_SIZE', 20) params_prediction['n_parallel_loaders'] = params.get('PARALLEL_LOADERS', 1) params_prediction['beam_size'] = params.get('BEAM_SIZE', 6) params_prediction['maxlen'] = params.get('MAX_OUTPUT_TEXT_LEN_TEST', 100) params_prediction['optimized_search'] = params['OPTIMIZED_SEARCH'] params_prediction['model_inputs'] = params['INPUTS_IDS_MODEL'] params_prediction['model_outputs'] = params['OUTPUTS_IDS_MODEL'] params_prediction['dataset_inputs'] = params['INPUTS_IDS_DATASET'] params_prediction['dataset_outputs'] = params['OUTPUTS_IDS_DATASET'] params_prediction['search_pruning'] = params.get('SEARCH_PRUNING', False) params_prediction['normalize_probs'] = params.get('NORMALIZE_SAMPLING', False) params_prediction['alpha_factor'] = params.get('ALPHA_FACTOR', 1.0) params_prediction['coverage_penalty'] = params.get('COVERAGE_PENALTY', False) params_prediction['length_penalty'] = params.get('LENGTH_PENALTY', False) params_prediction['length_norm_factor'] = params.get( 'LENGTH_NORM_FACTOR', 0.0) params_prediction['coverage_norm_factor'] = params.get( 'COVERAGE_NORM_FACTOR', 0.0) params_prediction['pos_unk'] = params.get('POS_UNK', False) params_prediction['state_below_maxlen'] = -1 if params.get('PAD_ON_BATCH', True) \ else params.get('MAX_OUTPUT_TEXT_LEN', 50) params_prediction['output_max_length_depending_on_x'] = params.get( 'MAXLEN_GIVEN_X', True) params_prediction['output_max_length_depending_on_x_factor'] = params.get( 'MAXLEN_GIVEN_X_FACTOR', 3) params_prediction['output_min_length_depending_on_x'] = params.get( 'MINLEN_GIVEN_X', True) params_prediction['output_min_length_depending_on_x_factor'] = params.get( 'MINLEN_GIVEN_X_FACTOR', 2) params_prediction['attend_on_output'] = params.get( 'ATTEND_ON_OUTPUT', 'transformer' in params['MODEL_TYPE'].lower()) heuristic = params.get('HEURISTIC', 0) mapping = None if dataset.mapping == dict() else dataset.mapping model_weights = args.weights if model_weights is not None and model_weights != []: assert len(model_weights) == len( models ), 'You should give a weight to each model. You gave %d models and %d weights.' % ( len(models), len(model_weights)) model_weights = map(lambda x: float(x), model_weights) if len(model_weights) > 1: logger.info('Giving the following weights to each model: %s' % str(model_weights)) for s in args.splits: # Apply model predictions params_prediction['predict_on_sets'] = [s] beam_searcher = BeamSearchEnsemble(models, dataset, params_prediction, model_weights=model_weights, n_best=args.n_best, verbose=args.verbose) if args.n_best: predictions, n_best = beam_searcher.predictBeamSearchNet()[s] else: predictions = beam_searcher.predictBeamSearchNet()[s] n_best = None if params_prediction['pos_unk']: samples = predictions[0] alphas = predictions[1] sources = [ x.strip() for x in open(args.text, 'r').read().split('\n') ] sources = sources[:-1] if len(sources[-1]) == 0 else sources else: samples = predictions alphas = None heuristic = None sources = None predictions = decode_predictions_beam_search(samples, index2word_y, alphas=alphas, x_text=sources, heuristic=heuristic, mapping=mapping, verbose=args.verbose) # Apply detokenization function if needed if params.get('APPLY_DETOKENIZATION', False): predictions = map(detokenize_function, predictions) if args.n_best: n_best_predictions = [] for i, (n_best_preds, n_best_scores, n_best_alphas) in enumerate(n_best): n_best_sample_score = [] for n_best_pred, n_best_score, n_best_alpha in zip( n_best_preds, n_best_scores, n_best_alphas): pred = decode_predictions_beam_search( [n_best_pred], index2word_y, alphas=[n_best_alpha] if params_prediction['pos_unk'] else None, x_text=[sources[i]] if params_prediction['pos_unk'] else None, heuristic=heuristic, mapping=mapping, verbose=args.verbose) # Apply detokenization function if needed if params.get('APPLY_DETOKENIZATION', False): pred = map(detokenize_function, pred) n_best_sample_score.append([i, pred, n_best_score]) n_best_predictions.append(n_best_sample_score) # Store result if args.dest is not None: filepath = args.dest # results file if params.get('SAMPLING_SAVE_MODE', 'list'): list2file(filepath, predictions) if args.n_best: nbest2file(filepath + '.nbest', n_best_predictions) else: raise Exception( 'Only "list" is allowed in "SAMPLING_SAVE_MODE"') else: list2stdout(predictions) if args.n_best: logging.info('Storing n-best sentences in ./' + s + '.nbest') nbest2file('./' + s + '.nbest', n_best_predictions) logging.info('Sampling finished')
def score_corpus(args, params): """ Use one or several translation models for scoring source--target pairs- :param argparse.Namespace args: Arguments given to the method: * dataset: Dataset instance with data. * source: Text file with source sentences. * target: Text file with target sentences. * splits: Splits to sample. Should be already included in the dataset object. * dest: Output file to save scores. * weights: Weight given to each model in the ensemble. You should provide the same number of weights than models. By default, it applies the same weight to each model (1/N). * verbose: Be verbose or not. * config: Config .pkl for loading the model configuration. If not specified, hyperparameters are read from config.py. * models: Path to the models. :param dict params: parameters of the translation model. """ from data_engine.prepare_data import update_dataset_from_file from keras_wrapper.dataset import loadDataset from keras_wrapper.cnn_model import loadModel from keras_wrapper.model_ensemble import BeamSearchEnsemble logging.info("Using an ensemble of %d models" % len(args.models)) models = [loadModel(m, -1, full_path=True) for m in args.models] dataset = loadDataset(args.dataset) dataset = update_dataset_from_file(dataset, args.source, params, splits=args.splits, output_text_filename=args.target, compute_state_below=True) params['INPUT_VOCABULARY_SIZE'] = dataset.vocabulary_len[ params['INPUTS_IDS_DATASET'][0]] params['OUTPUT_VOCABULARY_SIZE'] = dataset.vocabulary_len[ params['OUTPUTS_IDS_DATASET'][0]] # Apply scoring extra_vars = dict() extra_vars['tokenize_f'] = eval('dataset.' + params['TOKENIZATION_METHOD']) model_weights = args.weights if model_weights is not None and model_weights != []: assert len(model_weights) == len( models ), 'You should give a weight to each model. You gave %d models and %d weights.' % ( len(models), len(model_weights)) model_weights = map(float, model_weights) if len(model_weights) > 1: logger.info('Giving the following weights to each model: %s' % str(model_weights)) for s in args.splits: # Apply model predictions params_prediction = { 'max_batch_size': params['BATCH_SIZE'], 'n_parallel_loaders': params['PARALLEL_LOADERS'], 'predict_on_sets': [s] } if params['BEAM_SEARCH']: params_prediction['beam_size'] = params['BEAM_SIZE'] params_prediction['maxlen'] = params['MAX_OUTPUT_TEXT_LEN_TEST'] params_prediction['optimized_search'] = params['OPTIMIZED_SEARCH'] params_prediction['model_inputs'] = params['INPUTS_IDS_MODEL'] params_prediction['model_outputs'] = params['OUTPUTS_IDS_MODEL'] params_prediction['dataset_inputs'] = params['INPUTS_IDS_DATASET'] params_prediction['dataset_outputs'] = params[ 'OUTPUTS_IDS_DATASET'] params_prediction['normalize_probs'] = params.get( 'NORMALIZE_SAMPLING', False) params_prediction['alpha_factor'] = params.get('ALPHA_FACTOR', 1.0) params_prediction['coverage_penalty'] = params.get( 'COVERAGE_PENALTY', False) params_prediction['length_penalty'] = params.get( 'LENGTH_PENALTY', False) params_prediction['length_norm_factor'] = params.get( 'LENGTH_NORM_FACTOR', 0.0) params_prediction['coverage_norm_factor'] = params.get( 'COVERAGE_NORM_FACTOR', 0.0) params_prediction['pos_unk'] = params.get('POS_UNK', False) params_prediction['state_below_maxlen'] = -1 if params.get('PAD_ON_BATCH', True) \ else params.get('MAX_OUTPUT_TEXT_LEN', 50) params_prediction['output_max_length_depending_on_x'] = params.get( 'MAXLEN_GIVEN_X', True) params_prediction[ 'output_max_length_depending_on_x_factor'] = params.get( 'MAXLEN_GIVEN_X_FACTOR', 3) params_prediction['output_min_length_depending_on_x'] = params.get( 'MINLEN_GIVEN_X', True) params_prediction[ 'output_min_length_depending_on_x_factor'] = params.get( 'MINLEN_GIVEN_X_FACTOR', 2) params_prediction['attend_on_output'] = params.get( 'ATTEND_ON_OUTPUT', 'transformer' in params['MODEL_TYPE'].lower()) beam_searcher = BeamSearchEnsemble(models, dataset, params_prediction, model_weights=model_weights, verbose=args.verbose) scores = beam_searcher.scoreNet()[s] # Store result if args.dest is not None: filepath = args.dest # results file if params['SAMPLING_SAVE_MODE'] == 'list': list2file(filepath, scores) elif params['SAMPLING_SAVE_MODE'] == 'numpy': numpy2file(filepath, scores) else: raise Exception('The sampling mode ' + params['SAMPLING_SAVE_MODE'] + ' is not currently supported.') else: print(scores)
def sample_ensemble(args, params): """ Use several translation models for obtaining predictions from a source text file. :param argparse.Namespace args: Arguments given to the method: * dataset: Dataset instance with data. * text: Text file with source sentences. * splits: Splits to sample. Should be already included in the dataset object. * dest: Output file to save scores. * weights: Weight given to each model in the ensemble. You should provide the same number of weights than models. By default, it applies the same weight to each model (1/N). * n_best: Write n-best list (n = beam size). * config: Config .pkl for loading the model configuration. If not specified, hyperparameters are read from config.py. * models: Path to the models. * verbose: Be verbose or not. :param params: parameters of the translation model. """ from data_engine.prepare_data import update_dataset_from_file from keras_wrapper.model_ensemble import BeamSearchEnsemble from keras_wrapper.cnn_model import loadModel from keras_wrapper.dataset import loadDataset from keras_wrapper.utils import decode_predictions_beam_search logger.info("Using an ensemble of %d models" % len(args.models)) models = [loadModel(m, -1, full_path=True) for m in args.models] dataset = loadDataset(args.dataset) dataset = update_dataset_from_file(dataset, args.text, params, splits=args.splits, remove_outputs=True) params['INPUT_VOCABULARY_SIZE'] = dataset.vocabulary_len[params['INPUTS_IDS_DATASET'][0]] params['OUTPUT_VOCABULARY_SIZE'] = dataset.vocabulary_len[params['OUTPUTS_IDS_DATASET'][0]] # For converting predictions into sentences index2word_y = dataset.vocabulary[params['OUTPUTS_IDS_DATASET'][0]]['idx2words'] if params.get('APPLY_DETOKENIZATION', False): detokenize_function = eval('dataset.' + params['DETOKENIZATION_METHOD']) params_prediction = dict() params_prediction['max_batch_size'] = params.get('BATCH_SIZE', 20) params_prediction['n_parallel_loaders'] = params.get('PARALLEL_LOADERS', 1) params_prediction['beam_size'] = params.get('BEAM_SIZE', 6) params_prediction['maxlen'] = params.get('MAX_OUTPUT_TEXT_LEN_TEST', 100) params_prediction['optimized_search'] = params['OPTIMIZED_SEARCH'] params_prediction['model_inputs'] = params['INPUTS_IDS_MODEL'] params_prediction['model_outputs'] = params['OUTPUTS_IDS_MODEL'] params_prediction['dataset_inputs'] = params['INPUTS_IDS_DATASET'] params_prediction['dataset_outputs'] = params['OUTPUTS_IDS_DATASET'] params_prediction['search_pruning'] = params.get('SEARCH_PRUNING', False) params_prediction['normalize_probs'] = params.get('NORMALIZE_SAMPLING', False) params_prediction['alpha_factor'] = params.get('ALPHA_FACTOR', 1.0) params_prediction['coverage_penalty'] = params.get('COVERAGE_PENALTY', False) params_prediction['length_penalty'] = params.get('LENGTH_PENALTY', False) params_prediction['length_norm_factor'] = params.get('LENGTH_NORM_FACTOR', 0.0) params_prediction['coverage_norm_factor'] = params.get('COVERAGE_NORM_FACTOR', 0.0) params_prediction['pos_unk'] = params.get('POS_UNK', False) params_prediction['state_below_maxlen'] = -1 if params.get('PAD_ON_BATCH', True) \ else params.get('MAX_OUTPUT_TEXT_LEN', 50) params_prediction['output_max_length_depending_on_x'] = params.get('MAXLEN_GIVEN_X', True) params_prediction['output_max_length_depending_on_x_factor'] = params.get('MAXLEN_GIVEN_X_FACTOR', 3) params_prediction['output_min_length_depending_on_x'] = params.get('MINLEN_GIVEN_X', True) params_prediction['output_min_length_depending_on_x_factor'] = params.get('MINLEN_GIVEN_X_FACTOR', 2) params_prediction['attend_on_output'] = params.get('ATTEND_ON_OUTPUT', 'transformer' in params['MODEL_TYPE'].lower()) params_prediction['glossary'] = params.get('GLOSSARY', None) heuristic = params.get('HEURISTIC', 0) mapping = None if dataset.mapping == dict() else dataset.mapping model_weights = args.weights if args.glossary is not None: glossary = pkl2dict(args.glossary) elif params_prediction['glossary'] is not None: glossary = pkl2dict(params_prediction['glossary']) else: glossary = None if model_weights: assert len(model_weights) == len( models), 'You should give a weight to each model. You gave %d models and %d weights.' % ( len(models), len(model_weights)) model_weights = list(map(float, model_weights)) if len(model_weights) > 1: logger.info('Giving the following weights to each model: %s' % str(model_weights)) for s in args.splits: # Apply model predictions params_prediction['predict_on_sets'] = [s] beam_searcher = BeamSearchEnsemble(models, dataset, params_prediction, model_weights=model_weights, n_best=args.n_best, verbose=args.verbose) predictions = beam_searcher.predictBeamSearchNet()[s] samples = predictions['samples'] alphas = predictions['alphas'] if params_prediction['pos_unk'] else None if params_prediction['pos_unk']: sources = [x.strip() for x in open(args.text, 'r').read().split('\n')] sources = sources[:-1] if len(sources[-1]) == 0 else sources else: sources = None decoded_predictions = decode_predictions_beam_search(samples, index2word_y, glossary=glossary, alphas=alphas, x_text=sources, heuristic=heuristic, mapping=mapping, verbose=args.verbose) # Apply detokenization function if needed if params.get('APPLY_DETOKENIZATION', False): decoded_predictions = list(map(detokenize_function, decoded_predictions)) if args.n_best: n_best_predictions = [] for i, (n_best_preds, n_best_scores, n_best_alphas) in enumerate(predictions['n_best']): n_best_sample_score = [] for n_best_pred, n_best_score, n_best_alpha in zip(n_best_preds, n_best_scores, n_best_alphas): pred = decode_predictions_beam_search([n_best_pred], index2word_y, glossary=glossary, alphas=[n_best_alpha] if params_prediction[ 'pos_unk'] else None, x_text=[sources[i]] if params_prediction['pos_unk'] else None, heuristic=heuristic, mapping=mapping, verbose=args.verbose) # Apply detokenization function if needed if params.get('APPLY_DETOKENIZATION', False): pred = list(map(detokenize_function, pred)) n_best_sample_score.append([i, pred, n_best_score]) n_best_predictions.append(n_best_sample_score) # Store result if args.dest is not None: filepath = args.dest # results file if params.get('SAMPLING_SAVE_MODE', 'list'): list2file(filepath, decoded_predictions) if args.n_best: nbest2file(filepath + '.nbest', n_best_predictions) else: raise Exception('Only "list" is allowed in "SAMPLING_SAVE_MODE"') else: list2stdout(decoded_predictions) if args.n_best: logger.info('Storing n-best sentences in ./' + s + '.nbest') nbest2file('./' + s + '.nbest', n_best_predictions) logger.info('Sampling finished')
args = parse_args() models = args.models print "Using an ensemble of %d models" % len(args.models) models = [loadModel(m, -1, full_path=True) for m in args.models] if args.config is None: print "Reading parameters from config.py" params = load_parameters() else: print "Loading parameters from %s" % str(args.config) params = pkl2dict(args.config) dataset = loadDataset(args.dataset) if args.source is not None: dataset = update_dataset_from_file(dataset, args.source, params, splits=args.splits, output_text_filename=args.target, compute_state_below=True) params['INPUT_VOCABULARY_SIZE'] = dataset.vocabulary_len[ params['INPUTS_IDS_DATASET'][0]] params['OUTPUT_VOCABULARY_SIZE'] = dataset.vocabulary_len[ params['OUTPUTS_IDS_DATASET'][0]] # Apply scoring extra_vars = dict() extra_vars['tokenize_f'] = eval('dataset.' + params['TOKENIZATION_METHOD']) for s in args.splits: # Apply model predictions params_prediction = { 'max_batch_size': params['BATCH_SIZE'], 'n_parallel_loaders': params['PARALLEL_LOADERS'],
def score_corpus(args, params): from data_engine.prepare_data import update_dataset_from_file from keras_wrapper.dataset import loadDataset from keras_wrapper.cnn_model import loadModel from keras_wrapper.model_ensemble import BeamSearchEnsemble logging.info("Using an ensemble of %d models" % len(args.models)) models = [loadModel(m, -1, full_path=True) for m in args.models] dataset = loadDataset(args.dataset) dataset = update_dataset_from_file(dataset, args.source, params, splits=args.splits, output_text_filename=args.target, compute_state_below=True) params['INPUT_VOCABULARY_SIZE'] = dataset.vocabulary_len[ params['INPUTS_IDS_DATASET'][0]] params['OUTPUT_VOCABULARY_SIZE'] = dataset.vocabulary_len[ params['OUTPUTS_IDS_DATASET'][0]] # Apply scoring extra_vars = dict() extra_vars['tokenize_f'] = eval('dataset.' + params['TOKENIZATION_METHOD']) model_weights = args.weights if model_weights is not None and model_weights != []: assert len(model_weights) == len( models ), 'You should give a weight to each model. You gave %d models and %d weights.' % ( len(models), len(model_weights)) model_weights = map(lambda x: float(x), model_weights) if len(model_weights) > 1: logger.info('Giving the following weights to each model: %s' % str(model_weights)) for s in args.splits: # Apply model predictions params_prediction = { 'max_batch_size': params['BATCH_SIZE'], 'n_parallel_loaders': params['PARALLEL_LOADERS'], 'predict_on_sets': [s] } if params['BEAM_SEARCH']: params_prediction['beam_size'] = params['BEAM_SIZE'] params_prediction['maxlen'] = params['MAX_OUTPUT_TEXT_LEN_TEST'] params_prediction['optimized_search'] = params['OPTIMIZED_SEARCH'] params_prediction['model_inputs'] = params['INPUTS_IDS_MODEL'] params_prediction['model_outputs'] = params['OUTPUTS_IDS_MODEL'] params_prediction['dataset_inputs'] = params['INPUTS_IDS_DATASET'] params_prediction['dataset_outputs'] = params[ 'OUTPUTS_IDS_DATASET'] params_prediction['normalize_probs'] = params.get( 'NORMALIZE_SAMPLING', False) params_prediction['alpha_factor'] = params.get('ALPHA_FACTOR', 1.0) params_prediction['coverage_penalty'] = params.get( 'COVERAGE_PENALTY', False) params_prediction['length_penalty'] = params.get( 'LENGTH_PENALTY', False) params_prediction['length_norm_factor'] = params.get( 'LENGTH_NORM_FACTOR', 0.0) params_prediction['coverage_norm_factor'] = params.get( 'COVERAGE_NORM_FACTOR', 0.0) params_prediction['pos_unk'] = params.get('POS_UNK', False) params_prediction['state_below_maxlen'] = -1 if params.get('PAD_ON_BATCH', True) \ else params.get('MAX_OUTPUT_TEXT_LEN', 50) params_prediction['output_max_length_depending_on_x'] = params.get( 'MAXLEN_GIVEN_X', True) params_prediction[ 'output_max_length_depending_on_x_factor'] = params.get( 'MAXLEN_GIVEN_X_FACTOR', 3) params_prediction['output_min_length_depending_on_x'] = params.get( 'MINLEN_GIVEN_X', True) params_prediction[ 'output_min_length_depending_on_x_factor'] = params.get( 'MINLEN_GIVEN_X_FACTOR', 2) params_prediction['attend_on_output'] = params.get( 'ATTEND_ON_OUTPUT', 'transformer' in params['MODEL_TYPE'].lower()) beam_searcher = BeamSearchEnsemble(models, dataset, params_prediction, model_weights=model_weights, verbose=args.verbose) scores = beam_searcher.scoreNet()[s] # Store result if args.dest is not None: filepath = args.dest # results file if params['SAMPLING_SAVE_MODE'] == 'list': list2file(filepath, scores) elif params['SAMPLING_SAVE_MODE'] == 'numpy': numpy2file(filepath, scores) else: raise Exception('The sampling mode ' + params['SAMPLING_SAVE_MODE'] + ' is not currently supported.') else: print(scores)
def train_model(params, weights_dict, load_dataset=None, trainable_pred=True, trainable_est=True, weights_path=None): """ Training function. Sets the training parameters from params. Build or loads the model and launches the training. :param params: Dictionary of network hyperparameters. :param load_dataset: Load dataset from file or build it from the parameters. :return: None """ check_params(params) if params['RELOAD'] > 0: logging.info('Resuming training.') # Load data if load_dataset is None: if params['REBUILD_DATASET']: logging.info('Rebuilding dataset.') pred_vocab = params.get('PRED_VOCAB', None) if pred_vocab is not None: dataset_voc = loadDataset(params['PRED_VOCAB']) dataset = build_dataset(params, dataset_voc.vocabulary, dataset_voc.vocabulary_len) else: dataset = build_dataset(params) else: logging.info('Updating dataset.') dataset = loadDataset(params['DATASET_STORE_PATH'] + '/Dataset_' + params['DATASET_NAME'] + '_' + params['SRC_LAN'] + params['TRG_LAN'] + '.pkl') for split, filename in params['TEXT_FILES'].iteritems(): dataset = update_dataset_from_file( dataset, params['DATA_ROOT_PATH'] + '/' + filename + params['SRC_LAN'], params, splits=list([split]), output_text_filename=params['DATA_ROOT_PATH'] + '/' + filename + params['TRG_LAN'], remove_outputs=False, compute_state_below=True, recompute_references=True) dataset.name = params['DATASET_NAME'] + '_' + params[ 'SRC_LAN'] + params['TRG_LAN'] saveDataset(dataset, params['DATASET_STORE_PATH']) else: logging.info('Reloading and using dataset.') dataset = loadDataset(load_dataset) else: # Load data if load_dataset is None: pred_vocab = params.get('PRED_VOCAB', None) if pred_vocab is not None: dataset_voc = loadDataset(params['PRED_VOCAB']) # for the testing pharse handle model vocab differences #dataset_voc.vocabulary['target_text'] = dataset_voc.vocabulary['target'] #dataset_voc.vocabulary_len['target_text'] = dataset_voc.vocabulary_len['target'] dataset = build_dataset(params, dataset_voc.vocabulary, dataset_voc.vocabulary_len) else: dataset = build_dataset(params) else: dataset = loadDataset(load_dataset) params['INPUT_VOCABULARY_SIZE'] = dataset.vocabulary_len[ params['INPUTS_IDS_DATASET'][0]] #params['OUTPUT_VOCABULARY_SIZE'] = dataset.vocabulary_len[params['OUTPUTS_IDS_DATASET_FULL'][0]] params['OUTPUT_VOCABULARY_SIZE'] = dataset.vocabulary_len['target_text'] # Build model if params['RELOAD'] == 0: # build new model nmt_model = TranslationModel(params, model_type=params['MODEL_TYPE'], verbose=params['VERBOSE'], model_name=params['MODEL_NAME'], vocabularies=dataset.vocabulary, store_path=params['STORE_PATH'], trainable_pred=trainable_pred, trainable_est=trainable_est, clear_dirs=True, weights_path=weights_path) # Define the inputs and outputs mapping from our Dataset instance to our model inputMapping = dict() for i, id_in in enumerate(params['INPUTS_IDS_DATASET']): pos_source = dataset.ids_inputs.index(id_in) id_dest = nmt_model.ids_inputs[i] inputMapping[id_dest] = pos_source nmt_model.setInputsMapping(inputMapping) outputMapping = dict() for i, id_out in enumerate(params['OUTPUTS_IDS_DATASET']): pos_target = dataset.ids_outputs.index(id_out) id_dest = nmt_model.ids_outputs[i] outputMapping[id_dest] = pos_target nmt_model.setOutputsMapping(outputMapping) else: # resume from previously trained model nmt_model = TranslationModel(params, model_type=params['MODEL_TYPE'], verbose=params['VERBOSE'], model_name=params['MODEL_NAME'], vocabularies=dataset.vocabulary, store_path=params['STORE_PATH'], set_optimizer=False, trainable_pred=trainable_pred, trainable_est=trainable_est, weights_path=weights_path) # Define the inputs and outputs mapping from our Dataset instance to our model inputMapping = dict() for i, id_in in enumerate(params['INPUTS_IDS_DATASET']): pos_source = dataset.ids_inputs.index(id_in) id_dest = nmt_model.ids_inputs[i] inputMapping[id_dest] = pos_source nmt_model.setInputsMapping(inputMapping) outputMapping = dict() for i, id_out in enumerate(params['OUTPUTS_IDS_DATASET']): pos_target = dataset.ids_outputs.index(id_out) id_dest = nmt_model.ids_outputs[i] outputMapping[id_dest] = pos_target nmt_model.setOutputsMapping(outputMapping) nmt_model = updateModel(nmt_model, params['STORE_PATH'], params['RELOAD'], reload_epoch=params['RELOAD_EPOCH']) nmt_model.setParams(params) nmt_model.setOptimizer() params['EPOCH_OFFSET'] = params['RELOAD'] if params['RELOAD_EPOCH'] else \ int(params['RELOAD'] * params['BATCH_SIZE'] / dataset.len_train) # Store configuration as pkl dict2pkl(params, params['STORE_PATH'] + '/config') # Callbacks callbacks = buildCallbacks(params, nmt_model, dataset) # Training total_start_time = timer() logger.debug('Starting training!') training_params = { 'n_epochs': params['MAX_EPOCH'], 'batch_size': params['BATCH_SIZE'], 'homogeneous_batches': params['HOMOGENEOUS_BATCHES'], 'maxlen': params['MAX_OUTPUT_TEXT_LEN'], 'joint_batches': params['JOINT_BATCHES'], 'lr_decay': params.get('LR_DECAY', None), # LR decay parameters 'reduce_each_epochs': params.get('LR_REDUCE_EACH_EPOCHS', True), 'start_reduction_on_epoch': params.get('LR_START_REDUCTION_ON_EPOCH', 0), 'lr_gamma': params.get('LR_GAMMA', 0.9), 'lr_reducer_type': params.get('LR_REDUCER_TYPE', 'linear'), 'lr_reducer_exp_base': params.get('LR_REDUCER_EXP_BASE', 0), 'lr_half_life': params.get('LR_HALF_LIFE', 50000), 'epochs_for_save': params['EPOCHS_FOR_SAVE'], 'verbose': params['VERBOSE'], 'eval_on_sets': params['EVAL_ON_SETS_KERAS'], 'n_parallel_loaders': params['PARALLEL_LOADERS'], 'extra_callbacks': callbacks, 'reload_epoch': params['RELOAD'], 'epoch_offset': params.get('EPOCH_OFFSET', 0), 'data_augmentation': params['DATA_AUGMENTATION'], 'patience': params.get('PATIENCE', 0), # early stopping parameters 'metric_check': params.get('STOP_METRIC', None) if params.get('EARLY_STOP', False) else None, 'eval_on_epochs': params.get('EVAL_EACH_EPOCHS', True), 'each_n_epochs': params.get('EVAL_EACH', 1), 'start_eval_on_epoch': params.get('START_EVAL_ON_EPOCH', 0) } if weights_dict is not None: for layer in nmt_model.model.layers: if layer.name in weights_dict: layer.set_weights(weights_dict[layer.name]) nmt_model.trainNet(dataset, training_params) if weights_dict is not None: for layer in nmt_model.model.layers: weights_dict[layer.name] = layer.get_weights() total_end_time = timer() time_difference = total_end_time - total_start_time logging.info('In total is {0:.2f}s = {1:.2f}m'.format( time_difference, time_difference / 60.0))