def generate_feature_lists(root_dir, features_dir, features, lists_dir, list_suffix, feature_extension, replace_extension, splits, verbose): """ :param root_dir: :param features_dir: :param features: :param lists_dir: :param list_suffix: :param feature_extension: :param splits: :param verbose: :return: """ create_dir_if_not_exists(root_dir + '/' + lists_dir + '/' + features) path_features = features_dir + '/' + features print("Storing features in:", root_dir + '/' + lists_dir + '/' + features) for split in splits: print('Processing split', split) ids = file2list(root_dir + '/' + lists_dir + '/' + split + list_suffix) new_ids = [ path_features + '/' + split + '/' + sample_id[:-replace_extension] + feature_extension for sample_id in ids ] list2file( root_dir + '/' + lists_dir + '/' + features + '/' + split + '_list_features.txt', new_ids) print('Done!')
def score_corpus(args, params): print "Using an ensemble of %d models" % len(args.models) models = [loadModel(m, -1, full_path=True) for m in args.models] dataset = loadDataset(args.dataset) if args.source is not None: dataset = update_dataset_from_file(dataset, args.source, params, splits=args.splits, output_text_filename=args.target, compute_state_below=True) params['INPUT_VOCABULARY_SIZE'] = dataset.vocabulary_len[params['INPUTS_IDS_DATASET'][0]] params['OUTPUT_VOCABULARY_SIZE'] = dataset.vocabulary_len[params['OUTPUTS_IDS_DATASET'][0]] # Apply scoring extra_vars = dict() extra_vars['tokenize_f'] = eval('dataset.' + params['TOKENIZATION_METHOD']) for s in args.splits: # Apply model predictions params_prediction = {'max_batch_size': params['BATCH_SIZE'], 'n_parallel_loaders': params['PARALLEL_LOADERS'], 'predict_on_sets': [s]} if params['BEAM_SEARCH']: params_prediction['beam_size'] = params['BEAM_SIZE'] params_prediction['maxlen'] = params['MAX_OUTPUT_TEXT_LEN_TEST'] params_prediction['optimized_search'] = params['OPTIMIZED_SEARCH'] params_prediction['model_inputs'] = params['INPUTS_IDS_MODEL'] params_prediction['model_outputs'] = params['OUTPUTS_IDS_MODEL'] params_prediction['dataset_inputs'] = params['INPUTS_IDS_DATASET'] params_prediction['dataset_outputs'] = params['OUTPUTS_IDS_DATASET'] params_prediction['normalize_probs'] = params.get('NORMALIZE_SAMPLING', False) params_prediction['alpha_factor'] = params.get('ALPHA_FACTOR', 1.0) params_prediction['coverage_penalty'] = params.get('COVERAGE_PENALTY', False) params_prediction['length_penalty'] = params.get('LENGTH_PENALTY', False) params_prediction['length_norm_factor'] = params.get('LENGTH_NORM_FACTOR', 0.0) params_prediction['coverage_norm_factor'] = params.get('COVERAGE_NORM_FACTOR', 0.0) params_prediction['pos_unk'] = params.get('POS_UNK', False) params_prediction['state_below_maxlen'] = -1 if params.get('PAD_ON_BATCH', True) \ else params.get('MAX_OUTPUT_TEXT_LEN', 50) params_prediction['output_max_length_depending_on_x'] = params.get('MAXLEN_GIVEN_X', True) params_prediction['output_max_length_depending_on_x_factor'] = params.get('MAXLEN_GIVEN_X_FACTOR', 3) params_prediction['output_min_length_depending_on_x'] = params.get('MINLEN_GIVEN_X', True) params_prediction['output_min_length_depending_on_x_factor'] = params.get('MINLEN_GIVEN_X_FACTOR', 2) beam_searcher = BeamSearchEnsemble(models, dataset, params_prediction, verbose=args.verbose) scores = beam_searcher.scoreNet()[s] # Store result if args.dest is not None: filepath = args.dest # results file if params['SAMPLING_SAVE_MODE'] == 'list': list2file(filepath, scores) elif params['SAMPLING_SAVE_MODE'] == 'numpy': numpy2file(filepath, scores) else: raise Exception('The sampling mode ' + params['SAMPLING_SAVE_MODE'] + ' is not currently supported.') else: print scores
def generate_feature_lists(root_dir, features_dir, features, lists_dir, list_suffix, feature_extension, replace_extension, splits, verbose): """ :param root_dir: Base working directory. :param features_dir: Directory for storing the features. :param features: Features name. :param lists_dir: Directory (under --root-dir) containing the list splitting the dataset. :param list_suffix: Suffix for the lists splitting the features. Will be preceded with each of the options given in splits. :param feature_extension: MIME of the features. :param replace_extension: Remove this number of characters from the features names. Set to 4 for removing MIME extensions such as '.png', '.jpg' and replacing it by feature_extension. :param splits: Splits to create. :return: """ create_dir_if_not_exists(root_dir + '/' + lists_dir + '/' + features) path_features = features_dir + '/' + features print("Storing features in:", root_dir + '/' + lists_dir + '/' + features) for split in splits: print('Processing split', split) ids = file2list(root_dir + '/' + lists_dir + '/' + split + list_suffix) if replace_extension > 0: new_ids = [ path_features + '/' + split + '/' + sample_id[:-replace_extension] + feature_extension for sample_id in ids ] else: new_ids = [ path_features + '/' + split + '/' + sample_id + feature_extension for sample_id in ids ] list2file( root_dir + '/' + lists_dir + '/' + features + '/' + split + '_list_features.txt', new_ids) print('Done!')
def sample_ensemble(args, params): from data_engine.prepare_data import update_dataset_from_file from keras_wrapper.model_ensemble import BeamSearchEnsemble from keras_wrapper.cnn_model import loadModel from keras_wrapper.dataset import loadDataset from keras_wrapper.utils import decode_predictions_beam_search logging.info("Using an ensemble of %d models" % len(args.models)) models = [loadModel(m, -1, full_path=True) for m in args.models] dataset = loadDataset(args.dataset) dataset = update_dataset_from_file(dataset, args.text, params, splits=args.splits, remove_outputs=True) params['INPUT_VOCABULARY_SIZE'] = dataset.vocabulary_len[ params['INPUTS_IDS_DATASET'][0]] params['OUTPUT_VOCABULARY_SIZE'] = dataset.vocabulary_len[ params['OUTPUTS_IDS_DATASET'][0]] # For converting predictions into sentences index2word_y = dataset.vocabulary[params['OUTPUTS_IDS_DATASET'] [0]]['idx2words'] if params.get('APPLY_DETOKENIZATION', False): detokenize_function = eval('dataset.' + params['DETOKENIZATION_METHOD']) params_prediction = dict() params_prediction['max_batch_size'] = params.get('BATCH_SIZE', 20) params_prediction['n_parallel_loaders'] = params.get('PARALLEL_LOADERS', 1) params_prediction['beam_size'] = params.get('BEAM_SIZE', 6) params_prediction['maxlen'] = params.get('MAX_OUTPUT_TEXT_LEN_TEST', 100) params_prediction['optimized_search'] = params['OPTIMIZED_SEARCH'] params_prediction['model_inputs'] = params['INPUTS_IDS_MODEL'] params_prediction['model_outputs'] = params['OUTPUTS_IDS_MODEL'] params_prediction['dataset_inputs'] = params['INPUTS_IDS_DATASET'] params_prediction['dataset_outputs'] = params['OUTPUTS_IDS_DATASET'] params_prediction['search_pruning'] = params.get('SEARCH_PRUNING', False) params_prediction['normalize_probs'] = params.get('NORMALIZE_SAMPLING', False) params_prediction['alpha_factor'] = params.get('ALPHA_FACTOR', 1.0) params_prediction['coverage_penalty'] = params.get('COVERAGE_PENALTY', False) params_prediction['length_penalty'] = params.get('LENGTH_PENALTY', False) params_prediction['length_norm_factor'] = params.get( 'LENGTH_NORM_FACTOR', 0.0) params_prediction['coverage_norm_factor'] = params.get( 'COVERAGE_NORM_FACTOR', 0.0) params_prediction['pos_unk'] = params.get('POS_UNK', False) params_prediction['state_below_maxlen'] = -1 if params.get('PAD_ON_BATCH', True) \ else params.get('MAX_OUTPUT_TEXT_LEN', 50) params_prediction['output_max_length_depending_on_x'] = params.get( 'MAXLEN_GIVEN_X', True) params_prediction['output_max_length_depending_on_x_factor'] = params.get( 'MAXLEN_GIVEN_X_FACTOR', 3) params_prediction['output_min_length_depending_on_x'] = params.get( 'MINLEN_GIVEN_X', True) params_prediction['output_min_length_depending_on_x_factor'] = params.get( 'MINLEN_GIVEN_X_FACTOR', 2) params_prediction['attend_on_output'] = params.get( 'ATTEND_ON_OUTPUT', 'transformer' in params['MODEL_TYPE'].lower()) heuristic = params.get('HEURISTIC', 0) mapping = None if dataset.mapping == dict() else dataset.mapping model_weights = args.weights if model_weights is not None and model_weights != []: assert len(model_weights) == len( models ), 'You should give a weight to each model. You gave %d models and %d weights.' % ( len(models), len(model_weights)) model_weights = map(lambda x: float(x), model_weights) if len(model_weights) > 1: logger.info('Giving the following weights to each model: %s' % str(model_weights)) for s in args.splits: # Apply model predictions params_prediction['predict_on_sets'] = [s] beam_searcher = BeamSearchEnsemble(models, dataset, params_prediction, model_weights=model_weights, n_best=args.n_best, verbose=args.verbose) if args.n_best: predictions, n_best = beam_searcher.predictBeamSearchNet()[s] else: predictions = beam_searcher.predictBeamSearchNet()[s] n_best = None if params_prediction['pos_unk']: samples = predictions[0] alphas = predictions[1] sources = [ x.strip() for x in open(args.text, 'r').read().split('\n') ] sources = sources[:-1] if len(sources[-1]) == 0 else sources else: samples = predictions alphas = None heuristic = None sources = None predictions = decode_predictions_beam_search(samples, index2word_y, alphas=alphas, x_text=sources, heuristic=heuristic, mapping=mapping, verbose=args.verbose) # Apply detokenization function if needed if params.get('APPLY_DETOKENIZATION', False): predictions = map(detokenize_function, predictions) if args.n_best: n_best_predictions = [] for i, (n_best_preds, n_best_scores, n_best_alphas) in enumerate(n_best): n_best_sample_score = [] for n_best_pred, n_best_score, n_best_alpha in zip( n_best_preds, n_best_scores, n_best_alphas): pred = decode_predictions_beam_search( [n_best_pred], index2word_y, alphas=[n_best_alpha] if params_prediction['pos_unk'] else None, x_text=[sources[i]] if params_prediction['pos_unk'] else None, heuristic=heuristic, mapping=mapping, verbose=args.verbose) # Apply detokenization function if needed if params.get('APPLY_DETOKENIZATION', False): pred = map(detokenize_function, pred) n_best_sample_score.append([i, pred, n_best_score]) n_best_predictions.append(n_best_sample_score) # Store result if args.dest is not None: filepath = args.dest # results file if params.get('SAMPLING_SAVE_MODE', 'list'): list2file(filepath, predictions) if args.n_best: nbest2file(filepath + '.nbest', n_best_predictions) else: raise Exception( 'Only "list" is allowed in "SAMPLING_SAVE_MODE"') else: list2stdout(predictions) if args.n_best: logging.info('Storing n-best sentences in ./' + s + '.nbest') nbest2file('./' + s + '.nbest', n_best_predictions) logging.info('Sampling finished')
def evaluate(self, epoch, counter_name='epoch', logs=None): """ Evaluation function. Works for evaluators external to Keras. Computes the predictions according to the configuration and evaluates them, storing the results. :param epoch: Current epoch or update. :param counter_name: 'epoch' or 'update', string used for logging. :param logs: :return: """ if logs is None: logs = {} # Change inputs and outputs mappings for evaluation self.changeInOutMappings() # Evaluate on each set separately all_metrics = [] for s in self.set_name: # Apply model predictions if self.beam_search: params_prediction = { 'max_batch_size': self.batch_size, 'n_parallel_loaders': self.extra_vars.get('n_parallel_loaders', 1), 'predict_on_sets': [s], 'beam_batch_size': self.beam_batch_size if self.beam_batch_size is not None else self.batch_size, 'pos_unk': False, 'normalize': self.normalize, 'normalization_type': self.normalization_type, 'max_eval_samples': self.max_eval_samples } params_prediction.update( checkDefaultParamsBeamSearch(self.extra_vars)) predictions_all = self.model_to_eval.predictBeamSearchNet( self.ds, params_prediction)[s] else: orig_size = self.extra_vars.get('eval_orig_size', False) params_prediction = { 'batch_size': self.batch_size, 'n_parallel_loaders': self.extra_vars.get('n_parallel_loaders', 1), 'predict_on_sets': [s], 'normalize': self.normalize, 'normalization_type': self.normalization_type, 'max_eval_samples': self.max_eval_samples, 'model_name': self.model_name, } # Convert predictions postprocess_fun = None if self.is_3DLabel: postprocess_fun = [ self.ds.convert_3DLabels_to_bboxes, self.extra_vars[s]['references_orig_sizes'] ] elif orig_size: postprocess_fun = [ self.ds.resize_semantic_output, self.extra_vars[s]['eval_orig_size_id'] ] predictions_all = \ self.model_to_eval.predictNet(self.ds, params_prediction, postprocess_fun=postprocess_fun)[s] # Single-output model if not self.gt_pos or self.gt_pos == 0 or len(self.gt_pos) == 1: if len(predictions_all) != 2: predictions_all = [predictions_all] gt_positions = [0] # Multi-output model else: gt_positions = self.gt_pos # Select each output to evaluate separately for gt_pos, type_out, these_metrics, gt_id, write_type, index2word_y, index2word_x in zip( gt_positions, self.output_types, self.metric_name, self.gt_id, self.write_type, self.index2word_y, self.index2word_x): predictions = predictions_all[gt_pos] prediction_costs = None if self.verbose > 0: print('') logger.info('Prediction output ' + str(gt_pos) + ': ' + str(gt_id) + ' (' + str(type_out) + ')') # Postprocess outputs of type text if type_out == 'text': samples = predictions['samples'] prediction_costs = predictions['costs'] alphas = None sources = None if params_prediction.get('pos_unk', False): alphas = predictions['alphas'] if eval('self.ds.loaded_raw_' + s + '[0]'): sources = predictions['sources'] else: sources = [] for preds in predictions['sources']: for src in preds[self.input_text_id]: sources.append(src) sources = decode_predictions_beam_search( sources, index2word_x, pad_sequences=True, verbose=self.verbose) if self.out_pred_idx is not None: samples = samples[self.out_pred_idx] # Convert predictions into sentences if self.beam_search: decoded_predictions = decode_predictions_beam_search( samples, index2word_y, glossary=self.extra_vars.get('glossary', None), alphas=alphas, x_text=sources, heuristic=self.extra_vars.get('heuristic', 0), mapping=self.extra_vars.get('mapping', None), verbose=self.verbose) else: probs = predictions decoded_predictions = decode_predictions( predictions, 1, # always set temperature to 1 index2word_y, self.sampling_type, verbose=self.verbose) # Apply detokenization function if needed if self.extra_vars.get('apply_detokenization', False): decoded_predictions = list( map(self.extra_vars['detokenize_f'], decoded_predictions)) # Postprocess outputs of type binary elif type_out == 'binary': decoded_predictions = decode_multilabel( predictions, index2word_y, min_val=self.min_pred_multilabel[gt_pos], verbose=self.verbose) # Prepare references y_split = getattr(self.ds, 'Y_' + s) y_raw = y_split[gt_id] self.extra_vars[gt_pos][s][ 'references'] = self.ds.loadBinary(y_raw, gt_id) # Postprocess outputs of type 3DLabel elif type_out == '3DLabel': self.extra_vars[gt_pos][s] = dict() y_split = getattr(self.ds, 'Y_' + s) ref = y_split[gt_id] [ref, original_sizes ] = self.ds.convert_GT_3DLabels_to_bboxes(ref) self.extra_vars[gt_pos][s]['references'] = ref self.extra_vars[gt_pos][s][ 'references_orig_sizes'] = original_sizes # Postprocess outputs of type 3DSemanticLabel elif type_out == '3DSemanticLabel': self.extra_vars[gt_pos][ 'eval_orig_size'] = self.eval_orig_size self.extra_vars[gt_pos][s] = dict() y_split = getattr(self.ds, 'Y_' + s) ref = y_split[gt_id] if self.eval_orig_size: old_crop = copy.deepcopy(self.ds.img_size_crop) self.ds.img_size_crop = copy.deepcopy(self.ds.img_size) self.extra_vars[gt_pos][s][ 'eval_orig_size_id'] = np.array([gt_id] * len(ref)) ref = self.ds.load_GT_3DSemanticLabels(ref, gt_id) if self.eval_orig_size: self.ds.img_size_crop = copy.deepcopy(old_crop) self.extra_vars[gt_pos][s]['references'] = ref # Other output data types else: y_split = getattr(self.ds, 'Y_' + s) self.extra_vars[gt_pos][s]['references'] = y_split[gt_id] # Store predictions if self.write_samples: # Store result filepath = os.path.join( self.save_path, s + '_' + counter_name + '_' + str(epoch) + '_output_' + str(gt_pos) + '.pred') # results file if write_type == 'list': list2file(filepath, decoded_predictions) elif write_type == 'vqa': try: y_split = getattr(self.ds, 'Y_' + s) refs = y_split[gt_id] except Exception: refs = ['N/A' for _ in range(probs.shape[0])] extra_data_plot = { 'reference': refs, 'probs': probs, 'vocab': index2word_y } list2vqa(filepath, decoded_predictions, self.extra_vars[gt_pos][s]['question_ids'], extra=extra_data_plot) elif write_type == 'listoflists': listoflists2file(filepath, decoded_predictions) elif write_type == 'numpy': numpy2file(filepath, decoded_predictions) elif write_type == '3DLabels': raise NotImplementedError( 'Write 3DLabels function is not implemented') elif write_type == '3DSemanticLabel': folder_path = os.path.join( self.save_path, s + '_' + counter_name + '_' + str(epoch)) numpy2imgs( folder_path, decoded_predictions, eval('self.ds.X_' + s + '["' + self.input_id + '"]'), self.ds) else: raise NotImplementedError('The store type "' + self.write_type + '" is not implemented.') # Store current epoch/iteration in model log self.model_to_eval.log(s, counter_name, epoch) # Evaluate on each metric for metric in these_metrics: if self.verbose > 0: logger.info('Evaluating on metric ' + metric) filepath = os.path.join(self.save_path, s + '.' + metric) if s == 'train': logger.info( "WARNING: evaluation results on 'train' split might be incorrect when" "applying random image shuffling.") # Evaluate on the chosen metric metrics = evaluation.select[metric]( pred_list=decoded_predictions, verbose=self.verbose, extra_vars=self.extra_vars[gt_pos], split=s, costs=prediction_costs) self.model_to_eval.log_tensorboard(metrics, epoch, split=s) # Print results to file and store in model log with open(filepath, 'a') as f: header = counter_name + ',' line = str(epoch) + ',' for metric_ in sorted(metrics): value = metrics[metric_] # Multiple-output model if self.gt_pos and self.gt_pos != 0: metric_ += '_output_' + str(gt_pos) all_metrics.append(metric_) header += metric_ + ',' line += str(value) + ',' # Store in model log self.model_to_eval.log(s, metric_, value) if not self.written_header: f.write(header + '\n') self.written_header = True f.write(line + '\n') if self.verbose > 0: logger.info('Done evaluating on metric ' + metric) # Store losses if logs.get('loss') is not None: self.model_to_eval.log('train', 'train_loss', logs['loss']) if logs.get('valid_loss') is not None: self.model_to_eval.log('val', 'val_loss', logs['valid_loss']) # Plot results so far if self.do_plot: if self.metric_name: self.model_to_eval.plot(counter_name, set(all_metrics), self.set_name, upperbound=self.max_plot) # Save the model if self.save_each_evaluation: from keras_wrapper.cnn_model import saveModel saveModel(self.model_to_eval, epoch, store_iter=not self.eval_on_epochs) # Recover inputs and outputs mappings for resume training self.recoverInOutMappings()
Ross_predictions = Ross_model.predictBeamSearchNet(dataset, params_prediction)['test'] vocab = dataset.vocabulary['target_text']['idx2words'] Ross_predictions = decode_predictions_beam_search(Ross_predictions, vocab, verbose=params['VERBOSE']) ## see how they compare to ground truth # from keras_wrapper.extra.read_write import list2file from keras_wrapper.extra import evaluation Ross_path = 'Ross_M7.pred' list2file(Ross_path, Ross_predictions) dataset.setOutput('data/Ross_test.reply', 'test', type='text', id='target_text', pad_on_batch=True, tokenization='tokenize_basic', sample_weights=True, max_text_len=30, max_words=0) print(dataset) keep_n_captions(dataset, repeat=1, n=1, set_names=['test'])
def apply_Video_model(params): """ Function for using a previously trained model for sampling. """ ########### Load data dataset = build_dataset(params) params['OUTPUT_VOCABULARY_SIZE'] = dataset.vocabulary_len[ params['OUTPUTS_IDS_DATASET'][0]] ########### ########### Load model video_model = loadModel(params['STORE_PATH'], params['RELOAD']) video_model.setOptimizer() ########### ########### Apply sampling extra_vars = dict() extra_vars['tokenize_f'] = eval('dataset.' + params['TOKENIZATION_METHOD']) extra_vars['language'] = params.get('TRG_LAN', 'en') for s in params["EVAL_ON_SETS"]: # Apply model predictions params_prediction = { 'max_batch_size': params['BATCH_SIZE'], 'n_parallel_loaders': params['PARALLEL_LOADERS'], 'predict_on_sets': [s] } # Convert predictions into sentences vocab = dataset.vocabulary[params['OUTPUTS_IDS_DATASET'] [0]]['idx2words'] if params['BEAM_SEARCH']: params_prediction['beam_size'] = params['BEAM_SIZE'] params_prediction['maxlen'] = params['MAX_OUTPUT_TEXT_LEN_TEST'] params_prediction['optimized_search'] = params['OPTIMIZED_SEARCH'] params_prediction['model_inputs'] = params['INPUTS_IDS_MODEL'] params_prediction['model_outputs'] = params['OUTPUTS_IDS_MODEL'] params_prediction['dataset_inputs'] = params['INPUTS_IDS_DATASET'] params_prediction['dataset_outputs'] = params[ 'OUTPUTS_IDS_DATASET'] params_prediction['normalize_probs'] = params['NORMALIZE_SAMPLING'] params_prediction['alpha_factor'] = params['ALPHA_FACTOR'] predictions = video_model.predictBeamSearchNet( dataset, params_prediction)[s] predictions = video_model.decode_predictions_beam_search( predictions, vocab, verbose=params['VERBOSE']) else: predictions = video_model.predictNet(dataset, params_prediction)[s] predictions = video_model.decode_predictions( predictions, 1, # always set temperature to 1 vocab, params['SAMPLING'], verbose=params['VERBOSE']) # Store result filepath = video_model.model_path + '/' + s + '_sampling.pred' # results file if params['SAMPLING_SAVE_MODE'] == 'list': list2file(filepath, predictions) else: raise Exception, 'Only "list" is allowed in "SAMPLING_SAVE_MODE"' # Evaluate if any metric in params['METRICS'] for metric in params['METRICS']: logging.info('Evaluating on metric ' + metric) filepath = video_model.model_path + '/' + s + '_sampling.' + metric # results file # Evaluate on the chosen metric extra_vars[s] = dict() extra_vars[s]['references'] = dataset.extra_variables[s][ params['OUTPUTS_IDS_DATASET'][0]] metrics = evaluation.select[metric](pred_list=predictions, verbose=1, extra_vars=extra_vars, split=s) # Print results to file with open(filepath, 'w') as f: header = '' line = '' for metric_ in sorted(metrics): value = metrics[metric_] header += metric_ + ',' line += str(value) + ',' f.write(header + '\n') f.write(line + '\n') logging.info('Done evaluating on metric ' + metric)
def score_corpus(args, params): """ Use one or several translation models for scoring source--target pairs- :param argparse.Namespace args: Arguments given to the method: * dataset: Dataset instance with data. * source: Text file with source sentences. * target: Text file with target sentences. * splits: Splits to sample. Should be already included in the dataset object. * dest: Output file to save scores. * weights: Weight given to each model in the ensemble. You should provide the same number of weights than models. By default, it applies the same weight to each model (1/N). * verbose: Be verbose or not. * config: Config .pkl for loading the model configuration. If not specified, hyperparameters are read from config.py. * models: Path to the models. :param dict params: parameters of the translation model. """ from data_engine.prepare_data import update_dataset_from_file from keras_wrapper.dataset import loadDataset from keras_wrapper.cnn_model import loadModel from keras_wrapper.model_ensemble import BeamSearchEnsemble logging.info("Using an ensemble of %d models" % len(args.models)) models = [loadModel(m, -1, full_path=True) for m in args.models] dataset = loadDataset(args.dataset) dataset = update_dataset_from_file(dataset, args.source, params, splits=args.splits, output_text_filename=args.target, compute_state_below=True) params['INPUT_VOCABULARY_SIZE'] = dataset.vocabulary_len[ params['INPUTS_IDS_DATASET'][0]] params['OUTPUT_VOCABULARY_SIZE'] = dataset.vocabulary_len[ params['OUTPUTS_IDS_DATASET'][0]] # Apply scoring extra_vars = dict() extra_vars['tokenize_f'] = eval('dataset.' + params['TOKENIZATION_METHOD']) model_weights = args.weights if model_weights is not None and model_weights != []: assert len(model_weights) == len( models ), 'You should give a weight to each model. You gave %d models and %d weights.' % ( len(models), len(model_weights)) model_weights = map(float, model_weights) if len(model_weights) > 1: logger.info('Giving the following weights to each model: %s' % str(model_weights)) for s in args.splits: # Apply model predictions params_prediction = { 'max_batch_size': params['BATCH_SIZE'], 'n_parallel_loaders': params['PARALLEL_LOADERS'], 'predict_on_sets': [s] } if params['BEAM_SEARCH']: params_prediction['beam_size'] = params['BEAM_SIZE'] params_prediction['maxlen'] = params['MAX_OUTPUT_TEXT_LEN_TEST'] params_prediction['optimized_search'] = params['OPTIMIZED_SEARCH'] params_prediction['model_inputs'] = params['INPUTS_IDS_MODEL'] params_prediction['model_outputs'] = params['OUTPUTS_IDS_MODEL'] params_prediction['dataset_inputs'] = params['INPUTS_IDS_DATASET'] params_prediction['dataset_outputs'] = params[ 'OUTPUTS_IDS_DATASET'] params_prediction['normalize_probs'] = params.get( 'NORMALIZE_SAMPLING', False) params_prediction['alpha_factor'] = params.get('ALPHA_FACTOR', 1.0) params_prediction['coverage_penalty'] = params.get( 'COVERAGE_PENALTY', False) params_prediction['length_penalty'] = params.get( 'LENGTH_PENALTY', False) params_prediction['length_norm_factor'] = params.get( 'LENGTH_NORM_FACTOR', 0.0) params_prediction['coverage_norm_factor'] = params.get( 'COVERAGE_NORM_FACTOR', 0.0) params_prediction['pos_unk'] = params.get('POS_UNK', False) params_prediction['state_below_maxlen'] = -1 if params.get('PAD_ON_BATCH', True) \ else params.get('MAX_OUTPUT_TEXT_LEN', 50) params_prediction['output_max_length_depending_on_x'] = params.get( 'MAXLEN_GIVEN_X', True) params_prediction[ 'output_max_length_depending_on_x_factor'] = params.get( 'MAXLEN_GIVEN_X_FACTOR', 3) params_prediction['output_min_length_depending_on_x'] = params.get( 'MINLEN_GIVEN_X', True) params_prediction[ 'output_min_length_depending_on_x_factor'] = params.get( 'MINLEN_GIVEN_X_FACTOR', 2) params_prediction['attend_on_output'] = params.get( 'ATTEND_ON_OUTPUT', 'transformer' in params['MODEL_TYPE'].lower()) beam_searcher = BeamSearchEnsemble(models, dataset, params_prediction, model_weights=model_weights, verbose=args.verbose) scores = beam_searcher.scoreNet()[s] # Store result if args.dest is not None: filepath = args.dest # results file if params['SAMPLING_SAVE_MODE'] == 'list': list2file(filepath, scores) elif params['SAMPLING_SAVE_MODE'] == 'numpy': numpy2file(filepath, scores) else: raise Exception('The sampling mode ' + params['SAMPLING_SAVE_MODE'] + ' is not currently supported.') else: print(scores)
def apply_Feature_Extractor_model(params, dataset=None, extractor_model=None): """ Function for using a previously trained model for sampling. """ ########### Load data if dataset is None: dataset = build_dataset(params) ########### Load model if extractor_model is None and params['RELOAD'] > 0: extractor_model = loadModel(params['STORE_PATH'], params['RELOAD']) else: extractor_model = Feature_Extractor(params, type=params['MODEL_TYPE'], verbose=params['VERBOSE'], model_name=params['MODEL_NAME'], store_path=params['STORE_PATH']) # Define the inputs and outputs mapping from our Dataset instance to our model inputMapping = dict() for i, id_in in enumerate(params['INPUTS_IDS_DATASET']): if len(extractor_model.ids_inputs) > i: pos_source = dataset.ids_inputs.index(id_in) id_dest = extractor_model.ids_inputs[i] inputMapping[id_dest] = pos_source extractor_model.setInputsMapping(inputMapping) ########### Apply sampling extra_vars = dict() for s in params["EVAL_ON_SETS"]: # Apply model predictions params_prediction = {'batch_size': params['BATCH_SIZE'], 'n_parallel_loaders': params['PARALLEL_LOADERS'], 'predict_on_sets': [s], 'verbose': 0} logging.info("<<< Predicting outputs of " + s + " set >>>") if params['SAMPLING_SAVE_MODE'] == 'list': filepath = extractor_model.model_path + '/' + s + '_sampling.pred' # results file list2file(filepath, [], permission='w') start_time = time.time() eta = -1 mode = 'w' for n_sample in range(0, eval('dataset.len_' + s), params.get('PREDICTION_STEP', 100)): params_prediction['init_sample'] = n_sample params_prediction['final_sample'] = min(n_sample + params.get('PREDICTION_STEP', 100), eval('dataset.len_' + s)) predictions = extractor_model.predictNet(dataset, params_prediction)[s] # Store result if params['SAMPLING_SAVE_MODE'] == 'list': filepath = extractor_model.model_path + '/' + s + '_sampling.pred' # results file list2file(filepath, predictions, permission='a') elif params['SAMPLING_SAVE_MODE'] == 'npy': filepath = extractor_model.model_path + '/' + s + '_' + params.get('MODEL_TYPE', '') + '_features.npy' numpy2file(filepath, predictions, permission=mode) elif params['SAMPLING_SAVE_MODE'] == 'hdf5': filepath = extractor_model.model_path + '/' + s + '_' + params.get('MODEL_TYPE', '') + '_features.hdf5' numpy2hdf5(filepath, predictions, permission=mode) else: raise Exception, 'Only "list" or "hdf5" are allowed in "SAMPLING_SAVE_MODE"' mode = 'a' sys.stdout.write('\r') sys.stdout.write("\t Processed %d/%d - ETA: %ds " % (n_sample, eval('dataset.len_' + s), int(eta))) sys.stdout.flush() eta = (eval('dataset.len_' + s) - n_sample) * (time.time() - start_time) / max(n_sample, 1)
def interactive_translation( src_seq, src_line, trg_line, params_prediction, args, tokenize_f, index2word_y, word2index_y, index2word_x, word2index_x, unk_id, total_errors, total_mouse_actions, n_line=-1, ): errors_sentence = 0 mouse_actions_sentence = 0 hypothesis_number = 0 # Get (tokenized) input tokenized_reference = tokenize_f( trg_line) if args.tokenize_references else trg_line # Get reference as desired by the user, i.e. detokenized if necessary reference = params_prediction['detokenize_f'](tokenized_reference) if \ args.detokenize_bpe else tokenized_reference # Detokenize line for nicer logging :) if args.detokenize_bpe: src_line = params_prediction['detokenize_f'](src_line) logger.debug(u'\n\nProcessing sentence %d' % n_line) logger.debug(u'Source: %s' % src_line) logger.debug(u'Target: %s' % reference) # 1. Get a first hypothesis trans_indices, costs, alphas = interactive_beam_searcher.sample_beam_search_interactive( src_seq) # 1.1 Set unk replacemet strategy if params_prediction['pos_unk']: alphas = [alphas] sources = [tokenized_input] heuristic = params_prediction['heuristic'] else: alphas = None heuristic = None sources = None # 1.2 Decode hypothesis hypothesis = decode_predictions_beam_search([trans_indices], index2word_y, alphas=alphas, x_text=sources, heuristic=heuristic, mapping=mapping, pad_sequences=True, verbose=0)[0] # 1.3 Store result (optional) hypothesis = params_prediction['detokenize_f'](hypothesis) \ if params_prediction.get('apply_detokenization', False) else hypothesis if args.original_dest is not None: filepath = args.original_dest # results file if params_prediction['SAMPLING_SAVE_MODE'] == 'list': list2file(filepath, [hypothesis + '\n'], permission='a') else: raise Exception('Only "list" is allowed in "SAMPLING_SAVE_MODE"') logger.debug(u'Hypo_%d: %s' % (hypothesis_number, hypothesis)) # 2.0 Interactive translation if hypothesis == reference: # 2.1 If the sentence is correct, we validate it pass else: # 2.2 Wrong hypothesis -> Interactively translate the sentence correct_hypothesis = False last_correct_pos = 0 while not correct_hypothesis: # 2.2.1 Empty data structures for the next sentence fixed_words_user = OrderedDict() unk_words_dict = OrderedDict() if not args.prefix: Exception( NotImplementedError, 'Segment-based interaction at' ' character level is still unimplemented') else: isle_indices = [] unks_in_isles = [] # 2.2.2 Compute longest common character prefix (LCCP) next_correction_pos, validated_prefix = common_prefix( hypothesis, reference) if next_correction_pos == len(reference): correct_hypothesis = True break # 2.2.3 Get next correction by checking against the reference next_correction = reference[next_correction_pos] # 2.2.4 Tokenize the prefix properly (possibly applying BPE) tokenized_validated_prefix = tokenize_f(validated_prefix + next_correction) # 2.2.5 Validate words for pos, word in enumerate(tokenized_validated_prefix.split()): fixed_words_user[pos] = word2index_y.get(word, unk_id) if word2index_y.get(word) is None: unk_words_dict[pos] = word # 2.2.6 Constrain search for the last word last_user_word_pos = fixed_words_user.keys()[-1] if next_correction != u' ': last_user_word = tokenized_validated_prefix.split()[-1] filtered_idx2word = dict( (word2index_y[candidate_word], candidate_word) for candidate_word in word2index_y if candidate_word.decode( 'utf-8')[:len(last_user_word)] == last_user_word) if filtered_idx2word != dict(): del fixed_words_user[last_user_word_pos] if last_user_word_pos in unk_words_dict.keys(): del unk_words_dict[last_user_word_pos] else: filtered_idx2word = dict() logger.debug(u'"%s" to character %d.' % (next_correction, next_correction_pos)) # 2.2.7 Generate a hypothesis compatible with the feedback provided by the user hypothesis = generate_constrained_hypothesis( interactive_beam_searcher, src_seq, fixed_words_user, params_prediction, args, isle_indices, filtered_idx2word, index2word_y, sources, heuristic, mapping, unk_words_dict.keys(), unk_words_dict.values(), unks_in_isles) hypothesis_number += 1 hypothesis = u' '.join(hypothesis) # Hypothesis is unicode hypothesis = params_prediction['detokenize_f'](hypothesis) \ if args.detokenize_bpe else hypothesis logger.debug(u'Target: %s' % reference) logger.debug(u"Hypo_%d: %s" % (hypothesis_number, hypothesis)) # 2.2.8 Add a keystroke errors_sentence += 1 # 2.2.9 Add a mouse action if we moved the pointer if next_correction_pos - last_correct_pos > 1: mouse_actions_sentence += 1 last_correct_pos = next_correction_pos # 2.3 Final check: The reference is a subset of the hypothesis: Cut the hypothesis if len(reference) < len(hypothesis): hypothesis = hypothesis[:len(reference)] errors_sentence += 1 logger.debug("Cutting hypothesis") # 2.4 Security assertion assert hypothesis == reference, "Error: The final hypothesis does not match with the reference! \n" \ "\t Split: %s \n" \ "\t Sentence: %d \n" \ "\t Hypothesis: %s\n" \ "\t Reference: %s" % (str(s.encode('utf-8')), n_line, hypothesis.encode('utf-8'), reference.encode('utf-8')) # 3. Update user effort counters mouse_actions_sentence += 1 # This +1 is the validation action chars_sentence = len(hypothesis) total_errors += errors_sentence total_mouse_actions += mouse_actions_sentence # 3.1 Log some info logger.debug(u"Final hypotesis: %s" % hypothesis) logger.debug( "%d errors. " "Sentence WSR: %4f. " "Sentence mouse strokes: %d " "Sentence MAR: %4f. " "Sentence MAR_c: %4f. " "Sentence KSMR: %4f. " % (errors_sentence, float(errors_sentence) / len(hypothesis), mouse_actions_sentence, float(mouse_actions_sentence) / len(hypothesis), float(mouse_actions_sentence) / chars_sentence, float(errors_sentence + mouse_actions_sentence) / chars_sentence)) # 5 Write correct sentences into a file return hypothesis, total_errors, total_mouse_actions
'MINLEN_GIVEN_X_FACTOR', 2) mapping = None if dataset.mapping == dict() else dataset.mapping if params['POS_UNK']: params_prediction['heuristic'] = params['HEURISTIC'] input_text_id = params['INPUTS_IDS_DATASET'][0] vocab_src = dataset.vocabulary[input_text_id]['idx2words'] else: input_text_id = None vocab_src = None mapping = None beam_searcher = BeamSearchEnsemble(models, dataset, params_prediction, verbose=args.verbose) scores = beam_searcher.scoreNet()[s] # Store result if args.dest is not None: filepath = args.dest # results file if params['SAMPLING_SAVE_MODE'] == 'list': list2file(filepath, scores) elif params['SAMPLING_SAVE_MODE'] == 'numpy': numpy2file(filepath, scores) else: raise Exception('The sampling mode ' + params['SAMPLING_SAVE_MODE'] + ' is not currently supported.') else: print scores
def score_corpus(args, params): from data_engine.prepare_data import update_dataset_from_file from keras_wrapper.dataset import loadDataset from keras_wrapper.cnn_model import loadModel from keras_wrapper.model_ensemble import BeamSearchEnsemble logging.info("Using an ensemble of %d models" % len(args.models)) models = [loadModel(m, -1, full_path=True) for m in args.models] dataset = loadDataset(args.dataset) dataset = update_dataset_from_file(dataset, args.source, params, splits=args.splits, output_text_filename=args.target, compute_state_below=True) params['INPUT_VOCABULARY_SIZE'] = dataset.vocabulary_len[ params['INPUTS_IDS_DATASET'][0]] params['OUTPUT_VOCABULARY_SIZE'] = dataset.vocabulary_len[ params['OUTPUTS_IDS_DATASET'][0]] # Apply scoring extra_vars = dict() extra_vars['tokenize_f'] = eval('dataset.' + params['TOKENIZATION_METHOD']) model_weights = args.weights if model_weights is not None and model_weights != []: assert len(model_weights) == len( models ), 'You should give a weight to each model. You gave %d models and %d weights.' % ( len(models), len(model_weights)) model_weights = map(lambda x: float(x), model_weights) if len(model_weights) > 1: logger.info('Giving the following weights to each model: %s' % str(model_weights)) for s in args.splits: # Apply model predictions params_prediction = { 'max_batch_size': params['BATCH_SIZE'], 'n_parallel_loaders': params['PARALLEL_LOADERS'], 'predict_on_sets': [s] } if params['BEAM_SEARCH']: params_prediction['beam_size'] = params['BEAM_SIZE'] params_prediction['maxlen'] = params['MAX_OUTPUT_TEXT_LEN_TEST'] params_prediction['optimized_search'] = params['OPTIMIZED_SEARCH'] params_prediction['model_inputs'] = params['INPUTS_IDS_MODEL'] params_prediction['model_outputs'] = params['OUTPUTS_IDS_MODEL'] params_prediction['dataset_inputs'] = params['INPUTS_IDS_DATASET'] params_prediction['dataset_outputs'] = params[ 'OUTPUTS_IDS_DATASET'] params_prediction['normalize_probs'] = params.get( 'NORMALIZE_SAMPLING', False) params_prediction['alpha_factor'] = params.get('ALPHA_FACTOR', 1.0) params_prediction['coverage_penalty'] = params.get( 'COVERAGE_PENALTY', False) params_prediction['length_penalty'] = params.get( 'LENGTH_PENALTY', False) params_prediction['length_norm_factor'] = params.get( 'LENGTH_NORM_FACTOR', 0.0) params_prediction['coverage_norm_factor'] = params.get( 'COVERAGE_NORM_FACTOR', 0.0) params_prediction['pos_unk'] = params.get('POS_UNK', False) params_prediction['state_below_maxlen'] = -1 if params.get('PAD_ON_BATCH', True) \ else params.get('MAX_OUTPUT_TEXT_LEN', 50) params_prediction['output_max_length_depending_on_x'] = params.get( 'MAXLEN_GIVEN_X', True) params_prediction[ 'output_max_length_depending_on_x_factor'] = params.get( 'MAXLEN_GIVEN_X_FACTOR', 3) params_prediction['output_min_length_depending_on_x'] = params.get( 'MINLEN_GIVEN_X', True) params_prediction[ 'output_min_length_depending_on_x_factor'] = params.get( 'MINLEN_GIVEN_X_FACTOR', 2) params_prediction['attend_on_output'] = params.get( 'ATTEND_ON_OUTPUT', 'transformer' in params['MODEL_TYPE'].lower()) beam_searcher = BeamSearchEnsemble(models, dataset, params_prediction, model_weights=model_weights, verbose=args.verbose) scores = beam_searcher.scoreNet()[s] # Store result if args.dest is not None: filepath = args.dest # results file if params['SAMPLING_SAVE_MODE'] == 'list': list2file(filepath, scores) elif params['SAMPLING_SAVE_MODE'] == 'numpy': numpy2file(filepath, scores) else: raise Exception('The sampling mode ' + params['SAMPLING_SAVE_MODE'] + ' is not currently supported.') else: print(scores)
def interactive_simulation(): args = parse_args() # Update parameters if args.config is not None: logger.info('Reading parameters from %s.' % args.config) params = update_parameters({}, pkl2dict(args.config)) else: logger.info('Reading parameters from config.py.') params = load_parameters() if args.online: from config_online import load_parameters as load_parameters_online online_parameters = load_parameters_online(params) params = update_parameters(params, online_parameters) try: for arg in args.changes: try: k, v = arg.split('=') except ValueError: print( 'Overwritten arguments must have the form key=Value. \n Currently are: %s' % str(args.changes)) exit(1) try: params[k] = ast.literal_eval(v) except ValueError: params[k] = v except ValueError: print('Error processing arguments: (', k, ",", v, ")") exit(2) check_params(params) if args.verbose: logging.info("params = " + str(params)) dataset = loadDataset(args.dataset) # dataset = update_dataset_from_file(dataset, args.source, params, splits=args.splits, remove_outputs=True) # Dataset backwards compatibility bpe_separator = dataset.BPE_separator if hasattr( dataset, "BPE_separator") and dataset.BPE_separator is not None else u'@@' # Set tokenization method params[ 'TOKENIZATION_METHOD'] = 'tokenize_bpe' if args.tokenize_bpe else params.get( 'TOKENIZATION_METHOD', 'tokenize_none') # Build BPE tokenizer if necessary if 'bpe' in params['TOKENIZATION_METHOD'].lower(): logger.info('Building BPE') if not dataset.BPE_built: dataset.build_bpe(params.get( 'BPE_CODES_PATH', params['DATA_ROOT_PATH'] + '/training_codes.joint'), separator=bpe_separator) # Build tokenization function tokenize_f = eval('dataset.' + params.get('TOKENIZATION_METHOD', 'tokenize_none')) if args.online: # Traning params params_training = { # Traning params 'n_epochs': params['MAX_EPOCH'], 'shuffle': False, 'loss': params.get('LOSS', 'categorical_crossentropy'), 'batch_size': params.get('BATCH_SIZE', 1), 'homogeneous_batches': False, 'optimizer': params.get('OPTIMIZER', 'SGD'), 'lr': params.get('LR', 0.1), 'lr_decay': params.get('LR_DECAY', None), 'lr_gamma': params.get('LR_GAMMA', 1.), 'epochs_for_save': -1, 'verbose': args.verbose, 'eval_on_sets': params['EVAL_ON_SETS_KERAS'], 'n_parallel_loaders': params['PARALLEL_LOADERS'], 'extra_callbacks': [], # callbacks, 'reload_epoch': 0, 'epoch_offset': 0, 'data_augmentation': params['DATA_AUGMENTATION'], 'patience': params.get('PATIENCE', 0), 'metric_check': params.get('STOP_METRIC', None), 'eval_on_epochs': params.get('EVAL_EACH_EPOCHS', True), 'each_n_epochs': params.get('EVAL_EACH', 1), 'start_eval_on_epoch': params.get('START_EVAL_ON_EPOCH', 0), 'additional_training_settings': { 'k': params.get('K', 1), 'tau': params.get('TAU', 1), 'lambda': params.get('LAMBDA', 0.5), 'c': params.get('C', 0.5), 'd': params.get('D', 0.5) } } else: params_training = dict() params['OUTPUT_VOCABULARY_SIZE'] = dataset.vocabulary_len[ params['OUTPUTS_IDS_DATASET'][0]] logger.info("<<< Using an ensemble of %d models >>>" % len(args.models)) if args.online: # Load trainable model(s) logging.info('Loading models from %s' % str(args.models)) model_instances = [ Captioning_Model(params, model_type=params['MODEL_TYPE'], verbose=params['VERBOSE'], model_name=params['MODEL_NAME'] + '_' + str(i), vocabularies=dataset.vocabulary, store_path=params['STORE_PATH'], clear_dirs=False, set_optimizer=False) for i in range(len(args.models)) ] models = [ updateModel(model, path, -1, full_path=True) for (model, path) in zip(model_instances, args.models) ] # Set additional inputs to models if using a custom loss function params['USE_CUSTOM_LOSS'] = True if 'PAS' in params[ 'OPTIMIZER'] else False if params['N_BEST_OPTIMIZER']: logging.info('Using N-best optimizer') models = build_online_models(models, params) online_trainer = OnlineTrainer(models, dataset, None, None, params_training, verbose=args.verbose) else: # Otherwise, load regular model(s) models = [loadModel(m, -1, full_path=True) for m in args.models] # Load text files logger.info("<<< Storing corrected hypotheses into: %s >>>" % str(args.dest)) ftrans = open(args.dest, 'w') ftrans.close() # Do we want to save the original sentences? if args.original_dest is not None: logger.info("<<< Storing original hypotheses into: %s >>>" % str(args.original_dest)) ftrans_ori = open(args.original_dest, 'w') ftrans_ori.close() if args.references is not None: ftrg = codecs.open(args.references, 'r', encoding='utf-8' ) # File with post-edited (or reference) sentences. all_references = ftrg.read().split('\n') if all_references[-1] == u'': all_references = all_references[:-1] # Get word2index and index2word dictionaries index2word_y = dataset.vocabulary[params['OUTPUTS_IDS_DATASET'] [0]]['idx2words'] word2index_y = dataset.vocabulary[params['OUTPUTS_IDS_DATASET'] [0]]['words2idx'] unk_id = dataset.extra_words['<unk>'] # Initialize counters total_errors = 0 total_words = 0 total_chars = 0 total_mouse_actions = 0 try: for s in args.splits: # Apply model predictions params_prediction = { 'max_batch_size': params['BATCH_SIZE'], 'n_parallel_loaders': params['PARALLEL_LOADERS'], 'predict_on_sets': [s], 'beam_size': params['BEAM_SIZE'], 'maxlen': params['MAX_OUTPUT_TEXT_LEN_TEST'], 'optimized_search': params['OPTIMIZED_SEARCH'], 'model_inputs': params['INPUTS_IDS_MODEL'], 'model_outputs': params['OUTPUTS_IDS_MODEL'], 'dataset_inputs': params['INPUTS_IDS_DATASET'], 'dataset_outputs': params['OUTPUTS_IDS_DATASET'], 'normalize_probs': params.get('NORMALIZE_SAMPLING', False), 'alpha_factor': params.get('ALPHA_FACTOR', 1.0), 'normalize': params.get('NORMALIZATION', False), 'normalization_type': params.get('NORMALIZATION_TYPE', None), 'data_augmentation': params.get('DATA_AUGMENTATION', False), 'mean_substraction': params.get('MEAN_SUBTRACTION', False), 'wo_da_patch_type': params.get('WO_DA_PATCH_TYPE', 'whole'), 'da_patch_type': params.get('DA_PATCH_TYPE', 'resize_and_rndcrop'), 'da_enhance_list': params.get('DA_ENHANCE_LIST', None), 'heuristic': params.get('HEURISTIC', None), 'search_pruning': params.get('SEARCH_PRUNING', False), 'state_below_index': -1, 'output_text_index': 0, 'apply_tokenization': params.get('APPLY_TOKENIZATION', False), 'tokenize_f': eval('dataset.' + params.get('TOKENIZATION_METHOD', 'tokenize_none')), 'apply_detokenization': params.get('APPLY_DETOKENIZATION', True), 'detokenize_f': eval('dataset.' + params.get('DETOKENIZATION_METHOD', 'detokenize_none')), 'coverage_penalty': params.get('COVERAGE_PENALTY', False), 'length_penalty': params.get('LENGTH_PENALTY', False), 'length_norm_factor': params.get('LENGTH_NORM_FACTOR', 0.0), 'coverage_norm_factor': params.get('COVERAGE_NORM_FACTOR', 0.0), 'pos_unk': False, 'state_below_maxlen': -1 if params.get('PAD_ON_BATCH', True) else params.get( 'MAX_OUTPUT_TEXT_LEN_TEST', 50), 'output_max_length_depending_on_x': params.get('MAXLEN_GIVEN_X', False), 'output_max_length_depending_on_x_factor': params.get('MAXLEN_GIVEN_X_FACTOR', 3), 'output_min_length_depending_on_x': params.get('MINLEN_GIVEN_X', False), 'output_min_length_depending_on_x_factor': params.get('MINLEN_GIVEN_X_FACTOR', 2), 'attend_on_output': params.get('ATTEND_ON_OUTPUT', 'transformer' in params['MODEL_TYPE'].lower()), 'n_best_optimizer': params.get('N_BEST_OPTIMIZER', False) } # Build interactive sampler interactive_beam_searcher = InteractiveBeamSearchSampler( models, dataset, params_prediction, excluded_words=None, verbose=args.verbose) start_time = time.time() if args.verbose: logging.info("Params prediction = " + str(params_prediction)) if args.online: logging.info("Params training = " + str(params_training)) n_samples = getattr(dataset, 'len_' + s) if args.references is None: all_references = dataset.extra_variables[s][ params['OUTPUTS_IDS_DATASET'][0]] # Start to translate the source file interactively for n_sample in range(n_samples): errors_sentence = 0 mouse_actions_sentence = 0 hypothesis_number = 0 # Load data from dataset current_input = dataset.getX_FromIndices( s, [n_sample], normalization_type=params_prediction.get( 'normalization_type'), normalization=params_prediction.get('normalize', False), dataAugmentation=params_prediction.get( 'data_augmentation', False), wo_da_patch_type=params_prediction.get( 'wo_da_patch_type', 'whole'), da_patch_type=params_prediction.get( 'da_patch_type', 'resize_and_rndcrop'), da_enhance_list=params_prediction.get( 'da_enhance_list', None))[0][0] # Load references references = all_references[n_sample] tokenized_references = list(map( tokenize_f, references)) if args.tokenize_references else references # Get reference as desired by the user, i.e. detokenized if necessary reference = list(map(params_prediction['detokenize_f'], tokenized_references)) if \ args.detokenize_bpe else tokenized_references # Detokenize line for nicer logging :) logger.debug(u'\n\nProcessing sample %d' % (n_sample + 1)) logger.debug(u'Target: %s' % reference) # 1. Get a first hypothesis trans_indices, costs, alphas = interactive_beam_searcher.sample_beam_search_interactive( current_input) # 1.2 Decode hypothesis hypothesis = decode_predictions_beam_search([trans_indices], index2word_y, pad_sequences=True, verbose=0)[0] # 1.3 Store result (optional) hypothesis = params_prediction['detokenize_f'](hypothesis) \ if params_prediction.get('apply_detokenization', False) else hypothesis if args.original_dest is not None: if params['SAMPLING_SAVE_MODE'] == 'list': list2file(args.original_dest, [hypothesis], permission='a') else: raise Exception( 'Only "list" is allowed in "SAMPLING_SAVE_MODE"') logger.debug(u'Hypo_%d: %s' % (hypothesis_number, hypothesis)) # 2.0 Interactive translation if hypothesis in tokenized_references: # 2.1 If the sentence is correct, we validate it pass else: # 2.2 Wrong hypothesis -> Interactively translate the sentence correct_hypothesis = False last_correct_pos = 0 while not correct_hypothesis: # 2.2.1 Empty data structures for the next sentence fixed_words_user = OrderedDict() unk_words_dict = OrderedDict() isle_indices = [] unks_in_isles = [] if args.prefix: # 2.2.2 Compute longest common character prefix (LCCP) reference_idx, next_correction_pos, validated_prefix = common_prefixes( hypothesis, tokenized_references) else: # 2.2.2 Compute common character segments #TODO next_correction_pos, validated_prefix, validated_segments = common_segments( hypothesis, reference) reference = tokenized_references[reference_idx] if next_correction_pos == len(reference): correct_hypothesis = True break # 2.2.3 Get next correction by checking against the reference next_correction = reference[next_correction_pos] # 2.2.4 Tokenize the prefix properly (possibly applying BPE) tokenized_validated_prefix = tokenize_f( validated_prefix + next_correction) # 2.2.5 Validate words for pos, word in enumerate( tokenized_validated_prefix.split()): fixed_words_user[pos] = word2index_y.get( word, unk_id) if word2index_y.get(word) is None: unk_words_dict[pos] = word # 2.2.6 Constrain search for the last word last_user_word_pos = list(fixed_words_user.keys())[-1] if next_correction != u' ': last_user_word = tokenized_validated_prefix.split( )[-1] filtered_idx2word = dict( (word2index_y[candidate_word], candidate_word) for candidate_word in word2index_y if candidate_word[:len(last_user_word)] == last_user_word) if filtered_idx2word != dict(): del fixed_words_user[last_user_word_pos] if last_user_word_pos in unk_words_dict.keys(): del unk_words_dict[last_user_word_pos] else: filtered_idx2word = dict() logger.debug(u'"%s" to character %d.' % (next_correction, next_correction_pos)) # 2.2.7 Generate a hypothesis compatible with the feedback provided by the user hypothesis = generate_constrained_hypothesis( interactive_beam_searcher, current_input, fixed_words_user, params_prediction, args, isle_indices, filtered_idx2word, index2word_y, None, None, None, unk_words_dict.keys(), unk_words_dict.values(), unks_in_isles) hypothesis_number += 1 hypothesis = u' '.join( hypothesis) # Hypothesis is unicode hypothesis = params_prediction['detokenize_f'](hypothesis) \ if args.detokenize_bpe else hypothesis logger.debug(u'Target: %s' % reference) logger.debug(u"Hypo_%d: %s" % (hypothesis_number, hypothesis)) # 2.2.8 Add a keystroke errors_sentence += 1 # 2.2.9 Add a mouse action if we moved the pointer if next_correction_pos - last_correct_pos > 1: mouse_actions_sentence += 1 last_correct_pos = next_correction_pos # 2.3 Final check: The reference is a subset of the hypothesis: Cut the hypothesis if len(reference) < len(hypothesis): hypothesis = hypothesis[:len(reference)] errors_sentence += 1 logger.debug(u"Cutting hypothesis") # 2.4 Security assertion assert hypothesis in references, "Error: The final hypothesis does not match with the reference! \n" \ "\t Split: %s \n" \ "\t Sentence: %d \n" \ "\t Hypothesis: %s\n" \ "\t Reference: %s" % (s, n_sample + 1, hypothesis, reference) # 3. Update user effort counters mouse_actions_sentence += 1 # This +1 is the validation action chars_sentence = len(hypothesis) total_errors += errors_sentence total_words += len(hypothesis.split()) total_chars += chars_sentence total_mouse_actions += mouse_actions_sentence # 3.1 Log some info logger.debug(u"Final hypotesis: %s" % hypothesis) logger.debug( u"%d errors. " u"Sentence WSR: %4f. " u"Sentence mouse strokes: %d " u"Sentence MAR: %4f. " u"Sentence MAR_c: %4f. " u"Sentence KSMR: %4f. " u"Accumulated (should only be considered for debugging purposes!) " u"WSR: %4f. " u"MAR: %4f. " u"MAR_c: %4f. " u"KSMR: %4f.\n\n\n\n" % (errors_sentence, float(errors_sentence) / len(hypothesis), mouse_actions_sentence, float(mouse_actions_sentence) / len(hypothesis), float(mouse_actions_sentence) / chars_sentence, float(errors_sentence + mouse_actions_sentence) / chars_sentence, float(total_errors) / total_words, float(total_mouse_actions) / total_words, float(total_mouse_actions) / total_chars, float(total_errors + total_mouse_actions) / total_chars)) # 4. If we are performing OL after each correct sample: if args.online: # 4.1 Compute model inputs # 4.1.1 Source text -> Already computed (used for the INMT process) # 4.1.2 State below state_below = dataset.loadText( [reference], vocabularies=dataset.vocabulary[ params['OUTPUTS_IDS_DATASET'][0]], max_len=params['MAX_OUTPUT_TEXT_LEN_TEST'], offset=1, fill=dataset.fill_text[params['INPUTS_IDS_DATASET'] [-1]], pad_on_batch=dataset.pad_on_batch[ params['INPUTS_IDS_DATASET'][-1]], words_so_far=False, loading_X=True)[0] # 4.1.3 Ground truth sample -> Interactively translated sentence trg_seq = dataset.loadTextOneHot( [reference], vocabularies=dataset.vocabulary[ params['OUTPUTS_IDS_DATASET'][0]], vocabulary_len=dataset.vocabulary_len[ params['OUTPUTS_IDS_DATASET'][0]], max_len=params['MAX_OUTPUT_TEXT_LEN_TEST'], offset=0, fill=dataset.fill_text[params['OUTPUTS_IDS_DATASET'] [0]], pad_on_batch=dataset.pad_on_batch[ params['OUTPUTS_IDS_DATASET'][0]], words_so_far=False, sample_weights=params['SAMPLE_WEIGHTS'], loading_X=False) # 4.2 Train online! online_trainer.train_online( [np.asarray([current_input]), state_below], trg_seq, trg_words=[reference]) # 5 Write correct sentences into a file list2file(args.dest, [hypothesis], permission='a') if (n_sample + 1) % 50 == 0: logger.info(u"%d sentences processed" % (n_sample + 1)) logger.info(u"Current speed is {} per sentence".format( (time.time() - start_time) / (n_sample + 1))) logger.info(u"Current WSR is: %f" % (float(total_errors) / total_words)) logger.info(u"Current MAR is: %f" % (float(total_mouse_actions) / total_words)) logger.info(u"Current MAR_c is: %f" % (float(total_mouse_actions) / total_chars)) logger.info(u"Current KSMR is: %f" % (float(total_errors + total_mouse_actions) / total_chars)) # 6. Final! # 6.1 Log some information print(u"Total number of errors:", total_errors) print(u"Total number selections", total_mouse_actions) print(u"WSR: %f" % (float(total_errors) / total_words)) print(u"MAR: %f" % (float(total_mouse_actions) / total_words)) print(u"MAR_c: %f" % (float(total_mouse_actions) / total_chars)) print(u"KSMR: %f" % (float(total_errors + total_mouse_actions) / total_chars)) except KeyboardInterrupt: print(u'Interrupted!') print(u"Total number of corrections (up to now):", total_errors) print(u"WSR: %f" % (float(total_errors) / total_words)) print(u"MAR: %f" % (float(total_mouse_actions) / total_words)) print(u"MAR_c: %f" % (float(total_mouse_actions) / total_chars)) print(u"KSMR: %f" % (float(total_errors + total_mouse_actions) / total_chars))
for n_best_pred, n_best_score, n_best_alpha in zip( n_best_preds, n_best_scores, n_best_alphas): pred = decode_predictions_beam_search([n_best_pred], index2word_y, alphas=n_best_alpha, x_text=sources, heuristic=heuristic, mapping=mapping, verbose=args.verbose) # Apply detokenization function if needed if params.get('APPLY_DETOKENIZATION', False): pred = map(detokenize_function, pred) n_best_sample_score.append([i, pred, n_best_score]) n_best_predictions.append(n_best_sample_score) # Store result if args.dest is not None: filepath = args.dest # results file if params.get('SAMPLING_SAVE_MODE', 'list'): list2file(filepath, predictions) if args.n_best: nbest2file(filepath + '.nbest', n_best_predictions) else: raise Exception( 'Only "list" is allowed in "SAMPLING_SAVE_MODE"') else: list2stdout(predictions) if args.n_best: logging.info('Storing n-best sentences in ./' + s + '.nbest') nbest2file('./' + s + '.nbest', n_best_predictions)
def sample_ensemble(args, params): """ Use several translation models for obtaining predictions from a source text file. :param argparse.Namespace args: Arguments given to the method: * dataset: Dataset instance with data. * text: Text file with source sentences. * splits: Splits to sample. Should be already included in the dataset object. * dest: Output file to save scores. * weights: Weight given to each model in the ensemble. You should provide the same number of weights than models. By default, it applies the same weight to each model (1/N). * n_best: Write n-best list (n = beam size). * config: Config .pkl for loading the model configuration. If not specified, hyperparameters are read from config.py. * models: Path to the models. * verbose: Be verbose or not. :param params: parameters of the translation model. """ from data_engine.prepare_data import update_dataset_from_file from keras_wrapper.model_ensemble import BeamSearchEnsemble from keras_wrapper.cnn_model import loadModel from keras_wrapper.dataset import loadDataset from keras_wrapper.utils import decode_predictions_beam_search logger.info("Using an ensemble of %d models" % len(args.models)) models = [loadModel(m, -1, full_path=True) for m in args.models] dataset = loadDataset(args.dataset) dataset = update_dataset_from_file(dataset, args.text, params, splits=args.splits, remove_outputs=True) params['INPUT_VOCABULARY_SIZE'] = dataset.vocabulary_len[params['INPUTS_IDS_DATASET'][0]] params['OUTPUT_VOCABULARY_SIZE'] = dataset.vocabulary_len[params['OUTPUTS_IDS_DATASET'][0]] # For converting predictions into sentences index2word_y = dataset.vocabulary[params['OUTPUTS_IDS_DATASET'][0]]['idx2words'] if params.get('APPLY_DETOKENIZATION', False): detokenize_function = eval('dataset.' + params['DETOKENIZATION_METHOD']) params_prediction = dict() params_prediction['max_batch_size'] = params.get('BATCH_SIZE', 20) params_prediction['n_parallel_loaders'] = params.get('PARALLEL_LOADERS', 1) params_prediction['beam_size'] = params.get('BEAM_SIZE', 6) params_prediction['maxlen'] = params.get('MAX_OUTPUT_TEXT_LEN_TEST', 100) params_prediction['optimized_search'] = params['OPTIMIZED_SEARCH'] params_prediction['model_inputs'] = params['INPUTS_IDS_MODEL'] params_prediction['model_outputs'] = params['OUTPUTS_IDS_MODEL'] params_prediction['dataset_inputs'] = params['INPUTS_IDS_DATASET'] params_prediction['dataset_outputs'] = params['OUTPUTS_IDS_DATASET'] params_prediction['search_pruning'] = params.get('SEARCH_PRUNING', False) params_prediction['normalize_probs'] = params.get('NORMALIZE_SAMPLING', False) params_prediction['alpha_factor'] = params.get('ALPHA_FACTOR', 1.0) params_prediction['coverage_penalty'] = params.get('COVERAGE_PENALTY', False) params_prediction['length_penalty'] = params.get('LENGTH_PENALTY', False) params_prediction['length_norm_factor'] = params.get('LENGTH_NORM_FACTOR', 0.0) params_prediction['coverage_norm_factor'] = params.get('COVERAGE_NORM_FACTOR', 0.0) params_prediction['pos_unk'] = params.get('POS_UNK', False) params_prediction['state_below_maxlen'] = -1 if params.get('PAD_ON_BATCH', True) \ else params.get('MAX_OUTPUT_TEXT_LEN', 50) params_prediction['output_max_length_depending_on_x'] = params.get('MAXLEN_GIVEN_X', True) params_prediction['output_max_length_depending_on_x_factor'] = params.get('MAXLEN_GIVEN_X_FACTOR', 3) params_prediction['output_min_length_depending_on_x'] = params.get('MINLEN_GIVEN_X', True) params_prediction['output_min_length_depending_on_x_factor'] = params.get('MINLEN_GIVEN_X_FACTOR', 2) params_prediction['attend_on_output'] = params.get('ATTEND_ON_OUTPUT', 'transformer' in params['MODEL_TYPE'].lower()) params_prediction['glossary'] = params.get('GLOSSARY', None) heuristic = params.get('HEURISTIC', 0) mapping = None if dataset.mapping == dict() else dataset.mapping model_weights = args.weights if args.glossary is not None: glossary = pkl2dict(args.glossary) elif params_prediction['glossary'] is not None: glossary = pkl2dict(params_prediction['glossary']) else: glossary = None if model_weights: assert len(model_weights) == len( models), 'You should give a weight to each model. You gave %d models and %d weights.' % ( len(models), len(model_weights)) model_weights = list(map(float, model_weights)) if len(model_weights) > 1: logger.info('Giving the following weights to each model: %s' % str(model_weights)) for s in args.splits: # Apply model predictions params_prediction['predict_on_sets'] = [s] beam_searcher = BeamSearchEnsemble(models, dataset, params_prediction, model_weights=model_weights, n_best=args.n_best, verbose=args.verbose) predictions = beam_searcher.predictBeamSearchNet()[s] samples = predictions['samples'] alphas = predictions['alphas'] if params_prediction['pos_unk'] else None if params_prediction['pos_unk']: sources = [x.strip() for x in open(args.text, 'r').read().split('\n')] sources = sources[:-1] if len(sources[-1]) == 0 else sources else: sources = None decoded_predictions = decode_predictions_beam_search(samples, index2word_y, glossary=glossary, alphas=alphas, x_text=sources, heuristic=heuristic, mapping=mapping, verbose=args.verbose) # Apply detokenization function if needed if params.get('APPLY_DETOKENIZATION', False): decoded_predictions = list(map(detokenize_function, decoded_predictions)) if args.n_best: n_best_predictions = [] for i, (n_best_preds, n_best_scores, n_best_alphas) in enumerate(predictions['n_best']): n_best_sample_score = [] for n_best_pred, n_best_score, n_best_alpha in zip(n_best_preds, n_best_scores, n_best_alphas): pred = decode_predictions_beam_search([n_best_pred], index2word_y, glossary=glossary, alphas=[n_best_alpha] if params_prediction[ 'pos_unk'] else None, x_text=[sources[i]] if params_prediction['pos_unk'] else None, heuristic=heuristic, mapping=mapping, verbose=args.verbose) # Apply detokenization function if needed if params.get('APPLY_DETOKENIZATION', False): pred = list(map(detokenize_function, pred)) n_best_sample_score.append([i, pred, n_best_score]) n_best_predictions.append(n_best_sample_score) # Store result if args.dest is not None: filepath = args.dest # results file if params.get('SAMPLING_SAVE_MODE', 'list'): list2file(filepath, decoded_predictions) if args.n_best: nbest2file(filepath + '.nbest', n_best_predictions) else: raise Exception('Only "list" is allowed in "SAMPLING_SAVE_MODE"') else: list2stdout(decoded_predictions) if args.n_best: logger.info('Storing n-best sentences in ./' + s + '.nbest') nbest2file('./' + s + '.nbest', n_best_predictions) logger.info('Sampling finished')
Control_predictions = Control_model.predictBeamSearchNet( dataset, params_prediction)['test'] vocab = dataset.vocabulary['target_text']['idx2words'] Control_predictions = decode_predictions_beam_search(Control_predictions, vocab, verbose=params['VERBOSE']) ## see how they compare to ground truth # from keras_wrapper.extra.read_write import list2file from keras_wrapper.extra import evaluation Control_path = 'Control_M7.pred' list2file(Control_path, Control_predictions) dataset.setOutput('data/Ross_test.reply', 'test', type='text', id='target_text', pad_on_batch=True, tokenization='tokenize_basic', sample_weights=True, max_text_len=30, max_words=0) keep_n_captions(dataset, repeat=1, n=1, set_names=['test']) metric = 'coco' #Apply sampling
def apply_Clas_model(params): """ Function for using a previously trained model for sampling. """ ########### Load data dataset = build_dataset(params) params['INPUT_SCR_VOCABULARY_SIZE'] = dataset.vocabulary_len[ params['INPUTS_IDS_DATASET'][0]] params['OUTPUT_VOCABULARY_SIZE'] = dataset.vocabulary_len[ params['OUTPUTS_IDS_DATASET'][0]] ########### ########### Load model text_class_model = loadModel(params['STORE_PATH'], params['RELOAD']) text_class_model.setOptimizer() ########### ########### Apply sampling extra_vars = dict() extra_vars['tokenize_f'] = eval('dataset.' + params['TOKENIZATION_METHOD']) for s in params["EVAL_ON_SETS"]: # Apply model predictions params_prediction = { 'batch_size': params['BATCH_SIZE'], 'n_parallel_loaders': params['PARALLEL_LOADERS'], 'predict_on_sets': [s] } predictions = text_class_model.predictNet(dataset, params_prediction)[s] # Store result filepath = text_class_model.model_path + '/' + s + '.pred' # results file if params['SAMPLING_SAVE_MODE'] == 'list': read_write.list2file(filepath, predictions) else: raise Exception, 'Only "list" is allowed in "SAMPLING_SAVE_MODE"' # Evaluate if any metric in params['METRICS'] for metric in params['METRICS']: logging.info('Evaluating on metric ' + metric) filepath = text_class_model.model_path + '/' + s + '_sampling.' + metric # results file # Evaluate on the chosen metric extra_vars[s] = dict() extra_vars[s]['references'] = dataset.extra_variables[s][ params['OUTPUTS_IDS_DATASET'][0]] metrics = evaluation.select[metric](pred_list=predictions, verbose=1, extra_vars=extra_vars, split=s) # Print results to file with open(filepath, 'w') as f: header = '' line = '' for metric_ in sorted(metrics): value = metrics[metric_] header += metric_ + ',' line += str(value) + ',' f.write(header + '\n') f.write(line + '\n') logging.info('Done evaluating on metric ' + metric)