def __init__(self, models, dataset, params_prediction, model_tokenize_f, model_detokenize_f, general_tokenize_f, general_detokenize_f, mapping=None, word2index_x=None, word2index_y=None, index2word_y=None, excluded_words=None, unk_id=1, eos_symbol='/', verbose=0): self.models = models self.dataset = dataset self.params_prediction = params_prediction self.model_tokenize_f = model_tokenize_f self.model_detokenize_f = model_detokenize_f self.general_tokenize_f = general_tokenize_f self.general_detokenize_f = general_detokenize_f self.mapping = mapping self.excluded_words = excluded_words self.verbose = verbose self.eos_symbol = eos_symbol self.word2index_x = word2index_x if word2index_x is not None else \ dataset.vocabulary[params_prediction['INPUTS_IDS_DATASET'][0]]['words2idx'] self.index2word_y = index2word_y if index2word_y is not None else \ dataset.vocabulary[params_prediction['OUTPUTS_IDS_DATASET'][0]]['idx2words'] self.word2index_y = word2index_y if word2index_y is not None else \ dataset.vocabulary[params_prediction['OUTPUTS_IDS_DATASET'][0]]['words2idx'] self.unk_id = unk_id self.interactive_beam_searcher = InteractiveBeamSearchSampler(self.models, self.dataset, self.params_prediction, excluded_words=self.excluded_words, verbose=self.verbose)
def __init__(self, models, dataset, params, params_prediction, params_training, model_tokenize_f, model_detokenize_f, general_tokenize_f, general_detokenize_f, mapping=None, word2index_x=None, word2index_y=None, index2word_y=None, excluded_words=None, unk_id=1, eos_symbol='/', online=False, verbose=0): self.models = models self.dataset = dataset self.params = params self.params_prediction = params_prediction self.params_training = params_training self.model_tokenize_f = model_tokenize_f self.model_detokenize_f = model_detokenize_f self.general_tokenize_f = general_tokenize_f self.general_detokenize_f = general_detokenize_f self.mapping = mapping self.excluded_words = excluded_words self.verbose = verbose self.eos_symbol = eos_symbol self.word2index_x = word2index_x if word2index_x is not None else \ dataset.vocabulary[params_prediction['INPUTS_IDS_DATASET'][0]]['words2idx'] self.index2word_y = index2word_y if index2word_y is not None else \ dataset.vocabulary[params_prediction['OUTPUTS_IDS_DATASET'][0]]['idx2words'] self.word2index_y = word2index_y if word2index_y is not None else \ dataset.vocabulary[params_prediction['OUTPUTS_IDS_DATASET'][0]]['words2idx'] self.unk_id = unk_id self.interactive_beam_searcher = InteractiveBeamSearchSampler(self.models, self.dataset, self.params_prediction, excluded_words=self.excluded_words, verbose=self.verbose) # Compile Theano sampling function by generating a fake sample # TODO: Find a better way of doing this logger.info('Compiling sampler...') self.generate_sample('i') logger.info('Done.') self.online = online if self.online: self.online_trainer = OnlineTrainer(self.models, self.dataset, None, # Sampler None, # Params prediction params_training, verbose=self.verbose) for i, nmt_model in enumerate(self.models): logger.info('Compiling model %d...' % i) nmt_model.model._make_train_function() logger.info('Done.') else: self.online_trainer = None
class NMTSampler: def __init__(self, models, dataset, params, params_prediction, params_training, model_tokenize_f, model_detokenize_f, general_tokenize_f, general_detokenize_f, mapping=None, word2index_x=None, word2index_y=None, index2word_y=None, excluded_words=None, unk_id=1, eos_symbol='/', online=False, verbose=0): self.models = models self.dataset = dataset self.params = params self.params_prediction = params_prediction self.params_training = params_training self.model_tokenize_f = model_tokenize_f self.model_detokenize_f = model_detokenize_f self.general_tokenize_f = general_tokenize_f self.general_detokenize_f = general_detokenize_f self.mapping = mapping self.excluded_words = excluded_words self.verbose = verbose self.eos_symbol = eos_symbol self.word2index_x = word2index_x if word2index_x is not None else \ dataset.vocabulary[params_prediction['INPUTS_IDS_DATASET'][0]]['words2idx'] self.index2word_y = index2word_y if index2word_y is not None else \ dataset.vocabulary[params_prediction['OUTPUTS_IDS_DATASET'][0]]['idx2words'] self.word2index_y = word2index_y if word2index_y is not None else \ dataset.vocabulary[params_prediction['OUTPUTS_IDS_DATASET'][0]]['words2idx'] self.unk_id = unk_id self.interactive_beam_searcher = InteractiveBeamSearchSampler( self.models, self.dataset, self.params_prediction, excluded_words=self.excluded_words, verbose=self.verbose) # Compile Theano sampling function by generating a fake sample # TODO: Find a better way of doing this logger.info('Compiling sampler...') self.generate_sample('i') logger.info('Done.') self.online = online if self.online: self.online_trainer = OnlineTrainer( self.models, self.dataset, None, # Sampler None, # Params prediction params_training, verbose=self.verbose) for i, nmt_model in enumerate(self.models): logger.info('Compiling model %d...' % i) nmt_model.model._make_train_function() logger.info('Done.') else: self.online_trainer = None def generate_sample(self, source_sentence, validated_prefix=None, max_N=5, isle_indices=None, filtered_idx2word=None, unk_indices=None, unk_words=None): print("In params prediction beam_size: ", self.params_prediction['beam_size']) logger.log(2, 'Beam size: %d' % (self.params_prediction['beam_size'])) generate_sample_start_time = time.time() if unk_indices is None: unk_indices = [] if unk_words is None: unk_words = [] tokenization_start_time = time.time() tokenized_input = self.general_tokenize_f(source_sentence, escape=False) tokenized_input = self.model_tokenize_f(tokenized_input) tokenization_end_time = time.time() logger.log( 2, 'tokenization time: %.6f' % (tokenization_end_time - tokenization_start_time)) parse_input_start_time = time.time() src_seq, src_words = parse_input(tokenized_input, self.dataset, self.word2index_x) parse_input_end_time = time.time() logger.log( 2, 'parse_input time: %.6f' % (parse_input_end_time - parse_input_start_time)) fixed_words_user = OrderedDict() unk_words_dict = OrderedDict() # If the user provided some feedback... if validated_prefix is not None: next_correction = validated_prefix[-1] if next_correction == self.eos_symbol: return validated_prefix[:-1].decode('utf-8') # 2.2.4 Tokenize the prefix properly (possibly applying BPE) # TODO: Here we are tokenizing the target language with the source language tokenizer prefix_tokenization_start_time = time.time() tokenized_validated_prefix = self.general_tokenize_f( validated_prefix, escape=False) tokenized_validated_prefix = self.model_tokenize_f( tokenized_validated_prefix) prefix_tokenization_end_time = time.time() logger.log( 2, 'prefix_tokenization time: %.6f' % (prefix_tokenization_end_time - prefix_tokenization_start_time)) # 2.2.5 Validate words word_validation_start_time = time.time() for pos, word in enumerate(tokenized_validated_prefix.split()): fixed_words_user[pos] = self.word2index_y.get( word, self.unk_id) if self.word2index_y.get(word) is None: unk_words_dict[pos] = word word_validation_end_time = time.time() logger.log( 2, 'word_validation time: %.6f' % (word_validation_end_time - word_validation_start_time)) # 2.2.6 Constrain search for the last word constrain_search_start_time = time.time() last_user_word_pos = list(fixed_words_user.keys())[-1] if next_correction != u' ': last_user_word = tokenized_validated_prefix.split()[-1] filtered_idx2word = dict( (self.word2index_y[candidate_word], candidate_word) for candidate_word in self.word2index_y if candidate_word[:len(last_user_word)] == last_user_word) # if candidate_word.decode('utf-8')[:len(last_user_word)] == last_user_word) if filtered_idx2word != dict(): del fixed_words_user[last_user_word_pos] if last_user_word_pos in list(unk_words_dict.keys()): del unk_words_dict[last_user_word_pos] else: filtered_idx2word = dict() constrain_search_end_time = time.time() logger.log( 2, 'constrain_search_end_time time: %.6f' % (constrain_search_end_time - constrain_search_start_time)) sample_beam_search_start_time = time.time() trans_indices, costs, alphas = \ self.interactive_beam_searcher.sample_beam_search_interactive(src_seq, fixed_words=copy.copy(fixed_words_user), max_N=max_N, isles=isle_indices, valid_next_words=filtered_idx2word, idx2word=self.index2word_y) sample_beam_search_end_time = time.time() logger.log( 2, 'sample_beam_search time: %.6f' % (sample_beam_search_end_time - sample_beam_search_start_time)) # # Substitute possible unknown words in isles # unk_in_isles = [] # for isle_idx, isle_sequence, isle_words in unks_in_isles: # if unk_id in isle_sequence: # unk_in_isles.append((subfinder(isle_sequence, list(trans_indices)), isle_words)) if False and self.params_prediction['pos_unk']: alphas = [alphas] sources = [tokenized_input] heuristic = self.params_prediction['heuristic'] else: alphas = None heuristic = None sources = None # 1.2 Decode hypothesis decoding_predictions_start_time = time.time() hypothesis = decode_predictions_beam_search([trans_indices], self.index2word_y, alphas=alphas, x_text=sources, heuristic=heuristic, mapping=self.mapping, pad_sequences=True, verbose=0)[0] decoding_predictions_end_time = time.time() logger.log( 2, 'decoding_predictions time: %.6f' % (decoding_predictions_end_time - decoding_predictions_start_time)) # for (words_idx, starting_pos), words in unk_in_isles: # for pos_unk_word, pos_hypothesis in enumerate(range(starting_pos, starting_pos + len(words_idx))): # hypothesis[pos_hypothesis] = words[pos_unk_word] # UNK words management unk_management_start_time = time.time() unk_indices = list(unk_words_dict) unk_words = list(unk_words_dict.values()) if len(unk_indices) > 0: # If we added some UNK word hypothesis = hypothesis.split() if len(hypothesis) < len( unk_indices ): # The full hypothesis will be made up UNK words: for i, index in enumerate(range(0, len(hypothesis))): hypothesis[index] = unk_words[unk_indices[i]] for ii in range(i + 1, len(unk_words)): hypothesis.append(unk_words[ii]) else: # We put each unknown word in the corresponding gap for i, index in enumerate(unk_indices): if index < len(hypothesis): hypothesis[index] = unk_words[i] else: hypothesis.append(unk_words[i]) hypothesis = u' '.join(hypothesis) unk_management_end_time = time.time() logger.log( 2, 'unk_management time: %.6f' % (unk_management_end_time - unk_management_start_time)) hypothesis_detokenization_start_time = time.time() hypothesis = self.model_detokenize_f(hypothesis) hypothesis = self.general_detokenize_f(hypothesis, unescape=False) hypothesis_detokenization_end_time = time.time() logger.log( 2, 'hypothesis_detokenization time: %.6f' % (hypothesis_detokenization_end_time - hypothesis_detokenization_start_time)) generate_sample_end_time = time.time() logger.log( 2, 'generate_sample time: %.6f' % (generate_sample_end_time - generate_sample_start_time)) return hypothesis def learn_from_sample(self, source_sentence, target_sentence): # Tokenize input tokenized_input = self.general_tokenize_f(source_sentence, escape=False) tokenized_input = self.model_tokenize_f(tokenized_input) src_seq, src_words = parse_input(tokenized_input, self.dataset, self.word2index_x) # Tokenize output tokenized_reference = self.general_tokenize_f(target_sentence, escape=False) tokenized_reference = self.model_tokenize_f(tokenized_reference) # Build inputs/outpus of the system state_below = self.dataset.loadText( [tokenized_reference.encode('utf-8')], vocabularies=self.dataset.vocabulary[ self.params['OUTPUTS_IDS_DATASET'][0]], max_len=self.params['MAX_OUTPUT_TEXT_LEN_TEST'], offset=1, fill=self.dataset.fill_text[self.params['INPUTS_IDS_DATASET'][-1]], pad_on_batch=self.dataset.pad_on_batch[ self.params['INPUTS_IDS_DATASET'][-1]], words_so_far=False, loading_X=True)[0] # 4.1.3 Ground truth sample -> Interactively translated sentence # TODO: Load dense_text if necessary trg_seq = self.dataset.loadTextOneHot( [tokenized_reference.encode('utf-8')], vocabularies=self.dataset.vocabulary[ self.params['OUTPUTS_IDS_DATASET'][0]], vocabulary_len=self.dataset.vocabulary_len[ self.params['OUTPUTS_IDS_DATASET'][0]], max_len=self.params['MAX_OUTPUT_TEXT_LEN_TEST'], offset=0, fill=self.dataset.fill_text[self.params['OUTPUTS_IDS_DATASET'][0]], pad_on_batch=self.dataset.pad_on_batch[ self.params['OUTPUTS_IDS_DATASET'][0]], words_so_far=False, sample_weights=self.params['SAMPLE_WEIGHTS'], loading_X=False) # 4.2 Train online! if self.online_trainer is not None: self.online_trainer.train_online( [np.asarray([src_seq]), state_below], trg_seq, trg_words=[target_sentence]) else: logging.warning('Online learning is disabled.')
class NMTSampler: def __init__(self, models, dataset, params_prediction, model_tokenize_f, model_detokenize_f, general_tokenize_f, general_detokenize_f, mapping=None, word2index_x=None, word2index_y=None, index2word_y=None, excluded_words=None, unk_id=1, verbose=0): self.models = models self.dataset = dataset self.params_prediction = params_prediction self.model_tokenize_f = model_tokenize_f self.model_detokenize_f = model_detokenize_f self.general_tokenize_f = general_tokenize_f self.general_detokenize_f = general_detokenize_f self.mapping = mapping self.excluded_words = excluded_words self.verbose = verbose self.word2index_x = word2index_x if word2index_x is not None else \ dataset.vocabulary[params_prediction['INPUTS_IDS_DATASET'][0]]['words2idx'] self.index2word_y = index2word_y if index2word_y is not None else \ dataset.vocabulary[params['OUTPUTS_IDS_DATASET'][0]]['idx2words'] self.word2index_y = word2index_y if word2index_y is not None else \ dataset.vocabulary[params['OUTPUTS_IDS_DATASET'][0]]['words2idx'] self.unk_id = unk_id self.interactive_beam_searcher = InteractiveBeamSearchSampler( self.models, self.dataset, self.params_prediction, excluded_words=self.excluded_words, verbose=self.verbose) def generate_sample(self, source_sentence, validated_prefix=None, max_N=5, isle_indices=None, filtered_idx2word=None, unk_indices=None, unk_words=None): if unk_indices is None: unk_indices = [] if unk_words is None: unk_words = [] tokenized_input = self.general_tokenize_f(source_sentence) tokenized_input = self.model_tokenize_f(tokenized_input) src_seq, src_words = parse_input(tokenized_input, self.dataset, self.word2index_x) fixed_words_user = OrderedDict() unk_words_dict = OrderedDict() # If the user provided some feedback... if validated_prefix is not None: next_correction = validated_prefix[-1] # 2.2.4 Tokenize the prefix properly (possibly applying BPE) # TODO: Here we are tokenizing the target language with the source language tokenizer tokenized_validated_prefix = self.general_tokenize_f( validated_prefix) tokenized_validated_prefix = self.model_tokenize_f( tokenized_validated_prefix) # 2.2.5 Validate words for pos, word in enumerate(tokenized_validated_prefix.split()): fixed_words_user[pos] = self.word2index_y.get( word, self.unk_id) if self.word2index_y.get(word) is None: unk_words_dict[pos] = word # 2.2.6 Constrain search for the last word last_user_word_pos = fixed_words_user.keys()[-1] if next_correction != u' ': last_user_word = tokenized_validated_prefix.split()[-1] filtered_idx2word = dict( (self.word2index_y[candidate_word], candidate_word) for candidate_word in self.word2index_y if candidate_word.decode( 'utf-8')[:len(last_user_word)] == last_user_word) if filtered_idx2word != dict(): del fixed_words_user[last_user_word_pos] if last_user_word_pos in unk_words_dict.keys(): del unk_words_dict[last_user_word_pos] else: filtered_idx2word = dict() trans_indices, costs, alphas = \ self.interactive_beam_searcher.sample_beam_search_interactive(src_seq, fixed_words=copy.copy(fixed_words_user), max_N=max_N, isles=isle_indices, valid_next_words=filtered_idx2word, idx2word=self.index2word_y) # # Substitute possible unknown words in isles # unk_in_isles = [] # for isle_idx, isle_sequence, isle_words in unks_in_isles: # if unk_id in isle_sequence: # unk_in_isles.append((subfinder(isle_sequence, list(trans_indices)), isle_words)) if False and self.params_prediction['pos_unk']: alphas = [alphas] sources = [tokenized_input] heuristic = self.params_prediction['heuristic'] else: alphas = None heuristic = None sources = None # 1.2 Decode hypothesis hypothesis = decode_predictions_beam_search([trans_indices], self.index2word_y, alphas=alphas, x_text=sources, heuristic=heuristic, mapping=self.mapping, pad_sequences=True, verbose=0)[0] # for (words_idx, starting_pos), words in unk_in_isles: # for pos_unk_word, pos_hypothesis in enumerate(range(starting_pos, starting_pos + len(words_idx))): # hypothesis[pos_hypothesis] = words[pos_unk_word] # UNK words management unk_indices = unk_words_dict.keys() unk_words = unk_words_dict.values() if len(unk_indices) > 0: # If we added some UNK word hypothesis = hypothesis.split() if len(hypothesis) < len( unk_indices ): # The full hypothesis will be made up UNK words: for i, index in enumerate(range(0, len(hypothesis))): hypothesis[index] = unk_words[unk_indices[i]] for ii in range(i + 1, len(unk_words)): hypothesis.append(unk_words[ii]) else: # We put each unknown word in the corresponding gap for i, index in enumerate(unk_indices): if index < len(hypothesis): hypothesis[index] = unk_words[i] else: hypothesis.append(unk_words[i]) hypothesis = u' '.join(hypothesis) hypothesis = self.model_detokenize_f(hypothesis) hypothesis = self.general_detokenize_f(hypothesis) return hypothesis