def __init__(self, models, dataset, params_prediction, model_tokenize_f, model_detokenize_f, general_tokenize_f,
              general_detokenize_f, mapping=None, word2index_x=None, word2index_y=None, index2word_y=None,
              excluded_words=None, unk_id=1, eos_symbol='/', verbose=0):
     self.models = models
     self.dataset = dataset
     self.params_prediction = params_prediction
     self.model_tokenize_f = model_tokenize_f
     self.model_detokenize_f = model_detokenize_f
     self.general_tokenize_f = general_tokenize_f
     self.general_detokenize_f = general_detokenize_f
     self.mapping = mapping
     self.excluded_words = excluded_words
     self.verbose = verbose
     self.eos_symbol = eos_symbol
     self.word2index_x = word2index_x if word2index_x is not None else \
         dataset.vocabulary[params_prediction['INPUTS_IDS_DATASET'][0]]['words2idx']
     self.index2word_y = index2word_y if index2word_y is not None else \
         dataset.vocabulary[params_prediction['OUTPUTS_IDS_DATASET'][0]]['idx2words']
     self.word2index_y = word2index_y if word2index_y is not None else \
         dataset.vocabulary[params_prediction['OUTPUTS_IDS_DATASET'][0]]['words2idx']
     self.unk_id = unk_id
     self.interactive_beam_searcher = InteractiveBeamSearchSampler(self.models,
                                                                   self.dataset,
                                                                   self.params_prediction,
                                                                   excluded_words=self.excluded_words,
                                                                   verbose=self.verbose)
Beispiel #2
0
    def __init__(self, models, dataset, params, params_prediction, params_training, model_tokenize_f, model_detokenize_f, general_tokenize_f,
                 general_detokenize_f, mapping=None, word2index_x=None, word2index_y=None, index2word_y=None,
                 excluded_words=None, unk_id=1, eos_symbol='/', online=False, verbose=0):
        self.models = models
        self.dataset = dataset
        self.params = params
        self.params_prediction = params_prediction
        self.params_training = params_training
        self.model_tokenize_f = model_tokenize_f
        self.model_detokenize_f = model_detokenize_f
        self.general_tokenize_f = general_tokenize_f
        self.general_detokenize_f = general_detokenize_f
        self.mapping = mapping
        self.excluded_words = excluded_words
        self.verbose = verbose
        self.eos_symbol = eos_symbol
        self.word2index_x = word2index_x if word2index_x is not None else \
            dataset.vocabulary[params_prediction['INPUTS_IDS_DATASET'][0]]['words2idx']
        self.index2word_y = index2word_y if index2word_y is not None else \
            dataset.vocabulary[params_prediction['OUTPUTS_IDS_DATASET'][0]]['idx2words']
        self.word2index_y = word2index_y if word2index_y is not None else \
            dataset.vocabulary[params_prediction['OUTPUTS_IDS_DATASET'][0]]['words2idx']
        self.unk_id = unk_id
        self.interactive_beam_searcher = InteractiveBeamSearchSampler(self.models,
                                                                      self.dataset,
                                                                      self.params_prediction,
                                                                      excluded_words=self.excluded_words,
                                                                      verbose=self.verbose)

        # Compile Theano sampling function by generating a fake sample # TODO: Find a better way of doing this
        logger.info('Compiling sampler...')
        self.generate_sample('i')
        logger.info('Done.')

        self.online = online
        if self.online:
            self.online_trainer = OnlineTrainer(self.models, self.dataset, None,  # Sampler
                                                None,  # Params prediction
                                                params_training,
                                                verbose=self.verbose)
            for i, nmt_model in enumerate(self.models):
                logger.info('Compiling model %d...' % i)
                nmt_model.model._make_train_function()
            logger.info('Done.')

        else:
            self.online_trainer = None
Beispiel #3
0
class NMTSampler:
    def __init__(self,
                 models,
                 dataset,
                 params,
                 params_prediction,
                 params_training,
                 model_tokenize_f,
                 model_detokenize_f,
                 general_tokenize_f,
                 general_detokenize_f,
                 mapping=None,
                 word2index_x=None,
                 word2index_y=None,
                 index2word_y=None,
                 excluded_words=None,
                 unk_id=1,
                 eos_symbol='/',
                 online=False,
                 verbose=0):
        self.models = models
        self.dataset = dataset
        self.params = params
        self.params_prediction = params_prediction
        self.params_training = params_training
        self.model_tokenize_f = model_tokenize_f
        self.model_detokenize_f = model_detokenize_f
        self.general_tokenize_f = general_tokenize_f
        self.general_detokenize_f = general_detokenize_f
        self.mapping = mapping
        self.excluded_words = excluded_words
        self.verbose = verbose
        self.eos_symbol = eos_symbol
        self.word2index_x = word2index_x if word2index_x is not None else \
            dataset.vocabulary[params_prediction['INPUTS_IDS_DATASET'][0]]['words2idx']
        self.index2word_y = index2word_y if index2word_y is not None else \
            dataset.vocabulary[params_prediction['OUTPUTS_IDS_DATASET'][0]]['idx2words']
        self.word2index_y = word2index_y if word2index_y is not None else \
            dataset.vocabulary[params_prediction['OUTPUTS_IDS_DATASET'][0]]['words2idx']
        self.unk_id = unk_id
        self.interactive_beam_searcher = InteractiveBeamSearchSampler(
            self.models,
            self.dataset,
            self.params_prediction,
            excluded_words=self.excluded_words,
            verbose=self.verbose)

        # Compile Theano sampling function by generating a fake sample # TODO: Find a better way of doing this
        logger.info('Compiling sampler...')
        self.generate_sample('i')
        logger.info('Done.')

        self.online = online
        if self.online:
            self.online_trainer = OnlineTrainer(
                self.models,
                self.dataset,
                None,  # Sampler
                None,  # Params prediction
                params_training,
                verbose=self.verbose)
            for i, nmt_model in enumerate(self.models):
                logger.info('Compiling model %d...' % i)
                nmt_model.model._make_train_function()
            logger.info('Done.')

        else:
            self.online_trainer = None

    def generate_sample(self,
                        source_sentence,
                        validated_prefix=None,
                        max_N=5,
                        isle_indices=None,
                        filtered_idx2word=None,
                        unk_indices=None,
                        unk_words=None):
        print("In params prediction beam_size: ",
              self.params_prediction['beam_size'])
        logger.log(2, 'Beam size: %d' % (self.params_prediction['beam_size']))
        generate_sample_start_time = time.time()
        if unk_indices is None:
            unk_indices = []
        if unk_words is None:
            unk_words = []

        tokenization_start_time = time.time()
        tokenized_input = self.general_tokenize_f(source_sentence,
                                                  escape=False)
        tokenized_input = self.model_tokenize_f(tokenized_input)
        tokenization_end_time = time.time()
        logger.log(
            2, 'tokenization time: %.6f' %
            (tokenization_end_time - tokenization_start_time))
        parse_input_start_time = time.time()
        src_seq, src_words = parse_input(tokenized_input, self.dataset,
                                         self.word2index_x)
        parse_input_end_time = time.time()
        logger.log(
            2, 'parse_input time: %.6f' %
            (parse_input_end_time - parse_input_start_time))

        fixed_words_user = OrderedDict()
        unk_words_dict = OrderedDict()
        # If the user provided some feedback...
        if validated_prefix is not None:
            next_correction = validated_prefix[-1]
            if next_correction == self.eos_symbol:
                return validated_prefix[:-1].decode('utf-8')

            # 2.2.4 Tokenize the prefix properly (possibly applying BPE)
            #  TODO: Here we are tokenizing the target language with the source language tokenizer
            prefix_tokenization_start_time = time.time()
            tokenized_validated_prefix = self.general_tokenize_f(
                validated_prefix, escape=False)
            tokenized_validated_prefix = self.model_tokenize_f(
                tokenized_validated_prefix)
            prefix_tokenization_end_time = time.time()
            logger.log(
                2, 'prefix_tokenization time: %.6f' %
                (prefix_tokenization_end_time -
                 prefix_tokenization_start_time))

            # 2.2.5 Validate words
            word_validation_start_time = time.time()
            for pos, word in enumerate(tokenized_validated_prefix.split()):
                fixed_words_user[pos] = self.word2index_y.get(
                    word, self.unk_id)
                if self.word2index_y.get(word) is None:
                    unk_words_dict[pos] = word
            word_validation_end_time = time.time()
            logger.log(
                2, 'word_validation time: %.6f' %
                (word_validation_end_time - word_validation_start_time))

            # 2.2.6 Constrain search for the last word
            constrain_search_start_time = time.time()
            last_user_word_pos = list(fixed_words_user.keys())[-1]
            if next_correction != u' ':
                last_user_word = tokenized_validated_prefix.split()[-1]
                filtered_idx2word = dict(
                    (self.word2index_y[candidate_word], candidate_word)
                    for candidate_word in self.word2index_y
                    if candidate_word[:len(last_user_word)] == last_user_word)

                # if candidate_word.decode('utf-8')[:len(last_user_word)] == last_user_word)
                if filtered_idx2word != dict():
                    del fixed_words_user[last_user_word_pos]
                    if last_user_word_pos in list(unk_words_dict.keys()):
                        del unk_words_dict[last_user_word_pos]
            else:
                filtered_idx2word = dict()
            constrain_search_end_time = time.time()
            logger.log(
                2, 'constrain_search_end_time time: %.6f' %
                (constrain_search_end_time - constrain_search_start_time))

        sample_beam_search_start_time = time.time()
        trans_indices, costs, alphas = \
            self.interactive_beam_searcher.sample_beam_search_interactive(src_seq,
                                                                          fixed_words=copy.copy(fixed_words_user),
                                                                          max_N=max_N,
                                                                          isles=isle_indices,
                                                                          valid_next_words=filtered_idx2word,
                                                                          idx2word=self.index2word_y)
        sample_beam_search_end_time = time.time()
        logger.log(
            2, 'sample_beam_search time: %.6f' %
            (sample_beam_search_end_time - sample_beam_search_start_time))

        # # Substitute possible unknown words in isles
        # unk_in_isles = []
        # for isle_idx, isle_sequence, isle_words in unks_in_isles:
        #     if unk_id in isle_sequence:
        #         unk_in_isles.append((subfinder(isle_sequence, list(trans_indices)), isle_words))

        if False and self.params_prediction['pos_unk']:
            alphas = [alphas]
            sources = [tokenized_input]
            heuristic = self.params_prediction['heuristic']
        else:
            alphas = None
            heuristic = None
            sources = None

        # 1.2 Decode hypothesis
        decoding_predictions_start_time = time.time()
        hypothesis = decode_predictions_beam_search([trans_indices],
                                                    self.index2word_y,
                                                    alphas=alphas,
                                                    x_text=sources,
                                                    heuristic=heuristic,
                                                    mapping=self.mapping,
                                                    pad_sequences=True,
                                                    verbose=0)[0]
        decoding_predictions_end_time = time.time()
        logger.log(
            2, 'decoding_predictions time: %.6f' %
            (decoding_predictions_end_time - decoding_predictions_start_time))

        # for (words_idx, starting_pos), words in unk_in_isles:
        #     for pos_unk_word, pos_hypothesis in enumerate(range(starting_pos, starting_pos + len(words_idx))):
        #         hypothesis[pos_hypothesis] = words[pos_unk_word]

        # UNK words management
        unk_management_start_time = time.time()
        unk_indices = list(unk_words_dict)
        unk_words = list(unk_words_dict.values())
        if len(unk_indices) > 0:  # If we added some UNK word
            hypothesis = hypothesis.split()
            if len(hypothesis) < len(
                    unk_indices
            ):  # The full hypothesis will be made up UNK words:
                for i, index in enumerate(range(0, len(hypothesis))):
                    hypothesis[index] = unk_words[unk_indices[i]]
                for ii in range(i + 1, len(unk_words)):
                    hypothesis.append(unk_words[ii])
            else:  # We put each unknown word in the corresponding gap
                for i, index in enumerate(unk_indices):
                    if index < len(hypothesis):
                        hypothesis[index] = unk_words[i]
                    else:
                        hypothesis.append(unk_words[i])
            hypothesis = u' '.join(hypothesis)
        unk_management_end_time = time.time()
        logger.log(
            2, 'unk_management time: %.6f' %
            (unk_management_end_time - unk_management_start_time))

        hypothesis_detokenization_start_time = time.time()
        hypothesis = self.model_detokenize_f(hypothesis)
        hypothesis = self.general_detokenize_f(hypothesis, unescape=False)
        hypothesis_detokenization_end_time = time.time()
        logger.log(
            2, 'hypothesis_detokenization time: %.6f' %
            (hypothesis_detokenization_end_time -
             hypothesis_detokenization_start_time))
        generate_sample_end_time = time.time()
        logger.log(
            2, 'generate_sample time: %.6f' %
            (generate_sample_end_time - generate_sample_start_time))
        return hypothesis

    def learn_from_sample(self, source_sentence, target_sentence):

        # Tokenize input
        tokenized_input = self.general_tokenize_f(source_sentence,
                                                  escape=False)
        tokenized_input = self.model_tokenize_f(tokenized_input)
        src_seq, src_words = parse_input(tokenized_input, self.dataset,
                                         self.word2index_x)

        # Tokenize output
        tokenized_reference = self.general_tokenize_f(target_sentence,
                                                      escape=False)
        tokenized_reference = self.model_tokenize_f(tokenized_reference)

        # Build inputs/outpus of the system
        state_below = self.dataset.loadText(
            [tokenized_reference.encode('utf-8')],
            vocabularies=self.dataset.vocabulary[
                self.params['OUTPUTS_IDS_DATASET'][0]],
            max_len=self.params['MAX_OUTPUT_TEXT_LEN_TEST'],
            offset=1,
            fill=self.dataset.fill_text[self.params['INPUTS_IDS_DATASET'][-1]],
            pad_on_batch=self.dataset.pad_on_batch[
                self.params['INPUTS_IDS_DATASET'][-1]],
            words_so_far=False,
            loading_X=True)[0]

        # 4.1.3 Ground truth sample -> Interactively translated sentence
        # TODO: Load dense_text if necessary
        trg_seq = self.dataset.loadTextOneHot(
            [tokenized_reference.encode('utf-8')],
            vocabularies=self.dataset.vocabulary[
                self.params['OUTPUTS_IDS_DATASET'][0]],
            vocabulary_len=self.dataset.vocabulary_len[
                self.params['OUTPUTS_IDS_DATASET'][0]],
            max_len=self.params['MAX_OUTPUT_TEXT_LEN_TEST'],
            offset=0,
            fill=self.dataset.fill_text[self.params['OUTPUTS_IDS_DATASET'][0]],
            pad_on_batch=self.dataset.pad_on_batch[
                self.params['OUTPUTS_IDS_DATASET'][0]],
            words_so_far=False,
            sample_weights=self.params['SAMPLE_WEIGHTS'],
            loading_X=False)
        # 4.2 Train online!
        if self.online_trainer is not None:
            self.online_trainer.train_online(
                [np.asarray([src_seq]), state_below],
                trg_seq,
                trg_words=[target_sentence])
        else:
            logging.warning('Online learning is disabled.')
Beispiel #4
0
class NMTSampler:
    def __init__(self,
                 models,
                 dataset,
                 params_prediction,
                 model_tokenize_f,
                 model_detokenize_f,
                 general_tokenize_f,
                 general_detokenize_f,
                 mapping=None,
                 word2index_x=None,
                 word2index_y=None,
                 index2word_y=None,
                 excluded_words=None,
                 unk_id=1,
                 verbose=0):
        self.models = models
        self.dataset = dataset
        self.params_prediction = params_prediction
        self.model_tokenize_f = model_tokenize_f
        self.model_detokenize_f = model_detokenize_f
        self.general_tokenize_f = general_tokenize_f
        self.general_detokenize_f = general_detokenize_f
        self.mapping = mapping
        self.excluded_words = excluded_words
        self.verbose = verbose
        self.word2index_x = word2index_x if word2index_x is not None else \
            dataset.vocabulary[params_prediction['INPUTS_IDS_DATASET'][0]]['words2idx']
        self.index2word_y = index2word_y if index2word_y is not None else \
            dataset.vocabulary[params['OUTPUTS_IDS_DATASET'][0]]['idx2words']
        self.word2index_y = word2index_y if word2index_y is not None else \
            dataset.vocabulary[params['OUTPUTS_IDS_DATASET'][0]]['words2idx']
        self.unk_id = unk_id
        self.interactive_beam_searcher = InteractiveBeamSearchSampler(
            self.models,
            self.dataset,
            self.params_prediction,
            excluded_words=self.excluded_words,
            verbose=self.verbose)

    def generate_sample(self,
                        source_sentence,
                        validated_prefix=None,
                        max_N=5,
                        isle_indices=None,
                        filtered_idx2word=None,
                        unk_indices=None,
                        unk_words=None):

        if unk_indices is None:
            unk_indices = []
        if unk_words is None:
            unk_words = []

        tokenized_input = self.general_tokenize_f(source_sentence)
        tokenized_input = self.model_tokenize_f(tokenized_input)
        src_seq, src_words = parse_input(tokenized_input, self.dataset,
                                         self.word2index_x)
        fixed_words_user = OrderedDict()
        unk_words_dict = OrderedDict()
        # If the user provided some feedback...
        if validated_prefix is not None:
            next_correction = validated_prefix[-1]

            # 2.2.4 Tokenize the prefix properly (possibly applying BPE)
            #  TODO: Here we are tokenizing the target language with the source language tokenizer
            tokenized_validated_prefix = self.general_tokenize_f(
                validated_prefix)
            tokenized_validated_prefix = self.model_tokenize_f(
                tokenized_validated_prefix)

            # 2.2.5 Validate words
            for pos, word in enumerate(tokenized_validated_prefix.split()):
                fixed_words_user[pos] = self.word2index_y.get(
                    word, self.unk_id)
                if self.word2index_y.get(word) is None:
                    unk_words_dict[pos] = word
            # 2.2.6 Constrain search for the last word
            last_user_word_pos = fixed_words_user.keys()[-1]
            if next_correction != u' ':
                last_user_word = tokenized_validated_prefix.split()[-1]
                filtered_idx2word = dict(
                    (self.word2index_y[candidate_word], candidate_word)
                    for candidate_word in self.word2index_y
                    if candidate_word.decode(
                        'utf-8')[:len(last_user_word)] == last_user_word)
                if filtered_idx2word != dict():
                    del fixed_words_user[last_user_word_pos]
                    if last_user_word_pos in unk_words_dict.keys():
                        del unk_words_dict[last_user_word_pos]
            else:
                filtered_idx2word = dict()

        trans_indices, costs, alphas = \
            self.interactive_beam_searcher.sample_beam_search_interactive(src_seq,
                                                                          fixed_words=copy.copy(fixed_words_user),
                                                                          max_N=max_N,
                                                                          isles=isle_indices,
                                                                          valid_next_words=filtered_idx2word,
                                                                          idx2word=self.index2word_y)
        # # Substitute possible unknown words in isles
        # unk_in_isles = []
        # for isle_idx, isle_sequence, isle_words in unks_in_isles:
        #     if unk_id in isle_sequence:
        #         unk_in_isles.append((subfinder(isle_sequence, list(trans_indices)), isle_words))

        if False and self.params_prediction['pos_unk']:
            alphas = [alphas]
            sources = [tokenized_input]
            heuristic = self.params_prediction['heuristic']
        else:
            alphas = None
            heuristic = None
            sources = None

        # 1.2 Decode hypothesis
        hypothesis = decode_predictions_beam_search([trans_indices],
                                                    self.index2word_y,
                                                    alphas=alphas,
                                                    x_text=sources,
                                                    heuristic=heuristic,
                                                    mapping=self.mapping,
                                                    pad_sequences=True,
                                                    verbose=0)[0]

        # for (words_idx, starting_pos), words in unk_in_isles:
        #     for pos_unk_word, pos_hypothesis in enumerate(range(starting_pos, starting_pos + len(words_idx))):
        #         hypothesis[pos_hypothesis] = words[pos_unk_word]

        # UNK words management
        unk_indices = unk_words_dict.keys()
        unk_words = unk_words_dict.values()
        if len(unk_indices) > 0:  # If we added some UNK word
            hypothesis = hypothesis.split()
            if len(hypothesis) < len(
                    unk_indices
            ):  # The full hypothesis will be made up UNK words:
                for i, index in enumerate(range(0, len(hypothesis))):
                    hypothesis[index] = unk_words[unk_indices[i]]
                for ii in range(i + 1, len(unk_words)):
                    hypothesis.append(unk_words[ii])
            else:  # We put each unknown word in the corresponding gap
                for i, index in enumerate(unk_indices):
                    if index < len(hypothesis):
                        hypothesis[index] = unk_words[i]
                    else:
                        hypothesis.append(unk_words[i])
            hypothesis = u' '.join(hypothesis)

        hypothesis = self.model_detokenize_f(hypothesis)
        hypothesis = self.general_detokenize_f(hypothesis)
        return hypothesis