Esempi in Python per tokenize, esempi in Python per undreamt.data.tokenize

Esempio n. 1

0

Mostra file

 def greedy(self, sentences, max_ratio=2, train=False):
     self._train(train)
     input_lengths = [
         len(data.tokenize(sentence)) for sentence in sentences
     ]
     hidden, context, context_lengths = self.encode(sentences, train)
     context_mask = self.mask(context_lengths)
     translations = [[] for sentence in sentences]
     prev_words = len(sentences) * [data.SOS]
     pending = set(range(len(sentences)))
     output = self.device(self.decoder.initial_output(len(sentences)))
     while len(pending) > 0:
         var = self.device(
             Variable(torch.LongTensor([prev_words]), requires_grad=False))
         logprobs, hidden, output = self.decoder(var,
                                                 len(sentences) * [1],
                                                 self.decoder_embeddings,
                                                 hidden, context,
                                                 context_mask, output,
                                                 self.generator)
         prev_words = logprobs.max(
             dim=2)[1].squeeze().data.cpu().numpy().tolist()
         for i in pending.copy():
             if prev_words[i] == data.EOS:
                 pending.discard(i)
             else:
                 translations[i].append(prev_words[i])
                 if len(translations[i]) >= max_ratio * input_lengths[i]:
                     pending.discard(i)
     return self.trg_dictionary.ids2sentences(translations)

Esempio n. 2

0

Mostra file

File: translator.py Progetto: MaximeDaigle/Low-Resource-Machine-Translation

 def greedy(self, sentences, max_ratio=2, train=False):
     self._train(train)
     input_lengths = [
         len(data.tokenize(sentence)) for sentence in sentences
     ]
     hidden, context, context_lengths = self.encode(sentences, train)
     context_mask = self.mask(context_lengths)
     translations = [[] for sentence in sentences]
     prev_words = len(sentences) * [data.SOS]
     pending = set(range(len(sentences)))
     output = self.decoder.initial_output(len(sentences))
     while len(pending) > 0:
         var = tf.Variable([prev_words], dtype=tf.int64, trainable=False)
         logprobs, hidden, output = self.decoder(var,
                                                 len(sentences) * [1],
                                                 self.decoder_embeddings,
                                                 hidden, context,
                                                 context_mask, output,
                                                 self.generator)
         #prev_words = logprobs.max(dim=2)[1].squeeze().data.cpu().numpy().tolist() #argmax axis=2 into a list of int
         prev_words = tf.squeeze(tf.math.argmax(logprobs, axis=2),
                                 0).numpy().tolist()
         for i in pending.copy():
             if prev_words[i] == data.EOS:
                 pending.discard(i)
             else:
                 translations[i].append(prev_words[i])
                 if len(translations[i]) >= max_ratio * input_lengths[i]:
                     pending.discard(i)
     # for e in self.trg_dictionary.ids2sentences(translations):
     #     if e == "":
     #         print(0)
     return self.trg_dictionary.ids2sentences(translations)

Esempio n. 3

0

Mostra file

File: translator.py Progetto: zzj0402/UnsupNTS

    def greedy(self, sentences, max_ratio=2, train=False,pass_att=False,no_noise=False,encodings=None,pass_context=False\
        ,detach_encoder=False,ncontrol=None):
        self._train(train)
        input_lengths = [len(data.tokenize(sentence)) for sentence in sentences]
        if encodings is not None:
            (hidden,context,context_lengths,sentences) = encodings
        else:
            hidden, context, context_lengths, sentences = self.encode(sentences, train,no_noise=no_noise)
        context_mask = self.mask(context_lengths)
        translations = [[] for sentence in sentences]
        translations_att = [[] for sentence in sentences]
        prev_words = len(sentences)*[data.SOS]
        pending = set(range(len(sentences)))
        output = self.device(self.decoder.initial_output(len(sentences)))
        context_list = []
        # print("SENTENCES GIVEN TO SCORE: {}".format(sentences[0]))

        while len(pending) > 0:
            # print(pending)
            var = self.device(Variable(torch.LongTensor([prev_words]), requires_grad=False))
            logprobs, hidden, output, att_scores,att_contexts = self.decoder(var, len(sentences)*[1], self.decoder_embeddings, hidden, context, context_mask, output, self.generator\
                , pass_att=True, pass_context=True,detach_encoder=detach_encoder,ncontrol=ncontrol)
            postmask = torch.ByteTensor([0 if i in pending else 1 for i in range(var.data.size()[0])]).unsqueeze(0).unsqueeze(2)
            att_contexts.masked_fill_(self.device(Variable(postmask,requires_grad=False)),0)
            context_list.append(att_contexts)
            if logprobs.size()[1]==1:
                prev_words = [logprobs.max(dim=2)[1].squeeze().data.cpu().numpy().tolist()]
            else:
                prev_words = logprobs.max(dim=2)[1].squeeze().data.cpu().numpy().tolist()
            # prev_words = 
            # print('att_scores {}'.format(att_scores.size()))
            prev_words_att = att_scores.topk(dim=2,k=2)[1].squeeze().data.cpu().numpy().tolist()
            # print("att_scores IN GREEDY FUNCTION  {} {}".format(att_scores,prev_words_att))
            for i in pending.copy():
                if prev_words[i] == data.EOS:
                    pending.discard(i)
                else:
                    translations[i].append(prev_words[i])
                    translations_att[i].append(prev_words_att[i])
                    if len(translations[i]) >= max_ratio*input_lengths[i]:
                        pending.discard(i)
        if not pass_context:
            return self.trg_dictionary.ids2sentences(translations,translations_att=translations_att,sentences=sentences,pass_att=pass_att)
        else:
            # print(translations)
            # print("simpreds",max([len(x) for x in translations]))
            return self.trg_dictionary.ids2sentences(translations,translations_att=translations_att,sentences=sentences,pass_att=pass_att), torch.cat(context_list)

Esempio n. 4

0

Mostra file

    def beam_search(self, sentences, beam_size=12, max_ratio=2, train=False):
        self._train(train)
        batch_size = len(sentences)
        input_lengths = [
            len(data.tokenize(sentence)) for sentence in sentences
        ]
        hidden, context, context_lengths = self.encode(sentences, train)
        translations = [[] for sentence in sentences]
        pending = set(range(batch_size))

        hidden = hidden.repeat(1, beam_size, 1)
        context = context.repeat(1, beam_size, 1)
        context_lengths *= beam_size
        context_mask = self.mask(context_lengths)
        ones = beam_size * batch_size * [1]
        prev_words = beam_size * batch_size * [data.SOS]
        output = self.device(
            self.decoder.initial_output(beam_size * batch_size))

        translation_scores = batch_size * [-float('inf')]
        hypotheses = batch_size * [
            (0.0, [])
        ] + (beam_size - 1) * batch_size * [(-float('inf'), [])
                                            ]  # (score, translation)

        while len(pending) > 0:
            # Each iteration should update: prev_words, hidden, output
            var = self.device(
                Variable(torch.LongTensor([prev_words]), requires_grad=False))
            logprobs, hidden, output = self.decoder(var, ones,
                                                    self.decoder_embeddings,
                                                    hidden, context,
                                                    context_mask, output,
                                                    self.generator)
            prev_words = logprobs.max(
                dim=2)[1].squeeze().data.cpu().numpy().tolist()

            word_scores, words = logprobs.topk(k=beam_size + 1,
                                               dim=2,
                                               sorted=False)
            word_scores = word_scores.squeeze(0).data.cpu().numpy().tolist(
            )  # (beam_size*batch_size) * (beam_size+1)
            words = words.squeeze(0).data.cpu().numpy().tolist()

            for sentence_index in pending.copy():
                candidates = []  # (score, index, word)
                for rank in range(beam_size):
                    index = sentence_index + rank * batch_size
                    for i in range(beam_size + 1):
                        word = words[index][i]
                        score = hypotheses[index][0] + word_scores[index][i]
                        if word != data.EOS:
                            candidates.append((score, index, word))
                        elif score > translation_scores[sentence_index]:
                            translations[sentence_index] = hypotheses[index][
                                1] + [word]
                            translation_scores[sentence_index] = score
                best = []  # score, word, translation, hidden, output
                for score, current_index, word in sorted(
                        candidates, reverse=True)[:beam_size]:
                    translation = hypotheses[current_index][1] + [word]
                    best.append((score, word, translation,
                                 hidden[:, current_index, :].data,
                                 output[current_index].data))
                for rank, (score, word, translation, h, o) in enumerate(best):
                    next_index = sentence_index + rank * batch_size
                    hypotheses[next_index] = (score, translation)
                    prev_words[next_index] = word
                    hidden[:, next_index, :] = h
                    output[next_index, :] = o
                if len(hypotheses[sentence_index]
                       [1]) >= max_ratio * input_lengths[
                           sentence_index] or translation_scores[
                               sentence_index] > hypotheses[sentence_index][0]:
                    pending.discard(sentence_index)
                    if len(translations[sentence_index]) == 0:
                        translations[sentence_index] = hypotheses[
                            sentence_index][1]
                        translation_scores[sentence_index] = hypotheses[
                            sentence_index][0]
        return self.trg_dictionary.ids2sentences(translations)

Esempio n. 5

0

Mostra file

File: translator.py Progetto: zzj0402/UnsupNTS

    def beam_search(self, sentences, beam_size=12, max_ratio=2, train=False,rnk=2,noiseratio=0.5,pass_att=False,ncontrol=0):
        self._train(train)
        batch_size = len(sentences)
        input_lengths = [len(data.tokenize(sentence)) for sentence in sentences]
        hidden, context, context_lengths, sentences = self.encode(sentences, train,noiseratio=noiseratio,testing=True)
        translations = [[] for sentence in sentences]
        pending = set(range(batch_size))

        hidden = hidden.repeat(1, beam_size, 1)
        context = context.repeat(1, beam_size, 1)
        context_lengths *= beam_size
        context_mask = self.mask(context_lengths)
        ones = beam_size*batch_size*[1]
        prev_words = beam_size*batch_size*[data.SOS]
        output = self.device(self.decoder.initial_output(beam_size*batch_size))

        translation_scores = batch_size*[-float('inf')]
        hypotheses = batch_size*[(0.0, [])] + (beam_size-1)*batch_size*[(-float('inf'), [])]  # (score, translation)

        while len(pending) > 0:
            # Each iteration should update: prev_words, hidden, output
            var = self.device(Variable(torch.LongTensor([prev_words]), requires_grad=False))
            logprobs, hidden, output, att_scores = self.decoder(var, ones, self.decoder_embeddings, hidden, context, context_mask, output, self.generator,pass_att=True,ncontrol=ncontrol)
            prev_words = logprobs.max(dim=2)[1].squeeze().data.cpu().numpy().tolist()
            prev_words_att = att_scores.topk(dim=2,k=2)[1].squeeze().data.cpu().numpy().tolist()
            word_scores, words = logprobs.topk(k=beam_size+1, dim=2, sorted=False)
            word_scores = word_scores.squeeze(0).data.cpu().numpy().tolist()  # (beam_size*batch_size) * (beam_size+1)
            words = words.squeeze(0).data.cpu().numpy().tolist()

            for sentence_index in pending.copy():
                #consider a particular source for which beamsize best half translations have been extracted.
                #now next best beamsize translations should be found.
                #candidates which aren't finished will be found.
                candidates = []  # (score, index, word)
                for rank in range(beam_size):
                    index = sentence_index + rank*batch_size
                    for i in range(beam_size + 1):
                        word = words[index][i]
                        word_att = prev_words_att[index]
                        score = hypotheses[index][0] + word_scores[index][i]
                        if word != data.EOS:
                            candidates.append((score, index, word, word_att))
                        elif score > translation_scores[sentence_index]:
                            translations[sentence_index] = hypotheses[index][1] + [(word,word_att)]
                            translation_scores[sentence_index] = score
                best = []  # score, word, translation, hidden, output
                #beamsize best translations are inserted into best.
                for score, current_index, word, word_att in sorted(candidates, reverse=True)[:beam_size]:
                    translation = hypotheses[current_index][1] + [(word,word_att)]
                    best.append((score, word, word_att, translation, hidden[:, current_index, :].data, output[current_index].data))
                #update hypotheses based on best array
                for rank, (score, word, word_att, translation, h, o) in enumerate(best):
                    next_index = sentence_index + rank*batch_size
                    hypotheses[next_index] = (score, translation)
                    prev_words[next_index] = word
                    hidden[:, next_index, :] = h
                    output[next_index, :] = o
                if len(hypotheses[sentence_index][1]) >= max_ratio*input_lengths[sentence_index] or translation_scores[sentence_index] > hypotheses[sentence_index][0]:
                    pending.discard(sentence_index)
                    if len(translations[sentence_index]) == 0:
                        translations[sentence_index] = hypotheses[sentence_index][1]
                        translation_scores[sentence_index] = hypotheses[sentence_index][0]
        translations_att = [[translations[i][j][1] for j in range(len(translations[i])) ] for i in range(len(translations))]
        translations = [[translations[i][j][0] for j in range(len(translations[i])) ] for i in range(len(translations))]
        return self.trg_dictionary.ids2sentences(translations,translations_att=translations_att,sentences=sentences,pass_att=pass_att,testing=True)

Esempio n. 6

0

Mostra file

File: translator.py Progetto: MaximeDaigle/Low-Resource-Machine-Translation

    def beam_search(self, sentences, beam_size=12, max_ratio=2, train=False):
        self._train(train)
        batch_size = len(sentences)
        input_lengths = [
            len(data.tokenize(sentence)) for sentence in sentences
        ]
        hidden, context, context_lengths = self.encode(sentences, train)
        translations = [[] for sentence in sentences]
        pending = set(range(batch_size))

        hidden = tf.keras.backend.repeat_elements(hidden, beam_size, 1)
        context = tf.keras.backend.repeat_elements(context, beam_size, 1)
        context_lengths *= beam_size
        context_mask = self.mask(context_lengths)
        ones = beam_size * batch_size * [1]
        prev_words = beam_size * batch_size * [data.SOS]
        #output = self.device(self.decoder.initial_output(beam_size*batch_size))
        output = self.decoder.initial_output(beam_size * batch_size)

        translation_scores = batch_size * [-float('inf')]
        hypotheses = batch_size * [
            (0.0, [])
        ] + (beam_size - 1) * batch_size * [(-float('inf'), [])
                                            ]  # (score, translation)

        hidden_npy = None
        output_npy = None
        while len(pending) > 0:
            # Each iteration should update: prev_words, hidden, output
            if hidden_npy is not None:
                hidden = tf.convert_to_tensor(hidden_npy, dtype=tf.float32)
                output = tf.convert_to_tensor(output_npy, dtype=tf.float32)
            var = tf.Variable(
                [prev_words], dtype=tf.int64, trainable=False
            )  # self.device(Variable(torch.LongTensor([prev_words]), requires_grad=False))
            logprobs, hidden, output = self.decoder(var, ones,
                                                    self.decoder_embeddings,
                                                    hidden, context,
                                                    context_mask, output,
                                                    self.generator)
            prev_words = tf.squeeze(tf.math.argmax(logprobs, axis=2),
                                    0).numpy().tolist()

            word_scores, words = tf.raw_ops.TopKV2(input=logprobs,
                                                   k=beam_size + 1,
                                                   sorted=False)
            word_scores = tf.squeeze(
                word_scores,
                [0]).numpy().tolist()  # (beam_size*batch_size) * (beam_size+1)
            words = tf.squeeze(words, [0]).numpy().tolist()
            hidden_npy = hidden.numpy()
            output_npy = output.numpy()

            for sentence_index in pending.copy():
                candidates = []  # (score, index, word)
                for rank in range(beam_size):
                    index = sentence_index + rank * batch_size
                    for i in range(beam_size + 1):
                        word = words[index][i]
                        score = hypotheses[index][0] + word_scores[index][i]
                        if word != data.EOS:
                            candidates.append((score, index, word))
                        elif score > translation_scores[sentence_index]:
                            translations[sentence_index] = hypotheses[index][
                                1] + [word]
                            translation_scores[sentence_index] = score
                best = []  # score, word, translation, hidden, output
                for score, current_index, word in sorted(
                        candidates, reverse=True)[:beam_size]:
                    translation = hypotheses[current_index][1] + [word]
                    best.append((score, word, translation,
                                 hidden_npy[:, current_index, :],
                                 output_npy[current_index]))
                for rank, (score, word, translation, h, o) in enumerate(best):
                    next_index = sentence_index + rank * batch_size
                    hypotheses[next_index] = (score, translation)
                    prev_words[next_index] = word
                    hidden_npy[:, next_index, :] = h
                    output_npy[next_index, :] = o
                if len(hypotheses[sentence_index]
                       [1]) >= max_ratio * input_lengths[
                           sentence_index] or translation_scores[
                               sentence_index] > hypotheses[sentence_index][0]:
                    pending.discard(sentence_index)
                    if len(translations[sentence_index]) == 0:
                        translations[sentence_index] = hypotheses[
                            sentence_index][1]
                        translation_scores[sentence_index] = hypotheses[
                            sentence_index][0]
        return self.trg_dictionary.ids2sentences(translations)