コード例 #1
0
ファイル: lexicalize.py プロジェクト: tonydeep/tgen
 def get_surface_form(self, sentence, pos, possible_forms):
     log_debug("Pos: %d, forms: %s" %
               (pos, unicode(", ".join(possible_forms))))
     # get unnormalized scores for the whole vocabulary
     if pos >= self.max_sent_len:  # don't use whole sentence if it's too long
         pos -= pos - self.max_sent_len + 1
         sentence = sentence[pos - self.max_sent_len + 1:]
     inputs = np.array([self._sent_to_ids(sentence)[:-1]], dtype=np.int32)
     logits = self.session.run([self._logits], {self._inputs: inputs})
     # pick out scores for possible forms
     scores = [
         logits[0][pos][self.vocab.get(form.lower(),
                                       self.vocab.get('<UNK>'))]
         for form in possible_forms
     ]
     probs = softmax(scores)
     log_debug("Vocab: %s" % unicode(", ".join([
         unicode(self.vocab.get(form.lower(), self.vocab.get('<UNK>')))
         for f in possible_forms
     ])))
     log_debug("Scores: %s, Probs: %s" % (unicode(", ".join(
         ["%.3f" % s
          for s in scores])), unicode(", ".join(["%.3f" % p
                                                 for p in probs]))))
     # sample from the prob. dist.
     if self._sample:
         return np.random.choice(possible_forms, p=probs)
     # get just the most probable option
     max_idx, _ = max(enumerate(probs), key=operator.itemgetter(1))
     return possible_forms[max_idx]
コード例 #2
0
 def get_surface_form(self, sentence, pos, possible_forms):
     scores = [self._word_freq.get(possible_form.lower(), 0) + 0.1
               for possible_form in possible_forms]
     if self._sample:
         probs = softmax(scores)
         return np.random.choice(possible_forms, p=probs)
     max_idx, _ = max(enumerate(scores), key=operator.itemgetter(1))
     return possible_forms[max_idx]
コード例 #3
0
 def _valid_perplexity(self):
     """Compute perplexity of the RNNLM on validation data."""
     perp = 0
     n_toks = 0
     for inputs, targets in self._valid_batches():
         logits = self.session.run([self._logits], {self._inputs: inputs})[0]
         probs = softmax(logits)  # logits combine all sentences behind each other -- dimension
                                  # is (self.max_sent_len * self.batch_size, self.vocab_size)
         for tok_no in range(len(probs)):
             perp += np.log2(probs[tok_no, targets[old_div(tok_no, self.max_sent_len),
                                                   tok_no % self.max_sent_len]])
         n_toks += np.prod(inputs.shape)
     # perp = exp( -1/N * sum_i=1^N log p(x_i) )
     return np.exp2(- perp / float(n_toks))
コード例 #4
0
 def get_surface_form(self, sentence, pos, possible_forms):
     state, dummy_state = kenlm.State(), kenlm.State()
     self._lm.BeginSentenceWrite(state)
     for idx in range(pos):
         self._lm.BaseScore(state, sentence[idx].encode('utf-8'), state)
     best_form_idx = 0
     best_score = float('-inf')
     scores = []
     for form_idx, possible_form in enumerate(possible_forms):
         possible_form = possible_form.lower().replace(' ', '^').encode('utf-8')
         score = self._lm.BaseScore(state, possible_form, dummy_state)
         scores.append(score)
         if score > best_score:
             best_score = score
             best_form_idx = form_idx
     if self._sample:
         probs = softmax(scores)
         return np.random.choice(possible_forms, p=probs)
     return possible_forms[best_form_idx]
コード例 #5
0
ファイル: seq2seq.py プロジェクト: tonydeep/tgen
    def _beam_search_step(self, dec_inputs, dec_states):
        """Run one step of beam search decoding with the given decoder inputs and
        (previous steps') outputs and states."""

        step = len(dec_states)  # find the decoder position

        # fill in all previous path data
        for i in xrange(step):
            self._beam_search_feed_dict[self.dec_inputs[i]] = dec_inputs[i]
            self._beam_search_feed_dict[self.states[i]] = dec_states[i]

        # the decoder outputs are always one step longer
        self._beam_search_feed_dict[self.dec_inputs[step]] = dec_inputs[step]

        # run one step of the decoder
        output, state = self.session.run(
            [self.outputs[step], self.states[step]],
            feed_dict=self._beam_search_feed_dict)

        # softmax (normalize decoder outputs to obtain prob. distribution), assuming batches size 1
        out_probs = softmax(output[0])
        return out_probs, state