Beispiel #1
0
    def test_score_sequence(self):
        # Network predicts <unk> probability.
        scorer = TextScorer(self.dummy_network)
        word_ids = numpy.arange(6)
        class_ids = numpy.arange(6)
        membership_probs = numpy.ones(6, dtype='float32')
        logprob = scorer.score_sequence(word_ids, class_ids, membership_probs)
        correct = word_ids[1:].astype('float32')
        correct = correct / 5
        correct = numpy.log(correct).sum()
        self.assertAlmostEqual(logprob, correct, places=5)

        # <unk> is removed from the resulting logprobs.
        scorer = TextScorer(self.dummy_network, ignore_unk=True)
        word_ids = numpy.arange(6)
        word_ids[3] = self.vocabulary.word_to_id['<unk>']
        class_ids = numpy.arange(6)
        membership_probs = numpy.ones(6, dtype='float32')
        logprob = scorer.score_sequence(word_ids, class_ids, membership_probs)
        correct = word_ids[[1, 2, 4, 5]].astype('float32')
        correct = correct / 5
        correct = numpy.log(correct).sum()
        self.assertAlmostEqual(logprob, correct, places=5)

        # <unk> is assigned a constant logprob.
        scorer = TextScorer(self.dummy_network, ignore_unk=False, unk_penalty=-5)
        word_ids = numpy.arange(6)
        word_ids[3] = self.vocabulary.word_to_id['<unk>']
        class_ids = numpy.arange(6)
        membership_probs = numpy.ones(6, dtype='float32')
        logprob = scorer.score_sequence(word_ids, class_ids, membership_probs)
        correct = word_ids[[1, 2, 4, 5]].astype('float32')
        correct = correct / 5
        correct = numpy.log(correct).sum() - 5
        self.assertAlmostEqual(logprob, correct, places=5)
    def test_score_sequence(self):
        # Network predicts <unk> probability.
        scorer = TextScorer(self.dummy_network)
        word_ids = numpy.arange(6)
        class_ids = numpy.arange(6)
        membership_probs = numpy.ones(6, dtype='float32')
        logprob = scorer.score_sequence(word_ids, class_ids, membership_probs)
        correct = word_ids[1:].astype('float32')    
        correct = correct / 5
        correct = numpy.log(correct).sum()
        self.assertAlmostEqual(logprob, correct, places=5)

        # <unk> is removed from the resulting logprobs.
        scorer = TextScorer(self.dummy_network, ignore_unk=True)
        word_ids = numpy.arange(6)
        word_ids[3] = self.vocabulary.word_to_id['<unk>']
        class_ids = numpy.arange(6)
        membership_probs = numpy.ones(6, dtype='float32')
        logprob = scorer.score_sequence(word_ids, class_ids, membership_probs)
        correct = word_ids[[1, 2, 4, 5]].astype('float32')
        correct = correct / 5
        correct = numpy.log(correct).sum()
        self.assertAlmostEqual(logprob, correct, places=5)

        # <unk> is assigned a constant logprob.
        scorer = TextScorer(self.dummy_network, ignore_unk=False, unk_penalty=-5)
        word_ids = numpy.arange(6)
        word_ids[3] = self.vocabulary.word_to_id['<unk>']
        class_ids = numpy.arange(6)
        membership_probs = numpy.ones(6, dtype='float32')
        logprob = scorer.score_sequence(word_ids, class_ids, membership_probs)
        correct = word_ids[[1, 2, 4, 5]].astype('float32')
        correct = correct / 5
        correct = numpy.log(correct).sum() - 5
        self.assertAlmostEqual(logprob, correct, places=5)
    def test_score_sequence(self):
        # Network predicts <unk> probability.
        scorer = TextScorer(self.dummy_network, use_shortlist=False)
        word_ids = numpy.arange(15)
        class_ids, _ = self.vocabulary.get_class_memberships(word_ids)
        membership_probs = numpy.ones_like(word_ids).astype('float32')
        logprob = scorer.score_sequence(word_ids, class_ids, membership_probs)
        correct = word_ids[1:].astype('float32')
        correct /= 100.0
        correct[12] = 12.0 / 100.0
        correct[13] = 12.0 / 100.0
        correct = numpy.log(correct).sum()
        self.assertAlmostEqual(logprob, correct, places=4)

        # Network predicts <unk> probability. This is distributed for
        # out-of-shortlist words according to word frequency.
        scorer = TextScorer(self.dummy_network, use_shortlist=True)
        word_ids = numpy.arange(15)
        class_ids, _ = self.vocabulary.get_class_memberships(word_ids)
        membership_probs = numpy.ones_like(word_ids).astype('float32')
        logprob = scorer.score_sequence(word_ids, class_ids, membership_probs)
        correct = word_ids[1:].astype('float32')
        correct /= 100.0
        correct[11] = 1.0  # <unk> is ignored
        correct[12] = 12.0 / 100.0 * 0.3
        correct[13] = 12.0 / 100.0 * 0.7
        correct = numpy.log(correct).sum()
        self.assertAlmostEqual(logprob, correct, places=5)

        # OOV and OOS words are excluded from the resulting logprobs.
        scorer = TextScorer(self.dummy_network,
                            use_shortlist=False,
                            exclude_unk=True)
        word_ids = numpy.arange(15)
        class_ids, _ = self.vocabulary.get_class_memberships(word_ids)
        membership_probs = numpy.ones_like(word_ids).astype('float32')
        logprob = scorer.score_sequence(word_ids, class_ids, membership_probs)
        correct = word_ids[1:12].astype('float32')
        correct /= 100.0
        correct = numpy.log(correct).sum()
        self.assertAlmostEqual(logprob, correct, places=5)