def test_extract_window(self):
        sen_str = 'this is a test sentence.'
        sen = word_tokenize(sen_str.lower())
        test_token = 'this'
        window = ngram_window_extractor.extract_window(sen, test_token)
        self.assertListEqual(window, ['_START_', 'this', 'is'], 'A window starting with the first token should be correct')

        sen2_str = 'this is a test sentence.'
        sen2 = word_tokenize(sen2_str.lower())
        test_token2 = 'is'
        window2 = ngram_window_extractor.extract_window(sen2, test_token2)
        self.assertListEqual(window2, ['this', 'is', 'a'], 'A window starting with the second token should be correct')
Esempio n. 2
0
    def get_features(self, context_obj):
        idx = context_obj['index']

        left_ngram = left_context(context_obj['target'], context_obj['token'], context_size=self.order-1, idx=idx) + [context_obj['token']]
        right_ngram = [context_obj['token']] + right_context(context_obj['target'], context_obj['token'], context_size=self.order-1, idx=idx)
        left_ngram_order = self.check_lm(left_ngram, side='left')
        right_ngram_order = self.check_lm(right_ngram, side='right')

        left_trigram = left_context(context_obj['target'], context_obj['token'], context_size=2, idx=idx) + [context_obj['token']]
        middle_trigram = extract_window(context_obj['target'], context_obj['token'], idx=idx)
        right_trigram = [context_obj['token']] + right_context(context_obj['target'], context_obj['token'], context_size=2, idx=idx)

        backoff_left = self.get_backoff(left_trigram)
        backoff_middle = self.get_backoff(middle_trigram)
        backoff_right = self.get_backoff(right_trigram)

        return [left_ngram_order, right_ngram_order, backoff_left, backoff_middle, backoff_right]
Esempio n. 3
0
    def get_features(self, context_obj):
        idx = context_obj['index']

        left_ngram = left_context(context_obj['target'],
                                  context_obj['token'],
                                  context_size=self.order - 1,
                                  idx=idx) + [context_obj['token']]
        right_ngram = [context_obj['token']] + right_context(
            context_obj['target'],
            context_obj['token'],
            context_size=self.order - 1,
            idx=idx)
        left_ngram_order = self.check_lm(left_ngram, side='left')
        right_ngram_order = self.check_lm(right_ngram, side='right')

        left_trigram = left_context(context_obj['target'],
                                    context_obj['token'],
                                    context_size=2,
                                    idx=idx) + [context_obj['token']]
        middle_trigram = extract_window(context_obj['target'],
                                        context_obj['token'],
                                        idx=idx)
        right_trigram = [context_obj['token']] + right_context(
            context_obj['target'],
            context_obj['token'],
            context_size=2,
            idx=idx)
        # TODO: instead of _START_ there should be <s>

        backoff_left = self.get_backoff(left_trigram)
        backoff_middle = self.get_backoff(middle_trigram)
        backoff_right = self.get_backoff(right_trigram)

        return [
            left_ngram_order, right_ngram_order, backoff_left, backoff_middle,
            backoff_right
        ]