Ejemplo n.º 1
0
    def get_features(self, context_obj):
        idx_left = context_obj['index'][0]
        idx_right = context_obj['index'][1]

        left_ngram = left_context(context_obj['target'],
                                  context_obj['token'][0],
                                  context_size=self.order - 1,
                                  idx=idx_left) + [context_obj['token'][0]]
        right_ngram = [context_obj['token'][-1]] + right_context(
            context_obj['target'],
            context_obj['token'][-1],
            context_size=self.order - 1,
            idx=idx_right)
        left_ngram_order = self.check_lm(left_ngram, side='left')
        right_ngram_order = self.check_lm(right_ngram, side='right')

        left_trigram = left_context(context_obj['target'],
                                    context_obj['token'][0],
                                    context_size=2,
                                    idx=idx_left) + [context_obj['token'][0]]
        right_trigram = [context_obj['token'][-1]] + right_context(
            context_obj['target'],
            context_obj['token'][-1],
            context_size=2,
            idx=idx_right)

        backoff_left = self.get_backoff(left_trigram)
        backoff_right = self.get_backoff(right_trigram)

        return [
            left_ngram_order, right_ngram_order, backoff_left, backoff_right
        ]
    def get_features(self, context_obj):
        if 'target_pos' not in context_obj:
            raise NoDataError('target_pos', context_obj,
                              'POSContextFeatureExtractor')
        if 'source_pos' not in context_obj:
            raise NoDataError('source_pos', context_obj,
                              'POSContextFeatureExtractor')

        left_src = left_context(
            context_obj['source_pos'],
            context_obj['source_pos'][context_obj['source_index'][0]],
            context_size=1,
            idx=context_obj['source_index'][0])
        right_src = right_context(
            context_obj['source_pos'],
            context_obj['source_pos'][context_obj['source_index'][1] - 1],
            context_size=1,
            idx=context_obj['source_index'][1] - 1)

        left_tg = left_context(
            context_obj['target_pos'],
            context_obj['target_pos'][context_obj['index'][0]],
            context_size=1,
            idx=context_obj['index'][0])

        return [left_src[0], right_src[0], left_tg[0]]
Ejemplo n.º 3
0
    def get_features(self, context_obj):
        if 'source_token' in context_obj and len(
                context_obj['source_token']) > 0 and len(
                    context_obj['source_index']) > 1:
            try:
                left_src = left_context(context_obj['source'],
                                        context_obj['source_token'][0],
                                        context_size=1,
                                        idx=context_obj['source_index'][0])
            except IndexError:
                print(context_obj['source'])
                print(context_obj['source_token'])
                print(context_obj['source_index'])
                sys.exit()
            right_src = right_context(context_obj['source'],
                                      context_obj['source_token'][-1],
                                      context_size=1,
                                      idx=context_obj['source_index'][1] - 1)
        else:
            left_src = ""
            right_src = ""
        left_tg = left_context(context_obj['target'],
                               context_obj['token'][0],
                               context_size=1,
                               idx=context_obj['index'][0])
        right_tg = right_context(context_obj['target'],
                                 context_obj['token'][-1],
                                 context_size=1,
                                 idx=context_obj['index'][1] - 1)

        return [left_src[0], right_src[0], left_tg[0], right_tg[0]]
Ejemplo n.º 4
0
    def get_features(self, context_obj):
        idx = context_obj['index']

        left_ngram = left_context(context_obj['target'], context_obj['token'], context_size=self.order-1, idx=idx) + [context_obj['token']]
        left_ngram_order = self.check_lm(left_ngram, side='left')

        left_trigram = left_context(context_obj['target'], context_obj['token'], context_size=2, idx=idx) + [context_obj['token']]
        backoff_left = self.get_backoff(left_trigram)

        return [left_ngram_order, backoff_left]
Ejemplo n.º 5
0
    def get_features(self, context_obj):
        if 'source_token' in context_obj:
            left_src = left_context(context_obj['source'], context_obj['source_token'][0], context_size=1, idx=context_obj['source_index'][0])
            right_src = right_context(context_obj['source'], context_obj['source_token'][-1], context_size=1, idx=context_obj['source_index'][1]-1)
        else:
            left_src = ""
            right_src = ""
        left_tg = left_context(context_obj['target'], context_obj['token'][0], context_size=1, idx=context_obj['index'][0])
        right_tg = right_context(context_obj['target'], context_obj['token'][-1], context_size=1, idx=context_obj['index'][1]-1)

        return [left_src[0], right_src[0], left_tg[0], right_tg[0]]
    def get_features(self, context_obj):
        if 'target_pos' not in context_obj:
            raise NoDataError('target_pos', context_obj, 'POSContextFeatureExtractor')
        if 'source_pos' not in context_obj:
            raise NoDataError('source_pos', context_obj, 'POSContextFeatureExtractor')

        left_src = left_context(context_obj['source_pos'], context_obj['source_pos'][context_obj['source_index'][0]], context_size=1, idx=context_obj['source_index'][0])
        right_src = right_context(context_obj['source_pos'], context_obj['source_pos'][context_obj['source_index'][1]-1], context_size=1, idx=context_obj['source_index'][1]-1)

        left_tg = left_context(context_obj['target_pos'], context_obj['target_pos'][context_obj['index'][0]], context_size=1, idx=context_obj['index'][0])

        return [left_src[0], right_src[0], left_tg[0]]
    def get_features(self, context_obj):
        #sys.stderr.write("Start ContextLeftFeatureExtractor\n")
        if 'source_token' in context_obj:
            left_src = left_context(context_obj['source'], context_obj['source_token'][0], context_size=1, idx=context_obj['source_index'][0])
            right_src = right_context(context_obj['source'], context_obj['source_token'][-1], context_size=1, idx=context_obj['source_index'][1]-1)
        else:
            left_src = ""
            right_src = ""
        left_tg = left_context(context_obj['target'], context_obj['token'][0], context_size=1, idx=context_obj['index'][0])

        #sys.stderr.write("Finish ContextLeftFeatureExtractor\n")
        return [left_src[0], right_src[0], left_tg[0]]
    def get_features(self, context_obj):
        #sys.stderr.write("Start ContextLMLeftFeatureExtractor\n")
        idx_left = context_obj['index'][0]
        idx_right = context_obj['index'][1]

        left_ngram = left_context(context_obj['target'], context_obj['token'][0], context_size=self.order-1, idx=idx_left) + [context_obj['token'][0]]
        left_ngram_order = self.check_lm(left_ngram, side='left')

        left_trigram = left_context(context_obj['target'], context_obj['token'][0], context_size=2, idx=idx_left) + [context_obj['token'][0]]

        backoff_left = self.get_backoff(left_trigram)

        #sys.stderr.write("Finish ContextLMLeftFeatureExtractor\n")
        return [str(left_ngram_order), str(backoff_left)]
Ejemplo n.º 9
0
    def get_features(self, context_obj):
        idx_left = context_obj['index'][0]
        idx_right = context_obj['index'][1]

        left_ngram = left_context(context_obj['target'], context_obj['token'][0], context_size=self.order-1, idx=idx_left) + [context_obj['token'][0]]
        right_ngram = [context_obj['token'][-1]] + right_context(context_obj['target'], context_obj['token'][-1], context_size=self.order-1, idx=idx_right)
        left_ngram_order = self.check_lm(left_ngram, side='left')
        right_ngram_order = self.check_lm(right_ngram, side='right')

        left_trigram = left_context(context_obj['target'], context_obj['token'][0], context_size=2, idx=idx_left) + [context_obj['token'][0]]
        right_trigram = [context_obj['token'][-1]] + right_context(context_obj['target'], context_obj['token'][-1], context_size=2, idx=idx_right)

        backoff_left = self.get_backoff(left_trigram)
        backoff_right = self.get_backoff(right_trigram)

        return [left_ngram_order, right_ngram_order, backoff_left, backoff_right]
Ejemplo n.º 10
0
    def get_features(self, context_obj):
        if 'source' not in context_obj:
            raise NoDataError('source', context_obj, 'SourceLMFeatureExtractor')
        if 'alignments' not in context_obj:
            raise NoDataError('alignments', context_obj, 'SourceLMFeatureExtractor')
        align = sorted(context_obj['alignments'][context_obj['index']])
        # unaligned
        if align == []:
            return [0, 0]
        idx_first = align[0]
        idx_last = align[-1]
        words_number = idx_last - idx_first
        tokens = context_obj['source'][idx_first:idx_last+1]

        left_ngram = left_context(context_obj['source'], tokens[0], context_size=self.order-1-words_number, idx=idx_first) + tokens
        right_ngram = tokens + right_context(context_obj['source'], tokens[-1], context_size=self.order-1-words_number, idx=idx_last)
        left_ngram_order = self.check_lm(left_ngram, side='left')
        right_ngram_order = self.check_lm(right_ngram, side='right')


#        left_trigram = left_context(context_obj['target'], context_obj['token'], context_size=2, idx=idx) + [context_obj['token']]
#        middle_trigram = extract_window(context_obj['target'], context_obj['token'], idx=idx)
#        right_trigram = [context_obj['token']] + right_context(context_obj['target'], context_obj['token'], context_size=2, idx=idx)
#
#        backoff_left = self.get_backoff(left_trigram)
 #       backoff_middle = self.get_backoff(middle_trigram)
#        backoff_right = self.get_backoff(right_trigram)

#        return [left_ngram_order, right_ngram_order, backoff_left, backoff_middle, backoff_right]
        return [left_ngram_order, right_ngram_order]
Ejemplo n.º 11
0
    def get_features(self, context_obj):
        token = context_obj['token']
        left = ' '.join(
            left_context(context_obj['target'],
                         token,
                         context_size=1,
                         idx=context_obj['index']))
        right = ' '.join(
            right_context(context_obj['target'],
                          token,
                          context_size=1,
                          idx=context_obj['index']))
        tg_pos = context_obj['target_pos'][
            context_obj['index']] if context_obj['target_pos'] != [] else ''

        align_idx = context_obj['alignments'][context_obj['index']]
        if align_idx is None:
            src_token = '__unaligned__'
            src_pos = '__unaligned__'
        else:
            src_token = context_obj['source'][align_idx]
            src_pos = context_obj['source_pos'][align_idx]

        return [
            token + '|' + left, token + '|' + right, token + '|' + src_token,
            tg_pos + '|' + src_pos
        ]
Ejemplo n.º 12
0
    def get_features(self, context_obj):
        idx = context_obj['index']

        left_ngram = left_context(context_obj['target'],
                                  context_obj['token'],
                                  context_size=self.order - 1,
                                  idx=idx) + [context_obj['token']]
        left_ngram_order = self.check_lm(left_ngram, side='left')

        left_trigram = left_context(context_obj['target'],
                                    context_obj['token'],
                                    context_size=2,
                                    idx=idx) + [context_obj['token']]
        backoff_left = self.get_backoff(left_trigram)

        return [left_ngram_order, backoff_left]
Ejemplo n.º 13
0
    def get_features(self, context_obj):
        if 'source' not in context_obj or context_obj['source'] is None:
            raise NoDataError('source', context_obj, 'AlignmentFeatureExtractor')
        if 'target' not in context_obj or context_obj['source'] is None or context_obj['target'] is None:
            raise NoDataError('target', context_obj, 'AlignmentFeatureExtractor')

        if 'alignments' not in context_obj:
            raise NoDataError('alignments', context_obj, 'AlignmentFeatureExtractor')
#            if self.model == '':
#                raise NoDataError('alignments', context_obj, 'AlignmentFeatureExtractor')
#            context_obj['alignments'] = align_sentence(context_obj['source'], context_obj['target'], self.model)

        # source word(s)
        try:
            align_idx = context_obj['alignments'][context_obj['index']]
        except IndexError:
            print("{} items in the alignment, needed {}-th".format(len(context_obj['alignments']), context_obj['index']))
            print(context_obj['alignments'], context_obj['target'], context_obj['source'])
            sys.exit()
        # if word is unaligned - no source and no source contexts
        if align_idx == None:
            return ['__unaligned__', '|'.join(['__unaligned__' for i in range(self.context_size)]), '|'.join(['__unaligned__' for i in range(self.context_size)])]

        # TODO: find contexts for all words aligned to the token (now only 1st word)
        else:
            left = '|'.join(left_context(context_obj['source'], context_obj['source'][align_idx], context_size=self.context_size, idx=align_idx))
            right = '|'.join(right_context(context_obj['source'], context_obj['source'][align_idx], context_size=self.context_size, idx=align_idx))

        aligned_to = context_obj['source'][align_idx]
        return [aligned_to, left, right]
Ejemplo n.º 14
0
    def get_features(self, context_obj):
        idx = context_obj['index']

        left_ngram = left_context(context_obj['target'], context_obj['token'], context_size=self.order-1, idx=idx) + [context_obj['token']]
        right_ngram = [context_obj['token']] + right_context(context_obj['target'], context_obj['token'], context_size=self.order-1, idx=idx)
        left_ngram_order = self.check_lm(left_ngram, side='left')
        right_ngram_order = self.check_lm(right_ngram, side='right')

        left_trigram = left_context(context_obj['target'], context_obj['token'], context_size=2, idx=idx) + [context_obj['token']]
        middle_trigram = extract_window(context_obj['target'], context_obj['token'], idx=idx)
        right_trigram = [context_obj['token']] + right_context(context_obj['target'], context_obj['token'], context_size=2, idx=idx)

        backoff_left = self.get_backoff(left_trigram)
        backoff_middle = self.get_backoff(middle_trigram)
        backoff_right = self.get_backoff(right_trigram)

        return [left_ngram_order, right_ngram_order, backoff_left, backoff_middle, backoff_right]
Ejemplo n.º 15
0
    def get_features(self, context_obj):
        if 'source_token' in context_obj and len(context_obj['source_token']) > 0 and len(context_obj['source_index']) > 1:
            try:
                left_src = left_context(context_obj['source'], context_obj['source_token'][0], context_size=1, idx=context_obj['source_index'][0])
            except IndexError:
                print(context_obj['source'])
                print(context_obj['source_token'])
                print(context_obj['source_index'])
                sys.exit()
            right_src = right_context(context_obj['source'], context_obj['source_token'][-1], context_size=1, idx=context_obj['source_index'][1]-1)
        else:
            left_src = ""
            right_src = ""
        left_tg = left_context(context_obj['target'], context_obj['token'][0], context_size=1, idx=context_obj['index'][0])
        right_tg = right_context(context_obj['target'], context_obj['token'][-1], context_size=1, idx=context_obj['index'][1]-1)

        return [left_src[0], right_src[0], left_tg[0], right_tg[0]]
 def get_features(self, context_obj):
     token = context_obj['token']
     left = ' '.join(
         left_context(context_obj['target'],
                      token,
                      context_size=self.context_size,
                      idx=context_obj['index']))
     return [token, left]
    def get_features(self, context_obj):
        token = context_obj['token']
        left = ' '.join(left_context(context_obj['target'], token, context_size=1, idx=context_obj['index']))

        align_idx = context_obj['alignments'][context_obj['index']]
        if align_idx is None:
            aligned_to = '__unaligned__'
        else:
            aligned_to = context_obj['source'][align_idx]

        return [token + '|' + left + '|' + aligned_to]
Ejemplo n.º 18
0
    def get_features(self, context_obj):
        #sys.stderr.write("Start ContextLMLeftFeatureExtractor\n")
        idx_left = context_obj['index'][0]
        idx_right = context_obj['index'][1]

        left_ngram = left_context(context_obj['target'],
                                  context_obj['token'][0],
                                  context_size=self.order - 1,
                                  idx=idx_left) + [context_obj['token'][0]]
        left_ngram_order = self.check_lm(left_ngram, side='left')

        left_trigram = left_context(context_obj['target'],
                                    context_obj['token'][0],
                                    context_size=2,
                                    idx=idx_left) + [context_obj['token'][0]]

        backoff_left = self.get_backoff(left_trigram)

        #sys.stderr.write("Finish ContextLMLeftFeatureExtractor\n")
        return [str(left_ngram_order), str(backoff_left)]
Ejemplo n.º 19
0
    def get_features(self, context_obj):
        token = context_obj['token']
        left = ' '.join(left_context(context_obj['target'], token, context_size=1, idx=context_obj['index']))
        tg_pos = context_obj['target_pos'][context_obj['index']] if context_obj['target_pos'] != [] else ''

        align_idx = context_obj['alignments'][context_obj['index']]
        if align_idx is None:
            src_token = '__unaligned__'
            src_pos = '__unaligned__'
        else:
            src_token = context_obj['source'][align_idx]
            src_pos = context_obj['source_pos'][align_idx]

        return [token + '|' + left, token + '|' + src_token, tg_pos + '|' + src_pos]
Ejemplo n.º 20
0
    def get_features(self, context_obj):
        idx = context_obj['index']

        left_ngram = left_context(context_obj['target'],
                                  context_obj['token'],
                                  context_size=self.order - 1,
                                  idx=idx) + [context_obj['token']]
        right_ngram = [context_obj['token']] + right_context(
            context_obj['target'],
            context_obj['token'],
            context_size=self.order - 1,
            idx=idx)
        left_ngram_order = self.check_lm(left_ngram, side='left')
        right_ngram_order = self.check_lm(right_ngram, side='right')

        left_trigram = left_context(context_obj['target'],
                                    context_obj['token'],
                                    context_size=2,
                                    idx=idx) + [context_obj['token']]
        middle_trigram = extract_window(context_obj['target'],
                                        context_obj['token'],
                                        idx=idx)
        right_trigram = [context_obj['token']] + right_context(
            context_obj['target'],
            context_obj['token'],
            context_size=2,
            idx=idx)
        # TODO: instead of _START_ there should be <s>

        backoff_left = self.get_backoff(left_trigram)
        backoff_middle = self.get_backoff(middle_trigram)
        backoff_right = self.get_backoff(right_trigram)

        return [
            left_ngram_order, right_ngram_order, backoff_left, backoff_middle,
            backoff_right
        ]
Ejemplo n.º 21
0
    def get_features(self, context_obj):
        token = context_obj['token']
        left = ' '.join(
            left_context(context_obj['target'],
                         token,
                         context_size=1,
                         idx=context_obj['index']))

        align_idx = context_obj['alignments'][context_obj['index']]
        if align_idx is None:
            aligned_to = '__unaligned__'
        else:
            aligned_to = context_obj['source'][align_idx]

        return [token + '|' + left + '|' + aligned_to]
    def get_features(self, context_obj):
        if 'source' not in context_obj or context_obj['source'] is None:
            raise NoDataError('source', context_obj,
                              'AlignmentFeatureExtractor')
        if 'target' not in context_obj or context_obj[
                'source'] is None or context_obj['target'] is None:
            raise NoDataError('target', context_obj,
                              'AlignmentFeatureExtractor')

        if 'alignments' not in context_obj:
            raise NoDataError('alignments', context_obj,
                              'AlignmentFeatureExtractor')
#            if self.model == '':
#                raise NoDataError('alignments', context_obj, 'AlignmentFeatureExtractor')
#            context_obj['alignments'] = align_sentence(context_obj['source'], context_obj['target'], self.model)

# source word(s)
        try:
            align_idx = context_obj['alignments'][context_obj['index']]
        except IndexError:
            print("{} items in the alignment, needed {}-th".format(
                len(context_obj['alignments']), context_obj['index']))
            print(context_obj['alignments'], context_obj['target'],
                  context_obj['source'])
            sys.exit()
        # if word is unaligned - no source and no source contexts
        if align_idx == None:
            return [
                '__unaligned__',
                '|'.join(['__unaligned__' for i in range(self.context_size)]),
                '|'.join(['__unaligned__' for i in range(self.context_size)])
            ]

        # TODO: find contexts for all words aligned to the token (now only 1st word)
        else:
            left = '|'.join(
                left_context(context_obj['source'],
                             context_obj['source'][align_idx],
                             context_size=self.context_size,
                             idx=align_idx))
            right = '|'.join(
                right_context(context_obj['source'],
                              context_obj['source'][align_idx],
                              context_size=self.context_size,
                              idx=align_idx))

        aligned_to = context_obj['source'][align_idx]
        return [aligned_to, left, right]
Ejemplo n.º 23
0
    def get_features(self, context_obj):
        if 'source' not in context_obj:
            raise NoDataError('source', context_obj, 'SourceLMFeatureExtractor')
        if 'alignments' not in context_obj:
            raise NoDataError('alignments', context_obj, 'SourceLMFeatureExtractor')
        align_idx = context_obj['alignments'][context_obj['index']]
        # unaligned
        if align_idx is None:
            return [0, 0]
        align_token = context_obj['source'][align_idx]

        left_ngram = left_context(context_obj['source'], align_token, context_size=2, idx=align_idx) + [align_token]
        right_ngram = [align_token] + right_context(context_obj['source'], align_token, context_size=2, idx=align_idx)
        left_ngram_order = self.check_lm(left_ngram, side='left')
        right_ngram_order = self.check_lm(right_ngram, side='right')

        return [left_ngram_order, right_ngram_order]
    def get_features(self, context_obj):
        if 'source' not in context_obj:
            raise NoDataError('source', context_obj,
                              'SourceLMFeatureExtractor')
        if 'alignments' not in context_obj:
            raise NoDataError('alignments', context_obj,
                              'SourceLMFeatureExtractor')
        align_idx = context_obj['alignments'][context_obj['index']]
        # unaligned
        if align_idx is None:
            return [0, 0]
        align_token = context_obj['source'][align_idx]

        left_ngram = left_context(
            context_obj['source'], align_token, context_size=2,
            idx=align_idx) + [align_token]
        right_ngram = [align_token] + right_context(
            context_obj['source'], align_token, context_size=2, idx=align_idx)
        left_ngram_order = self.check_lm(left_ngram, side='left')
        right_ngram_order = self.check_lm(right_ngram, side='right')

        return [left_ngram_order, right_ngram_order]
Ejemplo n.º 25
0
    def get_features(self, context_obj):
        if 'source' not in context_obj or context_obj['source'] is None:
            raise NoDataError('source', context_obj, 'AlignmentFeatureExtractor')
        if 'target' not in context_obj or context_obj['source'] is None or context_obj['target'] is None:
            raise NoDataError('target', context_obj, 'AlignmentFeatureExtractor')

        if 'alignments' not in context_obj:
            if self.model == '':
                raise NoDataError('alignments', context_obj, 'AlignmentFeatureExtractor')
            context_obj['alignments'] = align_sentence(context_obj['source'], context_obj['target'], self.model)

        # source word(s)
        source_nums = sorted(context_obj['alignments'][context_obj['index']])
        # if word is unaligned - no source and no source contexts
        if source_nums == []:
            return ['__unaligned__', '|'.join(['__unaligned__' for i in range(self.context_size)]), '|'.join(['__unaligned__' for i in range(self.context_size)])]

        # TODO: find contexts for all words aligned to the token (now only 1st word)
        else:
            left = '|'.join(left_context(context_obj['source'], context_obj['source'][source_nums[0]], context_size=self.context_size, idx=source_nums[0]))
            right = '|'.join(right_context(context_obj['source'], context_obj['source'][source_nums[-1]], context_size=self.context_size, idx=source_nums[-1]))

        aligned_to = '|'.join([context_obj['source'][i] for i in source_nums])
        return [aligned_to, left, right]
 def test_left_context(self):
     sen_str = 'this is a test sentence.'
     sen = word_tokenize(sen_str.lower())
     test_token = 'is'
     left_context = ngram_window_extractor.left_context(sen, test_token, context_size=3)
     self.assertListEqual(left_context, ['_START_', '_START_', 'this'], 'left_context should prepend _START_ tokens')
 def get_features(self, context_obj):
     token = context_obj['token']
     left = ' '.join(left_context(context_obj['target'], token, context_size=self.context_size, idx=context_obj['index']))
     return [token, left]