def get_features(self, context_obj):
        if 'target_pos' not in context_obj:
            raise NoDataError('target_pos', context_obj,
                              'POSContextFeatureExtractor')
        if 'source_pos' not in context_obj:
            raise NoDataError('source_pos', context_obj,
                              'POSContextFeatureExtractor')

        left_src = left_context(
            context_obj['source_pos'],
            context_obj['source_pos'][context_obj['source_index'][0]],
            context_size=1,
            idx=context_obj['source_index'][0])
        right_src = right_context(
            context_obj['source_pos'],
            context_obj['source_pos'][context_obj['source_index'][1] - 1],
            context_size=1,
            idx=context_obj['source_index'][1] - 1)

        left_tg = left_context(
            context_obj['target_pos'],
            context_obj['target_pos'][context_obj['index'][0]],
            context_size=1,
            idx=context_obj['index'][0])

        return [left_src[0], right_src[0], left_tg[0]]
    def get_features(self, context_obj):
        if 'target_pos' not in context_obj:
            if 'target' in context_obj and context_obj['target'] is not None:
                context_obj['target_pos'] = self._call_tagger(context_obj['target'])
            else:
                raise NoDataError('target_pos', context_obj, 'POSFeatureExtractor')
        if 'source_pos' not in context_obj:
            if 'source' in context_obj and context_obj['source'] is not None:
                context_obj['source_pos'] = self._call_tagger(context_obj['source'], lang='src')
            else:
                raise NoDataError('source_pos', context_obj, 'POSFeatureExtractor')

        # extract POS features:
        # - target POS
        # - source POS (may be more than 1)
        # - something else?
        tg_pos = context_obj['target_pos'][context_obj['index']] if context_obj['target_pos'] != [] else ''
        src_pos = []
        if 'source_pos' in context_obj and context_obj['source_pos'] != [] and 'alignments' in context_obj:
            align_idx = context_obj['alignments'][context_obj['index']]
            if align_idx is not None:
                src_pos = context_obj['source_pos'][align_idx]
            else:
                src_pos = '__unaligned__'

        return [tg_pos, src_pos]
Beispiel #3
0
    def get_features(self, context_obj):
        #sys.stderr.write("Start PhraseAlignmentFeatureExtractor\n")
        if 'source' not in context_obj or context_obj['source'] is None:
            #sys.stderr.write('No source')
            raise NoDataError('source', context_obj,
                              'AlignmentFeatureExtractor')
        if 'target' not in context_obj or context_obj[
                'source'] is None or context_obj['target'] is None:
            #sys.stderr.write('No target')
            raise NoDataError('target', context_obj,
                              'AlignmentFeatureExtractor')

        if 'alignments_all' not in context_obj:
            context_obj['alignments_all'] = [[i]
                                             for i in context_obj['alignments']
                                             ]
            #raise NoDataError('alignments_all', context_obj, 'AlignmentFeatureExtractor')
#        if self.model == '':
#            raise NoDataError('alignments', context_obj, 'AlignmentFeatureExtractor')
# we have to extract new alignments because we need the number of aligned words per target word
#        local_alignments = align_sentence(context_obj['source'], context_obj['target'], self.model)
        n_unaligned, n_multiple = 0, 0
        n_alignments = []
        #sys.stderr.write('All fine\n')
        #sys.stderr.write('%s\n' % (', '.join([s for s in context_obj])))
        #sys.stderr.write('%s, %i\n' % (type(context_obj['index']), len(context_obj['index'])))
        #sys.stderr.write('Context obj index: %i to %i\n' % (context_obj['index'][0], context_obj['index'][1]))
        for i in range(context_obj['index'][0], context_obj['index'][1]):
            assert (all([
                w == ww for (w, ww) in zip(context_obj['token'], [
                    context_obj['target'][j] for j in range(
                        context_obj['index'][0], context_obj['index'][1])
                ])
            ])), "Assertion failed"
            #sys.stderr.write('Assertion was fine\n')
            #print(context_obj['alignments_all'])
            cur_alignments = len(context_obj['alignments_all'][i])
            #sys.stderr.write('Alignments_all\n')
            if cur_alignments == 0:
                #sys.stderr.write('Cur_alignments = 0\n')
                n_unaligned += 1
            elif cur_alignments > 1:
                #sys.stderr.write('Cur_alignments > 1\n')
                n_multiple += 1
            #sys.stderr.write('Op!\n')
            n_alignments.append(cur_alignments)

        #sys.stderr.write('Still fine')
        tg_len = len(context_obj['token'])
        #sys.stderr.write("Finish PhraseAlignmentFeatureExtractor\n")
        return [
            str(n_unaligned / tg_len),
            str(n_multiple / tg_len),
            str(np.average(n_alignments))
        ]
    def get_features(self, context_obj):
        if 'source' not in context_obj or context_obj['source'] is None:
            raise NoDataError('source', context_obj,
                              'AlignmentFeatureExtractor')
        if 'target' not in context_obj or context_obj[
                'source'] is None or context_obj['target'] is None:
            raise NoDataError('target', context_obj,
                              'AlignmentFeatureExtractor')

        if 'alignments' not in context_obj:
            raise NoDataError('alignments', context_obj,
                              'AlignmentFeatureExtractor')
#            if self.model == '':
#                raise NoDataError('alignments', context_obj, 'AlignmentFeatureExtractor')
#            context_obj['alignments'] = align_sentence(context_obj['source'], context_obj['target'], self.model)

# source word(s)
        try:
            align_idx = context_obj['alignments'][context_obj['index']]
        except IndexError:
            print("{} items in the alignment, needed {}-th".format(
                len(context_obj['alignments']), context_obj['index']))
            print(context_obj['alignments'], context_obj['target'],
                  context_obj['source'])
            sys.exit()
        # if word is unaligned - no source and no source contexts
        if align_idx == None:
            return [
                '__unaligned__',
                '|'.join(['__unaligned__' for i in range(self.context_size)]),
                '|'.join(['__unaligned__' for i in range(self.context_size)])
            ]

        # TODO: find contexts for all words aligned to the token (now only 1st word)
        else:
            left = '|'.join(
                left_context(context_obj['source'],
                             context_obj['source'][align_idx],
                             context_size=self.context_size,
                             idx=align_idx))
            right = '|'.join(
                right_context(context_obj['source'],
                              context_obj['source'][align_idx],
                              context_size=self.context_size,
                              idx=align_idx))

        aligned_to = context_obj['source'][align_idx]
        return [aligned_to, left, right]
    def get_features(self, context_obj):
        if 'sentence_id' not in context_obj:
            raise NoDataError('sentence_id', context_obj,
                              'PseudoReferenceFeatureExtractor')

        out = 1 if context_obj['token'] in self.pseudo_references[
            context_obj['sentence_id']] else 0
        return [out]
Beispiel #6
0
    def get_features(self, context_obj):
        if 'sequence_tags' not in context_obj:
            raise NoDataError('sequence_tags', context_obj, 'PreviousTagFeatureExtractor')

        idx = context_obj['index']
        if idx == 0:
            return ['_START_']
        else:
            return [context_obj['sequence_tags'][idx-1]]
    def get_features(self, context_obj):
        if 'source' not in context_obj:
            raise NoDataError('source', context_obj,
                              'SourceLMFeatureExtractor')
        if 'alignments' not in context_obj:
            raise NoDataError('alignments', context_obj,
                              'SourceLMFeatureExtractor')
        align_idx = context_obj['alignments'][context_obj['index']]
        # unaligned
        if align_idx is None:
            return [0, 0]
        align_token = context_obj['source'][align_idx]

        left_ngram = left_context(
            context_obj['source'], align_token, context_size=2,
            idx=align_idx) + [align_token]
        right_ngram = [align_token] + right_context(
            context_obj['source'], align_token, context_size=2, idx=align_idx)
        left_ngram_order = self.check_lm(left_ngram, side='left')
        right_ngram_order = self.check_lm(right_ngram, side='right')

        return [left_ngram_order, right_ngram_order]
    def get_features(self, context_obj):
        if 'source' not in context_obj:
            raise NoDataError('source', context_obj,
                              'GoogleTranslateFeatureExtractor')

        if 'pseudo-reference' in context_obj:
            translation = context_obj['pseudo-reference']
        else:
            gs = Goslate()
            translation = word_tokenize(
                gs.translate(' '.join(context_obj['source']), self.lang))
        if context_obj['token'] in translation:
            return [1]
        return [0]