Ejemplo n.º 1
0
    def test_extract_all_ngrams(self):

        ext = NGramExtractor(min_ngram_size=1,
                             max_ngram_size=2,
                             skip_size=1,
                             gap='',
                             bow='',
                             eow='')

        self.assertEquals(
            ext.extractFeaturesFromDatapoint('#abcd#'),
            set([
                tuple('#a'),
                tuple('#b'),
                tuple('ab'),
                tuple('ac'),
                tuple('bc'),
                tuple('bd'),
                tuple('cd'),
                tuple('c#'),
                tuple('d#'),
                tuple('#'),
                tuple('a'),
                tuple('b'),
                tuple('c'),
                tuple('d'),
                tuple('#'),
            ]))
Ejemplo n.º 2
0
    def test_extract_1_skip_bi_grams_with_gap_and_ngrampadding_car(self):

        ext = NGramExtractor(min_ngram_size=2,
                             max_ngram_size=2,
                             skip_size=1,
                             gap='|',
                             bow='$',
                             eow='$',
                             pad_ngrams=True)

        self.assertEquals(ext.extractFeaturesFromDatapoint('car'),
                          set([('$c', 'a'), ('a', 'r$'), ('c', '|', 'r')]))
Ejemplo n.º 3
0
    def test_extract_tri_grams(self):

        ext = NGramExtractor(min_ngram_size=3,
                             max_ngram_size=3,
                             skip_size=0,
                             bow='',
                             eow='')

        self.assertEquals(
            ext.extractFeaturesFromDatapoint(self.data_point),
            set([('insurgents', 'killed', 'in'), ('killed', 'in', 'ongoing'),
                 ('in', 'ongoing', 'fighting')]))
Ejemplo n.º 4
0
    def test_extract_1_skip_bi_grams_with_gap_and_ngrampadding_love(self):

        ext = NGramExtractor(min_ngram_size=2,
                             max_ngram_size=2,
                             skip_size=1,
                             gap='|',
                             bow='$',
                             eow='$',
                             pad_ngrams=True)

        self.assertEquals(
            ext.extractFeaturesFromDatapoint('love'),
            set([('$l', 'o'), ('o', 'v'), ('v', 'e$'), ('l', '|', 'v'),
                 ('o', '|', 'e')]))
Ejemplo n.º 5
0
    def test_padding_sequence(self):

        ext = NGramExtractor(min_ngram_size=2,
                             max_ngram_size=2,
                             skip_size=0,
                             bow='<BOS>',
                             eow='<EOS>',
                             pad_ngrams=False)

        self.assertEquals(
            ext.extractFeaturesFromDatapoint(self.data_point),
            set([('<BOS>', 'insurgents'), ('insurgents', 'killed'),
                 ('killed', 'in'), ('in', 'ongoing'), ('ongoing', 'fighting'),
                 ('fighting', '<EOS>')]))
Ejemplo n.º 6
0
    def create(dictionary: set = None,
               sim_thresh=0.01,
               feature_extractor: FeatureExtractorMixin = None,
               add_similarity=False):

        if feature_extractor is None:
            feature_extractor = NGramExtractor(min_ngram_size=3,
                                               max_ngram_size=float('inf'),
                                               skip_size=0,
                                               gap='',
                                               bow='$',
                                               eow='$')

        return ProxinetteGenerator(feature_extractor, dictionary, sim_thresh,
                                   add_similarity)
Ejemplo n.º 7
0
    def create(cls,
               dictionary: set = None,
               sim_thresh=0.2,
               feature_extractor: FeatureExtractorMixin = None,
               add_similarity=False):

        if feature_extractor is None:
            feature_extractor = NGramExtractor(min_ngram_size=2,
                                               max_ngram_size=2,
                                               skip_size=1,
                                               gap='|',
                                               bow='$',
                                               eow='$',
                                               pad_ngrams=False)

        return cls(feature_extractor, dictionary, sim_thresh, add_similarity)
Ejemplo n.º 8
0
    def test_feature_extractor_cache_with_ngram(self):

        ext = NGramExtractor(min_ngram_size=2,
                             max_ngram_size=2,
                             bow='',
                             eow='')

        ext.setFeatureCache()
        self.assertEquals(
            ext.extractFeaturesFromDatapoint('bcdea'),
            set([('b', 'c'), ('c', 'd'), ('d', 'e'), ('e', 'a')]))
        self.assertEquals(
            ext.extractFeaturesFromDatapoint('abcde'),
            set([('a', 'b'), ('b', 'c'), ('c', 'd'), ('d', 'e')]))
Ejemplo n.º 9
0
    def test_extract_all_ngrams_with_padding(self):

        ext = NGramExtractor(min_ngram_size=3,
                             max_ngram_size=float('inf'),
                             skip_size=0,
                             gap='',
                             bow='#',
                             eow='#')

        self.assertEquals(
            ext.extractFeaturesFromDatapoint('comparable'),
            set([
                tuple('#comparable#'),
                tuple('#comparable'),
                tuple('comparable#'),
                tuple('#comparabl'),
                tuple('comparable'),
                tuple('omparable#'),
                tuple('#comparab'),
                tuple('comparabl'),
                tuple('omparable'),
                tuple('mparable#'),
                tuple('#compara'),
                tuple('comparab'),
                tuple('omparabl'),
                tuple('mparable'),
                tuple('parable#'),
                tuple('#compar'),
                tuple('compara'),
                tuple('omparab'),
                tuple('mparabl'),
                tuple('parable'),
                tuple('arable#'),
                tuple('#compa'),
                tuple('compar'),
                tuple('ompara'),
                tuple('mparab'),
                tuple('parabl'),
                tuple('arable'),
                tuple('rable#'),
                tuple('#comp'),
                tuple('compa'),
                tuple('ompar'),
                tuple('mpara'),
                tuple('parab'),
                tuple('arabl'),
                tuple('rable'),
                tuple('able#'),
                tuple('#com'),
                tuple('comp'),
                tuple('ompa'),
                tuple('mpar'),
                tuple('para'),
                tuple('arab'),
                tuple('rabl'),
                tuple('able'),
                tuple('ble#'),
                tuple('#co'),
                tuple('com'),
                tuple('omp'),
                tuple('mpa'),
                tuple('par'),
                tuple('ara'),
                tuple('rab'),
                tuple('abl'),
                tuple('ble'),
                tuple('le#')
            ]))