Ejemplo n.º 1
0
    def test_extract_mismatch_ngrams(self):

        ext = SurfaceExtractor(only_mismatch_ngrams=True)
        self.assertEquals(
            set(ext.extractFeaturesFromDatapoint(self.data_point)),
            set([('ft', ), ('$$', 'ft'), ('ft', 'ee'), ('$$', 'ft', 'ee'),
                 ('ft', 'ee', 'ss')]))
Ejemplo n.º 2
0
    def test_feature_extractor_cache_with_key(self):

        ext = SurfaceExtractor()

        ## test cache hit
        ext.setFeatureCache(
            {ext._getDataKey(self.data_point): {
                 'ngrams': 'a'
             }}, 'ngrams')

        self.assertEquals(ext.extractFeaturesFromDatapoint(self.data_point),
                          'a')

        ## test empty cache
        ext.setFeatureCache(key='ngrams')
        self.assertEquals(
            set(ext.extractFeaturesFromDatapoint(self.data_point)),
            set([('ft', ), ('$$', 'ft'), ('ft', 'ee'), ('$$', 'ft', 'ee'),
                 ('ft', 'ee', 'ss')]))

        ## now with cache hit
        self.assertEquals(
            set(ext.extractFeaturesFromDatapoint(self.data_point)),
            set([('ft', ), ('$$', 'ft'), ('ft', 'ee'), ('$$', 'ft', 'ee'),
                 ('ft', 'ee', 'ss')]))
Ejemplo n.º 3
0
    def __init__(self, classifier=None, feature_extractors=None):

        if classifier is None:
            self.classifier = SVC()
        else:
            self.classifier = classifier

        if feature_extractors is None:
            self.feature_extractors = [('surface', SurfaceExtractor())]
        else:
            self.feature_extractors = feature_extractors
Ejemplo n.º 4
0
    def test_feature_extractor_cache(self):

        ext = SurfaceExtractor()

        ## test cache hit
        ext.setFeatureCache({ext._getDataKey(self.data_point): 'a'})

        self.assertEquals(ext.extractFeaturesFromDatapoint(self.data_point),
                          'a')

        ## test empty cache
        ext.setFeatureCache()
        self.assertEquals(
            set(ext.extractFeaturesFromDatapoint(self.data_point)),
            set([('ft', ), ('$$', 'ft'), ('ft', 'ee'), ('$$', 'ft', 'ee'),
                 ('ft', 'ee', 'ss')]))
        self.assertEquals(
            {key: set(value)
             for key, value in ext.feature_cache.items()}, {
                 ext._getDataKey(self.data_point):
                 set([('ft', ), ('$$', 'ft'), ('ft', 'ee'), ('$$', 'ft', 'ee'),
                      ('ft', 'ee', 'ss')])
             })
Ejemplo n.º 5
0
    def test_feature_extractor_pickle(self):

        ext = SurfaceExtractor()

        ext.setFeatureCache(
            {ext._getDataKey(self.data_point): {
                 'ngrams': 'a'
             }}, 'ngrams')

        self.assertTrue('feature_cache' not in ext.__getstate__())
Ejemplo n.º 6
0
    def test_extract_all_bigrams(self):

        ext = SurfaceExtractor(min_ngram_size=2, max_ngram_size=2)
        self.assertEquals(
            set(ext.extractFeaturesFromDatapoint(self.data_point)),
            set([('$$', 'ft'), ('ft', 'ee')]))
Ejemplo n.º 7
0
    def test_extract_all_unigrams(self):

        ext = SurfaceExtractor(max_ngram_size=1)
        self.assertEquals(
            set(ext.extractFeaturesFromDatapoint(self.data_point)),
            set([('ft', )]))