Example #1
0
 def test_get_transformer_no_type(self):
     data = {
         'params': {
             'ngram_range_min': 1,
             'ngram_range_max': 3,
             'min_df': 10,
             'max_df': 20
         }
     }
     self.assertIsNone(get_transformer(data))
Example #2
0
 def test_get_transformer_unknown(self):
     data = {
         'type': 'MyCustomVectorizer',
         'params': {
             'ngram_range_min': 1,
             'ngram_range_max': 3,
             'min_df': 10,
             'max_df': 20
         }
     }
     self.assertIsNone(get_transformer(data))
Example #3
0
 def test_get_transformer_word2vec(self):
     data = {
         'type': 'Word2Vec',
         'params': {
             'train_algorithm': 'skip-gram',
             'min_count': 3
         }
     }
     transformer = get_transformer(data)
     self.assertIsInstance(transformer, Word2VecVectorizer)
     self.assertEqual(transformer.train_algorithm, 'skip-gram')
     self.assertEqual(transformer.min_count, 3)
Example #4
0
 def test_get_transformer_tfidf(self):
     data = {
         'type': 'Tfidf',
         'params': {
             'ngram_range_min': 1,
             'ngram_range_max': 3,
             'min_df': 10
         }
     }
     transformer = get_transformer(data)
     self.assertIsInstance(transformer, TfidfVectorizer)
     self.assertEqual(transformer.ngram_range, (1, 3))
     self.assertEqual(transformer.min_df, 10)
Example #5
0
    def __init__(self, config, is_file=True):
        try:
            if is_file:
                with open(config, 'r') as fp:
                    data = json.load(fp)
            else:
                data = json.loads(config)
        except (ValueError, IOError) as e:
            raise TransformerSchemaException(message='%s %s ' % (config, e),
                                             chain=e)

        if 'transformer-name' not in data:
            raise TransformerSchemaException(
                message="transformer-name is missing")

        self.name = data['transformer-name'].strip(' \t\n\r')
        if not self.name:
            raise TransformerSchemaException(
                message="transformer-name is missing")

        self.type = data['type']

        # Get transformer
        transformer_config = data.get('transformer', None)
        transformer_type = None
        if transformer_config is not None:
            transformer_type = transformer_config.get('type')
        transformer = get_transformer(transformer_config)

        factory = FEATURE_TYPE_FACTORIES.get(data['type'], None)

        if factory is None:
            raise TransformerSchemaException('Unknown type: %s' %
                                             (data['type']))

        try:
            feature_type = factory.get_instance(
                data.get('params', None), data.get('input-format', 'plain'))
        except Exception as e:
            raise TransformerSchemaException(
                'Feature type error: %s' % (data['type']), e)

        self.feature = {
            'name': data['field-name'],
            'type': feature_type,
            'transformer-type': transformer_type,
            'transformer': transformer
        }
        self.voc_size = None
Example #6
0
 def test_get_transformer_count(self):
     data = {
         'type': 'Count',
         'params': {
             'ngram_range_min': 1,
             'ngram_range_max': 3,
             'min_df': 10,
             'max_df': 20
         }
     }
     transformer = get_transformer(data)
     self.assertIsInstance(transformer, CountVectorizer)
     self.assertEqual(transformer.ngram_range, (1, 3))
     self.assertEqual(transformer.min_df, 10)
     self.assertEqual(transformer.max_df, 20)
Example #7
0
 def test_get_transformer_doc2vec(self):
     data = {
         'type': 'Doc2Vec',
         'params': {
             'train_algorithm': 'pv-dm',
             'vector_size': 400,
             'min_count': 5,
             'iterations': 6
         }
     }
     transformer = get_transformer(data)
     self.assertIsInstance(transformer, Doc2VecVectorizer)
     self.assertEqual(transformer.train_algorithm, 'pv-dm')
     self.assertEqual(transformer.min_count, 5)
     self.assertEqual(transformer.vector_size, 400)
     self.assertEqual(transformer.iterations, 6)
Example #8
0
 def test_ntile(self):
     data = {'type': 'Ntile', 'params': {'number_tile': 4}}
     transformer = get_transformer(data)
     self.assertIsInstance(transformer, Ntile)
     self.assertEqual(transformer.number_tile, 4)
     X = [1, 2, 3, 7, 78, 8, 35235, 353, 3555, 3535, 3657, 6868, 865]
     X1 = [
         1, 2, 3, 7, 78, 8, 35235, 353, 3555, 3535, 3657, 6868, 865, 4, 66,
         342323
     ]
     transformer.fit(X)
     Y = transformer.transform(X)
     self.assertEqual(Y.transpose().todense().tolist()[0],
                      [1, 1, 1, 1, 2, 2, 4, 2, 3, 3, 4, 4, 3])
     Y = transformer.transform(X1)
     self.assertEqual(Y.transpose().todense().tolist()[0],
                      [1, 1, 1, 1, 2, 2, 4, 2, 3, 3, 4, 4, 3, 1, 2, 4])
Example #9
0
 def test_get_transformer_dict(self):
     data = {'type': 'Dictionary', 'params': {'separator': ','}}
     transformer = get_transformer(data)
     self.assertIsInstance(transformer, DictVectorizer)
     self.assertEqual(transformer.separator, ',')