Exemple #1
0
    def __init__(self,
                 evaluation_data_perc: int,
                 data_source: str,
                 random_seed: Optional[int] = None):
        """ Splits data already set on the ml model, using either the training data or evaluation data as source.
        The respective data is then split and loaded back into the training and evaluation data of the model.

        :param evaluation_data_perc: Percentage of data that will be cut of from data in data source and set to
                                     evaluation data of the ml model.
        :param data_source:          Data used to split.
        :param random_seed:          If set, seed will be used to shuffle the data before splitting, run multiple
                                     models with different seeds to achieve k-fold evaluation.
        """
        super().__init__(data_source, random_seed)
        self.randomize = DataTypeSpecification('splitter_randomize', False,
                                               bool)
        self.seed = NullSpecification('splitter_seed')
        if random_seed is not None:
            self.randomize = DataTypeSpecification('splitter_randomize', True,
                                                   bool)
            self.seed = DataTypeSpecification('splitter_seed', random_seed,
                                              int)

        self.evaluation_data_perc = RangeSpecification(
            name='evaluation_data_perc',
            value=evaluation_data_perc,
            min_value=0,
            max_value=100)
        self.training_data_percentage = RangeSpecification(
            name='training_data_perc',
            value=(100 - evaluation_data_perc),
            min_value=0,
            max_value=100)
Exemple #2
0
    def __init__(self,
                 data_source: str,
                 column_name: str,
                 training_categories: List[str] = None,
                 eval_categories: List[str] = None,
                 verbosity: int = 0):
        """ Splits data already set on the ml model, using either the training data or evaluation data as source.
        The respective data is split using categories from a given column

        :param data_source:          Data used to split.
        """
        super().__init__(data_source, None)
        self.verbosity = verbosity
        self.data_source = TypeSpecification(
            name='data_source',
            value=data_source,
            valid_types=[self.TRAINING_DATA, self.EVALUATION_DATA])

        self.column_name = DataTypeSpecification(name='column_name',
                                                 value=column_name,
                                                 data_type=str)
        self.training_categories = NullSpecification('training_categories')
        if training_categories is not None:
            self.training_categories = DataTypeSpecification(
                'training_categories', training_categories, list)

        self.eval_categories = NullSpecification('eval_categories')
        if eval_categories is not None:
            self.eval_categories = DataTypeSpecification(
                'eval_categories', eval_categories, list)
Exemple #3
0
    def test_describe_dict_ignore_callable(self):
        def callMe():
            return 'hello!'

        self.data_type_specification = DataTypeSpecification('test_dir', {0: 'test', callMe: 1}, dict)
        description = self.data_type_specification.describe()
        self.assertEqual({0: 'test'}, description)
Exemple #4
0
    def test_describe_list_ignore_callable(self):
        def callMe():
            return 'hello!'

        self.data_type_specification = DataTypeSpecification('test_dir', ['test_one', callMe], list)
        description = self.data_type_specification.describe()
        self.assertEqual(['test_one'], description)
Exemple #5
0
 def __init__(self, data_source: str = None, random_seed: int = None):
     super().__init__()
     self.data_source = NullSpecification('data_source')
     if data_source is not None:
         self.data_source = TypeSpecification(
             name='data_source',
             value=data_source,
             valid_types=[self.TRAINING_DATA, self.EVALUATION_DATA])
     self.randomize = DataTypeSpecification('splitter_randomize', False,
                                            bool)
     self.seed = NullSpecification('splitter_seed')
     if random_seed is not None:
         self.randomize = DataTypeSpecification('splitter_randomize', True,
                                                bool)
         self.seed = DataTypeSpecification('splitter_seed', random_seed,
                                           int)
Exemple #6
0
    def __init__(self, min_syntactic_distance: float, verbosity: int = 0):
        """

        Args:
            min_syntactic_distance: minimum difference between two words for them to be considerd synonyms by loader.
            verbosity:
        """
        self.verbosity = verbosity
        self.min_syntactic_distance = DataTypeSpecification('min_syntactic_distance', min_syntactic_distance, float)
Exemple #7
0
    def __init__(self,
                 optimizer_type: str,
                 learning_rate: float,
                 gradient_clipping: Optional[float] = None,
                 kwargs: dict = None):
        super().__init__()
        self.optimizer_type = TypeSpecification(
            'optimizer_type', optimizer_type, OptimizerStrategy.ALL_STRATEGIES)
        self.learning_rate = DataTypeSpecification('optimizer_learning_rate',
                                                   learning_rate, float)

        self.gradient_clipping = NullSpecification('gradient_clipping')
        if gradient_clipping is not None:
            self.gradient_clipping = DataTypeSpecification(
                'gradient_clipping', gradient_clipping, float)

        self.kwargs = NullSpecification('kwargs')
        if kwargs is not None:
            self.kwargs = DataTypeSpecification('kwargs', kwargs, dict)
Exemple #8
0
    def __init__(self, feature_columns: dict, feature_config: dict = None):
        super().__init__()
        self.feature_config = DataTypeSpecification('feature_config',
                                                    feature_config, dict)
        self.feature_columns = FeatureColumnsSpecification(
            'feature_columns', [], FeatureColumnStrategy.ALL_COLUMNS)

        for name, type in feature_columns.items():
            if type is FeatureColumnStrategy.BUCKETIZED_COLUMN:
                assert name in feature_config, 'Missing configuration for bucketized column: {}.'.format(
                    name)
                assert 'buckets' in self.feature_config()[name], \
                    'Missing buckets configuration for bucketized column: {}.'.format(name)
            self.add_feature_column(name=name, column_type=type)
Exemple #9
0
    def test_valid(self):
        self.data_type_specification = DataTypeSpecification('test_dir', 'test_one', str)
        self.data_type_specification.validate()

        self.data_type_specification = DataTypeSpecification('test_dir', ['test_one'], list)
        self.data_type_specification.validate()

        self.data_type_specification = DataTypeSpecification('test_dir', {'test_dir': 'one'}, dict)
        self.data_type_specification.validate()
Exemple #10
0
    def __init__(self,
                 estimator_type: str,
                 config_kwargs: dict = None,
                 kwargs: dict = None):
        """ build Estimator

        :param estimator_type: valid estimator strategy
        :param config_kwargs: pass the kwargs for a tf.estimator.RunConfig here, otherwise it will not be printed
                correctly in description.
        :param kwargs: estimator kwargs, do not pass objects if config is printed. //todo: find way to accurately print objects.
        """
        super().__init__()
        self.set_estimator(estimator_type)
        if config_kwargs is not None:
            self.config_kwargs = DataTypeSpecification('config_kwargs',
                                                       config_kwargs, dict)
            kwargs = self.set_config(config_kwargs, kwargs)

        self.kwargs = NullSpecification('kwargs')
        if kwargs is not None:
            self.kwargs = PrefixedDictSpecification('kwargs', 'est', kwargs)
Exemple #11
0
    def __init__(self,
                 target_column: str,
                 data_columns: list,
                 data_source: str = None,
                 eval_data_source: str = None,
                 prediction_data_source: str = None,
                 weight_column: str = None,
                 limit: int = None):
        super().__init__()
        self.data_columns = DataTypeSpecification('columns', data_columns,
                                                  list)
        self.target_column = DataTypeSpecification('target_column',
                                                   target_column, str)
        self.weight_column = NullSpecification('weight_column')

        self.data_source = NullSpecification('data_source')
        self.eval_data_source = NullSpecification('eval_data_source')
        self.prediction_data_source = NullSpecification(
            'prediction_data_source')
        self.limit = NullSpecification('limit')

        if None is not data_source:
            self.data_source = DataTypeSpecification('data_source',
                                                     data_source, str)

        if None is not eval_data_source:
            self.eval_data_source = DataTypeSpecification(
                'eval_data_source', eval_data_source, str)

        if None is not prediction_data_source:
            self.prediction_data_source = DataTypeSpecification(
                'prediction_data_source', prediction_data_source, str)

        if None is not weight_column:
            self.weight_column = DataTypeSpecification('weight_column',
                                                       weight_column, str)

        if None is not limit:
            self.limit = DataTypeSpecification('limit', limit, int)

        self.test_data = None
        self.validation_data = None
Exemple #12
0
 def test_describe_dict_ignore_object(self):
     self.data_type_specification = DataTypeSpecification('test_dir', {0: 'test', 1: mock.Mock()}, dict)
     description = self.data_type_specification.describe()
     self.assertEqual({0: 'test'}, description)
Exemple #13
0
 def test_describe_dict(self):
     self.data_type_specification = DataTypeSpecification('test_dir', {0: 'test', 1: 'test_one'}, dict)
     description = self.data_type_specification.describe()
     self.assertEqual({0: 'test', 1: 'test_one'}, description)
Exemple #14
0
 def test_describe_list_ignore_object(self):
     self.data_type_specification = DataTypeSpecification('test_dir', ['test_one', mock.Mock()], list)
     description = self.data_type_specification.describe()
     self.assertEqual(['test_one'], description)
Exemple #15
0
    def test_invalid(self):
        self.data_type_specification = DataTypeSpecification('test_dir', 'test_invalid', int)

        with self.assertRaises(AssertionError):
            self.data_type_specification.validate()
Exemple #16
0
class TestDataTypeSpecification(unittest.TestCase):

    def test_valid(self):
        self.data_type_specification = DataTypeSpecification('test_dir', 'test_one', str)
        self.data_type_specification.validate()

        self.data_type_specification = DataTypeSpecification('test_dir', ['test_one'], list)
        self.data_type_specification.validate()

        self.data_type_specification = DataTypeSpecification('test_dir', {'test_dir': 'one'}, dict)
        self.data_type_specification.validate()

    def test_invalid(self):
        self.data_type_specification = DataTypeSpecification('test_dir', 'test_invalid', int)

        with self.assertRaises(AssertionError):
            self.data_type_specification.validate()

    def test_describe_list(self):
        self.data_type_specification = DataTypeSpecification('test_dir', ['test_one'], list)
        description = self.data_type_specification.describe()
        self.assertEqual(['test_one'], description)

    def test_describe_list_ignore_object(self):
        self.data_type_specification = DataTypeSpecification('test_dir', ['test_one', mock.Mock()], list)
        description = self.data_type_specification.describe()
        self.assertEqual(['test_one'], description)

    def test_describe_list_ignore_callable(self):
        def callMe():
            return 'hello!'

        self.data_type_specification = DataTypeSpecification('test_dir', ['test_one', callMe], list)
        description = self.data_type_specification.describe()
        self.assertEqual(['test_one'], description)

    def test_describe_dict(self):
        self.data_type_specification = DataTypeSpecification('test_dir', {0: 'test', 1: 'test_one'}, dict)
        description = self.data_type_specification.describe()
        self.assertEqual({0: 'test', 1: 'test_one'}, description)

    def test_describe_dict_ignore_object(self):
        self.data_type_specification = DataTypeSpecification('test_dir', {0: 'test', 1: mock.Mock()}, dict)
        description = self.data_type_specification.describe()
        self.assertEqual({0: 'test'}, description)

    def test_describe_dict_ignore_callable(self):
        def callMe():
            return 'hello!'

        self.data_type_specification = DataTypeSpecification('test_dir', {0: 'test', callMe: 1}, dict)
        description = self.data_type_specification.describe()
        self.assertEqual({0: 'test'}, description)