コード例 #1
0
    def test_parse_non_nlp_features_string_unrecognized(self):
        """
        Use invalid parameter values to test `parse_non_nlp_features_string`.
        """

        # Use one of the time-related labels as the prediction label
        # and exclude all time-related labels from the input string
        label_group = 'TIME_LABELS'
        prediction_label = list(self.label_groups[label_group])[0]
        fake_and_real_features = \
            self.labels.difference(self.label_groups[label_group])

        # Add fake features to the set of input features and shuffle it
        fake_and_real_features.update(
            {'hours', 'achievements', 'friends', 'groups'})
        fake_and_real_features = list(fake_and_real_features)
        np.random.shuffle(fake_and_real_features)

        # Iterate through features and discard any set that doesn't
        # contain at least one unrecognized feature
        for i in range(len(fake_and_real_features)):
            if self.labels.issuperset(fake_and_real_features[:i]): continue
            with self.assertRaises(ValueError):
                parse_non_nlp_features_string(
                    ','.join(fake_and_real_features[:i]), prediction_label)
コード例 #2
0
    def test_parse_non_nlp_features_string_valid(self):
        """
        Use valid parameter values to test `parse_non_nlp_features_string`.
        """

        # Test some valid combinations (not all) of non-NLP features
        for label_group in self.label_groups:

            valid_prediction_labels = self.label_groups[label_group]

            # Pick one random label to use as the prediction label from
            # each group of labels
            group_labels = list(valid_prediction_labels)
            np.random.shuffle(group_labels)
            prediction_label = group_labels[0]

            if label_group != 'OTHER':
                valid_labels = list(
                    self.labels.difference(valid_prediction_labels))
            else:
                valid_labels = [
                    label for label in self.labels
                    if not label == prediction_label
                ]

            for i in range(1, len(valid_labels) + 1):
                assert_equal(
                    sorted(
                        parse_non_nlp_features_string(
                            ','.join(valid_labels[:i]), prediction_label)),
                    sorted(valid_labels[:i]))
コード例 #3
0
    def test_parse_non_nlp_features_string_none(self):
        """
        Test `parse_non_nlp_features_string` when a value of "none" is
        used instead of a comma-separated list of labels (return a set
        consisting of no labels).
        """

        # Use one label from the label group as the prediction label
        label_group_name = list(self.label_groups)[0]
        group_labels = self.label_groups[label_group_name]
        prediction_label = list(group_labels)[0]
        expected_labels = set()

        assert_equal(parse_non_nlp_features_string('none', prediction_label),
                     set())
コード例 #4
0
    def test_parse_non_nlp_features_string_group_conflict(self):
        """
        Use parameter values that represent a conflict to test whether
        or not `parse_non_nlp_features_string` will catch it.
        """

        for label_group in self.label_groups:

            # Skip 'OTHER' label group
            if label_group == 'OTHER': continue

            # Use one label from the label group as the prediction label
            group_labels = list(self.label_groups[label_group])
            prediction_label = group_labels[0]

            # Get a small set of labels from other groups
            other_group_labels = list(
                set(chain(
                    *self.label_groups.values())).difference(group_labels))
            np.random.shuffle(other_group_labels)
            other_group_labels = other_group_labels[:5]

            # Iterate through each group label that represents a
            # conflict (including the prediction label itself)
            for label in group_labels:
                labels = [label_ for label_ in group_labels if label_ != label]
                for i in range(len(labels)):
                    labels_ = labels[:i] + [label]
                    with self.assertRaises(ValueError):
                        parse_non_nlp_features_string(','.join(labels_),
                                                      prediction_label)
                    labels_ = labels_ + other_group_labels
                    np.random.shuffle(labels_)
                    with self.assertRaises(ValueError):
                        parse_non_nlp_features_string(','.join(labels_),
                                                      prediction_label)
コード例 #5
0
    def test_parse_non_nlp_features_string_all(self):
        """
        Test `parse_non_nlp_features_string` when a value of "all" is
        used instead of a comma-separated list of labels (which should
        automatically remove any labels that conflict with the
        prediction label).
        """

        # Use one label from the label group as the prediction label
        label_group_name = 'TIME_LABELS'
        group_labels = self.label_groups[label_group_name]
        prediction_label = list(group_labels)[0]
        expected_labels = self.labels.difference(group_labels)

        assert_equal(
            sorted(parse_non_nlp_features_string('all', prediction_label)),
            sorted(expected_labels))
コード例 #6
0
    def test_CVConfig_valid(self):
        """
        Test valid parameter values for the `CVConfig` class.
        """

        learners = ['perc', 'pagr']
        non_nlp_features = parse_non_nlp_features_string('all', 'total_game_hours')
        param_grids = [DEFAULT_PARAM_GRIDS[LEARNER_DICT[learner]]
                       for learner in learners]
        valid_kwargs = dict(db=self.db,
                            games=set(['Dota_2']),
                            learners=learners,
                            param_grids=param_grids,
                            training_rounds=10,
                            training_samples_per_round=100,
                            grid_search_samples_per_fold=50,
                            non_nlp_features=non_nlp_features,
                            prediction_label=self.prediction_label,
                            output_path=self.output_path,
                            objective='pearson_r',
                            data_sampling='stratified',
                            grid_search_folds=5,
                            hashed_features=100000,
                            nlp_features=True,
                            bin_ranges=[(0.0, 225.1), (225.2, 2026.2),
                                        (2026.3, 16435.0)],
                            lognormal=False,
                            power_transform=None,
                            majority_baseline=True,
                            rescale=True,
                            feature_selection_percentile=0.8,
                            n_jobs=4)
        default_params = set(['objective', 'data_sampling', 'grid_search_folds',
                              'hashed_features', 'nlp_features', 'bin_ranges',
                              'lognormal', 'power_transform', 'majority_baseline',
                              'rescale', 'feature_selection_percentile', 'n_jobs'])

        # Combinations of parameters
        valid_kwargs_list = [
            # Only specify non-default parameters
            dict(**{p: v for p, v in valid_kwargs.items() if not p
                    in default_params}),
            # Only specify non-default parameters + `objective`
            dict(**{p: v for p, v in valid_kwargs.items() if not p
                    in default_params.difference(['objective'])}),
            # Only specify non-default parameters + `data_sampling`
            dict(**{p: v for p, v in valid_kwargs.items() if not p
                    in default_params.difference(['data_sampling'])}),
            # Only specify non-default parameters + `grid_search_folds`
            dict(**{p: v for p, v in valid_kwargs.items() if not p
                    in default_params.difference(['grid_search_folds'])}),
            # Only specify non-default parameters + `hashed_features`
            dict(**{p: v for p, v in valid_kwargs.items() if not p
                    in default_params.difference(['hashed_features'])}),
            # Only specify non-default parameters + `nlp_features`
            dict(**{p: v for p, v in valid_kwargs.items() if not p
                    in default_params.difference(['nlp_features'])}),
            # Only specify non-default parameters + `bin_ranges`
            dict(**{p: v for p, v in valid_kwargs.items() if not p
                    in default_params.difference(['bin_ranges'])}),
            # Only specify non-default parameters + `lognormal`
            dict(**{p: v for p, v in valid_kwargs.items() if not p
                    in default_params.difference(['lognormal'])}),
            # Only specify non-default parameters + `power_transform`
            dict(**{p: v for p, v in valid_kwargs.items() if not p
                    in default_params.difference(['power_transform'])}),
            # Only specify non-default parameters + `majority_baseline`
            dict(**{p: v for p, v in valid_kwargs.items() if not p
                    in default_params.difference(['majority_baseline'])}),
            # Only specify non-default parameters + `rescale`
            dict(**{p: v for p, v in valid_kwargs.items() if not p
                    in default_params.difference(['rescale'])}),
            # Only specify non-default parameters +
            # `feature_selection_percentile`
            dict(**{p: v for p, v in valid_kwargs.items() if not p
                    in default_params.difference(['feature_selection_percentile'])}),
            # Only specify non-default parameters + `n_jobs`
            dict(**{p: v for p, v in valid_kwargs.items() if not p
                    in default_params.difference(['n_jobs'])})
            ]
        for kwargs in valid_kwargs_list:

            # Make the configuration object
            cfg = CVConfig(**kwargs).validated

            # `db`
            assert_equal(cfg['db'], kwargs['db'])

            # `games`
            assert_equal(cfg['games'], kwargs['games'])

            # `learners`
            assert_equal(cfg['learners'], kwargs['learners'])
            assert (isinstance(cfg['learners'], list)
                    and all(learner in LEARNER_DICT_KEYS for learner in learners))
            assert_equal(len(cfg['learners']), len(cfg['param_grids']))

            # `param_grids`
            assert (isinstance(cfg['param_grids'], list)
                    and all(isinstance(pgrids_list, list) for pgrids_list
                            in cfg['param_grids'])
                    and all(all(isinstance(pgrid, dict) for pgrid in pgrids_list)
                            for pgrids_list in cfg['param_grids'])
                    and all(all(all(isinstance(param, str)
                                    for param in pgrid)
                                for pgrid in pgrids_list)
                            for pgrids_list in cfg['param_grids'])
                    and len(cfg['param_grids']) > 0)

            # `training_rounds`, `training_samples_per_round`,
            # `grid_search_samples_per_fold`, and `grid_search_folds`
            assert cfg['training_rounds'] > 1
            assert cfg['training_samples_per_round'] > 0
            assert cfg['grid_search_samples_per_fold'] > 1
            if 'grid_search_folds' in kwargs:
                assert 'grid_search_folds' in cfg
                assert cfg['grid_search_folds'] > 1
                assert_equal(cfg['grid_search_folds'], kwargs['grid_search_folds'])
            else:
                assert_equal(cfg['grid_search_folds'], 5)

            # `nlp_features`, `non_nlp_features`, and `prediction_label`
            assert (isinstance(cfg['non_nlp_features'], set)
                    and cfg['non_nlp_features'].issubset(LABELS))
            assert (isinstance(cfg['prediction_label'], str)
                    and cfg['prediction_label'] in LABELS
                    and not cfg['prediction_label'] in cfg['non_nlp_features'])
            if 'nlp_features' in kwargs:
                assert 'nlp_features' in cfg
                assert isinstance(cfg['nlp_features'], bool)
                assert_equal(cfg['nlp_features'], kwargs['nlp_features'])
            else:
                assert_equal(cfg['nlp_features'], True)

            # `objective`
            if 'objective' in kwargs:
                assert 'objective' in cfg
                assert cfg['objective'] in OBJ_FUNC_ABBRS_DICT
                assert_equal(cfg['objective'], kwargs['objective'])
            else:
                assert_equal(cfg['objective'], None)

            # `data_sampling`
            if 'data_sampling' in kwargs:
                assert 'data_sampling' in cfg
                assert cfg['data_sampling'] in ExperimentalData.sampling_options
                assert_equal(cfg['data_sampling'], kwargs['data_sampling'])
            else:
                assert_equal(cfg['data_sampling'], 'even')

            # `hashed_features`
            if 'hashed_features' in kwargs:
                assert 'hashed_features' in cfg
                if cfg['hashed_features'] is not None:
                    assert cfg['hashed_features'] > -1
                assert_equal(cfg['hashed_features'], kwargs['hashed_features'])
            else:
                assert_equal(cfg['hashed_features'], None)

            # `bin_ranges`
            if 'bin_ranges' in kwargs:
                assert 'bin_ranges' in cfg
                assert (isinstance(cfg['bin_ranges'], list)
                        and all((isinstance(bin_, tuple)
                                 and all(isinstance(val, float) for val in bin_))
                                for bin_ in cfg['bin_ranges']))
                assert_equal(cfg['bin_ranges'], kwargs['bin_ranges'])
                validate_bin_ranges(cfg['bin_ranges'])
            else:
                assert_equal(cfg['bin_ranges'], None)

            # `lognormal`
            if 'lognormal' in kwargs:
                assert 'lognormal' in cfg
                assert isinstance(cfg['lognormal'], bool)
                assert_equal(cfg['lognormal'], kwargs['lognormal'])
            else:
                assert_equal(cfg['lognormal'], False)

            # `power_transform`
            if 'power_transform' in kwargs:
                assert 'power_transform' in cfg
                assert (cfg['power_transform'] is None
                        or isinstance(cfg['power_transform'], bool))
                assert_equal(cfg['power_transform'], kwargs['power_transform'])
            else:
                assert_equal(cfg['power_transform'], None)

            # `majority_baseline`
            if 'majority_baseline' in kwargs:
                assert 'majority_baseline' in cfg
                assert isinstance(cfg['majority_baseline'], bool)
                assert_equal(cfg['majority_baseline'], kwargs['majority_baseline'])
            else:
                assert_equal(cfg['majority_baseline'], True)

            # `rescale`
            if 'rescale' in kwargs:
                assert 'rescale' in cfg
                assert isinstance(cfg['rescale'], bool)
                assert_equal(cfg['rescale'], kwargs['rescale'])
            else:
                assert_equal(cfg['rescale'], True)

            # `feature_selection_percentile`
            if 'feature_selection_percentile' in kwargs:
                assert 'feature_selection_percentile' in cfg
                assert isinstance(cfg['feature_selection_percentile'], float)
                assert_equal(cfg['feature_selection_percentile'],
                             kwargs['feature_selection_percentile'])
            else:
                assert_equal(cfg['feature_selection_percentile'], 1.0)

            # `n_jobs`
            if 'n_jobs' in kwargs:
                assert 'n_jobs' in cfg
                assert isinstance(cfg['n_jobs'], int)
                assert_equal(cfg['n_jobs'], kwargs['n_jobs'])
            else:
                assert_equal(cfg['n_jobs'], 1)
コード例 #7
0
    def test_CVConfig_invalid(self):
        """
        Test invalid parameter values for the `CVConfig` class.
        """

        learners = ['perc', 'pagr']
        non_nlp_features = parse_non_nlp_features_string('all', 'total_game_hours')
        param_grids = [DEFAULT_PARAM_GRIDS[LEARNER_DICT[learner]]
                       for learner in learners]
        valid_kwargs = dict(db=self.db,
                            games=set(['Dota_2']),
                            learners=learners,
                            param_grids=param_grids,
                            training_rounds=10,
                            training_samples_per_round=100,
                            grid_search_samples_per_fold=50,
                            non_nlp_features=non_nlp_features,
                            prediction_label=self.prediction_label,
                            output_path=self.output_path,
                            objective='pearson_r',
                            data_sampling='even',
                            grid_search_folds=5,
                            hashed_features=100000,
                            nlp_features=True,
                            bin_ranges=[(0.0, 225.1), (225.2, 2026.2),
                                        (2026.3, 16435.0)],
                            lognormal=False,
                            power_transform=None,
                            majority_baseline=True,
                            rescale=True,
                            feature_selection_percentile=1.0,
                            n_jobs=1)
        
        # Combinations of parameters that should cause a `SchemaError`
        invalid_kwargs_list = [
            # Invalid `db` value
            dict(db='db',
                 **{p: v for p, v in valid_kwargs.items() if p != 'db'}),
            # Invalid games in `games` parameter value
            dict(games={'Dota'},
                 **{p: v for p, v in valid_kwargs.items() if p != 'games'}),
            # Invalid `learners` parameter value (unrecognized learner
            # abbreviations)
            dict(learners=['perceptron', 'passiveagressive'],
                 **{p: v for p, v in valid_kwargs.items() if p != 'learners'}),
            # Invalid `learners` parameter value (empty)
            dict(learners=[],
                 **{p: v for p, v in valid_kwargs.items() if p != 'learners'}),
            # Invalid parameter grids in `param_grids` parameter value
            dict(param_grids=[[dict(a=1, b=2), dict(c='g', d=True)]],
                 **{p: v for p, v in valid_kwargs.items() if p != 'param_grids'}),
            # `learners`/`param_grids` unequal in length
            dict(learners=['perc', 'pagr'],
                 param_grids=[DEFAULT_PARAM_GRIDS[LEARNER_DICT[learner]]
                              for learner in ['perc', 'pagr', 'mbkm']],
                 bin_ranges=None,
                 **{p: v for p, v in valid_kwargs.items() if not p
                    in ['learners', 'param_grids', 'bin_ranges']}),
            # Invalid `training_rounds` parameter value (must be int)
            dict(training_rounds=2.0,
                 **{p: v for p, v in valid_kwargs.items() if p != 'training_rounds'}),
            # Invalid `training_rounds` parameter value (must be greater
            # than 1)
            dict(training_rounds=1,
                 **{p: v for p, v in valid_kwargs.items() if p != 'training_rounds'}),
            # Invalid `training_samples_per_round` parameter value (must
            # be int)
            dict(training_samples_per_round=1.0,
                 **{p: v for p, v in valid_kwargs.items()
                    if p != 'training_samples_per_round'}),
            # Invalid `training_samples_per_round` parameter value (must
            # be greater than 0)
            dict(training_samples_per_round=0.0,
                 **{p: v for p, v in valid_kwargs.items()
                    if p != 'training_samples_per_round'}),
            # Invalid `grid_search_samples_per_fold` parameter value
            # (must be int)
            dict(grid_search_samples_per_fold=50.0,
                 **{p: v for p, v in valid_kwargs.items()
                    if p != 'grid_search_samples_per_fold'}),
            # Invalid `grid_search_samples_per_fold` parameter value
            # (must be greater than 1)
            dict(grid_search_samples_per_fold=0.0,
                 **{p: v for p, v in valid_kwargs.items()
                    if p != 'grid_search_samples_per_fold'}),
            # Invalid `non_nlp_features` parameter value (must be set of
            # valid features)
            dict(non_nlp_features={'total_game_hours_last_three_weeks'},
                 **{p: v for p, v in valid_kwargs.items()
                    if p != 'non_nlp_features'}),
            # Invalid `prediction_label` parameter value (must be in set
            # of valid features)
            dict(prediction_label='total_game_hours_last_three_weeks',
                 **{p: v for p, v in valid_kwargs.items()
                    if p != 'prediction_label'}),
            # Invalid `objective` parameter value (must be in set of
            # of valid objective function names)
            dict(objective='pearson',
                 **{p: v for p, v in valid_kwargs.items()
                    if p != 'objective'}),
            # Invalid `output_path` parameter value (must be string)
            dict(output_path=None,
                 **{p: v for p, v in valid_kwargs.items()
                    if p != 'output_path'}),
            # Invalid `output_path` parameter value (must exist)
            dict(output_path=join(self.output_path, 'does_not_exist'),
                 **{p: v for p, v in valid_kwargs.items()
                    if p != 'output_path'}),
            # Invalid `data_sampling` parameter value (must be in set of
            # of valid sampling methods)
            dict(data_sampling='equal',
                 **{p: v for p, v in valid_kwargs.items()
                    if p != 'data_sampling'}),
            # Invalid `grid_search_folds` parameter value (must be int)
            dict(grid_search_folds=0.0,
                 **{p: v for p, v in valid_kwargs.items()
                    if p != 'grid_search_folds'}),
            # Invalid `grid_search_folds` parameter value (must be
            # greater than 1)
            dict(grid_search_folds=1,
                 **{p: v for p, v in valid_kwargs.items()
                    if p != 'grid_search_folds'}),
            # Invalid `hashed_features` parameter value (must be
            # non-negative or None)
            dict(hashed_features=-1,
                 **{p: v for p, v in valid_kwargs.items()
                    if p != 'hashed_features'}),
            # Invalid `hashed_features` parameter value (must be
            # non-negative)
            dict(hashed_features=False,
                 **{p: v for p, v in valid_kwargs.items()
                    if p != 'hashed_features'}),
            # Invalid `nlp_features` parameter value (must be boolean or
            # None)
            dict(nlp_features=1,
                 **{p: v for p, v in valid_kwargs.items() if p != 'nlp_features'}),
            # Invalid `bin_ranges` parameter value (must be list of
            # tuples -- or None)
            dict(bin_ranges=[[0.2, 100.3], [100.5, 200.6]],
                 **{p: v for p, v in valid_kwargs.items() if p != 'bin_ranges'}),
            # Invalid `bin_ranges` parameter value (must be list of
            # tuples containing floats -- or None)
            dict(bin_ranges=[(0, 99), (100, 200)],
                 **{p: v for p, v in valid_kwargs.items() if p != 'bin_ranges'}),
            # Invalid `bin_ranges` parameter value (must be valid list
            # of bin ranges)
            dict(bin_ranges=[(0.9, 99.7), (99.9, 0.2)],
                 **{p: v for p, v in valid_kwargs.items() if p != 'bin_ranges'}),
            # Invalid `lognormal` parameter value (must be boolean or
            # None)
            dict(lognormal=0,
                 **{p: v for p, v in valid_kwargs.items() if p != 'lognormal'}),
            # Invalid `power_transform` parameter value (must be float
            # or None)
            dict(power_transform=False,
                 **{p: v for p, v in valid_kwargs.items()
                    if p != 'power_transform'}),
            # Invalid `power_transform` parameter value (must be float
            # or None)
            dict(power_transform=3,
                 **{p: v for p, v in valid_kwargs.items()
                    if p != 'power_transform'}),
            # Invalid `power_transform` parameter value (must be float
            # that is not equal to 0.0)
            dict(power_transform=0.0,
                 **{p: v for p, v in valid_kwargs.items()
                    if p != 'power_transform'}),
            # The `power_transform` and `lognormal` parameter values
            # were set as 2.0 and True, respectively, i.e., both were
            # set
            dict(power_transform=2.0,
                 lognormal=True,
                 **{p: v for p, v in valid_kwargs.items()
                    if not p in ['power_transform', 'lognormal']}),
            # Invalid `majority_baseline` parameter value (must be
            # boolean or None)
            dict(majority_baseline=0,
                 **{p: v for p, v in valid_kwargs.items()
                    if p != 'majority_baseline'}),
            # Invalid `rescale` parameter value (must be boolean or
            # None)
            dict(rescale=0,
                 **{p: v for p, v in valid_kwargs.items() if p != 'rescale'}),
            # `learners` and `param_grids` of unequal size
            dict(learners=[learners[0]],
                 **{p: v for p, v in valid_kwargs.items() if p != 'learners'}),
            # `feature_selection_percentile` is not greater than 0.0
            dict(feature_selection_percentile=0.0,
                 **{p: v for p, v in valid_kwargs.items()
                    if p != 'feature_selection_percentile'}),
            # `feature_selection_percentile` is greater than 1.0
            dict(feature_selection_percentile=1.1,
                 **{p: v for p, v in valid_kwargs.items()
                    if p != 'feature_selection_percentile'}),
            # `n_jobs` is not of type int
            dict(n_jobs=5.0,
                 **{p: v for p, v in valid_kwargs.items() if p != 'n_jobs'}),
            # `n_jobs` is less than 1
            dict(n_jobs=0,
                 **{p: v for p, v in valid_kwargs.items() if p != 'n_jobs'})
            ]
        for kwargs in invalid_kwargs_list:
            assert_raises(SchemaError, CVConfig, **kwargs)