def test_parse_non_nlp_features_string_unrecognized(self): """ Use invalid parameter values to test `parse_non_nlp_features_string`. """ # Use one of the time-related labels as the prediction label # and exclude all time-related labels from the input string label_group = 'TIME_LABELS' prediction_label = list(self.label_groups[label_group])[0] fake_and_real_features = \ self.labels.difference(self.label_groups[label_group]) # Add fake features to the set of input features and shuffle it fake_and_real_features.update( {'hours', 'achievements', 'friends', 'groups'}) fake_and_real_features = list(fake_and_real_features) np.random.shuffle(fake_and_real_features) # Iterate through features and discard any set that doesn't # contain at least one unrecognized feature for i in range(len(fake_and_real_features)): if self.labels.issuperset(fake_and_real_features[:i]): continue with self.assertRaises(ValueError): parse_non_nlp_features_string( ','.join(fake_and_real_features[:i]), prediction_label)
def test_parse_non_nlp_features_string_valid(self): """ Use valid parameter values to test `parse_non_nlp_features_string`. """ # Test some valid combinations (not all) of non-NLP features for label_group in self.label_groups: valid_prediction_labels = self.label_groups[label_group] # Pick one random label to use as the prediction label from # each group of labels group_labels = list(valid_prediction_labels) np.random.shuffle(group_labels) prediction_label = group_labels[0] if label_group != 'OTHER': valid_labels = list( self.labels.difference(valid_prediction_labels)) else: valid_labels = [ label for label in self.labels if not label == prediction_label ] for i in range(1, len(valid_labels) + 1): assert_equal( sorted( parse_non_nlp_features_string( ','.join(valid_labels[:i]), prediction_label)), sorted(valid_labels[:i]))
def test_parse_non_nlp_features_string_none(self): """ Test `parse_non_nlp_features_string` when a value of "none" is used instead of a comma-separated list of labels (return a set consisting of no labels). """ # Use one label from the label group as the prediction label label_group_name = list(self.label_groups)[0] group_labels = self.label_groups[label_group_name] prediction_label = list(group_labels)[0] expected_labels = set() assert_equal(parse_non_nlp_features_string('none', prediction_label), set())
def test_parse_non_nlp_features_string_group_conflict(self): """ Use parameter values that represent a conflict to test whether or not `parse_non_nlp_features_string` will catch it. """ for label_group in self.label_groups: # Skip 'OTHER' label group if label_group == 'OTHER': continue # Use one label from the label group as the prediction label group_labels = list(self.label_groups[label_group]) prediction_label = group_labels[0] # Get a small set of labels from other groups other_group_labels = list( set(chain( *self.label_groups.values())).difference(group_labels)) np.random.shuffle(other_group_labels) other_group_labels = other_group_labels[:5] # Iterate through each group label that represents a # conflict (including the prediction label itself) for label in group_labels: labels = [label_ for label_ in group_labels if label_ != label] for i in range(len(labels)): labels_ = labels[:i] + [label] with self.assertRaises(ValueError): parse_non_nlp_features_string(','.join(labels_), prediction_label) labels_ = labels_ + other_group_labels np.random.shuffle(labels_) with self.assertRaises(ValueError): parse_non_nlp_features_string(','.join(labels_), prediction_label)
def test_parse_non_nlp_features_string_all(self): """ Test `parse_non_nlp_features_string` when a value of "all" is used instead of a comma-separated list of labels (which should automatically remove any labels that conflict with the prediction label). """ # Use one label from the label group as the prediction label label_group_name = 'TIME_LABELS' group_labels = self.label_groups[label_group_name] prediction_label = list(group_labels)[0] expected_labels = self.labels.difference(group_labels) assert_equal( sorted(parse_non_nlp_features_string('all', prediction_label)), sorted(expected_labels))
def test_CVConfig_valid(self): """ Test valid parameter values for the `CVConfig` class. """ learners = ['perc', 'pagr'] non_nlp_features = parse_non_nlp_features_string('all', 'total_game_hours') param_grids = [DEFAULT_PARAM_GRIDS[LEARNER_DICT[learner]] for learner in learners] valid_kwargs = dict(db=self.db, games=set(['Dota_2']), learners=learners, param_grids=param_grids, training_rounds=10, training_samples_per_round=100, grid_search_samples_per_fold=50, non_nlp_features=non_nlp_features, prediction_label=self.prediction_label, output_path=self.output_path, objective='pearson_r', data_sampling='stratified', grid_search_folds=5, hashed_features=100000, nlp_features=True, bin_ranges=[(0.0, 225.1), (225.2, 2026.2), (2026.3, 16435.0)], lognormal=False, power_transform=None, majority_baseline=True, rescale=True, feature_selection_percentile=0.8, n_jobs=4) default_params = set(['objective', 'data_sampling', 'grid_search_folds', 'hashed_features', 'nlp_features', 'bin_ranges', 'lognormal', 'power_transform', 'majority_baseline', 'rescale', 'feature_selection_percentile', 'n_jobs']) # Combinations of parameters valid_kwargs_list = [ # Only specify non-default parameters dict(**{p: v for p, v in valid_kwargs.items() if not p in default_params}), # Only specify non-default parameters + `objective` dict(**{p: v for p, v in valid_kwargs.items() if not p in default_params.difference(['objective'])}), # Only specify non-default parameters + `data_sampling` dict(**{p: v for p, v in valid_kwargs.items() if not p in default_params.difference(['data_sampling'])}), # Only specify non-default parameters + `grid_search_folds` dict(**{p: v for p, v in valid_kwargs.items() if not p in default_params.difference(['grid_search_folds'])}), # Only specify non-default parameters + `hashed_features` dict(**{p: v for p, v in valid_kwargs.items() if not p in default_params.difference(['hashed_features'])}), # Only specify non-default parameters + `nlp_features` dict(**{p: v for p, v in valid_kwargs.items() if not p in default_params.difference(['nlp_features'])}), # Only specify non-default parameters + `bin_ranges` dict(**{p: v for p, v in valid_kwargs.items() if not p in default_params.difference(['bin_ranges'])}), # Only specify non-default parameters + `lognormal` dict(**{p: v for p, v in valid_kwargs.items() if not p in default_params.difference(['lognormal'])}), # Only specify non-default parameters + `power_transform` dict(**{p: v for p, v in valid_kwargs.items() if not p in default_params.difference(['power_transform'])}), # Only specify non-default parameters + `majority_baseline` dict(**{p: v for p, v in valid_kwargs.items() if not p in default_params.difference(['majority_baseline'])}), # Only specify non-default parameters + `rescale` dict(**{p: v for p, v in valid_kwargs.items() if not p in default_params.difference(['rescale'])}), # Only specify non-default parameters + # `feature_selection_percentile` dict(**{p: v for p, v in valid_kwargs.items() if not p in default_params.difference(['feature_selection_percentile'])}), # Only specify non-default parameters + `n_jobs` dict(**{p: v for p, v in valid_kwargs.items() if not p in default_params.difference(['n_jobs'])}) ] for kwargs in valid_kwargs_list: # Make the configuration object cfg = CVConfig(**kwargs).validated # `db` assert_equal(cfg['db'], kwargs['db']) # `games` assert_equal(cfg['games'], kwargs['games']) # `learners` assert_equal(cfg['learners'], kwargs['learners']) assert (isinstance(cfg['learners'], list) and all(learner in LEARNER_DICT_KEYS for learner in learners)) assert_equal(len(cfg['learners']), len(cfg['param_grids'])) # `param_grids` assert (isinstance(cfg['param_grids'], list) and all(isinstance(pgrids_list, list) for pgrids_list in cfg['param_grids']) and all(all(isinstance(pgrid, dict) for pgrid in pgrids_list) for pgrids_list in cfg['param_grids']) and all(all(all(isinstance(param, str) for param in pgrid) for pgrid in pgrids_list) for pgrids_list in cfg['param_grids']) and len(cfg['param_grids']) > 0) # `training_rounds`, `training_samples_per_round`, # `grid_search_samples_per_fold`, and `grid_search_folds` assert cfg['training_rounds'] > 1 assert cfg['training_samples_per_round'] > 0 assert cfg['grid_search_samples_per_fold'] > 1 if 'grid_search_folds' in kwargs: assert 'grid_search_folds' in cfg assert cfg['grid_search_folds'] > 1 assert_equal(cfg['grid_search_folds'], kwargs['grid_search_folds']) else: assert_equal(cfg['grid_search_folds'], 5) # `nlp_features`, `non_nlp_features`, and `prediction_label` assert (isinstance(cfg['non_nlp_features'], set) and cfg['non_nlp_features'].issubset(LABELS)) assert (isinstance(cfg['prediction_label'], str) and cfg['prediction_label'] in LABELS and not cfg['prediction_label'] in cfg['non_nlp_features']) if 'nlp_features' in kwargs: assert 'nlp_features' in cfg assert isinstance(cfg['nlp_features'], bool) assert_equal(cfg['nlp_features'], kwargs['nlp_features']) else: assert_equal(cfg['nlp_features'], True) # `objective` if 'objective' in kwargs: assert 'objective' in cfg assert cfg['objective'] in OBJ_FUNC_ABBRS_DICT assert_equal(cfg['objective'], kwargs['objective']) else: assert_equal(cfg['objective'], None) # `data_sampling` if 'data_sampling' in kwargs: assert 'data_sampling' in cfg assert cfg['data_sampling'] in ExperimentalData.sampling_options assert_equal(cfg['data_sampling'], kwargs['data_sampling']) else: assert_equal(cfg['data_sampling'], 'even') # `hashed_features` if 'hashed_features' in kwargs: assert 'hashed_features' in cfg if cfg['hashed_features'] is not None: assert cfg['hashed_features'] > -1 assert_equal(cfg['hashed_features'], kwargs['hashed_features']) else: assert_equal(cfg['hashed_features'], None) # `bin_ranges` if 'bin_ranges' in kwargs: assert 'bin_ranges' in cfg assert (isinstance(cfg['bin_ranges'], list) and all((isinstance(bin_, tuple) and all(isinstance(val, float) for val in bin_)) for bin_ in cfg['bin_ranges'])) assert_equal(cfg['bin_ranges'], kwargs['bin_ranges']) validate_bin_ranges(cfg['bin_ranges']) else: assert_equal(cfg['bin_ranges'], None) # `lognormal` if 'lognormal' in kwargs: assert 'lognormal' in cfg assert isinstance(cfg['lognormal'], bool) assert_equal(cfg['lognormal'], kwargs['lognormal']) else: assert_equal(cfg['lognormal'], False) # `power_transform` if 'power_transform' in kwargs: assert 'power_transform' in cfg assert (cfg['power_transform'] is None or isinstance(cfg['power_transform'], bool)) assert_equal(cfg['power_transform'], kwargs['power_transform']) else: assert_equal(cfg['power_transform'], None) # `majority_baseline` if 'majority_baseline' in kwargs: assert 'majority_baseline' in cfg assert isinstance(cfg['majority_baseline'], bool) assert_equal(cfg['majority_baseline'], kwargs['majority_baseline']) else: assert_equal(cfg['majority_baseline'], True) # `rescale` if 'rescale' in kwargs: assert 'rescale' in cfg assert isinstance(cfg['rescale'], bool) assert_equal(cfg['rescale'], kwargs['rescale']) else: assert_equal(cfg['rescale'], True) # `feature_selection_percentile` if 'feature_selection_percentile' in kwargs: assert 'feature_selection_percentile' in cfg assert isinstance(cfg['feature_selection_percentile'], float) assert_equal(cfg['feature_selection_percentile'], kwargs['feature_selection_percentile']) else: assert_equal(cfg['feature_selection_percentile'], 1.0) # `n_jobs` if 'n_jobs' in kwargs: assert 'n_jobs' in cfg assert isinstance(cfg['n_jobs'], int) assert_equal(cfg['n_jobs'], kwargs['n_jobs']) else: assert_equal(cfg['n_jobs'], 1)
def test_CVConfig_invalid(self): """ Test invalid parameter values for the `CVConfig` class. """ learners = ['perc', 'pagr'] non_nlp_features = parse_non_nlp_features_string('all', 'total_game_hours') param_grids = [DEFAULT_PARAM_GRIDS[LEARNER_DICT[learner]] for learner in learners] valid_kwargs = dict(db=self.db, games=set(['Dota_2']), learners=learners, param_grids=param_grids, training_rounds=10, training_samples_per_round=100, grid_search_samples_per_fold=50, non_nlp_features=non_nlp_features, prediction_label=self.prediction_label, output_path=self.output_path, objective='pearson_r', data_sampling='even', grid_search_folds=5, hashed_features=100000, nlp_features=True, bin_ranges=[(0.0, 225.1), (225.2, 2026.2), (2026.3, 16435.0)], lognormal=False, power_transform=None, majority_baseline=True, rescale=True, feature_selection_percentile=1.0, n_jobs=1) # Combinations of parameters that should cause a `SchemaError` invalid_kwargs_list = [ # Invalid `db` value dict(db='db', **{p: v for p, v in valid_kwargs.items() if p != 'db'}), # Invalid games in `games` parameter value dict(games={'Dota'}, **{p: v for p, v in valid_kwargs.items() if p != 'games'}), # Invalid `learners` parameter value (unrecognized learner # abbreviations) dict(learners=['perceptron', 'passiveagressive'], **{p: v for p, v in valid_kwargs.items() if p != 'learners'}), # Invalid `learners` parameter value (empty) dict(learners=[], **{p: v for p, v in valid_kwargs.items() if p != 'learners'}), # Invalid parameter grids in `param_grids` parameter value dict(param_grids=[[dict(a=1, b=2), dict(c='g', d=True)]], **{p: v for p, v in valid_kwargs.items() if p != 'param_grids'}), # `learners`/`param_grids` unequal in length dict(learners=['perc', 'pagr'], param_grids=[DEFAULT_PARAM_GRIDS[LEARNER_DICT[learner]] for learner in ['perc', 'pagr', 'mbkm']], bin_ranges=None, **{p: v for p, v in valid_kwargs.items() if not p in ['learners', 'param_grids', 'bin_ranges']}), # Invalid `training_rounds` parameter value (must be int) dict(training_rounds=2.0, **{p: v for p, v in valid_kwargs.items() if p != 'training_rounds'}), # Invalid `training_rounds` parameter value (must be greater # than 1) dict(training_rounds=1, **{p: v for p, v in valid_kwargs.items() if p != 'training_rounds'}), # Invalid `training_samples_per_round` parameter value (must # be int) dict(training_samples_per_round=1.0, **{p: v for p, v in valid_kwargs.items() if p != 'training_samples_per_round'}), # Invalid `training_samples_per_round` parameter value (must # be greater than 0) dict(training_samples_per_round=0.0, **{p: v for p, v in valid_kwargs.items() if p != 'training_samples_per_round'}), # Invalid `grid_search_samples_per_fold` parameter value # (must be int) dict(grid_search_samples_per_fold=50.0, **{p: v for p, v in valid_kwargs.items() if p != 'grid_search_samples_per_fold'}), # Invalid `grid_search_samples_per_fold` parameter value # (must be greater than 1) dict(grid_search_samples_per_fold=0.0, **{p: v for p, v in valid_kwargs.items() if p != 'grid_search_samples_per_fold'}), # Invalid `non_nlp_features` parameter value (must be set of # valid features) dict(non_nlp_features={'total_game_hours_last_three_weeks'}, **{p: v for p, v in valid_kwargs.items() if p != 'non_nlp_features'}), # Invalid `prediction_label` parameter value (must be in set # of valid features) dict(prediction_label='total_game_hours_last_three_weeks', **{p: v for p, v in valid_kwargs.items() if p != 'prediction_label'}), # Invalid `objective` parameter value (must be in set of # of valid objective function names) dict(objective='pearson', **{p: v for p, v in valid_kwargs.items() if p != 'objective'}), # Invalid `output_path` parameter value (must be string) dict(output_path=None, **{p: v for p, v in valid_kwargs.items() if p != 'output_path'}), # Invalid `output_path` parameter value (must exist) dict(output_path=join(self.output_path, 'does_not_exist'), **{p: v for p, v in valid_kwargs.items() if p != 'output_path'}), # Invalid `data_sampling` parameter value (must be in set of # of valid sampling methods) dict(data_sampling='equal', **{p: v for p, v in valid_kwargs.items() if p != 'data_sampling'}), # Invalid `grid_search_folds` parameter value (must be int) dict(grid_search_folds=0.0, **{p: v for p, v in valid_kwargs.items() if p != 'grid_search_folds'}), # Invalid `grid_search_folds` parameter value (must be # greater than 1) dict(grid_search_folds=1, **{p: v for p, v in valid_kwargs.items() if p != 'grid_search_folds'}), # Invalid `hashed_features` parameter value (must be # non-negative or None) dict(hashed_features=-1, **{p: v for p, v in valid_kwargs.items() if p != 'hashed_features'}), # Invalid `hashed_features` parameter value (must be # non-negative) dict(hashed_features=False, **{p: v for p, v in valid_kwargs.items() if p != 'hashed_features'}), # Invalid `nlp_features` parameter value (must be boolean or # None) dict(nlp_features=1, **{p: v for p, v in valid_kwargs.items() if p != 'nlp_features'}), # Invalid `bin_ranges` parameter value (must be list of # tuples -- or None) dict(bin_ranges=[[0.2, 100.3], [100.5, 200.6]], **{p: v for p, v in valid_kwargs.items() if p != 'bin_ranges'}), # Invalid `bin_ranges` parameter value (must be list of # tuples containing floats -- or None) dict(bin_ranges=[(0, 99), (100, 200)], **{p: v for p, v in valid_kwargs.items() if p != 'bin_ranges'}), # Invalid `bin_ranges` parameter value (must be valid list # of bin ranges) dict(bin_ranges=[(0.9, 99.7), (99.9, 0.2)], **{p: v for p, v in valid_kwargs.items() if p != 'bin_ranges'}), # Invalid `lognormal` parameter value (must be boolean or # None) dict(lognormal=0, **{p: v for p, v in valid_kwargs.items() if p != 'lognormal'}), # Invalid `power_transform` parameter value (must be float # or None) dict(power_transform=False, **{p: v for p, v in valid_kwargs.items() if p != 'power_transform'}), # Invalid `power_transform` parameter value (must be float # or None) dict(power_transform=3, **{p: v for p, v in valid_kwargs.items() if p != 'power_transform'}), # Invalid `power_transform` parameter value (must be float # that is not equal to 0.0) dict(power_transform=0.0, **{p: v for p, v in valid_kwargs.items() if p != 'power_transform'}), # The `power_transform` and `lognormal` parameter values # were set as 2.0 and True, respectively, i.e., both were # set dict(power_transform=2.0, lognormal=True, **{p: v for p, v in valid_kwargs.items() if not p in ['power_transform', 'lognormal']}), # Invalid `majority_baseline` parameter value (must be # boolean or None) dict(majority_baseline=0, **{p: v for p, v in valid_kwargs.items() if p != 'majority_baseline'}), # Invalid `rescale` parameter value (must be boolean or # None) dict(rescale=0, **{p: v for p, v in valid_kwargs.items() if p != 'rescale'}), # `learners` and `param_grids` of unequal size dict(learners=[learners[0]], **{p: v for p, v in valid_kwargs.items() if p != 'learners'}), # `feature_selection_percentile` is not greater than 0.0 dict(feature_selection_percentile=0.0, **{p: v for p, v in valid_kwargs.items() if p != 'feature_selection_percentile'}), # `feature_selection_percentile` is greater than 1.0 dict(feature_selection_percentile=1.1, **{p: v for p, v in valid_kwargs.items() if p != 'feature_selection_percentile'}), # `n_jobs` is not of type int dict(n_jobs=5.0, **{p: v for p, v in valid_kwargs.items() if p != 'n_jobs'}), # `n_jobs` is less than 1 dict(n_jobs=0, **{p: v for p, v in valid_kwargs.items() if p != 'n_jobs'}) ] for kwargs in invalid_kwargs_list: assert_raises(SchemaError, CVConfig, **kwargs)