def test_make_csv(self): M, y = uft.generate_test_matrix(1000, 5, 2, random_state=0) clfs = [{ 'clf': RandomForestClassifier, 'n_estimators': [10, 100], 'max_depth': [5, 25], 'random_state': [0] }, { 'clf': SVC, 'kernel': ['linear', 'rbf'], 'probability': [True], 'random_state': [0] }] subsets = [{ 'subset': per.SubsetSweepNumRows, 'num_rows': [[100, 200]], 'random_state': [0] }] cvs = [{'cv': StratifiedKFold, 'n_folds': [2, 3]}] exp = per.Experiment(M, y, clfs=clfs, subsets=subsets, cvs=cvs) result_path = exp.make_csv() ctrl_path = os.path.join(REFERENCE_PKL_PATH, 'make_csv.csv') with open(result_path) as result: with open(ctrl_path) as ctrl: self.assertEqual(result.read(), ctrl.read())
def test_subsetting(self): M, y = uft.generate_test_matrix(1000, 5, 2, random_state=0) subsets = [{ 'subset': per.SubsetRandomRowsEvenDistribution, 'subset_size': [20], 'random_state': [0] }, { 'subset': per.SubsetRandomRowsActualDistribution, 'subset_size': [20], 'random_state': [0] }, { 'subset': per.SubsetSweepNumRows, 'num_rows': [[10, 20, 30]], 'random_state': [0] }, { 'subset': per.SubsetSweepVaryStratification, 'proportions_positive': [[.5, .75, .9]], 'subset_size': [10], 'random_state': [0] }] exp = per.Experiment(M, y, subsets=subsets) exp.run() result = { str(trial): frozenset([str(run) for run in trial.runs]) for trial in exp.trials } self.__compare_to_ref_pkl(result, 'test_subsetting')
def test_report_complex(self): M, y = uft.generate_test_matrix(100, 5, 2) clfs = [{ 'clf': RandomForestClassifier, 'n_estimators': [10, 100], 'max_depth': [1, 10], 'random_state': [0] }, { 'clf': SVC, 'kernel': ['linear', 'rbf'], 'probability': [True], 'random_state': [0] }] subsets = [{ 'subset': per.SubsetRandomRowsActualDistribution, 'subset_size': [20, 40, 60, 80, 100], 'random_state': [0] }] cvs = [{'cv': StratifiedKFold}] exp = per.Experiment(M, y, clfs, subsets, cvs) _, rep = exp.make_report(dimension=per.CLF, return_report_object=True, verbose=False) self.report.add_heading('test_report_complex', 1) self.report.add_subreport(rep)
def test_operate(self): M, y = uft.generate_test_matrix(100, 5, 2, random_state=0) cvs = [{'cv': StratifiedKFold}] for label, clfs in zip(('std',), (op.DBG_std_clfs,)): exp = per.Experiment(M, y, clfs=clfs, cvs=cvs) result = {str(key) : val for key, val in exp.average_score().iteritems()} self.__compare_to_ref_pkl( result, 'test_operate_{}'.format(label))
def test_std_clfs(self): M, y = uft.generate_test_matrix(100, 5, 2, random_state=0) cvs = [{'cv': StratifiedKFold}] for label, clfs in zip(('std',), (per.DBG_std_clfs,)): exp = per.Experiment(M, y, clfs=clfs, cvs=cvs) exp.run() result = {str(trial) for trial in exp.trials} self.__compare_to_ref_pkl( result, 'test_operate_{}'.format(label))
def test_operate(self): M, y = uft.generate_test_matrix(100, 5, 2, random_state=0) cvs = [{'cv': StratifiedKFold}] for label, clfs in zip(('std', ), (op.DBG_std_clfs, )): exp = per.Experiment(M, y, clfs=clfs, cvs=cvs) result = { str(key): val for key, val in exp.average_score().iteritems() } self.__compare_to_ref_pkl(result, 'test_operate_{}'.format(label))
def test_report_simple(self): M, y = uft.generate_test_matrix(100, 5, 2, random_state=0) clfs = [{'clf': RandomForestClassifier, 'n_estimators': [10, 100, 1000], 'random_state': [0]}] cvs = [{'cv': StratifiedKFold}] exp = per.Experiment(M, y, clfs=clfs, cvs=cvs) _, rep = exp.make_report(return_report_object=True, verbose=False) self.report.add_heading('test_report_simple', 1) self.report.add_subreport(rep)
def test_report_simple(self): M, y = uft.generate_test_matrix(100, 5, 2, random_state=0) clfs = [{ 'clf': RandomForestClassifier, 'n_estimators': [10, 100, 1000], 'random_state': [0] }] cvs = [{'cv': StratifiedKFold}] exp = per.Experiment(M, y, clfs=clfs, cvs=cvs) _, rep = exp.make_report(return_report_object=True, verbose=False) self.report.add_heading('test_report_simple', 1) self.report.add_subreport(rep)
def test_make_csv(self): M, y = uft.generate_test_matrix(1000, 5, 2, random_state=0) clfs = [{'clf': RandomForestClassifier, 'n_estimators': [10, 100], 'max_depth': [5, 25], 'random_state': [0]}, {'clf': SVC, 'kernel': ['linear', 'rbf'], 'probability': [True], 'random_state': [0]}] subsets = [{'subset': per.SubsetSweepNumRows, 'num_rows': [[100, 200]], 'random_state': [0]}] cvs = [{'cv': StratifiedKFold, 'n_folds': [2, 3]}] exp = per.Experiment(M, y, clfs=clfs, subsets=subsets, cvs=cvs) result_path = exp.make_csv()
def test_report_complex(self): M, y = uft.generate_test_matrix(100, 5, 2) clfs = [{'clf': RandomForestClassifier, 'n_estimators': [10, 100], 'max_depth': [1, 10], 'random_state': [0]}, {'clf': SVC, 'kernel': ['linear', 'rbf'], 'probability': [True], 'random_state': [0]}] subsets = [{'subset': per.SubsetRandomRowsActualDistribution, 'subset_size': [20, 40, 60, 80, 100], 'random_state': [0]}] cvs = [{'cv': StratifiedKFold}] exp = per.Experiment(M, y, clfs, subsets, cvs) _, rep = exp.make_report(dimension=per.CLF, return_report_object=True, verbose=False) self.report.add_heading('test_report_complex', 1) self.report.add_subreport(rep)
def test_subsetting(self): M, y = uft.generate_test_matrix(1000, 5, 2, random_state=0) subsets = [{'subset': per.SubsetRandomRowsEvenDistribution, 'subset_size': [20], 'random_state': [0]}, {'subset': per.SubsetRandomRowsActualDistribution, 'subset_size': [20], 'random_state': [0]}, {'subset': per.SubsetSweepNumRows, 'num_rows': [[10, 20, 30]], 'random_state': [0]}, {'subset': per.SubsetSweepVaryStratification, 'proportions_positive': [[.5, .75, .9]], 'subset_size': [10], 'random_state': [0]}] exp = per.Experiment(M, y, subsets=subsets) exp.run() result = {str(trial) : frozenset([str(run) for run in trial.runs]) for trial in exp.trials} self.__compare_to_ref_pkl(result, 'test_subsetting')
def test_get_top_features(self): M, labels = uft.generate_test_matrix(1000, 15, random_state=0) M = utils.cast_np_sa_to_nd(M) M_train, M_test, labels_train, labels_test = train_test_split( M, labels) clf = RandomForestClassifier(random_state=0) clf.fit(M_train, labels_train) res = comm.get_top_features(clf, M, verbose=False) ctrl = utils.convert_to_sa([('f5', 0.0773838526068), ('f13', 0.0769596713039), ('f8', 0.0751584839431), ('f6', 0.0730815879102), ('f11', 0.0684456133071), ('f9', 0.0666747414603), ('f10', 0.0659621889608), ('f7', 0.0657988099065), ('f2', 0.0634000069218), ('f0', 0.0632912268319)], col_names=('feat_name', 'score')) self.assertTrue(uft.array_equal(ctrl, res))
def test_get_top_features(self): M, labels = uft.generate_test_matrix(1000, 15, random_state=0) M = utils.cast_np_sa_to_nd(M) M_train, M_test, labels_train, labels_test = train_test_split( M, labels) clf = RandomForestClassifier(random_state=0) clf.fit(M_train, labels_train) ctrl_feat_importances = clf.feature_importances_ ctrl_col_names = ['f{}'.format(i) for i in xrange(15)] ctrl_feat_ranks = np.argsort(ctrl_feat_importances)[::-1][:10] ctrl = utils.convert_to_sa( zip(ctrl_col_names, ctrl_feat_importances), col_names=('feat_name', 'score'))[ctrl_feat_ranks] res = dsp.get_top_features(clf, M, verbose=False) self.assertTrue(uft.array_equal(ctrl, res)) res = dsp.get_top_features(clf, col_names=['f{}'.format(i) for i in xrange(15)], verbose=False) self.assertTrue(uft.array_equal(ctrl, res))
def test_make_csv(self): M, y = uft.generate_test_matrix(1000, 5, 2, random_state=0) clfs = [{'clf': RandomForestClassifier, 'n_estimators': [10, 100], 'max_depth': [5, 25], 'random_state': [0]}, {'clf': SVC, 'kernel': ['linear', 'rbf'], 'probability': [True], 'random_state': [0]}] subsets = [{'subset': per.SubsetSweepNumRows, 'num_rows': [[100, 200]], 'random_state': [0]}] cvs = [{'cv': StratifiedKFold, 'n_folds': [2, 3]}] exp = per.Experiment(M, y, clfs=clfs, subsets=subsets, cvs=cvs) result_path = exp.make_csv() ctrl_path = os.path.join(REFERENCE_PKL_PATH, 'make_csv.csv') with open(result_path) as result: with open(ctrl_path) as ctrl: self.assertEqual(result.read(), ctrl.read())
def test_get_top_features(self): M, labels = uft.generate_test_matrix(1000, 15, random_state=0) M = utils.cast_np_sa_to_nd(M) M_train, M_test, labels_train, labels_test = train_test_split( M, labels) clf = RandomForestClassifier(random_state=0) clf.fit(M_train, labels_train) res = comm.get_top_features(clf, M, verbose=False) ctrl = utils.convert_to_sa( [('f5', 0.0773838526068), ('f13', 0.0769596713039), ('f8', 0.0751584839431), ('f6', 0.0730815879102), ('f11', 0.0684456133071), ('f9', 0.0666747414603), ('f10', 0.0659621889608), ('f7', 0.0657988099065), ('f2', 0.0634000069218), ('f0', 0.0632912268319)], col_names=('feat_name', 'score')) self.assertTrue(uft.array_equal(ctrl, res))
def test_generate_matrix(self): M, y = utils_for_tests.generate_test_matrix(100, 5, 3, [float, str, int]) print M print y