def test_compare_rbf(self): x, y, _, _ = load_arff_file(WHAS500_FILE, ['fstat', 'lenfol'], '1') kpca = KernelPCA(kernel="rbf") xt = kpca.fit_transform(x) nrsvm = FastSurvivalSVM(optimizer='rbtree', tol=1e-8, max_iter=1000, random_state=0) nrsvm.fit(xt, y) rsvm = FastKernelSurvivalSVM(optimizer='rbtree', kernel="rbf", tol=1e-8, max_iter=1000, random_state=0) rsvm.fit(x, y) pred_nrsvm = nrsvm.predict(kpca.transform(x)) pred_rsvm = rsvm.predict(x) self.assertEqual(len(pred_nrsvm), len(pred_rsvm)) c1 = concordance_index_censored(y['fstat'], y['lenfol'], pred_nrsvm) c2 = concordance_index_censored(y['fstat'], y['lenfol'], pred_rsvm) self.assertAlmostEqual(c1[0], c2[0]) self.assertTupleEqual(c1[1:], c2[1:])
def main(args): LOG.info("Using IPython profile %s", args.profile) rc = parallel.Client(profile=args.profile) with rc[:].sync_imports(): from sklearn.metrics import mean_squared_error import numpy _x, _y, _x_test, _y_test = load_arff_file(args.input, [args.event, args.time], args.outcome, args.test, to_numeric=False) _data = _x.copy() _x = categorical_to_numeric(_x) if _x_test is not None: _x_test = categorical_to_numeric(_x_test) _estimator = get_estimator(args.method, args.seed, _data) _param_grid = get_param_grid(args.params) print_settings(_estimator, _x, _y) _grid_search = run_grid_search(_estimator, _param_grid, _x, _y, _x_test, _y_test, args.seed, args.profile) if args.test is None: _output = "results-rmse-%s-%s.csv" % (basename( args.input).rstrip(".arff"), args.method) else: _output = "results-rmse-%s+%s-%s.csv" % (basename( args.input).rstrip(".arff"), basename( args.test).rstrip(".arff"), args.method) write_results(_grid_search.best_params_, _output) rc[:].clear()
def test_compare_clinical_kernel(self): x_full, y, _, _ = load_arff_file(WHAS500_FILE, ['fstat', 'lenfol'], '1', standardize_numeric=False, to_numeric=False) trans = ClinicalKernelTransform() trans.fit(x_full) x = encode_categorical(standardize(x_full)) kpca = KernelPCA(kernel=trans.pairwise_kernel) xt = kpca.fit_transform(x) nrsvm = FastSurvivalSVM(optimizer='rbtree', tol=1e-8, max_iter=1000, random_state=0) nrsvm.fit(xt, y) rsvm = FastKernelSurvivalSVM(optimizer='rbtree', kernel=trans.pairwise_kernel, tol=1e-8, max_iter=1000, random_state=0) rsvm.fit(x, y) pred_nrsvm = nrsvm.predict(kpca.transform(x)) pred_rsvm = rsvm.predict(x) self.assertEqual(len(pred_nrsvm), len(pred_rsvm)) c1 = concordance_index_censored(y['fstat'], y['lenfol'], pred_nrsvm) c2 = concordance_index_censored(y['fstat'], y['lenfol'], pred_rsvm) self.assertAlmostEqual(c1[0], c2[0]) self.assertTupleEqual(c1[1:], c2[1:])
def main(args): _x, _y, _x_test, _y_test = load_arff_file( args.input, [args.event, 'ENDTRS_C', 'ENTRT_PC'], survival=False, path_testing=args.test, to_numeric=False) model = fit_and_dump(_x, _y, args) model.load_from_db() print("Ensemble size: %d" % len(model)) if _x_test is not None: _x_test = categorical_to_numeric(_x_test) proba = model.predict(_x_test.values) pred_labels = model.classes_.take(numpy.argmax(proba, axis=1), axis=0) result = pandas.DataFrame({ "RISK": proba[:, model.classes_ == 1].ravel(), "RPT": _x_test.index.to_series(), "DISCONT": pred_labels }) result.set_index("RPT", inplace=True) _results_file = 'results-%s-%s.csv' % (basename( args.input), "ensemble_selection_classification") result.to_csv(_results_file)
def test_load_with_categorical_index_2(self): fp = StringIO(ARFF_CATEGORICAL_INDEX_2) x_train, y_train, x_test, y_test = load_arff_file(fp, ["label"], pos_label="yes", survival=False, standardize_numeric=False, to_numeric=False) self.assertEqual(x_test, None) self.assertEqual(y_test, None) self.assertTupleEqual(x_train.shape, (5, 2)) self.assertTupleEqual(y_train.shape, (5, 1)) index = pandas.Index(['ASampleOne', 'ASampleTwo', 'ASampleThree', 'ASampleFour', 'ASampleFive'], name='index', dtype=object) tm.assert_index_equal(x_train.index, index, exact=True) label = pandas.Series(pandas.Categorical(["no", "no", "yes", "yes", "no"], categories=["yes", "no"], ordered=False), name="label", index=index) tm.assert_series_equal(y_train["label"], label, check_exact=True) value = pandas.Series([1.51, 1.38, -20, 245.3, 3.14], name="value", index=index) tm.assert_series_equal(x_train["value"], value, check_exact=True) size = pandas.Series(pandas.Categorical(["small", "small", "large", "small", "large"], categories=["small", "medium", "large"], ordered=False), name="size", index=index) tm.assert_series_equal(x_train["size"], size, check_exact=True)
def setUp(self): # naive survival SVM does resolve ties in survival time differently, # therefore use data without ties self.x, self.y, _, _ = load_arff_file(WHAS500_NOTIES_FILE, ['fstat', 'lenfol'], '1', standardize_numeric=False)
def main(args): LOG.info("Using IPython profile %s", args.profile) rc = parallel.Client(profile=args.profile) _x, _y, _x_test, _y_test = load_arff_file( args.input, [args.event, 'ENDTRS_C', 'ENTRT_PC'], survival=False, path_testing=args.test, to_numeric=False) _data = _x.copy() _x = categorical_to_numeric(_x) if _x_test is not None: _x_test = categorical_to_numeric(_x_test) _y = _y[args.event].cat.codes.values _estimator = get_estimator(args.method, args.seed, _data) _param_grid = get_param_grid(args.params) print_settings(_estimator, _x, _y) _grid_search = run_grid_search(_estimator, _param_grid, args.metric, _x, _y, _x_test, _y_test, args.seed, args.profile) if args.test is None: _output = "results-%s-%s-%s.csv" % (args.metric, basename( args.input).rstrip(".arff"), args.method) else: _output = "results-%s-%s+%s-%s.csv" % (args.metric, basename( args.input).rstrip(".arff"), basename( args.test).rstrip(".arff"), args.method) write_results(_grid_search.best_params_, _output) rc[:].clear()
def test_load_train_and_test_with_different_columns(self): tmp_train = tempfile.NamedTemporaryFile("w", suffix=".arff", delete=False) tmp_test = tempfile.NamedTemporaryFile("w", suffix=".arff", delete=False) try: _make_and_write_data(tmp_train, 100, 19, False, True, 0) _make_and_write_data(tmp_test, 20, 11, False, True, 0) with warnings.catch_warnings(record=True) as w: warnings.simplefilter("always") load_arff_file(tmp_train.name, ["event", "time"], 1, path_testing=tmp_test.name, survival=True, standardize_numeric=False, to_numeric=False) self.assertEqual(1, len(w)) self.assertEqual("Restricting columns to intersection between training and testing data", str(w[0].message)) finally: os.unlink(tmp_train.name) os.unlink(tmp_test.name)
def test_load_train_and_test_with_categorical_index(self): fp_1 = StringIO(ARFF_CATEGORICAL_INDEX_1) fp_2 = StringIO(ARFF_CATEGORICAL_INDEX_2) x_train, y_train, x_test, y_test = load_arff_file(fp_1, ["label"], pos_label="yes", path_testing=fp_2, survival=False, standardize_numeric=False, to_numeric=False) self.assertTupleEqual(x_train.shape, (4, 2)) self.assertTupleEqual(x_test.shape, (5, 2)) self.assertTupleEqual(y_train.shape, (4, 1)) self.assertTupleEqual(y_test.shape, (5, 1)) # Check train data train_index = pandas.Index(['SampleOne', 'SampleTwo', 'SampleThree', 'SampleFour'], name='index', dtype=object) tm.assert_index_equal(x_train.index, train_index, exact=True) train_label = pandas.Series( pandas.Categorical(["yes", "no", "yes", "yes"], categories=["no", "yes"], ordered=False), name="label", index=train_index) tm.assert_series_equal(y_train["label"], train_label, check_exact=True) train_value = pandas.Series([15.1, 13.8, -0.2, 2.453], name="value", index=train_index) tm.assert_series_equal(x_train["value"], train_value, check_exact=True) train_size = pandas.Series(pandas.Categorical(["medium", "large", "small", "large"], categories=["small", "medium", "large"], ordered=False), name="size", index=train_index) tm.assert_series_equal(x_train["size"], train_size, check_exact=True) # Check test data test_index = pandas.Index(['ASampleOne', 'ASampleTwo', 'ASampleThree', 'ASampleFour', 'ASampleFive'], name='index', dtype=object) tm.assert_index_equal(x_test.index, test_index, exact=True) test_label = pandas.Series( pandas.Categorical(["no", "no", "yes", "yes", "no"], categories=["yes", "no"], ordered=False), name="label", index=test_index) tm.assert_series_equal(y_test["label"], test_label, check_exact=True) test_value = pandas.Series([1.51, 1.38, -20, 245.3, 3.14], name="value", index=test_index) tm.assert_series_equal(x_test["value"], test_value, check_exact=True) test_size = pandas.Series(pandas.Categorical(["small", "small", "large", "small", "large"], categories=["small", "medium", "large"], ordered=False), name="size", index=test_index) tm.assert_series_equal(x_test["size"], test_size, check_exact=True)
def main(args): _x, _y, _x_test, _y_test = load_arff_file(args.input, [args.event, args.time], args.outcome, args.test, to_numeric=False) model = fit_and_dump(_x, _y, args) print("Ensemble size: %d" % len(model)) if _x_test is not None: _x_test = categorical_to_numeric(_x_test) p = numpy.exp(model.predict(_x_test.values)) result = pandas.DataFrame({"TIMETOEVENT": p, "RPT": _x_test.index.to_series()}) result.set_index("RPT", inplace=True) _results_file = 'results-%s-%s.csv' % (basename(args.input), "ensemble_selection_regression") result.to_csv(_results_file)
def test_fit_and_predict_clinical_kernel(self): x_full, y, _, _ = load_arff_file(WHAS500_FILE, ['fstat', 'lenfol'], '1', standardize_numeric=False, to_numeric=False) trans = ClinicalKernelTransform() trans.fit(x_full) x = encode_categorical(standardize(x_full)) ssvm = FastKernelSurvivalSVM(optimizer="rbtree", kernel=trans.pairwise_kernel, max_iter=100, random_state=0) ssvm.fit(x.values, y) self.assertFalse(ssvm._pairwise) self.assertEquals(x.shape[0], ssvm.coef_.shape[0]) c = ssvm.score(x.values, y) self.assertLessEqual(abs(0.83699051218246412 - c), 1e-3)
def test_load_with_index(self): tmp = tempfile.NamedTemporaryFile("w", suffix=".arff", delete=False) try: dataset = _make_and_write_data(tmp, 100, 10, True, True, 0) x_train, y_train, x_test, y_test = load_arff_file(tmp.name, ["event", "time"], 1, survival=True, standardize_numeric=False, to_numeric=False) self.assertEqual(x_test, None) self.assertEqual(y_test, None) cols = ["event", "time"] x_true = dataset.drop(cols, axis=1) self.assert_x_equal(x_true, x_train) self.assert_y_equal(dataset, y_train) finally: os.unlink(tmp.name)
def main(args): LOG.info("Using IPython profile %s", args.profile) rc = parallel.Client(profile=args.profile) with rc[:].sync_imports(): from survival.metrics import concordance_index_censored import numpy if args.metric == "timeroc": with rc[:].sync_imports(): from rpy2.robjects.packages import importr from rpy2.robjects import numpy2ri from scipy.integrate import simps _x, _y, _x_test, _y_test = load_arff_file(args.input, [args.event, args.time], args.outcome, args.test, to_numeric=False) _data = _x.copy() _x = categorical_to_numeric(_x) if _x_test is not None: _x_test = categorical_to_numeric(_x_test) _estimator = get_estimator(args.method, args.seed, _data) _param_grid = get_param_grid(args.params) print_settings(_estimator, _x, _y) _grid_search = run_grid_search(_estimator, _param_grid, args.metric, _x, _y, _x_test, _y_test, args.seed, args.profile) if args.test is None: _output = "results-%s-%s-%s.csv" % (args.metric, basename( args.input).rstrip(".arff"), args.method) else: _output = "results-%s-%s+%s-%s.csv" % (args.metric, basename( args.input).rstrip(".arff"), basename( args.test).rstrip(".arff"), args.method) write_results(_grid_search.best_params_, _output) rc[:].clear()
def test_load_train_and_test_no_labels(self): tmp_train = tempfile.NamedTemporaryFile("w", suffix=".arff", delete=False) tmp_test = tempfile.NamedTemporaryFile("w", suffix=".arff", delete=False) try: train_dataset = _make_and_write_data(tmp_train, 100, 10, True, True, 0) test_dataset = _make_and_write_data(tmp_test, 20, 10, True, False, 0) x_train, y_train, x_test, y_test = load_arff_file(tmp_train.name, ["event", "time"], 1, path_testing=tmp_test.name, survival=True, standardize_numeric=False, to_numeric=False) cols = ["event", "time"] x_true = train_dataset.drop(cols, axis=1) self.assert_x_equal(x_true, x_train) self.assert_y_equal(train_dataset, y_train) self.assert_x_equal(test_dataset, x_test) self.assertEqual(y_test, None) finally: os.unlink(tmp_train.name) os.unlink(tmp_test.name)
def setUp(self): x, y, _, _ = load_arff_file(GBSG2_FILE, ["cens", "time"], "1", standardize_numeric=False, to_numeric=False) self.x = encode_categorical(x) self.y = y
def setUp(self): self.x, self.y, _, _ = load_arff_file(WHAS500_FILE, ['fstat', 'lenfol'], '1', standardize_numeric=True)