Esempio n. 1
0
    def test_compare_rbf(self):
        x, y, _, _ = load_arff_file(WHAS500_FILE, ['fstat', 'lenfol'], '1')

        kpca = KernelPCA(kernel="rbf")
        xt = kpca.fit_transform(x)

        nrsvm = FastSurvivalSVM(optimizer='rbtree',
                                tol=1e-8,
                                max_iter=1000,
                                random_state=0)
        nrsvm.fit(xt, y)

        rsvm = FastKernelSurvivalSVM(optimizer='rbtree',
                                     kernel="rbf",
                                     tol=1e-8,
                                     max_iter=1000,
                                     random_state=0)
        rsvm.fit(x, y)

        pred_nrsvm = nrsvm.predict(kpca.transform(x))
        pred_rsvm = rsvm.predict(x)

        self.assertEqual(len(pred_nrsvm), len(pred_rsvm))

        c1 = concordance_index_censored(y['fstat'], y['lenfol'], pred_nrsvm)
        c2 = concordance_index_censored(y['fstat'], y['lenfol'], pred_rsvm)

        self.assertAlmostEqual(c1[0], c2[0])
        self.assertTupleEqual(c1[1:], c2[1:])
def main(args):
    LOG.info("Using IPython profile %s", args.profile)
    rc = parallel.Client(profile=args.profile)

    with rc[:].sync_imports():
        from sklearn.metrics import mean_squared_error
        import numpy

    _x, _y, _x_test, _y_test = load_arff_file(args.input,
                                              [args.event, args.time],
                                              args.outcome,
                                              args.test,
                                              to_numeric=False)
    _data = _x.copy()
    _x = categorical_to_numeric(_x)
    if _x_test is not None:
        _x_test = categorical_to_numeric(_x_test)

    _estimator = get_estimator(args.method, args.seed, _data)
    _param_grid = get_param_grid(args.params)
    print_settings(_estimator, _x, _y)

    _grid_search = run_grid_search(_estimator, _param_grid, _x, _y, _x_test,
                                   _y_test, args.seed, args.profile)

    if args.test is None:
        _output = "results-rmse-%s-%s.csv" % (basename(
            args.input).rstrip(".arff"), args.method)
    else:
        _output = "results-rmse-%s+%s-%s.csv" % (basename(
            args.input).rstrip(".arff"), basename(
                args.test).rstrip(".arff"), args.method)
    write_results(_grid_search.best_params_, _output)

    rc[:].clear()
    def test_compare_clinical_kernel(self):
        x_full, y, _, _ = load_arff_file(WHAS500_FILE, ['fstat', 'lenfol'], '1',
                                         standardize_numeric=False, to_numeric=False)

        trans = ClinicalKernelTransform()
        trans.fit(x_full)

        x = encode_categorical(standardize(x_full))

        kpca = KernelPCA(kernel=trans.pairwise_kernel)
        xt = kpca.fit_transform(x)

        nrsvm = FastSurvivalSVM(optimizer='rbtree', tol=1e-8, max_iter=1000, random_state=0)
        nrsvm.fit(xt, y)

        rsvm = FastKernelSurvivalSVM(optimizer='rbtree', kernel=trans.pairwise_kernel,
                                     tol=1e-8, max_iter=1000, random_state=0)
        rsvm.fit(x, y)

        pred_nrsvm = nrsvm.predict(kpca.transform(x))
        pred_rsvm = rsvm.predict(x)

        self.assertEqual(len(pred_nrsvm), len(pred_rsvm))

        c1 = concordance_index_censored(y['fstat'], y['lenfol'], pred_nrsvm)
        c2 = concordance_index_censored(y['fstat'], y['lenfol'], pred_rsvm)

        self.assertAlmostEqual(c1[0], c2[0])
        self.assertTupleEqual(c1[1:], c2[1:])
Esempio n. 4
0
def main(args):
    _x, _y, _x_test, _y_test = load_arff_file(
        args.input, [args.event, 'ENDTRS_C', 'ENTRT_PC'],
        survival=False,
        path_testing=args.test,
        to_numeric=False)

    model = fit_and_dump(_x, _y, args)
    model.load_from_db()
    print("Ensemble size: %d" % len(model))

    if _x_test is not None:
        _x_test = categorical_to_numeric(_x_test)
    proba = model.predict(_x_test.values)
    pred_labels = model.classes_.take(numpy.argmax(proba, axis=1), axis=0)

    result = pandas.DataFrame({
        "RISK": proba[:, model.classes_ == 1].ravel(),
        "RPT": _x_test.index.to_series(),
        "DISCONT": pred_labels
    })
    result.set_index("RPT", inplace=True)

    _results_file = 'results-%s-%s.csv' % (basename(
        args.input), "ensemble_selection_classification")
    result.to_csv(_results_file)
Esempio n. 5
0
    def test_load_with_categorical_index_2(self):
        fp = StringIO(ARFF_CATEGORICAL_INDEX_2)
        x_train, y_train, x_test, y_test = load_arff_file(fp, ["label"], pos_label="yes", survival=False,
                                                          standardize_numeric=False, to_numeric=False)

        self.assertEqual(x_test, None)
        self.assertEqual(y_test, None)

        self.assertTupleEqual(x_train.shape, (5, 2))
        self.assertTupleEqual(y_train.shape, (5, 1))

        index = pandas.Index(['ASampleOne', 'ASampleTwo', 'ASampleThree', 'ASampleFour', 'ASampleFive'],
                             name='index', dtype=object)
        tm.assert_index_equal(x_train.index, index, exact=True)

        label = pandas.Series(pandas.Categorical(["no", "no", "yes", "yes", "no"], categories=["yes", "no"], ordered=False),
                              name="label", index=index)
        tm.assert_series_equal(y_train["label"], label, check_exact=True)

        value = pandas.Series([1.51, 1.38, -20, 245.3, 3.14], name="value", index=index)
        tm.assert_series_equal(x_train["value"], value, check_exact=True)

        size = pandas.Series(pandas.Categorical(["small", "small", "large", "small", "large"],
                                                categories=["small", "medium", "large"], ordered=False),
                             name="size", index=index)
        tm.assert_series_equal(x_train["size"], size, check_exact=True)
Esempio n. 6
0
 def setUp(self):
     # naive survival SVM does resolve ties in survival time differently,
     # therefore use data without ties
     self.x, self.y, _, _ = load_arff_file(WHAS500_NOTIES_FILE,
                                           ['fstat', 'lenfol'],
                                           '1',
                                           standardize_numeric=False)
Esempio n. 7
0
def main(args):
    LOG.info("Using IPython profile %s", args.profile)
    rc = parallel.Client(profile=args.profile)

    _x, _y, _x_test, _y_test = load_arff_file(
        args.input, [args.event, 'ENDTRS_C', 'ENTRT_PC'],
        survival=False,
        path_testing=args.test,
        to_numeric=False)
    _data = _x.copy()
    _x = categorical_to_numeric(_x)
    if _x_test is not None:
        _x_test = categorical_to_numeric(_x_test)
    _y = _y[args.event].cat.codes.values

    _estimator = get_estimator(args.method, args.seed, _data)
    _param_grid = get_param_grid(args.params)
    print_settings(_estimator, _x, _y)

    _grid_search = run_grid_search(_estimator, _param_grid, args.metric, _x,
                                   _y, _x_test, _y_test, args.seed,
                                   args.profile)

    if args.test is None:
        _output = "results-%s-%s-%s.csv" % (args.metric, basename(
            args.input).rstrip(".arff"), args.method)
    else:
        _output = "results-%s-%s+%s-%s.csv" % (args.metric, basename(
            args.input).rstrip(".arff"), basename(
                args.test).rstrip(".arff"), args.method)
    write_results(_grid_search.best_params_, _output)

    rc[:].clear()
Esempio n. 8
0
    def test_load_train_and_test_with_different_columns(self):
        tmp_train = tempfile.NamedTemporaryFile("w", suffix=".arff", delete=False)
        tmp_test = tempfile.NamedTemporaryFile("w", suffix=".arff", delete=False)
        try:
            _make_and_write_data(tmp_train, 100, 19, False, True, 0)
            _make_and_write_data(tmp_test, 20, 11, False, True, 0)

            with warnings.catch_warnings(record=True) as w:
                warnings.simplefilter("always")

                load_arff_file(tmp_train.name, ["event", "time"], 1,
                               path_testing=tmp_test.name,
                               survival=True,
                               standardize_numeric=False, to_numeric=False)

                self.assertEqual(1, len(w))
                self.assertEqual("Restricting columns to intersection between training and testing data",
                                 str(w[0].message))

        finally:
            os.unlink(tmp_train.name)
            os.unlink(tmp_test.name)
Esempio n. 9
0
    def test_load_train_and_test_with_categorical_index(self):
        fp_1 = StringIO(ARFF_CATEGORICAL_INDEX_1)
        fp_2 = StringIO(ARFF_CATEGORICAL_INDEX_2)
        x_train, y_train, x_test, y_test = load_arff_file(fp_1, ["label"], pos_label="yes",
                                                          path_testing=fp_2, survival=False,
                                                          standardize_numeric=False, to_numeric=False)

        self.assertTupleEqual(x_train.shape, (4, 2))
        self.assertTupleEqual(x_test.shape, (5, 2))
        self.assertTupleEqual(y_train.shape, (4, 1))
        self.assertTupleEqual(y_test.shape, (5, 1))

        # Check train data
        train_index = pandas.Index(['SampleOne', 'SampleTwo', 'SampleThree', 'SampleFour'],
                                   name='index', dtype=object)
        tm.assert_index_equal(x_train.index, train_index, exact=True)

        train_label = pandas.Series(
            pandas.Categorical(["yes", "no", "yes", "yes"], categories=["no", "yes"], ordered=False),
            name="label", index=train_index)
        tm.assert_series_equal(y_train["label"], train_label, check_exact=True)

        train_value = pandas.Series([15.1, 13.8, -0.2, 2.453], name="value", index=train_index)
        tm.assert_series_equal(x_train["value"], train_value, check_exact=True)

        train_size = pandas.Series(pandas.Categorical(["medium", "large", "small", "large"],
                                                      categories=["small", "medium", "large"], ordered=False),
                                   name="size", index=train_index)
        tm.assert_series_equal(x_train["size"], train_size, check_exact=True)

        # Check test data
        test_index = pandas.Index(['ASampleOne', 'ASampleTwo', 'ASampleThree', 'ASampleFour', 'ASampleFive'],
                                  name='index', dtype=object)
        tm.assert_index_equal(x_test.index, test_index, exact=True)

        test_label = pandas.Series(
            pandas.Categorical(["no", "no", "yes", "yes", "no"], categories=["yes", "no"], ordered=False),
            name="label", index=test_index)
        tm.assert_series_equal(y_test["label"], test_label, check_exact=True)

        test_value = pandas.Series([1.51, 1.38, -20, 245.3, 3.14], name="value", index=test_index)
        tm.assert_series_equal(x_test["value"], test_value, check_exact=True)

        test_size = pandas.Series(pandas.Categorical(["small", "small", "large", "small", "large"],
                                                     categories=["small", "medium", "large"], ordered=False),
                                  name="size", index=test_index)
        tm.assert_series_equal(x_test["size"], test_size, check_exact=True)
Esempio n. 10
0
def main(args):
    _x, _y, _x_test, _y_test = load_arff_file(args.input, [args.event, args.time], args.outcome,
                                              args.test, to_numeric=False)

    model = fit_and_dump(_x, _y, args)
    print("Ensemble size: %d" % len(model))

    if _x_test is not None:
        _x_test = categorical_to_numeric(_x_test)

    p = numpy.exp(model.predict(_x_test.values))

    result = pandas.DataFrame({"TIMETOEVENT": p, "RPT": _x_test.index.to_series()})
    result.set_index("RPT", inplace=True)

    _results_file = 'results-%s-%s.csv' % (basename(args.input), "ensemble_selection_regression")
    result.to_csv(_results_file)
    def test_fit_and_predict_clinical_kernel(self):
        x_full, y, _, _ = load_arff_file(WHAS500_FILE, ['fstat', 'lenfol'], '1',
                                         standardize_numeric=False, to_numeric=False)

        trans = ClinicalKernelTransform()
        trans.fit(x_full)

        x = encode_categorical(standardize(x_full))

        ssvm = FastKernelSurvivalSVM(optimizer="rbtree", kernel=trans.pairwise_kernel, max_iter=100, random_state=0)
        ssvm.fit(x.values, y)

        self.assertFalse(ssvm._pairwise)
        self.assertEquals(x.shape[0], ssvm.coef_.shape[0])

        c = ssvm.score(x.values, y)
        self.assertLessEqual(abs(0.83699051218246412 - c), 1e-3)
Esempio n. 12
0
    def test_load_with_index(self):
        tmp = tempfile.NamedTemporaryFile("w", suffix=".arff", delete=False)
        try:
            dataset = _make_and_write_data(tmp, 100, 10, True, True, 0)

            x_train, y_train, x_test, y_test = load_arff_file(tmp.name, ["event", "time"], 1, survival=True,
                                                              standardize_numeric=False, to_numeric=False)

            self.assertEqual(x_test, None)
            self.assertEqual(y_test, None)

            cols = ["event", "time"]
            x_true = dataset.drop(cols, axis=1)

            self.assert_x_equal(x_true, x_train)
            self.assert_y_equal(dataset, y_train)
        finally:
            os.unlink(tmp.name)
def main(args):
    LOG.info("Using IPython profile %s", args.profile)
    rc = parallel.Client(profile=args.profile)

    with rc[:].sync_imports():
        from survival.metrics import concordance_index_censored
        import numpy

    if args.metric == "timeroc":
        with rc[:].sync_imports():
            from rpy2.robjects.packages import importr
            from rpy2.robjects import numpy2ri
            from scipy.integrate import simps

    _x, _y, _x_test, _y_test = load_arff_file(args.input,
                                              [args.event, args.time],
                                              args.outcome,
                                              args.test,
                                              to_numeric=False)
    _data = _x.copy()
    _x = categorical_to_numeric(_x)
    if _x_test is not None:
        _x_test = categorical_to_numeric(_x_test)

    _estimator = get_estimator(args.method, args.seed, _data)
    _param_grid = get_param_grid(args.params)
    print_settings(_estimator, _x, _y)

    _grid_search = run_grid_search(_estimator, _param_grid, args.metric, _x,
                                   _y, _x_test, _y_test, args.seed,
                                   args.profile)

    if args.test is None:
        _output = "results-%s-%s-%s.csv" % (args.metric, basename(
            args.input).rstrip(".arff"), args.method)
    else:
        _output = "results-%s-%s+%s-%s.csv" % (args.metric, basename(
            args.input).rstrip(".arff"), basename(
                args.test).rstrip(".arff"), args.method)
    write_results(_grid_search.best_params_, _output)

    rc[:].clear()
Esempio n. 14
0
    def test_fit_and_predict_clinical_kernel(self):
        x_full, y, _, _ = load_arff_file(WHAS500_FILE, ['fstat', 'lenfol'],
                                         '1',
                                         standardize_numeric=False,
                                         to_numeric=False)

        trans = ClinicalKernelTransform()
        trans.fit(x_full)

        x = encode_categorical(standardize(x_full))

        ssvm = FastKernelSurvivalSVM(optimizer="rbtree",
                                     kernel=trans.pairwise_kernel,
                                     max_iter=100,
                                     random_state=0)
        ssvm.fit(x.values, y)

        self.assertFalse(ssvm._pairwise)
        self.assertEquals(x.shape[0], ssvm.coef_.shape[0])

        c = ssvm.score(x.values, y)
        self.assertLessEqual(abs(0.83699051218246412 - c), 1e-3)
Esempio n. 15
0
    def test_load_train_and_test_no_labels(self):
        tmp_train = tempfile.NamedTemporaryFile("w", suffix=".arff", delete=False)
        tmp_test = tempfile.NamedTemporaryFile("w", suffix=".arff", delete=False)
        try:
            train_dataset = _make_and_write_data(tmp_train, 100, 10, True, True, 0)
            test_dataset = _make_and_write_data(tmp_test, 20, 10, True, False, 0)

            x_train, y_train, x_test, y_test = load_arff_file(tmp_train.name, ["event", "time"], 1,
                                                              path_testing=tmp_test.name,
                                                              survival=True,
                                                              standardize_numeric=False, to_numeric=False)

            cols = ["event", "time"]

            x_true = train_dataset.drop(cols, axis=1)
            self.assert_x_equal(x_true, x_train)
            self.assert_y_equal(train_dataset, y_train)

            self.assert_x_equal(test_dataset, x_test)
            self.assertEqual(y_test, None)
        finally:
            os.unlink(tmp_train.name)
            os.unlink(tmp_test.name)
    def test_compare_rbf(self):
        x, y, _, _ = load_arff_file(WHAS500_FILE, ['fstat', 'lenfol'], '1')

        kpca = KernelPCA(kernel="rbf")
        xt = kpca.fit_transform(x)

        nrsvm = FastSurvivalSVM(optimizer='rbtree', tol=1e-8, max_iter=1000, random_state=0)
        nrsvm.fit(xt, y)

        rsvm = FastKernelSurvivalSVM(optimizer='rbtree', kernel="rbf",
                                     tol=1e-8, max_iter=1000, random_state=0)
        rsvm.fit(x, y)

        pred_nrsvm = nrsvm.predict(kpca.transform(x))
        pred_rsvm = rsvm.predict(x)

        self.assertEqual(len(pred_nrsvm), len(pred_rsvm))

        c1 = concordance_index_censored(y['fstat'], y['lenfol'], pred_nrsvm)
        c2 = concordance_index_censored(y['fstat'], y['lenfol'], pred_rsvm)

        self.assertAlmostEqual(c1[0], c2[0])
        self.assertTupleEqual(c1[1:], c2[1:])
Esempio n. 17
0
    def test_compare_clinical_kernel(self):
        x_full, y, _, _ = load_arff_file(WHAS500_FILE, ['fstat', 'lenfol'],
                                         '1',
                                         standardize_numeric=False,
                                         to_numeric=False)

        trans = ClinicalKernelTransform()
        trans.fit(x_full)

        x = encode_categorical(standardize(x_full))

        kpca = KernelPCA(kernel=trans.pairwise_kernel)
        xt = kpca.fit_transform(x)

        nrsvm = FastSurvivalSVM(optimizer='rbtree',
                                tol=1e-8,
                                max_iter=1000,
                                random_state=0)
        nrsvm.fit(xt, y)

        rsvm = FastKernelSurvivalSVM(optimizer='rbtree',
                                     kernel=trans.pairwise_kernel,
                                     tol=1e-8,
                                     max_iter=1000,
                                     random_state=0)
        rsvm.fit(x, y)

        pred_nrsvm = nrsvm.predict(kpca.transform(x))
        pred_rsvm = rsvm.predict(x)

        self.assertEqual(len(pred_nrsvm), len(pred_rsvm))

        c1 = concordance_index_censored(y['fstat'], y['lenfol'], pred_nrsvm)
        c2 = concordance_index_censored(y['fstat'], y['lenfol'], pred_rsvm)

        self.assertAlmostEqual(c1[0], c2[0])
        self.assertTupleEqual(c1[1:], c2[1:])
 def setUp(self):
     # naive survival SVM does resolve ties in survival time differently,
     # therefore use data without ties
     self.x, self.y, _, _ = load_arff_file(WHAS500_NOTIES_FILE, ['fstat', 'lenfol'], '1',
                                           standardize_numeric=False)
Esempio n. 19
0
 def setUp(self):
     x, y, _, _ = load_arff_file(GBSG2_FILE,
                                 ["cens", "time"], "1", standardize_numeric=False, to_numeric=False)
     self.x = encode_categorical(x)
     self.y = y
 def setUp(self):
     self.x, self.y, _, _ = load_arff_file(WHAS500_FILE, ['fstat', 'lenfol'], '1',
                                           standardize_numeric=True)
Esempio n. 21
0
 def setUp(self):
     self.x, self.y, _, _ = load_arff_file(WHAS500_FILE,
                                           ['fstat', 'lenfol'],
                                           '1',
                                           standardize_numeric=True)