def test_data_frame(self): a = numpy.concatenate(( numpy.repeat(["large"], 10), numpy.repeat(["small"], 5), numpy.repeat(["tiny"], 13), numpy.repeat(["medium"], 3))) b = numpy.concatenate(( numpy.repeat(["yes"], 8), numpy.repeat(["no"], 23))) rnd = numpy.random.RandomState(0) c = rnd.randn(len(a)) input_df = pandas.DataFrame({"a_category": a, "a_binary": b, "a_number": c.copy()}) a_num = numpy.concatenate(( numpy.repeat([0], 10), numpy.repeat([2], 5), numpy.repeat([3], 13), numpy.repeat([1], 3))) b_num = numpy.concatenate(( numpy.repeat([1], 8), numpy.repeat([0], 23))) expected = pandas.DataFrame({"a_category": a_num, "a_binary": b_num, "a_number": c.copy()}) actual = column.categorical_to_numeric(input_df) tm.assert_frame_equal(actual, expected, check_exact=True)
def main(args): _x, _y, _x_test, _y_test = load_arff_file( args.input, [args.event, 'ENDTRS_C', 'ENTRT_PC'], survival=False, path_testing=args.test, to_numeric=False) model = fit_and_dump(_x, _y, args) model.load_from_db() print("Ensemble size: %d" % len(model)) if _x_test is not None: _x_test = categorical_to_numeric(_x_test) proba = model.predict(_x_test.values) pred_labels = model.classes_.take(numpy.argmax(proba, axis=1), axis=0) result = pandas.DataFrame({ "RISK": proba[:, model.classes_ == 1].ravel(), "RPT": _x_test.index.to_series(), "DISCONT": pred_labels }) result.set_index("RPT", inplace=True) _results_file = 'results-%s-%s.csv' % (basename( args.input), "ensemble_selection_classification") result.to_csv(_results_file)
def test_bool_series(self): input_series = pandas.Series([True, True, False, False, True, False, True], name="human", index=["Alpha", "Beta", "Gamma", "Delta", "Eta", "Mu", "Zeta"]) expected = pandas.Series([1, 1, 0, 0, 1, 0, 1], name="human", index=["Alpha", "Beta", "Gamma", "Delta", "Eta", "Mu", "Zeta"]) actual = column.categorical_to_numeric(input_series) tm.assert_series_equal(actual, expected, check_exact=True)
def test_series(self): input_series = pandas.Series(["a", "a", "b", "b", "b", "c"], name="Thr33", index=["Alpha", "Beta", "Gamma", "Delta", "Eta", "Mu"]) expected = pandas.Series([0, 0, 1, 1, 1, 2], name="Thr33", index=["Alpha", "Beta", "Gamma", "Delta", "Eta", "Mu"]) actual = column.categorical_to_numeric(input_series) tm.assert_series_equal(actual, expected, check_exact=True)
def main(args): LOG.info("Using IPython profile %s", args.profile) rc = parallel.Client(profile=args.profile) with rc[:].sync_imports(): from survival.metrics import concordance_index_censored import numpy if args.metric == "timeroc": with rc[:].sync_imports(): from rpy2.robjects.packages import importr from rpy2.robjects import numpy2ri from scipy.integrate import simps _x, _y, _x_test, _y_test = load_arff_file(args.input, [args.event, args.time], args.outcome, args.test, to_numeric=False) _data = _x.copy() _x = categorical_to_numeric(_x) if _x_test is not None: _x_test = categorical_to_numeric(_x_test) _estimator = get_estimator(args.method, args.seed, _data) _param_grid = get_param_grid(args.params) print_settings(_estimator, _x, _y) _grid_search = run_grid_search(_estimator, _param_grid, args.metric, _x, _y, _x_test, _y_test, args.seed, args.profile) if args.test is None: _output = "results-%s-%s-%s.csv" % (args.metric, basename( args.input).rstrip(".arff"), args.method) else: _output = "results-%s-%s+%s-%s.csv" % (args.metric, basename( args.input).rstrip(".arff"), basename( args.test).rstrip(".arff"), args.method) write_results(_grid_search.best_params_, _output) rc[:].clear()
def load_dataset(name, base_dir): meta = DATASETS[name] data = loadarff(join(base_dir, meta['filename'])) x_orig = data.drop(meta['label'], axis=1) x = column.categorical_to_numeric(column.standardize(x_orig)) y_time = data.loc[:, meta['label'][0]] y_event = data.loc[:, meta['label'][1]] == meta['outcome'] y = numpy.empty(dtype=[('event', bool), ('time', float)], shape=x.shape[0]) y['event'] = y_event.values y['time'] = numpy.log(y_time.values) assert len(y_event.value_counts()) == 2 return x.values, y
def main(args): _x, _y, _x_test, _y_test = load_arff_file(args.input, [args.event, args.time], args.outcome, args.test, to_numeric=False) model = fit_and_dump(_x, _y, args) print("Ensemble size: %d" % len(model)) if _x_test is not None: _x_test = categorical_to_numeric(_x_test) p = numpy.exp(model.predict(_x_test.values)) result = pandas.DataFrame({"TIMETOEVENT": p, "RPT": _x_test.index.to_series()}) result.set_index("RPT", inplace=True) _results_file = 'results-%s-%s.csv' % (basename(args.input), "ensemble_selection_regression") result.to_csv(_results_file)
def fit_and_dump(_x, _y, args): data = _x.copy() _x = categorical_to_numeric(_x) model = create_estimator(data, args.seed) print("Number of base estimators: %d" % len(model.base_estimators)) print("Purging MongoDB cv_scores database") client = MongoClient(mongodb_host) db = client.ensemble_selection_survival db.cv_scores.remove({}) print("Purging MongoDB corr_scores database") client = MongoClient(mongodb_host) db = client.ensemble_selection_survival db.corr_scores.remove({}) print("Fitting %r" % model) _create_directories(args.models_dir, model.base_estimators) return model.fit(_x.values, _y)
def fit_and_dump(_x, _y, args): data = _x.copy() _x = categorical_to_numeric(_x) _y = _y[args.event].cat.codes.values model = create_estimator(data, _y, args.seed) if args.metric == 'avgprec': scoring_func = get_scorer("average_precision") else: scoring_func = get_scorer("roc_auc") model.set_params(scorer=scoring_func) print("Number of base estimators: %d" % len(model.base_estimators)) print("Purging MongoDB cv_scores database") client = MongoClient(mongodb_host) db = client.ensemble_selection_classification db.cv_scores.remove({}) print("Fitting %r" % model) _create_directories(args.models_dir, model.base_estimators) return model.fit(_x.values, _y)