def test_data_frame(self):
        a = numpy.concatenate((
            numpy.repeat(["large"], 10),
            numpy.repeat(["small"], 5),
            numpy.repeat(["tiny"], 13),
            numpy.repeat(["medium"], 3)))
        b = numpy.concatenate((
            numpy.repeat(["yes"], 8),
            numpy.repeat(["no"], 23)))

        rnd = numpy.random.RandomState(0)
        c = rnd.randn(len(a))

        input_df = pandas.DataFrame({"a_category": a,
                                     "a_binary": b,
                                     "a_number": c.copy()})

        a_num = numpy.concatenate((
            numpy.repeat([0], 10),
            numpy.repeat([2], 5),
            numpy.repeat([3], 13),
            numpy.repeat([1], 3)))
        b_num = numpy.concatenate((
            numpy.repeat([1], 8),
            numpy.repeat([0], 23)))
        expected = pandas.DataFrame({"a_category": a_num,
                                     "a_binary": b_num,
                                     "a_number": c.copy()})

        actual = column.categorical_to_numeric(input_df)

        tm.assert_frame_equal(actual, expected, check_exact=True)
Exemple #2
0
def main(args):
    _x, _y, _x_test, _y_test = load_arff_file(
        args.input, [args.event, 'ENDTRS_C', 'ENTRT_PC'],
        survival=False,
        path_testing=args.test,
        to_numeric=False)

    model = fit_and_dump(_x, _y, args)
    model.load_from_db()
    print("Ensemble size: %d" % len(model))

    if _x_test is not None:
        _x_test = categorical_to_numeric(_x_test)
    proba = model.predict(_x_test.values)
    pred_labels = model.classes_.take(numpy.argmax(proba, axis=1), axis=0)

    result = pandas.DataFrame({
        "RISK": proba[:, model.classes_ == 1].ravel(),
        "RPT": _x_test.index.to_series(),
        "DISCONT": pred_labels
    })
    result.set_index("RPT", inplace=True)

    _results_file = 'results-%s-%s.csv' % (basename(
        args.input), "ensemble_selection_classification")
    result.to_csv(_results_file)
    def test_bool_series(self):
        input_series = pandas.Series([True, True, False, False, True, False, True], name="human",
                                     index=["Alpha", "Beta", "Gamma", "Delta", "Eta", "Mu", "Zeta"])
        expected = pandas.Series([1, 1, 0, 0, 1, 0, 1], name="human",
                                 index=["Alpha", "Beta", "Gamma", "Delta", "Eta", "Mu", "Zeta"])

        actual = column.categorical_to_numeric(input_series)

        tm.assert_series_equal(actual, expected, check_exact=True)
    def test_series(self):
        input_series = pandas.Series(["a", "a", "b", "b", "b", "c"], name="Thr33",
                               index=["Alpha", "Beta", "Gamma", "Delta", "Eta", "Mu"])
        expected = pandas.Series([0, 0, 1, 1, 1, 2], name="Thr33",
                                 index=["Alpha", "Beta", "Gamma", "Delta", "Eta", "Mu"])

        actual = column.categorical_to_numeric(input_series)

        tm.assert_series_equal(actual, expected, check_exact=True)
def main(args):
    LOG.info("Using IPython profile %s", args.profile)
    rc = parallel.Client(profile=args.profile)

    with rc[:].sync_imports():
        from survival.metrics import concordance_index_censored
        import numpy

    if args.metric == "timeroc":
        with rc[:].sync_imports():
            from rpy2.robjects.packages import importr
            from rpy2.robjects import numpy2ri
            from scipy.integrate import simps

    _x, _y, _x_test, _y_test = load_arff_file(args.input,
                                              [args.event, args.time],
                                              args.outcome,
                                              args.test,
                                              to_numeric=False)
    _data = _x.copy()
    _x = categorical_to_numeric(_x)
    if _x_test is not None:
        _x_test = categorical_to_numeric(_x_test)

    _estimator = get_estimator(args.method, args.seed, _data)
    _param_grid = get_param_grid(args.params)
    print_settings(_estimator, _x, _y)

    _grid_search = run_grid_search(_estimator, _param_grid, args.metric, _x,
                                   _y, _x_test, _y_test, args.seed,
                                   args.profile)

    if args.test is None:
        _output = "results-%s-%s-%s.csv" % (args.metric, basename(
            args.input).rstrip(".arff"), args.method)
    else:
        _output = "results-%s-%s+%s-%s.csv" % (args.metric, basename(
            args.input).rstrip(".arff"), basename(
                args.test).rstrip(".arff"), args.method)
    write_results(_grid_search.best_params_, _output)

    rc[:].clear()
def load_dataset(name, base_dir):
    meta = DATASETS[name]

    data = loadarff(join(base_dir, meta['filename']))
    x_orig = data.drop(meta['label'], axis=1)
    x = column.categorical_to_numeric(column.standardize(x_orig))

    y_time = data.loc[:, meta['label'][0]]
    y_event = data.loc[:, meta['label'][1]] == meta['outcome']
    y = numpy.empty(dtype=[('event', bool), ('time', float)], shape=x.shape[0])
    y['event'] = y_event.values
    y['time'] = numpy.log(y_time.values)

    assert len(y_event.value_counts()) == 2

    return x.values, y
Exemple #7
0
def load_dataset(name, base_dir):
    meta = DATASETS[name]

    data = loadarff(join(base_dir, meta['filename']))
    x_orig = data.drop(meta['label'], axis=1)
    x = column.categorical_to_numeric(column.standardize(x_orig))

    y_time = data.loc[:, meta['label'][0]]
    y_event = data.loc[:, meta['label'][1]] == meta['outcome']
    y = numpy.empty(dtype=[('event', bool), ('time', float)], shape=x.shape[0])
    y['event'] = y_event.values
    y['time'] = numpy.log(y_time.values)

    assert len(y_event.value_counts()) == 2

    return x.values, y
Exemple #8
0
def main(args):
    _x, _y, _x_test, _y_test = load_arff_file(args.input, [args.event, args.time], args.outcome,
                                              args.test, to_numeric=False)

    model = fit_and_dump(_x, _y, args)
    print("Ensemble size: %d" % len(model))

    if _x_test is not None:
        _x_test = categorical_to_numeric(_x_test)

    p = numpy.exp(model.predict(_x_test.values))

    result = pandas.DataFrame({"TIMETOEVENT": p, "RPT": _x_test.index.to_series()})
    result.set_index("RPT", inplace=True)

    _results_file = 'results-%s-%s.csv' % (basename(args.input), "ensemble_selection_regression")
    result.to_csv(_results_file)
Exemple #9
0
def fit_and_dump(_x, _y, args):
    data = _x.copy()
    _x = categorical_to_numeric(_x)

    model = create_estimator(data, args.seed)
    print("Number of base estimators: %d" % len(model.base_estimators))

    print("Purging MongoDB cv_scores database")
    client = MongoClient(mongodb_host)
    db = client.ensemble_selection_survival
    db.cv_scores.remove({})

    print("Purging MongoDB corr_scores database")
    client = MongoClient(mongodb_host)
    db = client.ensemble_selection_survival
    db.corr_scores.remove({})

    print("Fitting %r" % model)
    _create_directories(args.models_dir, model.base_estimators)
    return model.fit(_x.values, _y)
Exemple #10
0
def fit_and_dump(_x, _y, args):
    data = _x.copy()
    _x = categorical_to_numeric(_x)
    _y = _y[args.event].cat.codes.values

    model = create_estimator(data, _y, args.seed)
    if args.metric == 'avgprec':
        scoring_func = get_scorer("average_precision")
    else:
        scoring_func = get_scorer("roc_auc")
    model.set_params(scorer=scoring_func)

    print("Number of base estimators: %d" % len(model.base_estimators))

    print("Purging MongoDB cv_scores database")
    client = MongoClient(mongodb_host)
    db = client.ensemble_selection_classification
    db.cv_scores.remove({})

    print("Fitting %r" % model)
    _create_directories(args.models_dir, model.base_estimators)
    return model.fit(_x.values, _y)