def test_series_numeric():
        input_series = pandas.Series(
            [0.5, 0.1, 10, 25, 3.8, 11, 2256, -1, -0.2, 3.14], name="a_series")

        with pytest.raises(
                TypeError,
                match="series must be of categorical dtype, but was float"):
            column.encode_categorical(input_series)
    def test_case_numeric():
        a = numpy.array([0, 1, 1, 0, 1, 0, 0, 1, 0, 1], dtype=object)
        b = numpy.array([1, 2, 1, 3, 2, 1, 3, 2, 3, 1], dtype=object)
        c = numpy.array([
            1. / 128, 1. / 32, 1., 1. / 8, 1. / 32, 1., 1. / 128, 1. / 8, 1.,
            1. / 32
        ],
                        dtype=object)

        df = pandas.DataFrame({
            "a_binary_int": a.copy(),
            "a_three_int": b.copy(),
            "a_four_float": c.copy()
        })

        actual_df = column.encode_categorical(df)

        expected_df = pandas.DataFrame({
            "a_binary_int=1":
            a.astype(float),
            "a_three_int=2": (b == 2).astype(float),
            "a_three_int=3": (b == 3).astype(float),
            "a_four_float={}".format(1. / 32): (c == 1. / 32).astype(float),
            "a_four_float={}".format(1. / 8): (c == 1. / 8).astype(float),
            "a_four_float={}".format(1.): (c == 1.).astype(float),
        })

        assert actual_df.shape == expected_df.shape
        tm.assert_frame_equal(actual_df, expected_df, check_exact=True)
    def test_case1():
        a = numpy.r_[numpy.repeat(["large"], 10),
                     numpy.repeat(["small"], 5),
                     numpy.repeat(["tiny"], 13),
                     numpy.repeat(["medium"], 3)]
        b = numpy.r_[numpy.repeat(["yes"], 8), numpy.repeat(["no"], 23)]

        rnd = numpy.random.RandomState(0)
        c = rnd.randn(len(a))

        df = pandas.DataFrame.from_dict(
            OrderedDict([("a_category", a), ("a_binary", b),
                         ("a_number", c.copy())]))

        actual_df = column.encode_categorical(df)

        eb = numpy.r_[numpy.repeat([1.], 8), numpy.repeat([0.], 23)]

        a_tiny = numpy.zeros(31, dtype=float)
        a_tiny[15:28] = 1

        a_small = numpy.zeros(31, dtype=float)
        a_small[10:15] = 1

        a_medium = numpy.zeros(31, dtype=float)
        a_medium[-3:] = 1

        expected_df = pandas.DataFrame.from_dict(
            OrderedDict([("a_category=medium", a_medium),
                         ("a_category=small", a_small),
                         ("a_category=tiny", a_tiny), ("a_binary=yes", eb),
                         ("a_number", c.copy())]))

        assert actual_df.shape == expected_df.shape
        tm.assert_frame_equal(actual_df, expected_df, check_exact=True)
def whas500_without_ties():
    # naive survival SVM does resolve ties in survival time differently,
    # therefore use data without ties
    data = loadarff(WHAS500_NOTIES_FILE)
    x, y = get_x_y(data, ['fstat', 'lenfol'], '1')
    x = encode_categorical(x)
    return x, y
Example #5
0
    def test_duplicate_index(self):
        a = numpy.concatenate((
            numpy.repeat(["large"], 10),
            numpy.repeat(["small"], 6),
            numpy.repeat(["tiny"], 13),
            numpy.repeat(["medium"], 3)))
        rnd = numpy.random.RandomState(0)
        c = rnd.randn(len(a))

        index = numpy.ceil(numpy.arange(0, len(a) // 2, 0.5))
        df = pandas.DataFrame.from_dict(OrderedDict([
            ("a_category", pandas.Series(a, index=index)),
            ("a_number", pandas.Series(c, index=index, copy=True))
        ]))

        actual_df = column.encode_categorical(df)

        expected_df = pandas.DataFrame(numpy.zeros((32, 3), dtype=numpy.float_),
                                       index=index,
                                       columns=["a_category=medium", "a_category=small", "a_category=tiny"])
        # tiny
        expected_df.iloc[16:29, 2] = 1
        # small
        expected_df.iloc[10:16, 1] = 1
        # medium
        expected_df.iloc[-3:, 0] = 1

        expected_df["a_number"] = c

        self.assertTupleEqual(actual_df.shape, expected_df.shape)
        tm.assert_frame_equal(actual_df, expected_df, check_exact=True)
Example #6
0
    def test_with_missing(self):
        b = numpy.concatenate((
            numpy.repeat(["yes"], 5),
            numpy.repeat([None], 10),
            numpy.repeat(["no"], 16)))

        rnd = numpy.random.RandomState(0)
        c = rnd.randn(len(b))

        df = pandas.DataFrame(OrderedDict("a_binary"=b,
                                          "a_number"=c.copy()))

        actual_df = column.encode_categorical(df)

        eb = numpy.concatenate((
            numpy.repeat([1.], 5),
            numpy.repeat([numpy.nan], 10),
            numpy.repeat([0.], 16)))

        d = OrderedDict()
        d['a_binary=yes'] = eb
        d['a_number'] = c.copy()
        expected_df = pandas.DataFrame(d)

        self.assertTupleEqual(actual_df.shape, expected_df.shape)
        tm.assert_frame_equal(actual_df.isnull(), expected_df.isnull())
        tm.assert_frame_equal(actual_df.dropna(), expected_df.dropna(), check_exact=True)
Example #7
0
    def test_compare_clinical_kernel(self):
        x_full, y = load_whas500()

        trans = ClinicalKernelTransform()
        trans.fit(x_full)

        x = encode_categorical(standardize(x_full))

        kpca = KernelPCA(kernel=trans.pairwise_kernel)
        xt = kpca.fit_transform(x)

        nrsvm = FastSurvivalSVM(optimizer='rbtree',
                                tol=1e-8,
                                max_iter=1000,
                                random_state=0)
        nrsvm.fit(xt, y)

        rsvm = FastKernelSurvivalSVM(optimizer='rbtree',
                                     kernel=trans.pairwise_kernel,
                                     tol=1e-8,
                                     max_iter=1000,
                                     random_state=0)
        rsvm.fit(x, y)

        pred_nrsvm = nrsvm.predict(kpca.transform(x))
        pred_rsvm = rsvm.predict(x)

        self.assertEqual(len(pred_nrsvm), len(pred_rsvm))

        c1 = concordance_index_censored(y['fstat'], y['lenfol'], pred_nrsvm)
        c2 = concordance_index_censored(y['fstat'], y['lenfol'], pred_rsvm)

        self.assertAlmostEqual(c1[0], c2[0])
        self.assertTupleEqual(c1[1:], c2[1:])
    def test_with_missing():
        b = numpy.r_[numpy.repeat(["yes"], 5),
                     numpy.repeat([None], 10),
                     numpy.repeat(["no"], 16)]

        rnd = numpy.random.RandomState(0)
        c = rnd.randn(len(b))

        df = pandas.DataFrame(
            OrderedDict([("a_binary", b), ("a_number", c.copy())]))

        actual_df = column.encode_categorical(df)

        eb = numpy.r_[numpy.repeat([1.], 5),
                      numpy.repeat([numpy.nan], 10),
                      numpy.repeat([0.], 16)]

        d = OrderedDict()
        d['a_binary=yes'] = eb
        d['a_number'] = c.copy()
        expected_df = pandas.DataFrame(d)

        assert actual_df.shape == expected_df.shape
        tm.assert_frame_equal(actual_df.isnull(), expected_df.isnull())
        tm.assert_frame_equal(actual_df.dropna(),
                              expected_df.dropna(),
                              check_exact=True)
    def test_retain_all_missing():
        b = numpy.r_[numpy.repeat(["yes"], 5),
                     numpy.repeat([None], 10),
                     numpy.repeat(["no"], 16)]

        all_missing = numpy.repeat([None], len(b))

        df = pandas.DataFrame({"a_binary": b, "bogus": all_missing})

        actual_df = column.encode_categorical(df, allow_drop=False)

        eb = numpy.r_[numpy.repeat([1.], 5),
                      numpy.repeat([numpy.nan], 10),
                      numpy.repeat([0.], 16)]

        expected_df = pandas.DataFrame({
            "a_binary=yes": eb,
            "bogus": all_missing.copy()
        })

        assert actual_df.shape == expected_df.shape
        tm.assert_frame_equal(actual_df.isnull(), expected_df.isnull())
        tm.assert_frame_equal(actual_df.dropna(),
                              expected_df.dropna(),
                              check_exact=True)
def data_processing(data_df):
    data_df_x = data_df.drop(['LOC', 'UID', 'Hospital_ID', 'SurvivalWeeks', 'admission_date',
                              'discharge_date', 'death_date', 'Mortality', 'CVDeath', 'SurvivalDays', 'CAD'], axis=1)

    data_df_y = data_df[['Mortality', 'SurvivalWeeks']]

    data_df_x = data_df_x.drop(['ICU'], axis=1)

    X_temp = data_df_x[(data_df.LOC == '3') | (data_df.LOC == '2') | (data_df.LOC == '6')]
    y_temp = data_df_y[(data_df.LOC == '3') | (data_df.LOC == '2') | (data_df.LOC == '6')]
    X_df_train, X_df_val, y_df_train, y_df_val = train_test_split(X_temp, y_temp, test_size=0.25, random_state=369)

    X_df_test_kao = data_df_x[data_df.LOC == '8']
    y_df_test_kao = data_df_y[data_df.LOC == '8']

    categorical_columns = ['Sex', 'AF', 'DM', 'HTN', 'Hyperlipidemia', 'CHF', 'Smoking',
                           'Cancer.before.adm', 'Foley', 'NG', 'Dyslipidemia']
    numerical_columns = np.setdiff1d(data_df_x.columns, categorical_columns).tolist()

    categorical_ix = [data_df_x.columns.get_loc(col) for col in categorical_columns]
    numerical_ix = np.setdiff1d(list(range(0, len(data_df_x.columns))), categorical_ix).tolist()

    scaler = preprocessing.StandardScaler()

    standardize = [([col], scaler) for col in numerical_columns]
    leave = [(col, None) for col in categorical_columns]

    x_mapper = DataFrameMapper(standardize + leave)

    X_df_train = pd.DataFrame(data=x_mapper.fit_transform(X_df_train),
                              columns=numerical_columns + categorical_columns,
                              index=X_df_train.index)

    X_df_val = pd.DataFrame(data=x_mapper.fit_transform(X_df_val),
                            columns=numerical_columns + categorical_columns,
                            index=X_df_val.index)

    X_df_test_kao = pd.DataFrame(data=x_mapper.fit_transform(X_df_test_kao),
                                 columns=numerical_columns + categorical_columns,
                                 index=X_df_test_kao.index)

    X_df_train = encode_categorical(X_df_train, columns=categorical_columns)
    X_df_val = encode_categorical(X_df_val, columns=categorical_columns)
    X_df_test_kao = encode_categorical(X_df_test_kao, columns=categorical_columns)

    return X_df_train, X_df_val, y_df_train, y_df_val, X_df_test_kao, y_df_test_kao
    def test_breast_example(self):
        x, y = load_breast_cancer()
        x = column.encode_categorical(x)

        coxnet = CoxnetSurvivalAnalysis(l1_ratio=1.0)
        coxnet.fit(x.values, y)

        expected_alphas = numpy.array([
            0.207764947265866, 0.189307681974955, 0.172490109262135, 0.157166563357949, 0.143204319038428,
            0.130482442022696, 0.118890741498079, 0.108328815700004, 0.0987051822799425, 0.0899364859290742,
            0.0819467763944772, 0.0746668506343715, 0.0680336534144775, 0.0619897311537413, 0.0564827342889011,
            0.051464963847614, 0.046892958302776, 0.0427271171295661, 0.0389313578046448, 0.0354728032765984,
            0.0323214972006479, 0.0294501444711215, 0.0268338748043064, 0.0244500273239498, 0.0222779542835891,
            0.0202988422256499, 0.0184955490282766, 0.0168524554284737, 0.0153553297355215, 0.0139912045628799,
            0.0127482645108893, 0.0116157438274312, 0.0105838331601337, 0.00964359459245389, 0.00878688422772072,
            0.00800628165059773, 0.0072950256549955, 0.0066469556817389, 0.00605645845875073, 0.00551841938157428,
            0.00502817821311635, 0.00458148871890295, 0.00417448188822764, 0.00380363242263169, 0.00346572820145532,
            0.00315784245998521, 0.00287730843921864, 0.00262169628767281, 0.00238879201517371, 0.00217657831633235,
            0.00198321709761059, 0.00180703355663423, 0.00164650167585602, 0.00150023100492174, 0.0013669546172544,
            0.00124551813654232, 0.00113486973808373, 0.00103405103838443, 0.000942188794098442, 0.000858487338411865,
            0.000782221689357606, 0.00071273127036839, 0.000649414188678556, 0.000591722022016858, 0.00053915506843511,
            0.00049125801812897, 0.000447616009762226, 0.000407851037136367, 0.000371618675081733, 0.000338605096211458,
            0.000308524352698783, 0.00028111589953377, 0.000256142337807075, 0.000233387358474159, 0.000212653868789829,
            0.000193762285185162, 0.000176548977800548, 0.000160864853202119, 0.000146574063005757,
            0.000133552827223371, 0.000121688362139862, 0.000110877903434536, 0.000101027816085719,
            9.20527833489927e-05, 8.38750677843702e-05, 7.64238379317803e-05, 6.96345548028444e-05, 6.34484128750348e-05
        ])

        assert_array_almost_equal(expected_alphas, coxnet.alphas_)

        expected_deviance_ratio = numpy.array([
            0, 0.00741462796207568, 0.0135178719105177, 0.0183232499901932, 0.0221250029051101, 0.0251530137843965,
            0.0275599035016693, 0.0298664819929119, 0.033763232356598, 0.0374249162331977, 0.0409637006907067,
            0.0454486054162627, 0.0551615080395675, 0.0651612844343542, 0.0736024993960834, 0.0808820441173129,
            0.0894426534710234, 0.0992239010000626, 0.108910229105339, 0.121376204780063, 0.134004998770465,
            0.145079557491685, 0.156667501995989, 0.167543840680748, 0.178622131991811, 0.189365153169168,
            0.199027839424271, 0.20909726215036, 0.218610320633419, 0.228024278642459, 0.238171883969976,
            0.248070501745195, 0.258480494697342, 0.268971907277929, 0.280744803445048, 0.291329662029924,
            0.300942928439923, 0.309972153913063, 0.318315812887558, 0.325822700491885, 0.332992506325249,
            0.339665277042211, 0.345876707002969, 0.351605625998246, 0.357206102668659, 0.362484660673399,
            0.367624391654207, 0.372275248793233, 0.37674043994605, 0.380887801196039, 0.384795899779142,
            0.388569806807258, 0.392075171498793, 0.395375481018565, 0.398377579969751, 0.400997300805061,
            0.403375467852471, 0.405431976972633, 0.407443593366561, 0.409668341757423, 0.411628734365416,
            0.413367576771339, 0.414896999887021, 0.416268233594787, 0.417475290203319, 0.418554781508749,
            0.419526121036389, 0.420522904669104, 0.421455233639571, 0.422296101083462, 0.423049677446171,
            0.423716974236606, 0.424302533927477, 0.424825925226932, 0.425286695396174, 0.425693415010937,
            0.426052733081791, 0.426369464812111, 0.426652822940747, 0.42686317150694, 0.427072533094355,
            0.427264216646862, 0.427427314063872, 0.427573225735422, 0.427700379783919, 0.427814235325525,
            0.427912925916531, 0.427998148400703
        ])

        assert_array_almost_equal(expected_deviance_ratio, coxnet.deviance_ratio_)

        coef = pandas.DataFrame(coxnet.coef_, index=x.columns, dtype=float)
        expected_coef = pandas.read_csv(BREAST_CANCER_COEFFICIENTS_FILE, index_col=0)
        expected_coef.columns = numpy.arange(expected_coef.shape[1])

        assert_columns_almost_equal(coef, expected_coef, 5)
Example #12
0
    def test_series_categorical():
        input_series = pandas.Series(pandas.Categorical.from_codes([1, 1, 0, 2, 0, 1, 2, 1, 2, 0, 0, 1, 2, 2],
                                                                   ["small", "medium", "large"], ordered=False),
                                     name="a_series")
        expected_df = pandas.DataFrame.from_dict(OrderedDict(
            [("a_series=medium", numpy.array([1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0], dtype=float)),
             ("a_series=large", numpy.array([0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1], dtype=float))
            ]))

        actual_df = column.encode_categorical(input_series)

        tm.assert_frame_equal(actual_df, expected_df, check_exact=True)
def test_pipeline_predict(func):
    X_str, y = load_breast_cancer()
    X_num = encode_categorical(X_str)

    est = RandomSurvivalForest(n_estimators=10, random_state=1)
    est.fit(X_num[10:], y[10:])

    pipe = make_pipeline(OneHotEncoder(),
                         RandomSurvivalForest(n_estimators=10, random_state=1))
    pipe.fit(X_str[10:], y[10:])

    tree_pred = getattr(est, func)(X_num[:10])
    pipe_pred = getattr(pipe, func)(X_str[:10])

    assert_array_almost_equal(tree_pred, pipe_pred)
Example #14
0
    def test_fit_and_predict_clinical_kernel(self):
        x_full, y = load_whas500()

        trans = ClinicalKernelTransform()
        trans.fit(x_full)

        x = encode_categorical(standardize(x_full))

        ssvm = FastKernelSurvivalSVM(optimizer="rbtree",
                                     kernel=trans.pairwise_kernel,
                                     max_iter=100,
                                     random_state=0)
        ssvm.fit(x.values, y)

        self.assertFalse(ssvm._pairwise)
        self.assertEquals(x.shape[0], ssvm.coef_.shape[0])

        c = ssvm.score(x.values, y)
        self.assertLessEqual(abs(0.83699051218246412 - c), 1e-3)
Example #15
0
    def test_case1(self):
        a = numpy.concatenate((
            numpy.repeat(["large"], 10),
            numpy.repeat(["small"], 5),
            numpy.repeat(["tiny"], 13),
            numpy.repeat(["medium"], 3)))
        b = numpy.concatenate((
            numpy.repeat(["yes"], 8),
            numpy.repeat(["no"], 23)))

        rnd = numpy.random.RandomState(0)
        c = rnd.randn(len(a))

        df = pandas.DataFrame({"a_category": a,
                               "a_binary": b,
                               "a_number": c.copy()})

        actual_df = column.encode_categorical(df)

        eb = numpy.concatenate((
            numpy.repeat([1.], 8),
            numpy.repeat([0.], 23)))

        a_tiny = numpy.zeros(31, dtype=float)
        a_tiny[15:28] = 1

        a_small = numpy.zeros(31, dtype=float)
        a_small[10:15] = 1

        a_medium = numpy.zeros(31, dtype=float)
        a_medium[-3:] = 1

        expected_df = pandas.DataFrame({"a_number": c.copy(),
                                        "a_binary=yes": eb,
                                        "a_category=medium": a_medium,
                                        "a_category=small": a_small,
                                        "a_category=tiny": a_tiny})

        self.assertTupleEqual(actual_df.shape, expected_df.shape)
        tm.assert_frame_equal(actual_df, expected_df, check_exact=True)
def test_pipeline_predict(func):
    X_str, y = load_breast_cancer()
    X_num = column.encode_categorical(X_str)

    est = CoxnetSurvivalAnalysis(alpha_min_ratio=0.0001,
                                 l1_ratio=1.0,
                                 fit_baseline_model=True)
    est.fit(X_num[10:], y[10:])

    pipe = make_pipeline(
        OneHotEncoder(),
        CoxnetSurvivalAnalysis(alpha_min_ratio=0.0001,
                               l1_ratio=1.0,
                               fit_baseline_model=True))
    pipe.fit(X_str[10:], y[10:])

    tree_pred = getattr(est, func)(X_num[:10])
    pipe_pred = getattr(pipe, func)(X_str[:10])

    for s1, s2 in zip(tree_pred, pipe_pred):
        assert_array_almost_equal(s1.x, s2.x)
        assert_array_almost_equal(s1.y, s2.y)
Example #17
0
    def test_drop_all_missing(self):
        b = numpy.concatenate((
            numpy.repeat(["yes"], 5),
            numpy.repeat([None], 10),
            numpy.repeat(["no"], 16)))

        all_missing = numpy.repeat([None], len(b))

        df = pandas.DataFrame({"a_binary": b,
                               "bogus": all_missing})

        actual_df = column.encode_categorical(df)

        eb = numpy.concatenate((
            numpy.repeat([1.], 5),
            numpy.repeat([numpy.nan], 10),
            numpy.repeat([0.], 16)))

        expected_df = pandas.DataFrame({"a_binary=yes": eb})

        self.assertTupleEqual(actual_df.shape, expected_df.shape)
        tm.assert_frame_equal(actual_df.isnull(), expected_df.isnull())
        tm.assert_frame_equal(actual_df.dropna(), expected_df.dropna(), check_exact=True)
from sksurv.column import encode_categorical
from sksurv.metrics import concordance_index_censored
from sksurv.svm import FastSurvivalSVM

data_x, y = load_veterans_lung_cancer()

print("\n")
print("La matriz con las covariables")
print(data_x.head())

print("\n")
print("Las primeras 5 observaciones de y")
print(y[0:4])

# Para convertir la matriz con las covariables
x = encode_categorical(data_x)

print("\n")
print("Para ver el inicio de la matriz x")
print(x.head())

# Para saber el numero de observaciones censuradas
n_censored = y.shape[0] - y["Status"].sum()

print("\n")
print("%.1f%% of records are censored" % (n_censored / y.shape[0] * 100))

# Dibujando
plt.figure(figsize=(9, 6))
val, bins, patches = plt.hist(
    (y["Survival_in_days"][y["Status"]], y["Survival_in_days"][~y["Status"]]),
def whas500_with_ties():
    # naive survival SVM does resolve ties in survival time differently,
    # therefore use data without ties
    x, y = load_whas500()
    x = normalize(encode_categorical(x))
    return x, y
Example #20
0
 def setUp(self):
     x, y = load_gbsg2()
     self.x = encode_categorical(x)
     self.y = y
Example #21
0
 def setUp(self):
     x, self.y = load_whas500()
     self.x = encode_categorical(standardize(x))
 def setUp(self):
     # naive survival SVM does resolve ties in survival time differently,
     # therefore use data without ties
     data = loadarff(WHAS500_NOTIES_FILE)
     x, self.y = get_x_y(data, ['fstat', 'lenfol'], '1')
     self.x = encode_categorical(x)
Example #23
0
def gbsg2():
    x, y = load_gbsg2()
    x = encode_categorical(x)
    return x.values, y
Example #24
0
def breast_cancer():
    X_str, y = load_breast_cancer()
    X_num = encode_categorical(X_str)
    return X_num, y