def test_series_numeric(): input_series = pandas.Series( [0.5, 0.1, 10, 25, 3.8, 11, 2256, -1, -0.2, 3.14], name="a_series") with pytest.raises( TypeError, match="series must be of categorical dtype, but was float"): column.encode_categorical(input_series)
def test_case_numeric(): a = numpy.array([0, 1, 1, 0, 1, 0, 0, 1, 0, 1], dtype=object) b = numpy.array([1, 2, 1, 3, 2, 1, 3, 2, 3, 1], dtype=object) c = numpy.array([ 1. / 128, 1. / 32, 1., 1. / 8, 1. / 32, 1., 1. / 128, 1. / 8, 1., 1. / 32 ], dtype=object) df = pandas.DataFrame({ "a_binary_int": a.copy(), "a_three_int": b.copy(), "a_four_float": c.copy() }) actual_df = column.encode_categorical(df) expected_df = pandas.DataFrame({ "a_binary_int=1": a.astype(float), "a_three_int=2": (b == 2).astype(float), "a_three_int=3": (b == 3).astype(float), "a_four_float={}".format(1. / 32): (c == 1. / 32).astype(float), "a_four_float={}".format(1. / 8): (c == 1. / 8).astype(float), "a_four_float={}".format(1.): (c == 1.).astype(float), }) assert actual_df.shape == expected_df.shape tm.assert_frame_equal(actual_df, expected_df, check_exact=True)
def test_case1(): a = numpy.r_[numpy.repeat(["large"], 10), numpy.repeat(["small"], 5), numpy.repeat(["tiny"], 13), numpy.repeat(["medium"], 3)] b = numpy.r_[numpy.repeat(["yes"], 8), numpy.repeat(["no"], 23)] rnd = numpy.random.RandomState(0) c = rnd.randn(len(a)) df = pandas.DataFrame.from_dict( OrderedDict([("a_category", a), ("a_binary", b), ("a_number", c.copy())])) actual_df = column.encode_categorical(df) eb = numpy.r_[numpy.repeat([1.], 8), numpy.repeat([0.], 23)] a_tiny = numpy.zeros(31, dtype=float) a_tiny[15:28] = 1 a_small = numpy.zeros(31, dtype=float) a_small[10:15] = 1 a_medium = numpy.zeros(31, dtype=float) a_medium[-3:] = 1 expected_df = pandas.DataFrame.from_dict( OrderedDict([("a_category=medium", a_medium), ("a_category=small", a_small), ("a_category=tiny", a_tiny), ("a_binary=yes", eb), ("a_number", c.copy())])) assert actual_df.shape == expected_df.shape tm.assert_frame_equal(actual_df, expected_df, check_exact=True)
def whas500_without_ties(): # naive survival SVM does resolve ties in survival time differently, # therefore use data without ties data = loadarff(WHAS500_NOTIES_FILE) x, y = get_x_y(data, ['fstat', 'lenfol'], '1') x = encode_categorical(x) return x, y
def test_duplicate_index(self): a = numpy.concatenate(( numpy.repeat(["large"], 10), numpy.repeat(["small"], 6), numpy.repeat(["tiny"], 13), numpy.repeat(["medium"], 3))) rnd = numpy.random.RandomState(0) c = rnd.randn(len(a)) index = numpy.ceil(numpy.arange(0, len(a) // 2, 0.5)) df = pandas.DataFrame.from_dict(OrderedDict([ ("a_category", pandas.Series(a, index=index)), ("a_number", pandas.Series(c, index=index, copy=True)) ])) actual_df = column.encode_categorical(df) expected_df = pandas.DataFrame(numpy.zeros((32, 3), dtype=numpy.float_), index=index, columns=["a_category=medium", "a_category=small", "a_category=tiny"]) # tiny expected_df.iloc[16:29, 2] = 1 # small expected_df.iloc[10:16, 1] = 1 # medium expected_df.iloc[-3:, 0] = 1 expected_df["a_number"] = c self.assertTupleEqual(actual_df.shape, expected_df.shape) tm.assert_frame_equal(actual_df, expected_df, check_exact=True)
def test_with_missing(self): b = numpy.concatenate(( numpy.repeat(["yes"], 5), numpy.repeat([None], 10), numpy.repeat(["no"], 16))) rnd = numpy.random.RandomState(0) c = rnd.randn(len(b)) df = pandas.DataFrame(OrderedDict("a_binary"=b, "a_number"=c.copy())) actual_df = column.encode_categorical(df) eb = numpy.concatenate(( numpy.repeat([1.], 5), numpy.repeat([numpy.nan], 10), numpy.repeat([0.], 16))) d = OrderedDict() d['a_binary=yes'] = eb d['a_number'] = c.copy() expected_df = pandas.DataFrame(d) self.assertTupleEqual(actual_df.shape, expected_df.shape) tm.assert_frame_equal(actual_df.isnull(), expected_df.isnull()) tm.assert_frame_equal(actual_df.dropna(), expected_df.dropna(), check_exact=True)
def test_compare_clinical_kernel(self): x_full, y = load_whas500() trans = ClinicalKernelTransform() trans.fit(x_full) x = encode_categorical(standardize(x_full)) kpca = KernelPCA(kernel=trans.pairwise_kernel) xt = kpca.fit_transform(x) nrsvm = FastSurvivalSVM(optimizer='rbtree', tol=1e-8, max_iter=1000, random_state=0) nrsvm.fit(xt, y) rsvm = FastKernelSurvivalSVM(optimizer='rbtree', kernel=trans.pairwise_kernel, tol=1e-8, max_iter=1000, random_state=0) rsvm.fit(x, y) pred_nrsvm = nrsvm.predict(kpca.transform(x)) pred_rsvm = rsvm.predict(x) self.assertEqual(len(pred_nrsvm), len(pred_rsvm)) c1 = concordance_index_censored(y['fstat'], y['lenfol'], pred_nrsvm) c2 = concordance_index_censored(y['fstat'], y['lenfol'], pred_rsvm) self.assertAlmostEqual(c1[0], c2[0]) self.assertTupleEqual(c1[1:], c2[1:])
def test_with_missing(): b = numpy.r_[numpy.repeat(["yes"], 5), numpy.repeat([None], 10), numpy.repeat(["no"], 16)] rnd = numpy.random.RandomState(0) c = rnd.randn(len(b)) df = pandas.DataFrame( OrderedDict([("a_binary", b), ("a_number", c.copy())])) actual_df = column.encode_categorical(df) eb = numpy.r_[numpy.repeat([1.], 5), numpy.repeat([numpy.nan], 10), numpy.repeat([0.], 16)] d = OrderedDict() d['a_binary=yes'] = eb d['a_number'] = c.copy() expected_df = pandas.DataFrame(d) assert actual_df.shape == expected_df.shape tm.assert_frame_equal(actual_df.isnull(), expected_df.isnull()) tm.assert_frame_equal(actual_df.dropna(), expected_df.dropna(), check_exact=True)
def test_retain_all_missing(): b = numpy.r_[numpy.repeat(["yes"], 5), numpy.repeat([None], 10), numpy.repeat(["no"], 16)] all_missing = numpy.repeat([None], len(b)) df = pandas.DataFrame({"a_binary": b, "bogus": all_missing}) actual_df = column.encode_categorical(df, allow_drop=False) eb = numpy.r_[numpy.repeat([1.], 5), numpy.repeat([numpy.nan], 10), numpy.repeat([0.], 16)] expected_df = pandas.DataFrame({ "a_binary=yes": eb, "bogus": all_missing.copy() }) assert actual_df.shape == expected_df.shape tm.assert_frame_equal(actual_df.isnull(), expected_df.isnull()) tm.assert_frame_equal(actual_df.dropna(), expected_df.dropna(), check_exact=True)
def data_processing(data_df): data_df_x = data_df.drop(['LOC', 'UID', 'Hospital_ID', 'SurvivalWeeks', 'admission_date', 'discharge_date', 'death_date', 'Mortality', 'CVDeath', 'SurvivalDays', 'CAD'], axis=1) data_df_y = data_df[['Mortality', 'SurvivalWeeks']] data_df_x = data_df_x.drop(['ICU'], axis=1) X_temp = data_df_x[(data_df.LOC == '3') | (data_df.LOC == '2') | (data_df.LOC == '6')] y_temp = data_df_y[(data_df.LOC == '3') | (data_df.LOC == '2') | (data_df.LOC == '6')] X_df_train, X_df_val, y_df_train, y_df_val = train_test_split(X_temp, y_temp, test_size=0.25, random_state=369) X_df_test_kao = data_df_x[data_df.LOC == '8'] y_df_test_kao = data_df_y[data_df.LOC == '8'] categorical_columns = ['Sex', 'AF', 'DM', 'HTN', 'Hyperlipidemia', 'CHF', 'Smoking', 'Cancer.before.adm', 'Foley', 'NG', 'Dyslipidemia'] numerical_columns = np.setdiff1d(data_df_x.columns, categorical_columns).tolist() categorical_ix = [data_df_x.columns.get_loc(col) for col in categorical_columns] numerical_ix = np.setdiff1d(list(range(0, len(data_df_x.columns))), categorical_ix).tolist() scaler = preprocessing.StandardScaler() standardize = [([col], scaler) for col in numerical_columns] leave = [(col, None) for col in categorical_columns] x_mapper = DataFrameMapper(standardize + leave) X_df_train = pd.DataFrame(data=x_mapper.fit_transform(X_df_train), columns=numerical_columns + categorical_columns, index=X_df_train.index) X_df_val = pd.DataFrame(data=x_mapper.fit_transform(X_df_val), columns=numerical_columns + categorical_columns, index=X_df_val.index) X_df_test_kao = pd.DataFrame(data=x_mapper.fit_transform(X_df_test_kao), columns=numerical_columns + categorical_columns, index=X_df_test_kao.index) X_df_train = encode_categorical(X_df_train, columns=categorical_columns) X_df_val = encode_categorical(X_df_val, columns=categorical_columns) X_df_test_kao = encode_categorical(X_df_test_kao, columns=categorical_columns) return X_df_train, X_df_val, y_df_train, y_df_val, X_df_test_kao, y_df_test_kao
def test_breast_example(self): x, y = load_breast_cancer() x = column.encode_categorical(x) coxnet = CoxnetSurvivalAnalysis(l1_ratio=1.0) coxnet.fit(x.values, y) expected_alphas = numpy.array([ 0.207764947265866, 0.189307681974955, 0.172490109262135, 0.157166563357949, 0.143204319038428, 0.130482442022696, 0.118890741498079, 0.108328815700004, 0.0987051822799425, 0.0899364859290742, 0.0819467763944772, 0.0746668506343715, 0.0680336534144775, 0.0619897311537413, 0.0564827342889011, 0.051464963847614, 0.046892958302776, 0.0427271171295661, 0.0389313578046448, 0.0354728032765984, 0.0323214972006479, 0.0294501444711215, 0.0268338748043064, 0.0244500273239498, 0.0222779542835891, 0.0202988422256499, 0.0184955490282766, 0.0168524554284737, 0.0153553297355215, 0.0139912045628799, 0.0127482645108893, 0.0116157438274312, 0.0105838331601337, 0.00964359459245389, 0.00878688422772072, 0.00800628165059773, 0.0072950256549955, 0.0066469556817389, 0.00605645845875073, 0.00551841938157428, 0.00502817821311635, 0.00458148871890295, 0.00417448188822764, 0.00380363242263169, 0.00346572820145532, 0.00315784245998521, 0.00287730843921864, 0.00262169628767281, 0.00238879201517371, 0.00217657831633235, 0.00198321709761059, 0.00180703355663423, 0.00164650167585602, 0.00150023100492174, 0.0013669546172544, 0.00124551813654232, 0.00113486973808373, 0.00103405103838443, 0.000942188794098442, 0.000858487338411865, 0.000782221689357606, 0.00071273127036839, 0.000649414188678556, 0.000591722022016858, 0.00053915506843511, 0.00049125801812897, 0.000447616009762226, 0.000407851037136367, 0.000371618675081733, 0.000338605096211458, 0.000308524352698783, 0.00028111589953377, 0.000256142337807075, 0.000233387358474159, 0.000212653868789829, 0.000193762285185162, 0.000176548977800548, 0.000160864853202119, 0.000146574063005757, 0.000133552827223371, 0.000121688362139862, 0.000110877903434536, 0.000101027816085719, 9.20527833489927e-05, 8.38750677843702e-05, 7.64238379317803e-05, 6.96345548028444e-05, 6.34484128750348e-05 ]) assert_array_almost_equal(expected_alphas, coxnet.alphas_) expected_deviance_ratio = numpy.array([ 0, 0.00741462796207568, 0.0135178719105177, 0.0183232499901932, 0.0221250029051101, 0.0251530137843965, 0.0275599035016693, 0.0298664819929119, 0.033763232356598, 0.0374249162331977, 0.0409637006907067, 0.0454486054162627, 0.0551615080395675, 0.0651612844343542, 0.0736024993960834, 0.0808820441173129, 0.0894426534710234, 0.0992239010000626, 0.108910229105339, 0.121376204780063, 0.134004998770465, 0.145079557491685, 0.156667501995989, 0.167543840680748, 0.178622131991811, 0.189365153169168, 0.199027839424271, 0.20909726215036, 0.218610320633419, 0.228024278642459, 0.238171883969976, 0.248070501745195, 0.258480494697342, 0.268971907277929, 0.280744803445048, 0.291329662029924, 0.300942928439923, 0.309972153913063, 0.318315812887558, 0.325822700491885, 0.332992506325249, 0.339665277042211, 0.345876707002969, 0.351605625998246, 0.357206102668659, 0.362484660673399, 0.367624391654207, 0.372275248793233, 0.37674043994605, 0.380887801196039, 0.384795899779142, 0.388569806807258, 0.392075171498793, 0.395375481018565, 0.398377579969751, 0.400997300805061, 0.403375467852471, 0.405431976972633, 0.407443593366561, 0.409668341757423, 0.411628734365416, 0.413367576771339, 0.414896999887021, 0.416268233594787, 0.417475290203319, 0.418554781508749, 0.419526121036389, 0.420522904669104, 0.421455233639571, 0.422296101083462, 0.423049677446171, 0.423716974236606, 0.424302533927477, 0.424825925226932, 0.425286695396174, 0.425693415010937, 0.426052733081791, 0.426369464812111, 0.426652822940747, 0.42686317150694, 0.427072533094355, 0.427264216646862, 0.427427314063872, 0.427573225735422, 0.427700379783919, 0.427814235325525, 0.427912925916531, 0.427998148400703 ]) assert_array_almost_equal(expected_deviance_ratio, coxnet.deviance_ratio_) coef = pandas.DataFrame(coxnet.coef_, index=x.columns, dtype=float) expected_coef = pandas.read_csv(BREAST_CANCER_COEFFICIENTS_FILE, index_col=0) expected_coef.columns = numpy.arange(expected_coef.shape[1]) assert_columns_almost_equal(coef, expected_coef, 5)
def test_series_categorical(): input_series = pandas.Series(pandas.Categorical.from_codes([1, 1, 0, 2, 0, 1, 2, 1, 2, 0, 0, 1, 2, 2], ["small", "medium", "large"], ordered=False), name="a_series") expected_df = pandas.DataFrame.from_dict(OrderedDict( [("a_series=medium", numpy.array([1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0], dtype=float)), ("a_series=large", numpy.array([0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1], dtype=float)) ])) actual_df = column.encode_categorical(input_series) tm.assert_frame_equal(actual_df, expected_df, check_exact=True)
def test_pipeline_predict(func): X_str, y = load_breast_cancer() X_num = encode_categorical(X_str) est = RandomSurvivalForest(n_estimators=10, random_state=1) est.fit(X_num[10:], y[10:]) pipe = make_pipeline(OneHotEncoder(), RandomSurvivalForest(n_estimators=10, random_state=1)) pipe.fit(X_str[10:], y[10:]) tree_pred = getattr(est, func)(X_num[:10]) pipe_pred = getattr(pipe, func)(X_str[:10]) assert_array_almost_equal(tree_pred, pipe_pred)
def test_fit_and_predict_clinical_kernel(self): x_full, y = load_whas500() trans = ClinicalKernelTransform() trans.fit(x_full) x = encode_categorical(standardize(x_full)) ssvm = FastKernelSurvivalSVM(optimizer="rbtree", kernel=trans.pairwise_kernel, max_iter=100, random_state=0) ssvm.fit(x.values, y) self.assertFalse(ssvm._pairwise) self.assertEquals(x.shape[0], ssvm.coef_.shape[0]) c = ssvm.score(x.values, y) self.assertLessEqual(abs(0.83699051218246412 - c), 1e-3)
def test_case1(self): a = numpy.concatenate(( numpy.repeat(["large"], 10), numpy.repeat(["small"], 5), numpy.repeat(["tiny"], 13), numpy.repeat(["medium"], 3))) b = numpy.concatenate(( numpy.repeat(["yes"], 8), numpy.repeat(["no"], 23))) rnd = numpy.random.RandomState(0) c = rnd.randn(len(a)) df = pandas.DataFrame({"a_category": a, "a_binary": b, "a_number": c.copy()}) actual_df = column.encode_categorical(df) eb = numpy.concatenate(( numpy.repeat([1.], 8), numpy.repeat([0.], 23))) a_tiny = numpy.zeros(31, dtype=float) a_tiny[15:28] = 1 a_small = numpy.zeros(31, dtype=float) a_small[10:15] = 1 a_medium = numpy.zeros(31, dtype=float) a_medium[-3:] = 1 expected_df = pandas.DataFrame({"a_number": c.copy(), "a_binary=yes": eb, "a_category=medium": a_medium, "a_category=small": a_small, "a_category=tiny": a_tiny}) self.assertTupleEqual(actual_df.shape, expected_df.shape) tm.assert_frame_equal(actual_df, expected_df, check_exact=True)
def test_pipeline_predict(func): X_str, y = load_breast_cancer() X_num = column.encode_categorical(X_str) est = CoxnetSurvivalAnalysis(alpha_min_ratio=0.0001, l1_ratio=1.0, fit_baseline_model=True) est.fit(X_num[10:], y[10:]) pipe = make_pipeline( OneHotEncoder(), CoxnetSurvivalAnalysis(alpha_min_ratio=0.0001, l1_ratio=1.0, fit_baseline_model=True)) pipe.fit(X_str[10:], y[10:]) tree_pred = getattr(est, func)(X_num[:10]) pipe_pred = getattr(pipe, func)(X_str[:10]) for s1, s2 in zip(tree_pred, pipe_pred): assert_array_almost_equal(s1.x, s2.x) assert_array_almost_equal(s1.y, s2.y)
def test_drop_all_missing(self): b = numpy.concatenate(( numpy.repeat(["yes"], 5), numpy.repeat([None], 10), numpy.repeat(["no"], 16))) all_missing = numpy.repeat([None], len(b)) df = pandas.DataFrame({"a_binary": b, "bogus": all_missing}) actual_df = column.encode_categorical(df) eb = numpy.concatenate(( numpy.repeat([1.], 5), numpy.repeat([numpy.nan], 10), numpy.repeat([0.], 16))) expected_df = pandas.DataFrame({"a_binary=yes": eb}) self.assertTupleEqual(actual_df.shape, expected_df.shape) tm.assert_frame_equal(actual_df.isnull(), expected_df.isnull()) tm.assert_frame_equal(actual_df.dropna(), expected_df.dropna(), check_exact=True)
from sksurv.column import encode_categorical from sksurv.metrics import concordance_index_censored from sksurv.svm import FastSurvivalSVM data_x, y = load_veterans_lung_cancer() print("\n") print("La matriz con las covariables") print(data_x.head()) print("\n") print("Las primeras 5 observaciones de y") print(y[0:4]) # Para convertir la matriz con las covariables x = encode_categorical(data_x) print("\n") print("Para ver el inicio de la matriz x") print(x.head()) # Para saber el numero de observaciones censuradas n_censored = y.shape[0] - y["Status"].sum() print("\n") print("%.1f%% of records are censored" % (n_censored / y.shape[0] * 100)) # Dibujando plt.figure(figsize=(9, 6)) val, bins, patches = plt.hist( (y["Survival_in_days"][y["Status"]], y["Survival_in_days"][~y["Status"]]),
def whas500_with_ties(): # naive survival SVM does resolve ties in survival time differently, # therefore use data without ties x, y = load_whas500() x = normalize(encode_categorical(x)) return x, y
def setUp(self): x, y = load_gbsg2() self.x = encode_categorical(x) self.y = y
def setUp(self): x, self.y = load_whas500() self.x = encode_categorical(standardize(x))
def setUp(self): # naive survival SVM does resolve ties in survival time differently, # therefore use data without ties data = loadarff(WHAS500_NOTIES_FILE) x, self.y = get_x_y(data, ['fstat', 'lenfol'], '1') self.x = encode_categorical(x)
def gbsg2(): x, y = load_gbsg2() x = encode_categorical(x) return x.values, y
def breast_cancer(): X_str, y = load_breast_cancer() X_num = encode_categorical(X_str) return X_num, y