def fit_and_prepare(x_train, y_train, test_df):

    # 3.1. Prepare Y-----
    y_train.specific_death = y_train.specific_death.astype(bool)

    # Transform it into a structured array
    y_train = y_train.to_records(index=False)

    # 3.2. Prepare X-----
    # obtain the x variables that are categorical
    categorical_feature_mask = x_train.dtypes == object

    # Filter categorical columns using mask and turn it into a list
    categorical_cols = x_train.columns[categorical_feature_mask].tolist()

    # Ensure categorical columns are category type
    for col in categorical_cols:
        x_train[col] = x_train[col].astype('category')
        test_df[col] = test_df[col].astype('category')

    # 3.3. Fit model-----
    # initiate
    encoder = OneHotEncoder()
    estimator = CoxPHSurvivalAnalysis()

    # fit model
    estimator.fit(encoder.fit_transform(x_train), y_train)

    # transform the test variables to match the train
    x_test = encoder.transform(test_df)

    return (estimator, x_test, x_train, y_train)
    def test_fit_unpenalized():
        X, y = load_breast_cancer()
        included = X["grade"] != "unkown"
        X = X.loc[included, :]
        y = y[included.values]

        X["grade"] = pandas.Series(pandas.Categorical(
            X["grade"].astype(object),
            categories=["intermediate", "poorly differentiated",
                        "well differentiated"]),
            index=X.index, name="grade")

        enc = OneHotEncoder()
        X = enc.fit_transform(X)

        cols_unpen = ['age', 'size', 'grade=poorly differentiated',
                      'grade=well differentiated', 'er=positive']
        X = pandas.concat((
            X.loc[:, cols_unpen],
            X.drop(cols_unpen, axis=1)),
            axis=1)

        alphas = numpy.ones(X.shape[1])
        alphas[:len(cols_unpen)] = 0.0

        cph = CoxPHSurvivalAnalysis(alpha=alphas)
        cph.fit(X, y)

        coef = numpy.array([
            -0.0228825990482334, 0.635554486750423, -0.242079636336473,
            -1.30197563647684, -2.27790151300312,
            0.291950212930807, 0.210861165049552, -0.612456645638769, -0.453414844486013, -0.1239424190253,
            0.196855946938761, 1.08724198521351, -0.313645443818603, -0.660016141198812, 1.07104977404073,
            0.559632480471393, -0.47740746012516, -1.26199769642326, -1.40486191330444, -0.418517018253652,
            0.284936091689505, -0.215531076378674, -0.200889269720281, 0.341231176941461, 0.0307350667648337,
            -0.212527052910377, -0.3019678509188, 0.54491723178866, -0.286914381308269, 0.370374100647823,
            -0.496258248067704, 0.624528657777646, 0.287884026214139, 0.022095151910937, 0.910293732936019,
            -0.13076488639207, 0.0857209529827562, -0.0922302696963889, 0.498136631416287, 0.937133644376614,
            0.395090607856869, -1.04727952099579, -0.54974694800345, 0.442372971174454, -0.745558450753062,
            -0.0920496108021893, 0.75549238586293, 0.562496351046743, 0.259183349320614, 0.405816113039412,
            -0.0969485695700491, -0.507388915258978, -0.474246597197329, -0.209335517183595, 0.187390427612498,
            -0.0522568530719332, 0.0806559868641646, -0.0397654339013217, -0.269582356665396, 0.791793553908743,
            0.344208857844796, -0.180165785909583, -0.7927695046551, 0.0311635012097026, -0.579429950080662,
            -0.264770995160963, 0.869512689697827, 0.765479119494175, -0.173588059680979, -0.199781736503338,
            -0.58712767650975, -0.457389854855, 0.3891865514653, 0.707309743580534, -0.121997864690072,
            0.0447174402649954, 0.0319336975869795, 0.0117988435665652, -0.593691059339064, -0.838107176656365,
            -0.247955128152877
        ])

        assert_array_almost_equal(cph.coef_, coef)