コード例 #1
0
    def test_get_x_y_survival_no_pos_label():
        x, event, time = _make_survival_data(100, 10, 0)

        columns = ["V{}".format(i) for i in range(10)] + ["event", "time"]
        dataset = pandas.DataFrame(numpy.column_stack((x, event, time)), columns=columns)

        with pytest.raises(ValueError,
                           match="pos_label needs to be specified if survival=True"):
            sdata.get_x_y(dataset, ["event", "time"], survival=True)
コード例 #2
0
    def test_get_x_y_survival_too_many_labels():
        x, event, time = _make_survival_data(100, 10, 0)

        columns = ["V{}".format(i) for i in range(10)] + ["event", "time"]
        dataset = pandas.DataFrame(numpy.column_stack((x, event, time)), columns=columns)

        attr_labels = ["event", "time", "random"]
        with pytest.raises(ValueError,
                           match="expected sequence of length two for attr_labels, but got 3"):
            sdata.get_x_y(dataset, attr_labels, pos_label=1, survival=True)
コード例 #3
0
def whas500_without_ties():
    # naive survival SVM does resolve ties in survival time differently,
    # therefore use data without ties
    data = loadarff(WHAS500_NOTIES_FILE)
    x, y = get_x_y(data, ['fstat', 'lenfol'], '1')
    x = encode_categorical(x)
    return x, y
コード例 #4
0
    def _fit_example(self, **kwargs):
        x, y = get_x_y(pandas.read_csv(EXAMPLE_FILE), ["status", "time"],
                       pos_label=1)
        coxnet = CoxnetSurvivalAnalysis(**kwargs)
        coxnet.fit(x.values, y)

        return x, y, coxnet
コード例 #5
0
    def test_example_2_standardize(self):
        from sklearn.preprocessing import StandardScaler
        from sklearn.pipeline import Pipeline

        x, y = get_x_y(pandas.read_csv(EXAMPLE_FILE), ["status", "time"],
                       pos_label=1)
        expected_alphas = numpy.array(
            [0.263066005037211, 0.239695946189997, 0.218402018960187, 0.198999785536952, 0.18132119305624,
             0.165213118007272, 0.15053603994994, 0.137162833055499, 0.124977665003457, 0.113874993697428,
             0.103758653109983, 0.0945410203385227, 0.0861422566576188, 0.0784896159941638, 0.0715168148356887,
             0.0651634581142872, 0.0593745160934302, 0.0540998477267133, 0.0492937663601004, 0.0449146440159821,
             0.0409245508315483, 0.037288926528462, 0.0339762810682614, 0.0309579219007116, 0.0282077054426604,
             0.0257018106348284, 0.0234185326151886, 0.0213380947218357, 0.0194424771970012, 0.0177152611085322,
             0.0161414861369557, 0.0147075209963485, 0.0134009453666594, 0.0122104423148384, 0.0111257002729774,
             0.0101373237244409, 0.00923675182439653, 0.00841618424987191, 0.00766851363708906, 0.00698726402087929,
             0.00636653474297097, 0.00580094934331044, 0.00528560899173708, 0.00481605005665997, 0.00438820544321646,
             0.0039983693660421, 0.00364316525153066, 0.00331951649156886, 0.0030246197954287])

        scaler = StandardScaler()
        coxnet = CoxnetSurvivalAnalysis(l1_ratio=0.9)
        pipe = Pipeline([("standardize", scaler),
                         ("coxnet", coxnet)])
        pipe.fit(x.values, y)

        assert_array_almost_equal(expected_alphas, coxnet.alphas_)

        expected_coef = pandas.read_csv(EXAMPLE_COEF_FILE.format("2-std"))
        # rescale coefficients
        coef = pandas.DataFrame(coxnet.coef_ / scaler.scale_[:, numpy.newaxis],
                                columns=expected_coef.columns, dtype=float)
        assert_columns_almost_equal(coef, expected_coef, 5)
コード例 #6
0
    def test_get_x_y_classification_no_label():
        x = _make_features(100, 10, 0)

        columns = ["V{}".format(i) for i in range(10)]
        dataset = pandas.DataFrame(x, columns=columns)

        x_test, y_test = sdata.get_x_y(dataset, None, survival=False)

        assert y_test is None
        assert_array_equal(x_test, x)
コード例 #7
0
    def test_get_x_y_survival_no_label():
        x = _make_features(100, 10, 0)

        columns = ["V{}".format(i) for i in range(10)]
        dataset = pandas.DataFrame(x, columns=columns)

        attr_labels = [None, None]

        x_test, y_test = sdata.get_x_y(dataset, attr_labels, pos_label=1, survival=True)

        assert y_test is None
        assert_array_equal(x, x_test)
コード例 #8
0
    def test_get_x_y_classification():
        x, label = _make_classification_data(100, 10, 6, 0)

        columns = ["V{}".format(i) for i in range(10)] + ["class_label"]
        dataset = pandas.DataFrame(numpy.column_stack((x, label)), columns=columns)

        attr_labels = ["class_label"]

        x_test, y_test = sdata.get_x_y(dataset, attr_labels, survival=False)

        assert y_test.ndim == 2
        assert_array_equal(y_test.values.ravel(), label)
        assert_array_equal(x_test, x)
コード例 #9
0
    def test_get_x_y_survival():
        x, event, time = _make_survival_data(100, 10, 0)
        columns = ["V{}".format(i) for i in range(10)] + ["event", "time"]
        dataset = pandas.DataFrame(numpy.column_stack((x, event, time)), columns=columns)

        attr_labels = ["event", "time"]

        x_test, y_test = sdata.get_x_y(dataset, attr_labels, pos_label=1, survival=True)

        assert y_test.dtype.names == ("event", "time")

        assert_array_equal(y_test["event"].astype(numpy.uint32),
                           event.astype(numpy.uint32))
        assert_array_almost_equal(y_test["time"], time)

        assert_array_equal(x, x_test)
コード例 #10
0
 def setUp(self):
     # naive survival SVM does resolve ties in survival time differently,
     # therefore use data without ties
     data = loadarff(WHAS500_NOTIES_FILE)
     x, self.y = get_x_y(data, ['fstat', 'lenfol'], '1')
     self.x = encode_categorical(x)
コード例 #11
0
def main(args):
    """
    Runs evaluation for the data set
        1. Loads model from tar.gz
        2. Reads in test features
        3. Runs an accuracy report
        4. Generates feature importance with SHAP

    Args:
        model-name (str): Name of the trained model, default xgboost
        test-features (str): preprocessed test features for
         evaluation, default test_features.csv
        train-features (str): preproceed train features for SHAP,
        default train_features.csv
        test-features (str): preproceed test features for SHAP,
        default test_features.csv
        report-name (str): Name of the evaluation output
        , default evaluation.json
        shap-name (str): Name of the SHAP feature importance
        output file, default shap.csv
        threshold (float): Threshold to cut probablities at
        , default 0.5
        tau (int): time range for the c-index will be from 0 to tau
        , default 100
    """

    model_path = os.path.join("/opt/ml/processing/model", "model.tar.gz")

    logger.info(f"Extracting model from path: {model_path}")

    with tarfile.open(model_path) as tar:
        tar.extractall(path=".")
    logger.info("Loading model")
    with open(args.model_name, "rb") as f:
        model = pickle.load(f)

    logger.info("Loading train and test data")

    test_features_data = os.path.join("/opt/ml/processing/test",
                                      args.test_features)
    train_features_data = os.path.join("/opt/ml/processing/train",
                                       args.train_features)

    X_test = pd.read_csv(test_features_data, header=0)
    X_train = pd.read_csv(train_features_data, header=0)

    y_test = X_test.iloc[:, 0]
    y_train = X_train.iloc[:, 0]

    # Reverse transfrom to event and duration columns
    y_test_df = pd.DataFrame(
        np.vstack((np.where(y_test > 0, 1, 0), np.abs(y_test))).T,
        columns=["event", "duration"],
    )

    y_train_df = pd.DataFrame(
        np.vstack((np.where(y_train > 0, 1, 0), np.abs(y_train))).T,
        columns=["event", "duration"],
    )

    X_test.drop(X_test.columns[0], axis=1, inplace=True)
    X_train.drop(X_test.columns[0], axis=1, inplace=True)

    logger.info("Running inference")

    predictions = model.predict(xgboost.DMatrix(X_test.values[:, 1:]),
                                output_margin=False)

    logger.info("Creating evaluation report")

    # NOTE: technical evaluation is really not as a classifier
    # TO DO: Normalize to 0 to 1 scale
    report_dict = classification_report(y_test_df["event"],
                                        predictions > args.threshold,
                                        output_dict=True)
    report_dict["accuracy"] = accuracy_score(y_test_df["event"],
                                             predictions > args.threshold)

    _, y_train_tuple = get_x_y(y_train_df, ["event", "duration"],
                               pos_label=True)
    _, y_test_tuple = get_x_y(y_test_df, ["event", "duration"], pos_label=True)

    concordance_index = concordance_index_ipcw(
        y_train_tuple,
        y_test_tuple,
        predictions,
        tau=args.tau,  # default within 100 days
    )

    report_dict["concordance_index"] = {
        "cindex": float(concordance_index[0]),
        "concordant": int(concordance_index[1]),
        "discordant": int(concordance_index[2]),
        "tied_risk": int(concordance_index[3]),
        "tied_time": int(concordance_index[4]),
    }

    times, score = brier_score(y_train_tuple, y_test_tuple, predictions,
                               y_test_df["duration"].max() - 1)

    report_dict["brier_score"] = {
        "times": times.astype(np.int32).tolist(),
        "score": score.astype(np.float32).tolist(),
    }

    logger.info(f"Classification report:\n{report_dict}")

    evaluation_output_path = os.path.join("/opt/ml/processing/evaluation",
                                          args.report_name)
    logger.info(f"Saving classification report to {evaluation_output_path}")

    logger.debug(report_dict)

    with open(evaluation_output_path, "w") as f:
        f.write(json.dumps(report_dict))

    # SHAP
    latest_job_debugger_artifacts_path = "/opt/ml/processing/debug/debug-output"
    trial = create_trial(latest_job_debugger_artifacts_path)

    shap_values = trial.tensor("full_shap/f0").value(trial.last_complete_step)

    pd.DataFrame(shap_values).to_csv(
        os.path.join("/opt/ml/processing/evaluation", args.shap_name))

    shap_no_base = shap_values[1:, :-1]
    feature_names = X_train.columns
    os.makedirs("/opt/ml/processing/plot/", exist_ok=True)
    logger.info(shap_values.shape, shap_no_base.shape, X_train.shape)
    shap.summary_plot(shap_no_base,
                      features=X_train,
                      feature_names=feature_names,
                      show=False)
    plt.savefig("/opt/ml/processing/plot/feature_importance.png",
                bbox_inches="tight")
コード例 #12
0
feature_matrix = pd.DataFrame()
for lab_id in lab_ids:
    feature_vector = makeFullFeatureVector(im, most_variant_genes, inhibitors,
                                           lab_id)
    feature_series = pd.Series(data=feature_vector, name=lab_id)
    feature_matrix = feature_matrix.append(feature_series)
# In[ ]:
feature_means = feature_matrix.mean()
feature_stds = feature_matrix.std()
normed_features = (feature_matrix - feature_means) / feature_stds
normed_features = normed_features.fillna(0.0)
# In[ ]:
from sksurv.datasets import get_x_y
full_dataset = pd.read_csv('training/response.csv').set_index('lab_id').join(
    normed_features)
X, Y = get_x_y(full_dataset, ['vitalStatus', 'overallSurvival'],
               pos_label='Dead')
# In[ ]:
from sksurv.linear_model import CoxnetSurvivalAnalysis
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold

# This package allows general elastic net tuning, but by setting
# l1_ratio=1, we restrict to LASSO.
regr = CoxnetSurvivalAnalysis(l1_ratio=1, alpha_min_ratio=0.05, max_iter=3e5)

n_folds = 10

alphas = np.logspace(-1.3, 0, num=100)
cv = KFold(n_splits=5, shuffle=True, random_state=328)
gcv = GridSearchCV(regr, {"alphas": [[v] for v in alphas]}, cv=cv).fit(X, Y)
#In[ ]:
コード例 #13
0
df = df[df['Duration'] > 12]

df2 = df.loc[:, [
    'DISTRIBUTION CHANNEL', 'GENDER', 'SMOKER STATUS', 'AGE AT DOC',
    'PremiumPattern', 'BENEFITS TYPE', 'BROKER COMM', 'DEBITORDERPERIOD',
    'PREM % EARNINGS BAND'
]]

T = df['Duration']

E = df['LapseIndicator'].apply(lambda x: True if x == 1 else False)

df2['E'] = E
df2['T'] = T

X, y = get_x_y(df2, ['E', 'T'], pos_label=True)

for c in X.columns.values:
    if c != 'AGE AT DOC':
        X[c] = X[c].astype('category')

data_x_numeric = OneHotEncoder().fit_transform(X)
#%%

estimator = CoxnetSurvivalAnalysis(verbose=True)
estimator.fit(data_x_numeric, y)
#%%

print(estimator.score(data_x_numeric, y))
print()
コード例 #14
0
        'BENEFITS TYPE', 'BROKER COMM', 'DEBITORDERPERIOD',
        'PREM % EARNINGS BAND'
    ]]

    mca = prince.MCA(df2, n_components=8)

    df2 = mca.row_component_contributions

    T = df['Duration']

    E = df['LapseIndicator'].apply(lambda x: True if x == 1 else False)

    df2['E'] = E
    df2['T'] = T

    data_x_numeric, y = get_x_y(df2, ['E', 'T'], pos_label=True)
    #%%

    estimator = CoxPHSurvivalAnalysis(verbose=True, n_iter=10000)
    estimator.fit(data_x_numeric, y)
    #%%

    print()
    print(pd.Series(estimator.coef_, index=data_x_numeric.columns))
    print()

    print(estimator.score(data_x_numeric, y))
    print()

    scores = fit_and_score_features(data_x_numeric.values, y)
    print(