def test_get_x_y_survival_no_pos_label(): x, event, time = _make_survival_data(100, 10, 0) columns = ["V{}".format(i) for i in range(10)] + ["event", "time"] dataset = pandas.DataFrame(numpy.column_stack((x, event, time)), columns=columns) with pytest.raises(ValueError, match="pos_label needs to be specified if survival=True"): sdata.get_x_y(dataset, ["event", "time"], survival=True)
def test_get_x_y_survival_too_many_labels(): x, event, time = _make_survival_data(100, 10, 0) columns = ["V{}".format(i) for i in range(10)] + ["event", "time"] dataset = pandas.DataFrame(numpy.column_stack((x, event, time)), columns=columns) attr_labels = ["event", "time", "random"] with pytest.raises(ValueError, match="expected sequence of length two for attr_labels, but got 3"): sdata.get_x_y(dataset, attr_labels, pos_label=1, survival=True)
def whas500_without_ties(): # naive survival SVM does resolve ties in survival time differently, # therefore use data without ties data = loadarff(WHAS500_NOTIES_FILE) x, y = get_x_y(data, ['fstat', 'lenfol'], '1') x = encode_categorical(x) return x, y
def _fit_example(self, **kwargs): x, y = get_x_y(pandas.read_csv(EXAMPLE_FILE), ["status", "time"], pos_label=1) coxnet = CoxnetSurvivalAnalysis(**kwargs) coxnet.fit(x.values, y) return x, y, coxnet
def test_example_2_standardize(self): from sklearn.preprocessing import StandardScaler from sklearn.pipeline import Pipeline x, y = get_x_y(pandas.read_csv(EXAMPLE_FILE), ["status", "time"], pos_label=1) expected_alphas = numpy.array( [0.263066005037211, 0.239695946189997, 0.218402018960187, 0.198999785536952, 0.18132119305624, 0.165213118007272, 0.15053603994994, 0.137162833055499, 0.124977665003457, 0.113874993697428, 0.103758653109983, 0.0945410203385227, 0.0861422566576188, 0.0784896159941638, 0.0715168148356887, 0.0651634581142872, 0.0593745160934302, 0.0540998477267133, 0.0492937663601004, 0.0449146440159821, 0.0409245508315483, 0.037288926528462, 0.0339762810682614, 0.0309579219007116, 0.0282077054426604, 0.0257018106348284, 0.0234185326151886, 0.0213380947218357, 0.0194424771970012, 0.0177152611085322, 0.0161414861369557, 0.0147075209963485, 0.0134009453666594, 0.0122104423148384, 0.0111257002729774, 0.0101373237244409, 0.00923675182439653, 0.00841618424987191, 0.00766851363708906, 0.00698726402087929, 0.00636653474297097, 0.00580094934331044, 0.00528560899173708, 0.00481605005665997, 0.00438820544321646, 0.0039983693660421, 0.00364316525153066, 0.00331951649156886, 0.0030246197954287]) scaler = StandardScaler() coxnet = CoxnetSurvivalAnalysis(l1_ratio=0.9) pipe = Pipeline([("standardize", scaler), ("coxnet", coxnet)]) pipe.fit(x.values, y) assert_array_almost_equal(expected_alphas, coxnet.alphas_) expected_coef = pandas.read_csv(EXAMPLE_COEF_FILE.format("2-std")) # rescale coefficients coef = pandas.DataFrame(coxnet.coef_ / scaler.scale_[:, numpy.newaxis], columns=expected_coef.columns, dtype=float) assert_columns_almost_equal(coef, expected_coef, 5)
def test_get_x_y_classification_no_label(): x = _make_features(100, 10, 0) columns = ["V{}".format(i) for i in range(10)] dataset = pandas.DataFrame(x, columns=columns) x_test, y_test = sdata.get_x_y(dataset, None, survival=False) assert y_test is None assert_array_equal(x_test, x)
def test_get_x_y_survival_no_label(): x = _make_features(100, 10, 0) columns = ["V{}".format(i) for i in range(10)] dataset = pandas.DataFrame(x, columns=columns) attr_labels = [None, None] x_test, y_test = sdata.get_x_y(dataset, attr_labels, pos_label=1, survival=True) assert y_test is None assert_array_equal(x, x_test)
def test_get_x_y_classification(): x, label = _make_classification_data(100, 10, 6, 0) columns = ["V{}".format(i) for i in range(10)] + ["class_label"] dataset = pandas.DataFrame(numpy.column_stack((x, label)), columns=columns) attr_labels = ["class_label"] x_test, y_test = sdata.get_x_y(dataset, attr_labels, survival=False) assert y_test.ndim == 2 assert_array_equal(y_test.values.ravel(), label) assert_array_equal(x_test, x)
def test_get_x_y_survival(): x, event, time = _make_survival_data(100, 10, 0) columns = ["V{}".format(i) for i in range(10)] + ["event", "time"] dataset = pandas.DataFrame(numpy.column_stack((x, event, time)), columns=columns) attr_labels = ["event", "time"] x_test, y_test = sdata.get_x_y(dataset, attr_labels, pos_label=1, survival=True) assert y_test.dtype.names == ("event", "time") assert_array_equal(y_test["event"].astype(numpy.uint32), event.astype(numpy.uint32)) assert_array_almost_equal(y_test["time"], time) assert_array_equal(x, x_test)
def setUp(self): # naive survival SVM does resolve ties in survival time differently, # therefore use data without ties data = loadarff(WHAS500_NOTIES_FILE) x, self.y = get_x_y(data, ['fstat', 'lenfol'], '1') self.x = encode_categorical(x)
def main(args): """ Runs evaluation for the data set 1. Loads model from tar.gz 2. Reads in test features 3. Runs an accuracy report 4. Generates feature importance with SHAP Args: model-name (str): Name of the trained model, default xgboost test-features (str): preprocessed test features for evaluation, default test_features.csv train-features (str): preproceed train features for SHAP, default train_features.csv test-features (str): preproceed test features for SHAP, default test_features.csv report-name (str): Name of the evaluation output , default evaluation.json shap-name (str): Name of the SHAP feature importance output file, default shap.csv threshold (float): Threshold to cut probablities at , default 0.5 tau (int): time range for the c-index will be from 0 to tau , default 100 """ model_path = os.path.join("/opt/ml/processing/model", "model.tar.gz") logger.info(f"Extracting model from path: {model_path}") with tarfile.open(model_path) as tar: tar.extractall(path=".") logger.info("Loading model") with open(args.model_name, "rb") as f: model = pickle.load(f) logger.info("Loading train and test data") test_features_data = os.path.join("/opt/ml/processing/test", args.test_features) train_features_data = os.path.join("/opt/ml/processing/train", args.train_features) X_test = pd.read_csv(test_features_data, header=0) X_train = pd.read_csv(train_features_data, header=0) y_test = X_test.iloc[:, 0] y_train = X_train.iloc[:, 0] # Reverse transfrom to event and duration columns y_test_df = pd.DataFrame( np.vstack((np.where(y_test > 0, 1, 0), np.abs(y_test))).T, columns=["event", "duration"], ) y_train_df = pd.DataFrame( np.vstack((np.where(y_train > 0, 1, 0), np.abs(y_train))).T, columns=["event", "duration"], ) X_test.drop(X_test.columns[0], axis=1, inplace=True) X_train.drop(X_test.columns[0], axis=1, inplace=True) logger.info("Running inference") predictions = model.predict(xgboost.DMatrix(X_test.values[:, 1:]), output_margin=False) logger.info("Creating evaluation report") # NOTE: technical evaluation is really not as a classifier # TO DO: Normalize to 0 to 1 scale report_dict = classification_report(y_test_df["event"], predictions > args.threshold, output_dict=True) report_dict["accuracy"] = accuracy_score(y_test_df["event"], predictions > args.threshold) _, y_train_tuple = get_x_y(y_train_df, ["event", "duration"], pos_label=True) _, y_test_tuple = get_x_y(y_test_df, ["event", "duration"], pos_label=True) concordance_index = concordance_index_ipcw( y_train_tuple, y_test_tuple, predictions, tau=args.tau, # default within 100 days ) report_dict["concordance_index"] = { "cindex": float(concordance_index[0]), "concordant": int(concordance_index[1]), "discordant": int(concordance_index[2]), "tied_risk": int(concordance_index[3]), "tied_time": int(concordance_index[4]), } times, score = brier_score(y_train_tuple, y_test_tuple, predictions, y_test_df["duration"].max() - 1) report_dict["brier_score"] = { "times": times.astype(np.int32).tolist(), "score": score.astype(np.float32).tolist(), } logger.info(f"Classification report:\n{report_dict}") evaluation_output_path = os.path.join("/opt/ml/processing/evaluation", args.report_name) logger.info(f"Saving classification report to {evaluation_output_path}") logger.debug(report_dict) with open(evaluation_output_path, "w") as f: f.write(json.dumps(report_dict)) # SHAP latest_job_debugger_artifacts_path = "/opt/ml/processing/debug/debug-output" trial = create_trial(latest_job_debugger_artifacts_path) shap_values = trial.tensor("full_shap/f0").value(trial.last_complete_step) pd.DataFrame(shap_values).to_csv( os.path.join("/opt/ml/processing/evaluation", args.shap_name)) shap_no_base = shap_values[1:, :-1] feature_names = X_train.columns os.makedirs("/opt/ml/processing/plot/", exist_ok=True) logger.info(shap_values.shape, shap_no_base.shape, X_train.shape) shap.summary_plot(shap_no_base, features=X_train, feature_names=feature_names, show=False) plt.savefig("/opt/ml/processing/plot/feature_importance.png", bbox_inches="tight")
feature_matrix = pd.DataFrame() for lab_id in lab_ids: feature_vector = makeFullFeatureVector(im, most_variant_genes, inhibitors, lab_id) feature_series = pd.Series(data=feature_vector, name=lab_id) feature_matrix = feature_matrix.append(feature_series) # In[ ]: feature_means = feature_matrix.mean() feature_stds = feature_matrix.std() normed_features = (feature_matrix - feature_means) / feature_stds normed_features = normed_features.fillna(0.0) # In[ ]: from sksurv.datasets import get_x_y full_dataset = pd.read_csv('training/response.csv').set_index('lab_id').join( normed_features) X, Y = get_x_y(full_dataset, ['vitalStatus', 'overallSurvival'], pos_label='Dead') # In[ ]: from sksurv.linear_model import CoxnetSurvivalAnalysis from sklearn.model_selection import GridSearchCV from sklearn.model_selection import KFold # This package allows general elastic net tuning, but by setting # l1_ratio=1, we restrict to LASSO. regr = CoxnetSurvivalAnalysis(l1_ratio=1, alpha_min_ratio=0.05, max_iter=3e5) n_folds = 10 alphas = np.logspace(-1.3, 0, num=100) cv = KFold(n_splits=5, shuffle=True, random_state=328) gcv = GridSearchCV(regr, {"alphas": [[v] for v in alphas]}, cv=cv).fit(X, Y) #In[ ]:
df = df[df['Duration'] > 12] df2 = df.loc[:, [ 'DISTRIBUTION CHANNEL', 'GENDER', 'SMOKER STATUS', 'AGE AT DOC', 'PremiumPattern', 'BENEFITS TYPE', 'BROKER COMM', 'DEBITORDERPERIOD', 'PREM % EARNINGS BAND' ]] T = df['Duration'] E = df['LapseIndicator'].apply(lambda x: True if x == 1 else False) df2['E'] = E df2['T'] = T X, y = get_x_y(df2, ['E', 'T'], pos_label=True) for c in X.columns.values: if c != 'AGE AT DOC': X[c] = X[c].astype('category') data_x_numeric = OneHotEncoder().fit_transform(X) #%% estimator = CoxnetSurvivalAnalysis(verbose=True) estimator.fit(data_x_numeric, y) #%% print(estimator.score(data_x_numeric, y)) print()
'BENEFITS TYPE', 'BROKER COMM', 'DEBITORDERPERIOD', 'PREM % EARNINGS BAND' ]] mca = prince.MCA(df2, n_components=8) df2 = mca.row_component_contributions T = df['Duration'] E = df['LapseIndicator'].apply(lambda x: True if x == 1 else False) df2['E'] = E df2['T'] = T data_x_numeric, y = get_x_y(df2, ['E', 'T'], pos_label=True) #%% estimator = CoxPHSurvivalAnalysis(verbose=True, n_iter=10000) estimator.fit(data_x_numeric, y) #%% print() print(pd.Series(estimator.coef_, index=data_x_numeric.columns)) print() print(estimator.score(data_x_numeric, y)) print() scores = fit_and_score_features(data_x_numeric.values, y) print(