def subsample_performance( X, Y, max_ic50, model_fn=None, fractions=np.arange(0.01, 1, 0.03), niters=10, fraction_test=0.2, nb_epoch=50, batch_size=32): n = len(Y) xs = [] aucs = [] f1s = [] for iternum in range(niters): if model_fn is None: model = LinearRegression() else: model = model_fn() initial_weights = model.get_weights() mask = np.random.rand(n) > fraction_test X_train = X[mask] X_test = X[~mask] Y_train = Y[mask] Y_test = Y[~mask] n_train = len(Y_train) train_indices = np.arange(len(Y_train)) np.random.shuffle(train_indices) for i, fraction in enumerate(fractions): n_fraction = int(n_train * fraction) subset_indices = train_indices[:n_fraction] X_subset = X_train[subset_indices] Y_subset = Y_train[subset_indices] if model_fn is None: model.fit(X_subset, Y_subset) else: model.set_weights(initial_weights) model.fit( X_subset, Y_subset, verbose=0, nb_epoch=nb_epoch, batch_size=batch_size) pred = model.predict(X_test) true_ic50 = max_ic50 ** (1 - Y_test) true_label = true_ic50 <= 500 auc = sklearn.metrics.roc_auc_score(true_label, pred) xs.append(n_fraction) aucs.append(auc) pred_ic50 = max_ic50 ** (1 - pred) pred_label = pred_ic50 <= 500 f1 = sklearn.metrics.f1_score(true_label, pred_label) print("Fraction=%0.2f, n=%d, AUC=%0.4f, F1=%0.4f" % (fraction, n_fraction, auc, f1)) f1s.append(f1) return xs, aucs, f1s