def evaluate_model_config_train_vs_test( config, training_allele_datasets, testing_allele_datasets, min_samples_per_allele=5): binary_encoding = config.embedding_size == 0 print("=== Training Alleles ===") for (allele_name, dataset) in sorted(training_allele_datasets.items()): print("%s: count = %d" % (allele_name, len(dataset.Y))) print("=== Testing Alleles ===") for (allele_name, dataset) in sorted(testing_allele_datasets.items()): print(" %s: count = %d" % (allele_name, len(dataset.Y))) X_train_dict, Y_train_dict, ic50_train_dict = encode_allele_datasets( allele_datasets=training_allele_datasets, max_ic50=config.max_ic50, binary_encoding=binary_encoding) X_test_dict, Y_test_dict, ic50_test_dict = encode_allele_datasets( allele_datasets=testing_allele_datasets, max_ic50=config.max_ic50, binary_encoding=binary_encoding) X_train_combined = np.vstack(X_train_dict.values()) Y_train_combined = np.concatenate(list(Y_train_dict.values())) model = make_model(config) model.fit( X_train_combined, Y_train_combined, nb_epoch=config.n_pretrain_epochs, batch_size=config.minibatch_size, verbose=1) scores = ScoreCollection() initial_weights = [w.copy() for w in model.get_weights()] for allele_name, training_dataset in filter_alleles( training_allele_datasets, min_samples_per_allele=min_samples_per_allele): if allele_name not in X_test_dict: print("Skipping %s, missing from test datasets" % allele_name) continue X_test_allele = X_test_dict[allele_name] ic50_test_allele = ic50_test_dict[allele_name] true_log_ic50 = np.log(ic50_test_allele) / np.log(config.max_ic50) true_log_ic50 = np.maximum(0, 1.0 - true_log_ic50) true_label = ic50_test_allele <= 500 if true_label.all(): print("Skipping %s since all affinities are <= 500nM" % allele_name) continue elif not true_label.any(): print("Skipping %s since all affinities are > 500nM" % allele_name) continue model.set_weights(initial_weights) model.fit( training_dataset.X, training_dataset.Y, nb_epoch=config.n_epochs, batch_size=config.minibatch_size, verbose=0) pred = model.predict(X_test_allele).flatten() accuracy, auc, f1_score = score_predictions( predicted_log_ic50=pred, true_log_ic50=true_log_ic50, max_ic50=config.max_ic50) print("-- %s accuracy=%0.4f AUC = %0.4f F1 = %0.4f" % ( allele_name, accuracy, auc, f1_score)) scores.add(allele_name, auc=[auc], accuracy=[accuracy], f1=[f1_score]) return scores.dataframe()
def leave_out_allele_cross_validation( model, allele_datasets, max_ic50, binary_encoding=False, n_pretrain_epochs=0, n_training_epochs=100, min_samples_per_allele=5, cv_folds=5, minibatch_size=128): """ Fit the model for every allele in the dataset and return a DataFrame with the following columns: allele_name dataset_size auc_mean auc_median auc_std auc_min auc_max accuracy_mean accuracy_median accuracy_std accuracy_min accuracy_max f1_mean f1_median f1_std f1_min f1_max """ scores = ScoreCollection() X_dict, Y_log_ic50_dict, ic50_dict = encode_allele_datasets( allele_datasets=allele_datasets, max_ic50=max_ic50, binary_encoding=binary_encoding) initial_weights = [w.copy() for w in model.get_weights()] for allele_name, dataset in filter_alleles( allele_datasets, min_samples_per_allele=min_samples_per_allele): model.set_weights(initial_weights) X_allele = X_dict[allele_name] Y_allele = Y_log_ic50_dict[allele_name] ic50_allele = ic50_dict[allele_name] if n_pretrain_epochs > 0: X_other_alleles = np.vstack([ X for (other_allele, X) in X_dict.items() if normalize_allele_name(other_allele) != allele_name]) Y_other_alleles = np.concatenate([ y for (other_allele, y) in Y_log_ic50_dict.items() if normalize_allele_name(other_allele) != allele_name]) print("Pre-training X shape: %s" % (X_other_alleles.shape,)) print("Pre-training Y shape: %s" % (Y_other_alleles.shape,)) model.fit( X_other_alleles, Y_other_alleles, nb_epoch=n_pretrain_epochs, batch_size=minibatch_size, verbose=0) print("Cross-validation for %s (%d):" % (allele_name, len(Y_allele))) aucs, accuracies, f1_scores = kfold_cross_validation_for_single_allele( allele_name=allele_name, model=model, X=X_allele, Y=Y_allele, ic50=ic50_allele, n_training_epochs=n_training_epochs, cv_folds=cv_folds, max_ic50=max_ic50, minibatch_size=minibatch_size) if len(aucs) == 0: print("Skipping allele %s" % allele_name) continue scores.add(allele_name, auc=aucs, accuracy=accuracies, f1=f1_scores) return scores.dataframe()