def test_dataset_intersection(): dataset1 = Dataset.from_nested_dictionary({ "H-2-Kb": { "SIINFEKL": 10.0, "FEKLSIIN": 20000.0, "SIFEKLIN": 50000.0, } }) dataset2 = Dataset.from_nested_dictionary({"H-2-Kb": {"SIINFEKL": 30.0}}) dataset_intersection = dataset1.intersection(dataset2) expected_result = Dataset.from_nested_dictionary( {"H-2-Kb": { "SIINFEKL": 10.0 }}) eq_(dataset_intersection, expected_result)
def test_dataset_difference(): dataset1 = Dataset.from_nested_dictionary({ "H-2-Kb": { "SIINFEKL": 10.0, "FEKLSIIN": 20000.0, "SIFEKLIN": 50000.0, } }) dataset2 = Dataset.from_nested_dictionary({"H-2-Kb": {"SIINFEKL": 10.0}}) dataset_diff = dataset1.difference(dataset2) expected_result = Dataset.from_nested_dictionary( {"H-2-Kb": { "FEKLSIIN": 20000.0, "SIFEKLIN": 50000.0, }}) eq_(dataset_diff, expected_result)
def test_class1_binding_predictor_A0205_training_accuracy(): dataset = Dataset.from_csv(get_path( "data_combined_iedb_kim2014", "combined_human_class1_dataset.csv")) dataset_a0205_all_lengths = dataset.get_allele("HLA-A0205") dataset_a0205 = Dataset( dataset_a0205_all_lengths._df.ix[ dataset_a0205_all_lengths._df.peptide.str.len() == 9]) predictor = Class1BindingPredictor( name="A0205", embedding_output_dim=32, activation="tanh", layer_sizes=[64], optimizer="adam", dropout_probability=0.0) predictor.fit_dataset(dataset_a0205, n_training_epochs=1000) peptides = dataset_a0205.peptides ic50_pred = predictor.predict(peptides) ic50_true = dataset_a0205.affinities eq_(len(ic50_pred), len(ic50_true)) testing.assert_allclose( np.log(ic50_pred), np.log(ic50_true), rtol=0.2, atol=0.2)
def class1_binding_predictor_A0205_training_accuracy(): dataset = Dataset.from_csv(CLASS1_DATA_CSV_PATH) dataset_a0205 = dataset.get_allele("HLA-A0205") predictor = Class1BindingPredictor.from_hyperparameters(name="A0205") predictor.fit_dataset(dataset_a0205) peptides = dataset_a0205.peptides ic50_pred = predictor.predict(peptides) ic50_true = dataset_a0205.affinities eq_(len(ic50_pred), len(ic50_true)) assert np.allclose(ic50_pred, ic50_true)
def test_dataset_random_split(): dataset = Dataset.from_nested_dictionary({ "H-2-Kb": { "SIINFEKL": 10.0, "FEKLSIIN": 20000.0, "SIFEKLIN": 50000.0, } }) left, right = dataset.random_split(n=2) assert len(left) == 2 assert len(right) == 1
def test_class1_binding_predictor_A0205_training_accuracy(): dataset = Dataset.from_csv( get_path("data_combined_iedb_kim2014", "combined_human_class1_dataset.csv")) dataset_a0205_all_lengths = dataset.get_allele("HLA-A0205") dataset_a0205 = Dataset(dataset_a0205_all_lengths._df.ix[ dataset_a0205_all_lengths._df.peptide.str.len() == 9]) predictor = Class1BindingPredictor(name="A0205", embedding_output_dim=32, activation="tanh", layer_sizes=[64], optimizer="adam", dropout_probability=0.0) predictor.fit_dataset(dataset_a0205, n_training_epochs=1000) peptides = dataset_a0205.peptides ic50_pred = predictor.predict(peptides) ic50_true = dataset_a0205.affinities eq_(len(ic50_pred), len(ic50_true)) testing.assert_allclose(np.log(ic50_pred), np.log(ic50_true), rtol=0.2, atol=0.2)
def test_create_imputed_datasets_two_alleles(): dataset = Dataset.from_nested_dictionary({ "HLA-A*02:01": { "A" * 9: 20.0, "C" * 9: 40000.0, }, "HLA-A*02:05": { "S" * 9: 500.0, "A" * 9: 25.0, }, }) imputed_dataset = dataset.impute_missing_values(MICE(n_imputations=25)) eq_(imputed_dataset.unique_alleles(), {"HLA-A*02:01", "HLA-A*02:05"}) expected_peptides = {"A" * 9, "C" * 9, "S" * 9} for allele_name, allele_data in imputed_dataset.groupby_allele(): eq_(set(allele_data.peptides), expected_peptides)
def test_create_imputed_datasets_two_alleles(): dataset = Dataset.from_nested_dictionary({ "HLA-A*02:01": { "A" * 9: 20.0, "C" * 9: 40000.0, }, "HLA-A*02:05": { "S" * 9: 500.0, "A" * 9: 25.0, }, }) imputed_dataset = dataset.impute_missing_values(MICE(n_imputations=25)) eq_(imputed_dataset.unique_alleles(), {"HLA-A*02:01", "HLA-A*02:05"}) expected_peptides = {"A" * 9, "C" * 9, "S" * 9} for allele_name, allele_data in imputed_dataset.groupby_allele(): eq_(set(allele_data.peptides), expected_peptides)
def test_create_allele_data_from_single_allele_dict(): peptide_to_ic50_dict = { ("A" * 10): 1.2, ("C" * 9): 1000, } dataset = Dataset.from_single_allele_dictionary( allele_name="A0201", peptide_to_affinity_dict=peptide_to_ic50_dict) assert isinstance(dataset, Dataset) eq_(len(peptide_to_ic50_dict), len(dataset)) expected_peptides = set([ "A" * 10, "C" * 9, ]) for pi, pj in zip(sorted(expected_peptides), sorted(dataset.peptides)): eq_(pi, pj) for pi, pj in zip(sorted(expected_peptides), sorted(dataset.unique_peptides())): eq_(pi, pj)
def test_dataset_cross_validation(): dataset = Dataset.from_nested_dictionary({ "H-2-Kb": { "SIINFEKL": 10.0, "FEKLSIIN": 20000.0, "SIFEKLIN": 50000.0, }, "HLA-A*02:01": { "ASASAS": 1.0, "CCC": 0.0 } }) fold_count = 0 for train_dataset, test_dataset in dataset.cross_validation_iterator( test_allele="HLA-A*02:01", n_folds=2): assert train_dataset.unique_alleles() == {"H-2-Kb", "HLA-A*02:01"} assert test_dataset.unique_alleles() == {"HLA-A*02:01"} assert len(test_dataset) == 1 fold_count += 1 assert fold_count == 2
orientation="columns", verbose=verbose, print_interval=knn_print_interval) return result_dict if __name__ == "__main__": args = parser.parse_args() print(args) imputation_methods = create_imputation_methods( verbose=args.verbose, clip_imputed_values=not (args.normalize_rows or args.normalize_rows), ) print("Imputation methods: %s" % imputation_methods) dataset = Dataset.from_csv(args.binding_data_csv) X, peptide_list, allele_list = dataset.to_dense_pMHC_affinity_matrix( min_observations_per_allele=args.n_folds, min_observations_per_peptide=args.min_observations_per_peptide) observed_mask = np.isfinite(X) print("Loaded binding data, shape: %s, n_observed=%d/%d (%0.2f%%)" % ( X.shape, observed_mask.sum(), X.size, 100.0 * observed_mask.sum() / X.size)) if args.save_incomplete_affinity_matrix: print("Saving incomplete data to %s" % args.save_incomplete_affinity_matrix) df = pd.DataFrame(X, columns=allele_list, index=peptide_list) df.to_csv(args.save_incomplete_affinity_matrix, index_label="peptide") scores = ScoreSet()
def load_csv(filename): return Dataset.from_csv(data_path(filename))
def test_performance_improves_for_A0205_with_pretraining(): # test to make sure that imputation improves predictive accuracy after a # small number of training iterations (5 epochs) dataset = Dataset.from_csv(CLASS1_DATA_CSV_PATH) print("Full dataset: %d pMHC entries" % len(dataset)) limited_alleles = ["HLA-A0205", "HLA-A0201", "HLA-A0101", "HLA-B0702"] # restrict to just five alleles dataset = dataset.get_alleles(limited_alleles) print("After filtering to %s, # entries: %d" % (limited_alleles, len(dataset))) a0205_data_without_imputation = dataset.get_allele("HLA-A0205") print("Dataset with only A0205, # entries: %d" % len(a0205_data_without_imputation)) predictor_without_imputation = \ Class1BindingPredictor.from_hyperparameters(name="A0205-no-impute") X_index, ic50_true, sample_weights, _ = \ a0205_data_without_imputation.kmer_index_encoding() assert sample_weights.min() >= 0, sample_weights.min() assert sample_weights.max() <= 1, sample_weights.max() assert ic50_true.min() >= 0, ic50_true.min() predictor_without_imputation.fit_kmer_encoded_arrays( X=X_index, ic50=ic50_true, sample_weights=sample_weights, n_training_epochs=10) ic50_pred_without_imputation = \ predictor_without_imputation.predict_ic50_for_kmer_encoded_array(X_index) diff_squared = (ic50_true - ic50_pred_without_imputation) ** 2 ic50_true_label = ic50_true <= 500 ic50_pred_label_without_imputation = ic50_pred_without_imputation <= 500 ic50_label_same_without_imputation = ( ic50_true_label == ic50_pred_label_without_imputation) mse_without_imputation = (diff_squared * sample_weights).sum() / sample_weights.sum() accuracy_without_imputation = ( ic50_label_same_without_imputation * sample_weights).sum() / sample_weights.sum() imputed_datset = dataset.impute_missing_values(MICE(n_imputations=25)) print("After imputation, dataset for %s has %d entries" % ( limited_alleles, len(imputed_datset))) a0205_data_with_imputation = imputed_datset.get_allele("HLA-A0205") print("Limited to just A0205, # entries: %d" % (len(a0205_data_with_imputation))) X_index_imputed, ic50_imputed, sample_weights_imputed, _ = \ a0205_data_with_imputation.kmer_index_encoding() assert sample_weights_imputed.min() >= 0, sample_weights_imputed.min() assert sample_weights_imputed.max() <= 1, sample_weights_imputed.max() assert ic50_imputed.min() >= 0, ic50_imputed.min() predictor_with_imputation = \ Class1BindingPredictor.from_hyperparameters(name="A0205-impute") predictor_with_imputation.fit_kmer_encoded_arrays( X=X_index, ic50=ic50_true, sample_weights=sample_weights, X_pretrain=X_index_imputed, ic50_pretrain=ic50_imputed, sample_weights_pretrain=sample_weights_imputed, n_training_epochs=200) ic50_pred_with_imputation = \ predictor_with_imputation.predict_ic50_for_kmer_encoded_array(X_index) diff_squared = (ic50_true - ic50_pred_with_imputation) ** 2 mse_with_imputation = (diff_squared * sample_weights).sum() / sample_weights.sum() ic50_pred_label_with_imputation = ic50_pred_with_imputation <= 500 ic50_label_same_with_imputation = ( ic50_true_label == ic50_pred_label_with_imputation) accuracy_with_imputation = ( ic50_label_same_with_imputation * sample_weights).sum() / sample_weights.sum() print("RMS w/out imputation: %f" % (np.sqrt(mse_without_imputation),)) print("RMS w/ imputation: %f" % (np.sqrt(mse_with_imputation),)) assert mse_with_imputation < mse_without_imputation, \ "Expected MSE with imputation (%f) to be less than (%f) without imputation" % ( mse_with_imputation, mse_without_imputation) print("IC50 <= 500nM accuracy w/out imputation: %f" % ( accuracy_without_imputation,)) print("IC50 <= 500nM accuracy w/ imputation: %f" % ( accuracy_with_imputation,)) assert accuracy_with_imputation > accuracy_without_imputation
base_filename = \ ("%s-vs-nsamples-hidden-%s-activation-%s" "-impute-%s-epochs-%d-embedding-%d-pretrain-%s") % ( args.allele, args.hidden_layer_size, args.activation, args.imputation_method, args.training_epochs, args.embedding_size, args.pretraining_weight_decay) csv_filename = base_filename + ".csv" if args.load_existing_data: results_df = pd.read_csv(csv_filename) else: dataset = Dataset.from_csv(args.training_csv) imputer = imputer_from_args(args) def make_model(): return predictor_from_args(allele_name=args.allele, args=args) if args.pretraining_weight_decay == "exponential": def pretrain_weight_decay_fn(t): return np.exp(-t) elif args.pretraining_weight_decay == "quadratic": def pretrain_weight_decay_fn(t): return 1.0 / (t + 1) ** 2.0 elif args.pretraining_weight_decay == "linear": def pretrain_weight_decay_fn(t):
default=[], nargs="+", type=normalize_allele_name) # add options for neural network hyperparameters parser = add_arguments_to_parser(parser) if __name__ == "__main__": args = parser.parse_args() print(args) if not exists(args.output_dir): makedirs(args.output_dir) dataset = Dataset.from_csv( filename=args.binding_data_csv, sep=",", peptide_column_name="peptide") # if user didn't specify alleles then train models for all available alleles alleles = args.alleles if not alleles: alleles = list(sorted(dataset.unique_alleles())) else: dataset = dataset.get_alleles(alleles) imputer = imputer_from_args(args) if imputer is None: imputed_dataset = Dataset.create_empty() else:
def test_create_imputed_datasets_empty(): empty_dataset = Dataset.create_empty() result = empty_dataset.impute_missing_values(MICE(n_imputations=25)) eq_(result, empty_dataset)
def load_csv(filename): base_dir = dirname(realpath(__file__)) data_dir = join(base_dir, "data") full_path = join(data_dir, filename) return Dataset.from_csv(full_path)
def test_create_imputed_datasets_empty(): empty_dataset = Dataset.create_empty() result = empty_dataset.impute_missing_values(MICE(n_imputations=25)) eq_(result, empty_dataset)
def test_performance_improves_for_A0205_with_pretraining(): # test to make sure that imputation improves predictive accuracy after a # small number of training iterations (5 epochs) dataset = Dataset.from_csv( get_path("data_combined_iedb_kim2014", "combined_human_class1_dataset.csv")) print("Full dataset: %d pMHC entries" % len(dataset)) limited_alleles = ["HLA-A0205", "HLA-A0201", "HLA-A0101", "HLA-B0702"] # restrict to just five alleles dataset = dataset.get_alleles(limited_alleles) print("After filtering to %s, # entries: %d" % (limited_alleles, len(dataset))) a0205_data_without_imputation = dataset.get_allele("HLA-A0205") print("Dataset with only A0205, # entries: %d" % (len(a0205_data_without_imputation))) predictor_without_imputation = Class1BindingPredictor( name="A0205-no-impute") X_index, ic50_true, sample_weights, _ = ( a0205_data_without_imputation.kmer_index_encoding()) assert sample_weights.min() >= 0, sample_weights.min() assert sample_weights.max() <= 1, sample_weights.max() assert ic50_true.min() >= 0, ic50_true.min() predictor_without_imputation.fit_kmer_encoded_arrays( X=X_index, ic50=ic50_true, sample_weights=sample_weights, n_training_epochs=10) ic50_pred_without_imputation = ( predictor_without_imputation.predict_ic50_for_kmer_encoded_array( X_index)) diff_squared = (ic50_true - ic50_pred_without_imputation)**2 ic50_true_label = ic50_true <= 500 ic50_pred_label_without_imputation = ic50_pred_without_imputation <= 500 ic50_label_same_without_imputation = ( ic50_true_label == ic50_pred_label_without_imputation) mse_without_imputation = ((diff_squared * sample_weights).sum() / sample_weights.sum()) accuracy_without_imputation = ( (ic50_label_same_without_imputation * sample_weights).sum() / sample_weights.sum()) imputed_datset = dataset.impute_missing_values(MICE(n_imputations=25)) print("After imputation, dataset for %s has %d entries" % (limited_alleles, len(imputed_datset))) a0205_data_with_imputation = imputed_datset.get_allele("HLA-A0205") print("Limited to just A0205, # entries: %d" % (len(a0205_data_with_imputation))) X_index_imputed, ic50_imputed, sample_weights_imputed, _ = \ a0205_data_with_imputation.kmer_index_encoding() assert sample_weights_imputed.min() >= 0, sample_weights_imputed.min() assert sample_weights_imputed.max() <= 1, sample_weights_imputed.max() assert ic50_imputed.min() >= 0, ic50_imputed.min() predictor_with_imputation = Class1BindingPredictor(name="A0205-impute") predictor_with_imputation.fit_kmer_encoded_arrays( X=X_index, ic50=ic50_true, sample_weights=sample_weights, X_pretrain=X_index_imputed, ic50_pretrain=ic50_imputed, sample_weights_pretrain=sample_weights_imputed, n_training_epochs=10) ic50_pred_with_imputation = \ predictor_with_imputation.predict_ic50_for_kmer_encoded_array(X_index) diff_squared = (ic50_true - ic50_pred_with_imputation)**2 mse_with_imputation = ((diff_squared * sample_weights).sum() / sample_weights.sum()) ic50_pred_label_with_imputation = ic50_pred_with_imputation <= 500 ic50_label_same_with_imputation = ( ic50_true_label == ic50_pred_label_with_imputation) accuracy_with_imputation = ( (ic50_label_same_with_imputation * sample_weights).sum() / sample_weights.sum()) print("RMS w/out imputation: %f" % (np.sqrt(mse_without_imputation), )) print("RMS w/ imputation: %f" % (np.sqrt(mse_with_imputation), )) assert mse_with_imputation < mse_without_imputation, \ "Expected MSE with imputation (%f) to be < w/o imputation (%f)" % ( mse_with_imputation, mse_without_imputation) print("IC50 <= 500nM accuracy w/out imputation: %f" % (accuracy_without_imputation, )) print("IC50 <= 500nM accuracy w/ imputation: %f" % (accuracy_with_imputation, )) assert accuracy_with_imputation > accuracy_without_imputation
def load_csv(filename): return Dataset.from_csv(data_path(filename))