def test_class1_binding_predictor_A0205_training_accuracy():
    dataset = Dataset.from_csv(get_path(
        "data_combined_iedb_kim2014", "combined_human_class1_dataset.csv"))
    dataset_a0205_all_lengths = dataset.get_allele("HLA-A0205")
    dataset_a0205 = Dataset(
        dataset_a0205_all_lengths._df.ix[
            dataset_a0205_all_lengths._df.peptide.str.len() == 9])

    predictor = Class1BindingPredictor(
        name="A0205",
        embedding_output_dim=32,
        activation="tanh",
        layer_sizes=[64],
        optimizer="adam",
        dropout_probability=0.0)
    predictor.fit_dataset(dataset_a0205, n_training_epochs=1000)
    peptides = dataset_a0205.peptides
    ic50_pred = predictor.predict(peptides)
    ic50_true = dataset_a0205.affinities
    eq_(len(ic50_pred), len(ic50_true))
    testing.assert_allclose(
        np.log(ic50_pred),
        np.log(ic50_true),
        rtol=0.2,
        atol=0.2)
def class1_binding_predictor_A0205_training_accuracy():
    dataset = Dataset.from_csv(CLASS1_DATA_CSV_PATH)
    dataset_a0205 = dataset.get_allele("HLA-A0205")

    predictor = Class1BindingPredictor.from_hyperparameters(name="A0205")
    predictor.fit_dataset(dataset_a0205)
    peptides = dataset_a0205.peptides
    ic50_pred = predictor.predict(peptides)
    ic50_true = dataset_a0205.affinities
    eq_(len(ic50_pred), len(ic50_true))
    assert np.allclose(ic50_pred, ic50_true)
Example #3
0
def test_class1_binding_predictor_A0205_training_accuracy():
    dataset = Dataset.from_csv(
        get_path("data_combined_iedb_kim2014",
                 "combined_human_class1_dataset.csv"))
    dataset_a0205_all_lengths = dataset.get_allele("HLA-A0205")
    dataset_a0205 = Dataset(dataset_a0205_all_lengths._df.ix[
        dataset_a0205_all_lengths._df.peptide.str.len() == 9])

    predictor = Class1BindingPredictor(name="A0205",
                                       embedding_output_dim=32,
                                       activation="tanh",
                                       layer_sizes=[64],
                                       optimizer="adam",
                                       dropout_probability=0.0)
    predictor.fit_dataset(dataset_a0205, n_training_epochs=1000)
    peptides = dataset_a0205.peptides
    ic50_pred = predictor.predict(peptides)
    ic50_true = dataset_a0205.affinities
    eq_(len(ic50_pred), len(ic50_true))
    testing.assert_allclose(np.log(ic50_pred),
                            np.log(ic50_true),
                            rtol=0.2,
                            atol=0.2)
def load_csv(filename):
    base_dir = dirname(realpath(__file__))
    data_dir = join(base_dir, "data")
    full_path = join(data_dir, filename)
    return Dataset.from_csv(full_path)
            orientation="columns",
            verbose=verbose,
            print_interval=knn_print_interval)
    return result_dict


if __name__ == "__main__":
    args = parser.parse_args()
    print(args)
    imputation_methods = create_imputation_methods(
        verbose=args.verbose,
        clip_imputed_values=not (args.normalize_rows or args.normalize_rows),
    )
    print("Imputation methods: %s" % imputation_methods)

    dataset = Dataset.from_csv(args.binding_data_csv)
    X, peptide_list, allele_list = dataset.to_dense_pMHC_affinity_matrix(
        min_observations_per_allele=args.n_folds,
        min_observations_per_peptide=args.min_observations_per_peptide)
    observed_mask = np.isfinite(X)
    print("Loaded binding data, shape: %s, n_observed=%d/%d (%0.2f%%)" % (
        X.shape,
        observed_mask.sum(),
        X.size,
        100.0 * observed_mask.sum() / X.size))
    if args.save_incomplete_affinity_matrix:
        print("Saving incomplete data to %s" % args.save_incomplete_affinity_matrix)
        df = pd.DataFrame(X, columns=allele_list, index=peptide_list)
        df.to_csv(args.save_incomplete_affinity_matrix, index_label="peptide")

    scores = ScoreSet()
Example #6
0
def load_csv(filename):
    return Dataset.from_csv(data_path(filename))
Example #7
0
def test_performance_improves_for_A0205_with_pretraining():
    # test to make sure that imputation improves predictive accuracy after a
    # small number of training iterations (5 epochs)
    dataset = Dataset.from_csv(
        get_path("data_combined_iedb_kim2014",
                 "combined_human_class1_dataset.csv"))
    print("Full dataset: %d pMHC entries" % len(dataset))

    limited_alleles = ["HLA-A0205", "HLA-A0201", "HLA-A0101", "HLA-B0702"]

    # restrict to just five alleles
    dataset = dataset.get_alleles(limited_alleles)
    print("After filtering to %s, # entries: %d" %
          (limited_alleles, len(dataset)))

    a0205_data_without_imputation = dataset.get_allele("HLA-A0205")

    print("Dataset with only A0205, # entries: %d" %
          (len(a0205_data_without_imputation)))

    predictor_without_imputation = Class1BindingPredictor(
        name="A0205-no-impute")

    X_index, ic50_true, sample_weights, _ = (
        a0205_data_without_imputation.kmer_index_encoding())

    assert sample_weights.min() >= 0, sample_weights.min()
    assert sample_weights.max() <= 1, sample_weights.max()
    assert ic50_true.min() >= 0, ic50_true.min()

    predictor_without_imputation.fit_kmer_encoded_arrays(
        X=X_index,
        ic50=ic50_true,
        sample_weights=sample_weights,
        n_training_epochs=10)

    ic50_pred_without_imputation = (
        predictor_without_imputation.predict_ic50_for_kmer_encoded_array(
            X_index))
    diff_squared = (ic50_true - ic50_pred_without_imputation)**2

    ic50_true_label = ic50_true <= 500
    ic50_pred_label_without_imputation = ic50_pred_without_imputation <= 500
    ic50_label_same_without_imputation = (
        ic50_true_label == ic50_pred_label_without_imputation)
    mse_without_imputation = ((diff_squared * sample_weights).sum() /
                              sample_weights.sum())
    accuracy_without_imputation = (
        (ic50_label_same_without_imputation * sample_weights).sum() /
        sample_weights.sum())
    imputed_datset = dataset.impute_missing_values(MICE(n_imputations=25))
    print("After imputation, dataset for %s has %d entries" %
          (limited_alleles, len(imputed_datset)))
    a0205_data_with_imputation = imputed_datset.get_allele("HLA-A0205")
    print("Limited to just A0205, # entries: %d" %
          (len(a0205_data_with_imputation)))

    X_index_imputed, ic50_imputed, sample_weights_imputed, _ = \
        a0205_data_with_imputation.kmer_index_encoding()
    assert sample_weights_imputed.min() >= 0, sample_weights_imputed.min()
    assert sample_weights_imputed.max() <= 1, sample_weights_imputed.max()
    assert ic50_imputed.min() >= 0, ic50_imputed.min()

    predictor_with_imputation = Class1BindingPredictor(name="A0205-impute")

    predictor_with_imputation.fit_kmer_encoded_arrays(
        X=X_index,
        ic50=ic50_true,
        sample_weights=sample_weights,
        X_pretrain=X_index_imputed,
        ic50_pretrain=ic50_imputed,
        sample_weights_pretrain=sample_weights_imputed,
        n_training_epochs=10)

    ic50_pred_with_imputation = \
        predictor_with_imputation.predict_ic50_for_kmer_encoded_array(X_index)
    diff_squared = (ic50_true - ic50_pred_with_imputation)**2
    mse_with_imputation = ((diff_squared * sample_weights).sum() /
                           sample_weights.sum())

    ic50_pred_label_with_imputation = ic50_pred_with_imputation <= 500
    ic50_label_same_with_imputation = (
        ic50_true_label == ic50_pred_label_with_imputation)
    accuracy_with_imputation = (
        (ic50_label_same_with_imputation * sample_weights).sum() /
        sample_weights.sum())
    print("RMS w/out imputation: %f" % (np.sqrt(mse_without_imputation), ))
    print("RMS w/ imputation: %f" % (np.sqrt(mse_with_imputation), ))

    assert mse_with_imputation < mse_without_imputation, \
        "Expected MSE with imputation (%f) to be < w/o imputation (%f)" % (
            mse_with_imputation, mse_without_imputation)

    print("IC50 <= 500nM accuracy w/out imputation: %f" %
          (accuracy_without_imputation, ))
    print("IC50 <= 500nM accuracy w/ imputation: %f" %
          (accuracy_with_imputation, ))
    assert accuracy_with_imputation > accuracy_without_imputation
    default=[],
    nargs="+",
    type=normalize_allele_name)

# add options for neural network hyperparameters
parser = add_arguments_to_parser(parser)

if __name__ == "__main__":
    args = parser.parse_args()
    print(args)

    if not exists(args.output_dir):
        makedirs(args.output_dir)

    dataset = Dataset.from_csv(
        filename=args.binding_data_csv,
        sep=",",
        peptide_column_name="peptide")

    # if user didn't specify alleles then train models for all available alleles
    alleles = args.alleles

    if not alleles:
        alleles = list(sorted(dataset.unique_alleles()))
    else:
        dataset = dataset.get_alleles(alleles)

    imputer = imputer_from_args(args)

    if imputer is None:
        imputed_dataset = Dataset.create_empty()
    else:
    base_filename = \
        ("%s-vs-nsamples-hidden-%s-activation-%s"
         "-impute-%s-epochs-%d-embedding-%d-pretrain-%s") % (
            args.allele,
            args.hidden_layer_size,
            args.activation,
            args.imputation_method,
            args.training_epochs,
            args.embedding_size,
            args.pretraining_weight_decay)
    csv_filename = base_filename + ".csv"

    if args.load_existing_data:
        results_df = pd.read_csv(csv_filename)
    else:
        dataset = Dataset.from_csv(args.training_csv)
        imputer = imputer_from_args(args)

        def make_model():
            return predictor_from_args(allele_name=args.allele, args=args)

        if args.pretraining_weight_decay == "exponential":
            def pretrain_weight_decay_fn(t):
                return np.exp(-t)

        elif args.pretraining_weight_decay == "quadratic":
            def pretrain_weight_decay_fn(t):
                return 1.0 / (t + 1) ** 2.0

        elif args.pretraining_weight_decay == "linear":
            def pretrain_weight_decay_fn(t):
Example #10
0
def test_performance_improves_for_A0205_with_pretraining():
    # test to make sure that imputation improves predictive accuracy after a
    # small number of training iterations (5 epochs)
    dataset = Dataset.from_csv(CLASS1_DATA_CSV_PATH)
    print("Full dataset: %d pMHC entries" % len(dataset))

    limited_alleles = ["HLA-A0205", "HLA-A0201", "HLA-A0101", "HLA-B0702"]

    # restrict to just five alleles
    dataset = dataset.get_alleles(limited_alleles)
    print("After filtering to %s, # entries: %d" % (limited_alleles, len(dataset)))

    a0205_data_without_imputation = dataset.get_allele("HLA-A0205")

    print("Dataset with only A0205, # entries: %d" % len(a0205_data_without_imputation))

    predictor_without_imputation = \
        Class1BindingPredictor.from_hyperparameters(name="A0205-no-impute")

    X_index, ic50_true, sample_weights, _ = \
        a0205_data_without_imputation.kmer_index_encoding()

    assert sample_weights.min() >= 0, sample_weights.min()
    assert sample_weights.max() <= 1, sample_weights.max()
    assert ic50_true.min() >= 0, ic50_true.min()

    predictor_without_imputation.fit_kmer_encoded_arrays(
        X=X_index,
        ic50=ic50_true,
        sample_weights=sample_weights,
        n_training_epochs=10)

    ic50_pred_without_imputation = \
        predictor_without_imputation.predict_ic50_for_kmer_encoded_array(X_index)
    diff_squared = (ic50_true - ic50_pred_without_imputation) ** 2

    ic50_true_label = ic50_true <= 500
    ic50_pred_label_without_imputation = ic50_pred_without_imputation <= 500
    ic50_label_same_without_imputation = (
        ic50_true_label == ic50_pred_label_without_imputation)
    mse_without_imputation = (diff_squared * sample_weights).sum() / sample_weights.sum()
    accuracy_without_imputation = (
        ic50_label_same_without_imputation * sample_weights).sum() / sample_weights.sum()
    imputed_datset = dataset.impute_missing_values(MICE(n_imputations=25))
    print("After imputation, dataset for %s has %d entries" % (
        limited_alleles, len(imputed_datset)))
    a0205_data_with_imputation = imputed_datset.get_allele("HLA-A0205")
    print("Limited to just A0205, # entries: %d" % (len(a0205_data_with_imputation)))

    X_index_imputed, ic50_imputed, sample_weights_imputed, _ = \
        a0205_data_with_imputation.kmer_index_encoding()
    assert sample_weights_imputed.min() >= 0, sample_weights_imputed.min()
    assert sample_weights_imputed.max() <= 1, sample_weights_imputed.max()
    assert ic50_imputed.min() >= 0, ic50_imputed.min()

    predictor_with_imputation = \
        Class1BindingPredictor.from_hyperparameters(name="A0205-impute")

    predictor_with_imputation.fit_kmer_encoded_arrays(
        X=X_index,
        ic50=ic50_true,
        sample_weights=sample_weights,
        X_pretrain=X_index_imputed,
        ic50_pretrain=ic50_imputed,
        sample_weights_pretrain=sample_weights_imputed,
        n_training_epochs=200)

    ic50_pred_with_imputation = \
        predictor_with_imputation.predict_ic50_for_kmer_encoded_array(X_index)
    diff_squared = (ic50_true - ic50_pred_with_imputation) ** 2
    mse_with_imputation = (diff_squared * sample_weights).sum() / sample_weights.sum()

    ic50_pred_label_with_imputation = ic50_pred_with_imputation <= 500
    ic50_label_same_with_imputation = (
        ic50_true_label == ic50_pred_label_with_imputation)
    accuracy_with_imputation = (
        ic50_label_same_with_imputation * sample_weights).sum() / sample_weights.sum()
    print("RMS w/out imputation: %f" % (np.sqrt(mse_without_imputation),))
    print("RMS w/ imputation: %f" % (np.sqrt(mse_with_imputation),))

    assert mse_with_imputation < mse_without_imputation, \
        "Expected MSE with imputation (%f) to be less than (%f) without imputation" % (
            mse_with_imputation, mse_without_imputation)

    print("IC50 <= 500nM accuracy w/out imputation: %f" % (
        accuracy_without_imputation,))
    print("IC50 <= 500nM accuracy w/ imputation: %f" % (
        accuracy_with_imputation,))
    assert accuracy_with_imputation > accuracy_without_imputation
def load_csv(filename):
    return Dataset.from_csv(data_path(filename))