def make_dataset(num=10000):
    df = pandas.DataFrame({
        "n_flank":
        random_peptides(num / 2, 10) + random_peptides(num / 2, 1),
        "c_flank":
        random_peptides(num, 10),
        "peptide":
        random_peptides(num / 2, 11) + random_peptides(num / 2, 8),
    }).sample(frac=1.0)
    df["sample_id"] = pandas.Series(["sample_%d" % (i + 1) for i in range(5)
                                     ]).sample(n=len(df), replace=True).values

    n_regex = "[AILQSVWEN].[MNPQYKV]"

    def is_hit(n_flank, c_flank, peptide):
        if re.search(n_regex, peptide):
            return False  # peptide is cleaved
        return bool(re.match(n_regex, n_flank[-1:] + peptide))

    df["hit"] = [
        is_hit(row.n_flank, row.c_flank, row.peptide)
        for (_, row) in df.iterrows()
    ]

    train_df = df.sample(frac=0.9)
    test_df = df.loc[~df.index.isin(train_df.index)].copy()

    print("Generated dataset", len(df), "hits: ", df.hit.sum(), "frac:",
          df.hit.mean())

    return (train_df, test_df)
Beispiel #2
0
def test_random_negative_peptides_by_allele():
    planner = RandomNegativePeptides(
        random_negative_method="by_allele_equalize_nonbinders",
        random_negative_binder_threshold=500,
        random_negative_rate=1.0,
        random_negative_constant=2)
    data_rows = [
        ("HLA-A*02:01", "SIINFEKL", 400, "="),
        ("HLA-A*02:01", "SIINFEKLL", 300, "="),
        ("HLA-A*02:01", "SIINFEKLL", 300, "="),
        ("HLA-A*02:01", "SIINFEKLQ", 1000, "="),
        ("HLA-A*02:01", "SIINFEKLZZ", 12000, ">"),
        ("HLA-C*01:02", "SIINFEKLQ", 100, "="),  # only binders
        ("HLA-C*07:02", "SIINFEKLL", 1000, "=")   # only non-binders

    ]
    for peptide in random_peptides(1000, length=9):
        data_rows.append(("HLA-B*44:02", peptide, 100, "="))
    for peptide in random_peptides(1000, length=9):
        data_rows.append(("HLA-B*44:02", peptide, 1000, "="))
    for peptide in random_peptides(5, length=10):
        data_rows.append(("HLA-B*44:02", peptide, 100, "="))

    data = pandas.DataFrame(
        data_rows,
        columns=["allele", "peptide", "affinity", "inequality"])
    data["length"] = data.peptide.str.len()

    planner.plan(
        peptides=data.peptide.values,
        affinities=data.affinity.values,
        alleles=data.allele.values,
        inequalities=data.inequality.values)
    result_df = pandas.DataFrame({
        "allele": planner.get_alleles(),
        "peptide": planner.get_peptides(),
    })
    result_df["length"] = result_df.peptide.str.len()
    random_negatives = result_df.groupby(["allele", "length"]).peptide.count().unstack()
    real_data = data.groupby(["allele", "length"]).peptide.count().unstack().fillna(0)
    real_binders = data.loc[
        data.affinity <= 500
    ].groupby(["allele", "length"]).peptide.count().unstack().fillna(0)
    real_nonbinders = data.loc[
        data.affinity > 500
    ].groupby(["allele", "length"]).peptide.count().unstack().fillna(0)
    for length in random_negatives.columns:
        if length not in real_nonbinders.columns:
            real_nonbinders[length] = 0
    total_nonbinders = (
            random_negatives.reindex(real_data.index).fillna(0) +
            real_nonbinders.reindex(real_data.index).fillna(0))

    assert (total_nonbinders.loc["HLA-A*02:01"] == 2.0).all(), total_nonbinders
    assert (total_nonbinders.loc["HLA-B*44:02"] == 1126).all(), total_nonbinders

    assert not total_nonbinders.isnull().any().any()
Beispiel #3
0
def test_random_negative_peptides_by_allele():
    planner = RandomNegativePeptides(
        random_negative_method="by_allele",
        random_negative_binder_threshold=500,
        random_negative_rate=1.0,
        random_negative_constant=2)

    data_rows = [
        ("HLA-A*02:01", "SIINFEKL", 400, "="),
        ("HLA-A*02:01", "SIINFEKLL", 300, "="),
        ("HLA-A*02:01", "SIINFEKLL", 300, "="),
        ("HLA-A*02:01", "SIINFEKLQ", 1000, "="),
        ("HLA-A*02:01", "SIINFEKLZZ", 12000, ">"),
    ]
    for peptide in random_peptides(1000, length=9):
        data_rows.append(("HLA-B*44:02", peptide, 100, "="))
    for peptide in random_peptides(1000, length=9):
        data_rows.append(("HLA-B*44:02", peptide, 1000, "="))
    for peptide in random_peptides(5, length=10):
        data_rows.append(("HLA-B*44:02", peptide, 100, "="))

    data = pandas.DataFrame(
        data_rows,
        columns=["allele", "peptide", "affinity", "inequality"])
    data["length"] = data.peptide.str.len()

    planner.plan(
        peptides=data.peptide.values,
        affinities=data.affinity.values,
        alleles=data.allele.values,
        inequalities=data.inequality.values)
    result_df = pandas.DataFrame({
        "allele": planner.get_alleles(),
        "peptide": planner.get_peptides(),
    })

    result_df["length"] = result_df.peptide.str.len()
    random_negatives = result_df.groupby(["allele", "length"]).peptide.count().unstack()
    real_data = data.groupby(["allele", "length"]).peptide.count().unstack().fillna(0)
    real_binders = data.loc[
        data.affinity <= 500
    ].groupby(["allele", "length"]).peptide.count().unstack().fillna(0)
    real_nonbinders = data.loc[
        data.affinity > 500
    ].groupby(["allele", "length"]).peptide.count().unstack().fillna(0)
    total_nonbinders = random_negatives + real_nonbinders

    assert (random_negatives.loc["HLA-A*02:01"] == 1.0).all()
    assert (random_negatives.loc["HLA-B*44:02"] == math.ceil(1007 / 8)).all(), (
        random_negatives.loc["HLA-B*44:02"], math.ceil(1007 / 8))
Beispiel #4
0
def test_speed_allele_specific(profile=False, num=DEFAULT_NUM_PREDICTIONS):
    global ALLELE_SPECIFIC_PREDICTOR
    starts = collections.OrderedDict()
    timings = collections.OrderedDict()
    profilers = collections.OrderedDict()

    predictor = ALLELE_SPECIFIC_PREDICTOR

    def start(name):
        starts[name] = time.time()
        if profile:
            profilers[name] = cProfile.Profile()
            profilers[name].enable()

    def end(name):
        timings[name] = time.time() - starts[name]
        if profile:
            profilers[name].disable()

    start("first")
    predictor.predict(["SIINFEKL"], allele="HLA-A*02:01")
    end("first")

    peptides = random_peptides(num)
    start("pred_%d" % num)
    predictor.predict(peptides, allele="HLA-A*02:01")
    end("pred_%d" % num)

    NUM2 = 10000
    peptides = EncodableSequences.create(random_peptides(NUM2, length=13))
    start("encode_blosum_%d" % NUM2)
    peptides.variable_length_to_fixed_length_vector_encoding("BLOSUM62")
    end("encode_blosum_%d" % NUM2)

    start("pred_already_encoded_%d" % NUM2)
    predictor.predict(peptides, allele="HLA-A*02:01")
    end("pred_already_encoded_%d" % NUM2)

    NUM_REPEATS = 100
    start("pred_already_encoded_%d_%d_times" % (NUM2, NUM_REPEATS))
    for _ in range(NUM_REPEATS):
        predictor.predict(peptides, allele="HLA-A*02:01")
    end("pred_already_encoded_%d_%d_times" % (NUM2, NUM_REPEATS))

    print("SPEED BENCHMARK")
    print("Results:\n%s" % str(pandas.Series(timings)))

    return dict(
        (key, pstats.Stats(value)) for (key, value) in profilers.items())
Beispiel #5
0
def test_speed_pan_allele(profile=False, num=DEFAULT_NUM_PREDICTIONS):
    global PAN_ALLELE_PREDICTOR
    starts = collections.OrderedDict()
    timings = collections.OrderedDict()
    profilers = collections.OrderedDict()

    predictor = PAN_ALLELE_PREDICTOR

    def start(name):
        starts[name] = time.time()
        if profile:
            profilers[name] = cProfile.Profile()
            profilers[name].enable()

    def end(name):
        timings[name] = time.time() - starts[name]
        if profile:
            profilers[name].disable()

    start("first")
    predictor.predict(["SIINFEKL"], allele="HLA-A*02:01")
    end("first")

    peptides = random_peptides(num)
    start("pred_%d" % num)
    predictor.predict(peptides, allele="HLA-A*02:01")
    end("pred_%d" % num)

    print("SPEED BENCHMARK")
    print("Results:\n%s" % str(pandas.Series(timings)))

    return dict(
        (key, pstats.Stats(value)) for (key, value) in profilers.items())
def test_merge():
    assert len(PAN_ALLELE_PREDICTOR.class1_pan_allele_models) > 1
    peptides = random_peptides(100, length=9)
    peptides.extend(random_peptides(100, length=10))
    peptides = pandas.Series(peptides).sample(frac=1.0)

    alleles = pandas.Series(["HLA-A*03:01", "HLA-B*57:01",
                             "HLA-C*02:01"]).sample(n=len(peptides),
                                                    replace=True)

    predictions1 = PAN_ALLELE_PREDICTOR.predict(peptides=peptides,
                                                alleles=alleles)

    merged = Class1NeuralNetwork.merge(
        PAN_ALLELE_PREDICTOR.class1_pan_allele_models)
    merged_predictor = Class1AffinityPredictor(
        allele_to_sequence=PAN_ALLELE_PREDICTOR.allele_to_sequence,
        class1_pan_allele_models=[merged],
    )
    predictions2 = merged_predictor.predict(peptides=peptides, alleles=alleles)
    numpy.testing.assert_allclose(predictions1, predictions2, atol=0.1)
def test_basic():
    network = train_basic_network(num=10000,
                                  do_assertions=False,
                                  max_epochs=10)
    predictor = Class1ProcessingPredictor(models=[network])

    num = 10000
    df = pandas.DataFrame({
        "n_flank": random_peptides(num, 10),
        "c_flank": random_peptides(num, 10),
        "peptide": random_peptides(num, 9),
    })
    df["score"] = predictor.predict(df.peptide, df.n_flank, df.c_flank)

    # Test predictions are deterministic
    df1b = predictor.predict_to_dataframe(peptides=df.peptide.values,
                                          n_flanks=df.n_flank.values,
                                          c_flanks=df.c_flank.values)
    assert_array_equal(df.score.values, df1b.score.values)

    # Test saving and loading
    models_dir = tempfile.mkdtemp("_models")
    print(models_dir)
    predictor.save(models_dir)
    predictor2 = Class1ProcessingPredictor.load(models_dir)

    df2 = predictor2.predict_to_dataframe(peptides=df.peptide.values,
                                          n_flanks=df.n_flank.values,
                                          c_flanks=df.c_flank.values)
    assert_array_equal(df.score.values, df2.score.values)

    # Test pickling
    predictor3 = pickle.loads(
        pickle.dumps(predictor, protocol=pickle.HIGHEST_PROTOCOL))
    df3 = predictor3.predict_to_dataframe(peptides=df.peptide.values,
                                          n_flanks=df.n_flank.values,
                                          c_flanks=df.c_flank.values)
    assert_array_equal(df.score.values, df3.score.values)
def test_correlation(alleles=None,
                     num_peptides_per_length=1000,
                     lengths=[8, 9, 10],
                     debug=False):
    peptides = []
    for length in lengths:
        peptides.extend(random_peptides(num_peptides_per_length, length))

    # Cache encodings
    peptides = EncodableSequences.create(list(set(peptides)))

    if alleles is None:
        alleles = set.intersection(*[
            set(predictor.supported_alleles)
            for predictor in PREDICTORS.values()
        ])
    alleles = sorted(set(alleles))
    df = pandas.DataFrame(index=peptides.sequences)

    results_df = []
    for allele in alleles:
        for (name, predictor) in PREDICTORS.items():
            df[name] = predictor.predict(peptides, allele=allele)
        correlation = numpy.corrcoef(numpy.log10(df["allele-specific"]),
                                     numpy.log10(df["pan-allele"]))[0, 1]
        results_df.append((allele, correlation))
        print(len(results_df), len(alleles), *results_df[-1])

        if correlation < 0.6:
            print("Warning: low correlation", allele)
            df["tightest"] = df.min(1)
            print(df.sort_values("tightest").iloc[:, :-1])
            if debug:
                import ipdb
                ipdb.set_trace()
            del df["tightest"]

    results_df = pandas.DataFrame(results_df,
                                  columns=["allele", "correlation"])
    print(results_df)

    print("Mean correlation", results_df.correlation.mean())
    assert_greater(results_df.correlation.mean(), 0.65)

    return results_df
def run():
    args = parser.parse_args(sys.argv[1:])
    print(args)

    predictor = mhcflurry.Class1AffinityPredictor.load(args.models)

    alleles = pandas.Series(predictor.supported_alleles)

    # Clear the file
    pandas.DataFrame(columns=alleles).to_csv(args.out, index=True)

    (min_length, max_length) = predictor.supported_peptide_lengths

    peptides_per_length = int(
        math.ceil(args.chunksize / (max_length - min_length)))

    peptides_written = 0
    i = 0
    while peptides_written < args.num_peptides:
        print("Chunk %d / %d" % (
            i + 1, math.ceil(args.num_peptides / args.chunksize)))
        start = time.time()
        peptides = []
        for l in range(8, 16):
            peptides.extend(random_peptides(peptides_per_length, length=l))

        peptides = pandas.Series(peptides).sample(
            n=min(args.chunksize, args.num_peptides - peptides_written)).values
        encodable_peptides = mhcflurry.encodable_sequences.EncodableSequences.create(
            peptides)
        df = pandas.DataFrame(index=peptides)
        for allele in alleles:
            df[allele] = predictor.predict(encodable_peptides, allele=allele)
        df.to_csv(
            args.out, index=True, mode='a', header=False, float_format='%.1f')
        print("Wrote: %s  [%0.2f sec]" % (args.out, time.time() - start))
        i += 1
        peptides_written += len(peptides)

    print("Done.")
Beispiel #10
0
def train_basic_network(num,
                        do_assertions=True,
                        is_hit=None,
                        **hyperparameters):
    use_hyperparameters = {
        "max_epochs": 100,
        "peptide_max_length": 12,
        "n_flank_length": 8,
        "c_flank_length": 8,
        "convolutional_kernel_size": 3,
        "flanking_averages": True,
        "min_delta": 0.01,
    }
    use_hyperparameters.update(hyperparameters)

    df = pandas.DataFrame({
        "n_flank":
        random_peptides(num / 2, 10) + random_peptides(num / 2, 1),
        "c_flank":
        random_peptides(num, 10),
        "peptide":
        random_peptides(num / 2, 11) + random_peptides(num / 2, 8),
    }).sample(frac=1.0)

    if is_hit is None:
        n_cleavage_regex = "[AILQSV][SINFEKLH][MNPQYK]"

        def is_hit(n_flank, c_flank, peptide):
            if re.search(n_cleavage_regex, peptide):
                return False  # peptide is cleaved
            return bool(re.match(n_cleavage_regex, n_flank[-1:] + peptide))

    df["hit"] = [
        is_hit(row.n_flank, row.c_flank, row.peptide)
        for (_, row) in df.iterrows()
    ]

    train_df = df.sample(frac=0.9)
    test_df = df.loc[~df.index.isin(train_df.index)]

    print("Generated dataset", len(df), "hits: ", df.hit.sum(), "frac:",
          df.hit.mean())

    network = Class1ProcessingNeuralNetwork(**use_hyperparameters)
    network.fit(sequences=FlankingEncoding(peptides=train_df.peptide.values,
                                           n_flanks=train_df.n_flank.values,
                                           c_flanks=train_df.c_flank.values),
                targets=train_df.hit.values,
                verbose=0)

    network.network().summary()

    for df in [train_df, test_df]:
        df["predictions"] = network.predict(df.peptide.values,
                                            df.n_flank.values,
                                            df.c_flank.values)

    train_auc = roc_auc_score(train_df.hit.values, train_df.predictions.values)
    test_auc = roc_auc_score(test_df.hit.values, test_df.predictions.values)

    print("Train auc", train_auc)
    print("Test auc", test_auc)

    if do_assertions:
        assert_greater(train_auc, 0.9)
        assert_greater(test_auc, 0.85)

    return network
Beispiel #11
0
from mhcflurry.amino_acid import COMMON_AMINO_ACIDS
from mhcflurry.common import random_peptides

######################
# Helper functions


def hit_criterion(experiment_name, peptide):
    # Peptides with 'A' are always hits. Easy for model to learn.
    return 'A' in peptide


######################
# Small test dataset

PEPTIDES = random_peptides(1000, 9)
OTHER_PEPTIDES = random_peptides(1000, 9)

TRANSCRIPTS = ["transcript-%d" % i for i in range(1, 10)]

EXPERIMENT_TO_ALLELES = {
    'exp1': ['HLA-A*01:01'],
    'exp2': ['HLA-A*02:01', 'HLA-B*51:01'],
}

EXPERIMENT_TO_EXPRESSION_GROUP = {
    'exp1': 'group1',
    'exp2': 'group2',
}

EXPERESSION_GROUPS = sorted(set(EXPERIMENT_TO_EXPRESSION_GROUP.values()))
Beispiel #12
0
def test_multi_output():
    hyperparameters = dict(
        loss="custom:mse_with_inequalities_and_multiple_outputs",
        activation="tanh",
        layer_sizes=[16],
        max_epochs=50,
        minibatch_size=250,
        random_negative_rate=0.0,
        random_negative_constant=0.0,
        early_stopping=False,
        validation_split=0.0,
        locally_connected_layers=[
        ],
        dense_layer_l1_regularization=0.0,
        dropout_probability=0.0,
        optimizer="adam",
        num_outputs=3)

    df = pandas.DataFrame()
    df["peptide"] = random_peptides(10000, length=9)
    df["output1"] = df.peptide.map(lambda s: s[4] == 'K').astype(int) * 49000 + 1
    df["output2"] = df.peptide.map(lambda s: s[3] == 'Q').astype(int) * 49000 + 1
    df["output3"] = df.peptide.map(lambda s: s[4] == 'K' or s[3] == 'Q').astype(int) * 49000 + 1

    print("output1 mean", df.output1.mean())
    print("output2 mean", df.output2.mean())

    stacked = df.set_index("peptide").stack().reset_index()
    stacked.columns = ['peptide', 'output_name', 'value']
    stacked["output_index"] = stacked.output_name.map({
        "output1": 0,
        "output2": 1,
        "output3": 2,
    })
    assert not stacked.output_index.isnull().any(), stacked

    fit_kwargs = {
        'verbose': 1,
    }

    predictor = Class1NeuralNetwork(**hyperparameters)
    stacked_train = stacked
    predictor.fit(
        stacked_train.peptide.values,
        stacked_train.value.values,
        output_indices=stacked_train.output_index.values,
        **fit_kwargs)

    result = predictor.predict(df.peptide.values, output_index=None)
    print(df.shape, result.shape)
    print(result)

    df["prediction1"] = result[:,0]
    df["prediction2"] = result[:,1]
    df["prediction3"] = result[:,2]

    df_by_peptide = df.set_index("peptide")

    correlation = pandas.DataFrame(
        numpy.corrcoef(df_by_peptide.T),
        columns=df_by_peptide.columns,
        index=df_by_peptide.columns)
    print(correlation)

    sub_correlation = correlation.loc[
        ["output1", "output2", "output3"],
        ["prediction1", "prediction2", "prediction3"],
    ]
    assert sub_correlation.iloc[0, 0] > 0.99, correlation
    assert sub_correlation.iloc[1, 1] > 0.99, correlation
    assert sub_correlation.iloc[2, 2] > 0.99, correlation
Beispiel #13
0
def test_inequalities():
    # Memorize the dataset.
    hyperparameters = dict(
        peptide_amino_acid_encoding="one-hot",
        activation="tanh",
        layer_sizes=[64],
        max_epochs=200,
        minibatch_size=32,
        random_negative_rate=0.0,
        random_negative_constant=0,
        early_stopping=False,
        validation_split=0.0,
        locally_connected_layers=[{
            "filters": 8,
            "activation": "tanh",
            "kernel_size": 3
        }],
        dense_layer_l1_regularization=0.0,
        dropout_probability=0.0,
        loss="custom:mse_with_inequalities_and_multiple_outputs")

    dfs = []

    # Weak binders
    df = pandas.DataFrame()
    df["peptide"] = random_peptides(100, length=9)
    df["value"] = 100
    df["inequality1"] = "="
    df["inequality2"] = "<"
    dfs.append(df)

    # Strong binders - same peptides as above but more measurement values
    df = pandas.DataFrame()
    df["peptide"] = dfs[-1].peptide.values
    df["value"] = 1
    df["inequality1"] = "="
    df["inequality2"] = "="
    dfs.append(df)

    # Non-binders
    df = pandas.DataFrame()
    df["peptide"] = random_peptides(100, length=10)
    df["value"] = 1000
    df["inequality1"] = ">"
    df["inequality2"] = ">"
    dfs.append(df)

    df = pandas.concat(dfs, ignore_index=True)

    fit_kwargs = {'verbose': 0}

    predictor = Class1NeuralNetwork(**hyperparameters)
    predictor.fit(df.peptide.values,
                  df.value.values,
                  inequalities=df.inequality1.values,
                  **fit_kwargs)
    df["prediction1"] = predictor.predict(df.peptide.values)

    predictor = Class1NeuralNetwork(**hyperparameters)
    predictor.fit(df.peptide.values,
                  df.value.values,
                  inequalities=df.inequality2.values,
                  **fit_kwargs)
    df["prediction2"] = predictor.predict(df.peptide.values)

    # Binders should be stronger
    for pred in ["prediction1", "prediction2"]:
        assert_less(df.loc[df.value < 1000, pred].mean(), 500)
        assert_greater(df.loc[df.value >= 1000, pred].mean(), 500)

    # For the binders, the (=) on the weak-binding measurement (100) in
    # inequality1 should make the prediction weaker, whereas for inequality2
    # this measurement is a "<" so it should allow the strong-binder measurement
    # to dominate.
    numpy.testing.assert_allclose(df.loc[df.value == 1].prediction2.values,
                                  1.0,
                                  atol=0.5)
    numpy.testing.assert_array_less(5.0,
                                    df.loc[df.value == 1].prediction1.values)
    print(df.groupby("value")[["prediction1", "prediction2"]].mean())
Beispiel #14
0
def test_inequalities():
    # Memorize the dataset.
    hyperparameters = dict(
        loss="custom:mse_with_inequalities",
        peptide_amino_acid_encoding="one-hot",
        activation="tanh",
        layer_sizes=[16],
        max_epochs=50,
        minibatch_size=32,
        random_negative_rate=0.0,
        early_stopping=False,
        validation_split=0.0,
        locally_connected_layers=[
            {
                "filters": 8,
                "activation": "tanh",
                "kernel_size": 3
            }
        ],
        dense_layer_l1_regularization=0.0,
        dropout_probability=0.0)

    df = pandas.DataFrame()
    df["peptide"] = random_peptides(1000, length=9)

    # First half are binders
    df["binder"] = df.index < len(df) / 2
    df["value"] = df.binder.map({True: 100, False: 5000})
    df.loc[:10, "value"] = 1.0  # some strong binders
    df["inequality1"] = "="
    df["inequality2"] = df.binder.map({True: "<", False: "="})
    df["inequality3"] = df.binder.map({True: "=", False: ">"})

    # "A" at start of peptide indicates strong binder
    df["peptide"] = [
        ("C" if not row.binder else "A") + row.peptide[1:]
        for _, row in df.iterrows()
    ]

    fit_kwargs = {'verbose': 0}

    # Prediction1 uses no inequalities (i.e. all are (=))
    predictor = Class1NeuralNetwork(**hyperparameters)
    predictor.fit(
        df.peptide.values,
        df.value.values,
        inequalities=df.inequality1.values,
        **fit_kwargs)
    df["prediction1"] = predictor.predict(df.peptide.values)

    # Prediction2 has a (<) inequality on binders and an (=) on non-binders
    predictor = Class1NeuralNetwork(**hyperparameters)
    predictor.fit(
        df.peptide.values,
        df.value.values,
        inequalities=df.inequality2.values,
        **fit_kwargs)
    df["prediction2"] = predictor.predict(df.peptide.values)

    # Prediction3 has a (=) inequality on binders and an (>) on non-binders
    predictor = Class1NeuralNetwork(**hyperparameters)
    predictor.fit(
        df.peptide.values,
        df.value.values,
        inequalities=df.inequality3.values,
        **fit_kwargs)
    df["prediction3"] = predictor.predict(df.peptide.values)

    df_binders = df.loc[df.binder]
    df_nonbinders = df.loc[~df.binder]

    print("***** Binders: *****")
    print(df_binders.head(5))

    print("***** Non-binders: *****")
    print(df_nonbinders.head(5))

    # Binders should always be given tighter predicted affinity than non-binders
    assert_less(df_binders.prediction1.mean(), df_nonbinders.prediction1.mean())
    assert_less(df_binders.prediction2.mean(), df_nonbinders.prediction2.mean())
    assert_less(df_binders.prediction3.mean(), df_nonbinders.prediction3.mean())

    # prediction2 binders should be tighter on average than prediction1
    # binders, since prediction2 has a (<) inequality for binders.
    # Non-binders should be about the same between prediction2 and prediction1
    assert_less(df_binders.prediction2.mean(), df_binders.prediction1.mean())
    assert_almost_equal(
        df_nonbinders.prediction2.mean(),
        df_nonbinders.prediction1.mean(),
        delta=3000)

    # prediction3 non-binders should be weaker on average than prediction2 (or 1)
    # non-binders, since prediction3 has a (>) inequality for these peptides.
    # Binders should be about the same.
    assert_greater(
        df_nonbinders.prediction3.mean(),
        df_nonbinders.prediction2.mean())
    assert_greater(
        df_nonbinders.prediction3.mean(),
        df_nonbinders.prediction1.mean())
    assert_almost_equal(
        df_binders.prediction3.mean(),
        df_binders.prediction1.mean(),
        delta=3000)