Esempio n. 1
0
    def predict_peptides(self, peptides):
        """
        Predict MHC affinity for peptides.
        """

        # importing locally to avoid slowing down CLI applications which
        # don't use MHCflurry
        from mhcflurry.encodable_sequences import EncodableSequences

        binding_predictions = []
        encodable_sequences = EncodableSequences.create(peptides)
        for allele in self.alleles:
            predictions_df = self.predictor.predict_to_dataframe(
                encodable_sequences, allele=allele)
            for (_, row) in predictions_df.iterrows():
                binding_prediction = BindingPrediction(
                    allele=allele,
                    peptide=row.peptide,
                    affinity=row.prediction,
                    percentile_rank=(
                        row.prediction_percentile
                        if 'prediction_percentile' in row else nan),
                    prediction_method_name="mhcflurry"
                )
                binding_predictions.append(binding_prediction)
        return BindingPredictionCollection(binding_predictions)
Esempio n. 2
0
def test_speed_allele_specific(profile=False, num=DEFAULT_NUM_PREDICTIONS):
    global ALLELE_SPECIFIC_PREDICTOR
    starts = collections.OrderedDict()
    timings = collections.OrderedDict()
    profilers = collections.OrderedDict()

    predictor = ALLELE_SPECIFIC_PREDICTOR

    def start(name):
        starts[name] = time.time()
        if profile:
            profilers[name] = cProfile.Profile()
            profilers[name].enable()

    def end(name):
        timings[name] = time.time() - starts[name]
        if profile:
            profilers[name].disable()

    start("first")
    predictor.predict(["SIINFEKL"], allele="HLA-A*02:01")
    end("first")

    peptides = random_peptides(num)
    start("pred_%d" % num)
    predictor.predict(peptides, allele="HLA-A*02:01")
    end("pred_%d" % num)

    NUM2 = 10000
    peptides = EncodableSequences.create(random_peptides(NUM2, length=13))
    start("encode_blosum_%d" % NUM2)
    peptides.variable_length_to_fixed_length_vector_encoding("BLOSUM62")
    end("encode_blosum_%d" % NUM2)

    start("pred_already_encoded_%d" % NUM2)
    predictor.predict(peptides, allele="HLA-A*02:01")
    end("pred_already_encoded_%d" % NUM2)

    NUM_REPEATS = 100
    start("pred_already_encoded_%d_%d_times" % (NUM2, NUM_REPEATS))
    for _ in range(NUM_REPEATS):
        predictor.predict(peptides, allele="HLA-A*02:01")
    end("pred_already_encoded_%d_%d_times" % (NUM2, NUM_REPEATS))

    print("SPEED BENCHMARK")
    print("Results:\n%s" % str(pandas.Series(timings)))

    return dict(
        (key, pstats.Stats(value)) for (key, value) in profilers.items())
Esempio n. 3
0
def do_predictions_mhcflurry(work_item_dicts, constant_data=None):
    """
    Each dict of work items should have keys: work_item_num, peptides, alleles

    """

    # This may run on the cluster in a way that misses all top level imports,
    # so we have to re-import everything here.
    import time
    from mhcflurry.encodable_sequences import EncodableSequences
    from mhcflurry import Class1AffinityPredictor

    if constant_data is None:
        constant_data = GLOBAL_DATA

    args = constant_data['args']

    assert args.predictor == "mhcflurry"
    assert constant_data['cols'] == ["affinity"]

    predictor = Class1AffinityPredictor.load(args.mhcflurry_models_dir)

    results = []
    for (i, d) in enumerate(work_item_dicts):
        work_item_num = d['work_item_num']
        peptides = d['peptides']
        alleles = d['alleles']

        print("Processing work item", i + 1, "of", len(work_item_dicts))
        result = {}
        results.append((work_item_num, result))
        start = time.time()
        peptides = EncodableSequences.create(peptides)
        for (i, allele) in enumerate(alleles):
            print("Processing allele %d / %d: %0.2f sec elapsed" %
                  (i + 1, len(alleles), time.time() - start))
            for col in ["affinity"]:
                result["%s %s" % (allele, col)] = predictor.predict(
                    peptides=peptides,
                    allele=allele,
                    throw=False,
                    model_kwargs={
                        'batch_size': args.mhcflurry_batch_size,
                    }).astype(constant_data['args'].result_dtype)
        print("Done predicting in", time.time() - start, "sec")
    return results
def test_correlation(alleles=None,
                     num_peptides_per_length=1000,
                     lengths=[8, 9, 10],
                     debug=False):
    peptides = []
    for length in lengths:
        peptides.extend(random_peptides(num_peptides_per_length, length))

    # Cache encodings
    peptides = EncodableSequences.create(list(set(peptides)))

    if alleles is None:
        alleles = set.intersection(*[
            set(predictor.supported_alleles)
            for predictor in PREDICTORS.values()
        ])
    alleles = sorted(set(alleles))
    df = pandas.DataFrame(index=peptides.sequences)

    results_df = []
    for allele in alleles:
        for (name, predictor) in PREDICTORS.items():
            df[name] = predictor.predict(peptides, allele=allele)
        correlation = numpy.corrcoef(numpy.log10(df["allele-specific"]),
                                     numpy.log10(df["pan-allele"]))[0, 1]
        results_df.append((allele, correlation))
        print(len(results_df), len(alleles), *results_df[-1])

        if correlation < 0.6:
            print("Warning: low correlation", allele)
            df["tightest"] = df.min(1)
            print(df.sort_values("tightest").iloc[:, :-1])
            if debug:
                import ipdb
                ipdb.set_trace()
            del df["tightest"]

    results_df = pandas.DataFrame(results_df,
                                  columns=["allele", "correlation"])
    print(results_df)

    print("Mean correlation", results_df.correlation.mean())
    assert_greater(results_df.correlation.mean(), 0.65)

    return results_df