Beispiel #1
0
def run_pipeline(patient_id, score_epitopes):
    """Run the pipeline for this patient, and save the output to the DB as a
    Run."""
    hla_types = HLAType.query.with_entities(HLAType.allele,
        HLAType.mhc_class).filter_by(patient_id=patient_id).all()

    peptide_length = 31
    alleles = [normalize_hla_allele_name(
        allele) for allele, mhc_class in hla_types]

    vcf_df = get_vcf_df(patient_id)
    transcripts_df, vcf_df, variant_report = expand_transcripts(
        vcf_df,
        patient_id,
        min_peptide_length = peptide_length,
        max_peptide_length = peptide_length)

    scored_epitopes = score_epitopes(transcripts_df, alleles)
    imm = ImmunogenicityPredictor(alleles=alleles)
    scored_epitopes = imm.predict(scored_epitopes)

    # TODO(tavi) Make this expansion more robust. It breaks the IEDB predictor,
    # for example.
    short_transcripts_df = transcripts_df[['chr', 'pos', 'ref',
        'alt', 'TranscriptId']]
    scored_epitopes = merge(scored_epitopes, short_transcripts_df,
        on='TranscriptId', how='left')
    peptides = group_epitopes_dataframe(
        scored_epitopes, use_transcript_name = True)

    run = Run(patient_id=patient_id, output=dumps(peptides))
    db.session.add(run)
Beispiel #2
0
def generate_mutation_counts(
        mutation_files,
        hla_types,
        genes_expressed,
        max_peptide_length=31,
        skip_identifiers = {},
        output_file=None):
    """
    Returns dictionary that maps each patient ID to a tuple with six fields:
        - total number of mutated epitopes across all transcripts
        - number of mutated genes
        - number of mutated genes with MHC binding mutated epitope
        - number of mutated epitopes which are predicted to bind to an MHC
          allele
        - number of mutated genes with at least one immunogenic mutated
          epitope
        - number of mutated epitopes which are predicted to be immunogenic
          (MHC binder + non-self)
    """
    mutation_counts = OrderedDict()
    n = len(mutation_files)
    for i, (patient_id, vcf_df) in enumerate(mutation_files.iteritems()):
        if patient_id in skip_identifiers:
            logging.info("Skipping patient ID %s", patient_id)
            continue
        hla_allele_names = hla_types[patient_id]
        logging.info(
            "Processing %s (#%d/%d) with HLA alleles %s",
            patient_id, i + 1, n, hla_allele_names)

        if not args.quiet:
            print vcf_df

        try:
            transcripts_df, raw_genomic_mutation_df, variant_report = (
                expand_transcripts(
                    vcf_df,
                    patient_id,
                    max_peptide_length=max_peptide_length))
        except KeyboardInterrupt:
            raise
        except:
            logging.warning("Failed to apply mutations for %s", patient_id)
            raise

        # print each genetic mutation applied to each possible transcript
        # and either why it failed or what protein mutation resulted
        if not args.quiet:
            print_mutation_report(
                patient_id,
                variant_report,
                raw_genomic_mutation_df,
                transcripts_df)
            logging.info(
                "Calling MHC binding predictor for %s (#%d/%d)",
                patient_id, i + 1, n)

        def make_mhc_predictor():
            if args.netmhc_cons:
                return ConsensusBindingPredictor(hla_allele_names)
            else:
                return PanBindingPredictor(hla_allele_names)

        # If we want to read scored_epitopes from a CSV file, do that.
        if args.debug_scored_epitopes_csv:
            csv_file = args.debug_scored_epitopes_csv
            if isfile(csv_file):
                scored_epitopes = pd.read_csv(csv_file)
            else:
                mhc = make_mhc_predictor()
                scored_epitopes = mhc.predict(transcripts_df,
                        mutation_window_size=9)
                scored_epitopes.to_csv(csv_file)
        else:
            mhc = make_mhc_predictor()
            scored_epitopes = mhc.predict(transcripts_df,
                    mutation_window_size=9)

        if not args.quiet:
            print scored_epitopes

        imm = ImmunogenicityPredictor(
            alleles=hla_allele_names,
            binding_threshold=args.binding_threshold)
        scored_epitopes = imm.predict(scored_epitopes)
        scored_epitopes.to_csv("scored_epitopes.csv")
        scored_epitopes = pd.read_csv("scored_epitopes.csv")

        grouped = scored_epitopes.groupby(["Gene", "GeneMutationInfo"])
        n_coding_mutations = len(grouped)
        n_epitopes = 0
        n_ligand_mutations = 0
        n_ligands = 0
        n_immunogenic_mutations = 0
        n_immunogenic_epitopes = 0
        for (gene, mut), group in grouped:
            start_mask = group.EpitopeStart < group.MutationEnd
            stop_mask = group.EpitopeEnd > group.MutationStart
            mutated_subset = group[start_mask & stop_mask]
            # we might have duplicate epitopes from multiple transcripts, so
            # drop them
            n_curr_epitopes = len(mutated_subset.groupby(['Epitope']))
            n_epitopes += n_curr_epitopes
            below_threshold_mask = \
                mutated_subset.MHC_IC50 <= args.binding_threshold
            ligands = mutated_subset[below_threshold_mask]
            n_curr_ligands = len(ligands.groupby(['Epitope']))
            n_ligands += n_curr_ligands
            n_ligand_mutations += (n_curr_ligands) > 0
            thymic_deletion_mask = \
                np.array(ligands.ThymicDeletion).astype(bool)
            immunogenic_epitopes = ligands[~thymic_deletion_mask]
            curr_immunogenic_epitopes = immunogenic_epitopes.groupby(['Epitope']).first()
            n_immunogenic_epitopes += len(curr_immunogenic_epitopes)
            n_immunogenic_mutations += len(curr_immunogenic_epitopes) > 0
            logging.info(("%s %s: epitopes %s, ligands %d, imm %d"),
                         gene,
                         mut,
                         n_curr_epitopes,
                         n_curr_ligands,
                         len(curr_immunogenic_epitopes),
                        )
        result_tuple = (
            n_coding_mutations,
            n_epitopes,
            n_ligand_mutations,
            n_ligands,
            n_immunogenic_mutations,
            n_immunogenic_epitopes,
        )
        if output_file:
            data_string = ",".join(str(d) for d in result_tuple)
            output_file.write("%s,%s\n" % (patient_id, data_string))
            output_file.flush()
        mutation_counts[patient_id] = result_tuple
    return mutation_counts
Beispiel #3
0
def generate_mutation_counts(mutation_files,
                             hla_types,
                             genes_expressed,
                             max_peptide_length=31,
                             skip_identifiers={},
                             output_file=None):
    """
    Returns dictionary that maps each patient ID to a tuple with six fields:
        - total number of mutated epitopes across all transcripts
        - number of mutated genes
        - number of mutated genes with MHC binding mutated epitope
        - number of mutated epitopes which are predicted to bind to an MHC
          allele
        - number of mutated genes with at least one immunogenic mutated
          epitope
        - number of mutated epitopes which are predicted to be immunogenic
          (MHC binder + non-self)
    """
    mutation_counts = OrderedDict()
    n = len(mutation_files)
    for i, (patient_id, vcf_df) in enumerate(mutation_files.iteritems()):
        if patient_id in skip_identifiers:
            logging.info("Skipping patient ID %s", patient_id)
            continue
        hla_allele_names = hla_types[patient_id]
        logging.info("Processing %s (#%d/%d) with HLA alleles %s", patient_id,
                     i + 1, n, hla_allele_names)

        if not args.quiet:
            print vcf_df

        try:
            transcripts_df, raw_genomic_mutation_df, variant_report = (
                expand_transcripts(vcf_df,
                                   patient_id,
                                   max_peptide_length=max_peptide_length))
        except KeyboardInterrupt:
            raise
        except:
            logging.warning("Failed to apply mutations for %s", patient_id)
            raise

        # print each genetic mutation applied to each possible transcript
        # and either why it failed or what protein mutation resulted
        if not args.quiet:
            print_mutation_report(patient_id, variant_report,
                                  raw_genomic_mutation_df, transcripts_df)
            logging.info("Calling MHC binding predictor for %s (#%d/%d)",
                         patient_id, i + 1, n)

        def make_mhc_predictor():
            if args.netmhc_cons:
                return ConsensusBindingPredictor(hla_allele_names)
            else:
                return PanBindingPredictor(hla_allele_names)

        # If we want to read scored_epitopes from a CSV file, do that.
        if args.debug_scored_epitopes_csv:
            csv_file = args.debug_scored_epitopes_csv
            if isfile(csv_file):
                scored_epitopes = pd.read_csv(csv_file)
            else:
                mhc = make_mhc_predictor()
                scored_epitopes = mhc.predict(transcripts_df,
                                              mutation_window_size=9)
                scored_epitopes.to_csv(csv_file)
        else:
            mhc = make_mhc_predictor()
            scored_epitopes = mhc.predict(transcripts_df,
                                          mutation_window_size=9)

        if not args.quiet:
            print scored_epitopes

        imm = ImmunogenicityPredictor(alleles=hla_allele_names,
                                      binding_threshold=args.binding_threshold)
        scored_epitopes = imm.predict(scored_epitopes)
        scored_epitopes.to_csv("scored_epitopes.csv")
        scored_epitopes = pd.read_csv("scored_epitopes.csv")

        grouped = scored_epitopes.groupby(["Gene", "GeneMutationInfo"])
        n_coding_mutations = len(grouped)
        n_epitopes = 0
        n_ligand_mutations = 0
        n_ligands = 0
        n_immunogenic_mutations = 0
        n_immunogenic_epitopes = 0
        for (gene, mut), group in grouped:
            start_mask = group.EpitopeStart < group.MutationEnd
            stop_mask = group.EpitopeEnd > group.MutationStart
            mutated_subset = group[start_mask & stop_mask]
            # we might have duplicate epitopes from multiple transcripts, so
            # drop them
            n_curr_epitopes = len(mutated_subset.groupby(['Epitope']))
            n_epitopes += n_curr_epitopes
            below_threshold_mask = \
                mutated_subset.MHC_IC50 <= args.binding_threshold
            ligands = mutated_subset[below_threshold_mask]
            n_curr_ligands = len(ligands.groupby(['Epitope']))
            n_ligands += n_curr_ligands
            n_ligand_mutations += (n_curr_ligands) > 0
            thymic_deletion_mask = \
                np.array(ligands.ThymicDeletion).astype(bool)
            immunogenic_epitopes = ligands[~thymic_deletion_mask]
            curr_immunogenic_epitopes = immunogenic_epitopes.groupby(
                ['Epitope']).first()
            n_immunogenic_epitopes += len(curr_immunogenic_epitopes)
            n_immunogenic_mutations += len(curr_immunogenic_epitopes) > 0
            logging.info(
                ("%s %s: epitopes %s, ligands %d, imm %d"),
                gene,
                mut,
                n_curr_epitopes,
                n_curr_ligands,
                len(curr_immunogenic_epitopes),
            )
        result_tuple = (
            n_coding_mutations,
            n_epitopes,
            n_ligand_mutations,
            n_ligands,
            n_immunogenic_mutations,
            n_immunogenic_epitopes,
        )
        if output_file:
            data_string = ",".join(str(d) for d in result_tuple)
            output_file.write("%s,%s\n" % (patient_id, data_string))
            output_file.flush()
        mutation_counts[patient_id] = result_tuple
    return mutation_counts