Ejemplo n.º 1
0
def variants_to_protein_sequences_dataframe(
        expressed_vcf="data/b16.f10/b16.expressed.vcf",
        not_expressed_vcf="data/b16.f10/b16.not-expressed.vcf",
        tumor_rna_bam="data/b16.f10/b16.combined.sorted.bam",
        min_mapping_quality=0,
        max_protein_sequences_per_variant=1,
        variant_sequence_assembly=False):
    """
    Helper function to load pair of VCFs and tumor RNA BAM
    and use them to generate a DataFrame of expressed variant protein
    sequences.
    """
    expressed_variants = load_vcf(expressed_vcf)
    not_expressed_variants = load_vcf(not_expressed_vcf)

    combined_variants = VariantCollection(
        list(expressed_variants) + list(not_expressed_variants))
    samfile = load_bam(tumor_rna_bam)

    allele_reads_generator = reads_overlapping_variants(
        variants=combined_variants,
        samfile=samfile,
        min_mapping_quality=min_mapping_quality)

    protein_sequences_generator = reads_generator_to_protein_sequences_generator(
        allele_reads_generator,
        max_protein_sequences_per_variant=max_protein_sequences_per_variant,
        variant_sequence_assembly=variant_sequence_assembly)
    df = protein_sequences_generator_to_dataframe(protein_sequences_generator)
    return df, expressed_variants, combined_variants
Ejemplo n.º 2
0
    def isovar_protein_sequence_dict(self):
        """
        This function computes a dictionary of Variant objects to a single isovar protein sequence
        that will be used to try to construct VaccinePeptides. If this function has been previously
        called, the result will be cached.
        """
        if self._isovar_protein_sequence_dict is None:
            # total number of amino acids is the vaccine peptide length plus the
            # number of off-center windows around the mutation
            protein_fragment_sequence_length = (
                self.vaccine_peptide_length + 2 * self.padding_around_mutation)
            """
            These sequences are only the ones that overlap the variant and support the mutation.
            Right now, this generator yields:
            - (variant, mutant protein sequences) if there's enough alt RNA support
            - (variant, None) if the variant is silent or there are ref reads overlapping the
            variant locus but inadequate alt RNA support.
            - does not return the variant if there's no RNA support for ref or alt - we may miss
            some coding variants this way unless we check for them explicitly

            Future intended behavior: returns all passing variants, with a protein sequences
            generator that is non empty if there are enough alt RNA reads supporting the variant
            """
            protein_sequences_generator = reads_generator_to_protein_sequences_generator(
                self.reads_generator,
                transcript_id_whitelist=None,
                protein_sequence_length=protein_fragment_sequence_length,
                min_alt_rna_reads=self.min_alt_rna_reads,
                min_variant_sequence_coverage=self.
                min_variant_sequence_coverage,
                variant_sequence_assembly=self.variant_sequence_assembly,
                max_protein_sequences_per_variant=1)

            self._isovar_protein_sequence_dict = {}
            for variant, isovar_protein_sequences in protein_sequences_generator:
                if len(isovar_protein_sequences) == 0:
                    # variant RNA support is below threshold
                    logger.info("No protein sequences for %s", variant)
                    continue

                # use the first protein sequence - why?
                self._isovar_protein_sequence_dict[
                    variant] = isovar_protein_sequences[0]

        return self._isovar_protein_sequence_dict
Ejemplo n.º 3
0
def test_variants_to_protein_sequences_dataframe_filtered_all_reads_by_mapping_quality():
    # since the B16 BAM has all MAPQ=255 values then all the reads should get dropped
    # if we set the minimum quality to 256
    variants = load_vcf("data/b16.f10/b16.vcf")
    samfile = load_bam("data/b16.f10/b16.combined.sorted.bam")
    allele_reads_generator = reads_overlapping_variants(
        variants=variants,
        samfile=samfile,
        min_mapping_quality=256)
    protein_sequences_generator = reads_generator_to_protein_sequences_generator(
        allele_reads_generator,
        max_protein_sequences_per_variant=1)
    df = protein_sequences_generator_to_dataframe(protein_sequences_generator)
    print(df)
    eq_(
        len(df),
        0,
        "Expected 0 entries, got %d: %s" % (len(df), df))
Ejemplo n.º 4
0
    def isovar_protein_sequence_dict(self):
        """
        This function computes a dictionary of Variant objects to a single isovar protein sequence
        that will be used to try to construct VaccinePeptides. If this function has been previously
        called, the result will be cached.
        """
        if self._isovar_protein_sequence_dict is None:
            # total number of amino acids is the vaccine peptide length plus the
            # number of off-center windows around the mutation
            protein_fragment_sequence_length = (
                self.vaccine_peptide_length + 2 * self.padding_around_mutation)
            """
            These sequences are only the ones that overlap the variant and support the mutation.
            Right now, this generator yields:
            - (variant, mutant protein sequences) if there's enough alt RNA support
            - (variant, None) if the variant is silent or there are ref reads overlapping the
            variant locus but inadequate alt RNA support.
            - does not return the variant if there's no RNA support for ref or alt - we may miss
            some coding variants this way unless we check for them explicitly

            Future intended behavior: returns all passing variants, with a protein sequences
            generator that is non empty if there are enough alt RNA reads supporting the variant
            """
            protein_sequences_generator = reads_generator_to_protein_sequences_generator(
                self.reads_generator,
                transcript_id_whitelist=None,
                protein_sequence_length=protein_fragment_sequence_length,
                min_alt_rna_reads=self.min_alt_rna_reads,
                min_variant_sequence_coverage=self.min_variant_sequence_coverage,
                variant_sequence_assembly=self.variant_sequence_assembly,
                max_protein_sequences_per_variant=1)

            self._isovar_protein_sequence_dict = {}
            for variant, isovar_protein_sequences in protein_sequences_generator:
                if len(isovar_protein_sequences) == 0:
                    # variant RNA support is below threshold
                    logger.info("No protein sequences for %s", variant)
                    continue

                # use the first protein sequence - why?
                self._isovar_protein_sequence_dict[variant] = isovar_protein_sequences[0]

        return self._isovar_protein_sequence_dict
Ejemplo n.º 5
0
def generate_vaccine_peptides(
        reads_generator,
        mhc_predictor,
        vaccine_peptide_length,
        padding_around_mutation,
        max_vaccine_peptides_per_variant,
        min_alt_rna_reads,
        min_variant_sequence_coverage,
        variant_sequence_assembly,
        min_epitope_score=0):
    """
    Returns dictionary mapping each variant to list of VaccinePeptide objects.
    """

    # total number of amino acids is the vaccine peptide length plus the
    # number of off-center windows around the mutation
    protein_fragment_sequence_length = (
        vaccine_peptide_length + 2 * padding_around_mutation)

    protein_sequences_generator = reads_generator_to_protein_sequences_generator(
        reads_generator,
        transcript_id_whitelist=None,
        protein_sequence_length=protein_fragment_sequence_length,
        min_alt_rna_reads=min_alt_rna_reads,
        min_variant_sequence_coverage=min_variant_sequence_coverage,
        variant_sequence_assembly=variant_sequence_assembly,
        max_protein_sequences_per_variant=1)

    result_dict = {}
    for variant, isovar_protein_sequences in protein_sequences_generator:
        vaccine_peptides = vaccine_peptides_for_variant(
            variant=variant,
            isovar_protein_sequences=isovar_protein_sequences,
            mhc_predictor=mhc_predictor,
            vaccine_peptide_length=vaccine_peptide_length,
            padding_around_mutation=padding_around_mutation,
            max_vaccine_peptides_per_variant=max_vaccine_peptides_per_variant,
            min_epitope_score=min_epitope_score)
        result_dict[variant] = vaccine_peptides
    return result_dict
Ejemplo n.º 6
0
def generate_vaccine_peptides(
        reads_generator,
        mhc_predictor,
        vaccine_peptide_length,
        padding_around_mutation,
        max_vaccine_peptides_per_variant,
        min_alt_rna_reads,
        min_variant_sequence_coverage,
        variant_sequence_assembly,
        num_mutant_epitopes_to_keep=10000,
        min_epitope_score=0):
    """
    Returns a tuple of two values:
    - dictionary mapping each variant to list of VaccinePeptide objects
    - dictionary containing some variant counts for report display
    """

    # total number of amino acids is the vaccine peptide length plus the
    # number of off-center windows around the mutation
    protein_fragment_sequence_length = (
        vaccine_peptide_length + 2 * padding_around_mutation)

    protein_sequences_generator = reads_generator_to_protein_sequences_generator(
        reads_generator,
        transcript_id_whitelist=None,
        protein_sequence_length=protein_fragment_sequence_length,
        min_alt_rna_reads=min_alt_rna_reads,
        min_variant_sequence_coverage=min_variant_sequence_coverage,
        variant_sequence_assembly=variant_sequence_assembly,
        max_protein_sequences_per_variant=1)

    result_dict = {}
    counts_dict = defaultdict(int)
    for variant, isovar_protein_sequences in protein_sequences_generator:
        if len(variant.effects().drop_silent_and_noncoding()) > 0:
            counts_dict['num_coding_effect_variants'] += 1
        isovar_protein_sequences = list(isovar_protein_sequences)
        if len(isovar_protein_sequences) == 0:
            # this means the variant RNA support is below threshold
            logger.info("No protein sequences for %s", variant)
            continue

        # use the first protein sequence - why?
        counts_dict['num_variants_with_rna_support'] += 1
        isovar_protein_sequence = isovar_protein_sequences[0]
        vaccine_peptides = vaccine_peptides_for_variant(
            variant=variant,
            isovar_protein_sequence=isovar_protein_sequence,
            mhc_predictor=mhc_predictor,
            vaccine_peptide_length=vaccine_peptide_length,
            padding_around_mutation=padding_around_mutation,
            max_vaccine_peptides_per_variant=max_vaccine_peptides_per_variant,
            num_mutant_epitopes_to_keep=num_mutant_epitopes_to_keep,
            min_epitope_score=min_epitope_score)

        # do any of this variant's vaccine peptides contain mutant epitopes?
        any_mutant_epitopes = False
        for vaccine_peptide in vaccine_peptides:
            if vaccine_peptide.contains_mutant_epitopes():
                any_mutant_epitopes = True
                break
        if any_mutant_epitopes:
            counts_dict['num_variants_with_vaccine_peptides'] += 1
        result_dict[variant] = vaccine_peptides

    for key, value in counts_dict.items():
        logger.info('%s: %d', key, value)

    return result_dict, counts_dict