def test_mutant_amino_acids_in_mm10_chr9_82927102_refGT_altTG_pT441H():
    # In the Isovar repository this test is weird because the VCF only
    # mentions the G>T variant but doesn't include the subsequent nucleotide
    # change T>G. To avoid having to think about phasing of variants I changed
    # the VCF in vaxrank to contain a GT>TG variant.
    arg_parser = make_variant_sequences_arg_parser()
    args = arg_parser.parse_args([
        "--vcf", data_path("b16.f10/b16.f10.Phip.vcf"),
        "--bam", data_path("b16.f10/b16.combined.sorted.bam"),
    ])
    reads_generator = allele_reads_generator_from_args(args)
    ranked_list, _ = ranked_vaccine_peptides(
        reads_generator=reads_generator,
        mhc_predictor=RandomBindingPredictor(["H-2-Kb", "H-2-Db"]),
        vaccine_peptide_length=15,
        padding_around_mutation=5,
        min_alt_rna_reads=1,
        min_variant_sequence_coverage=1,
        variant_sequence_assembly=True,
        max_vaccine_peptides_per_variant=1)

    for variant, vaccine_peptides in ranked_list:
        vaccine_peptide = vaccine_peptides[0]
        mutant_protein_fragment = vaccine_peptide.mutant_protein_fragment
        check_mutant_amino_acids(
            variant,
            mutant_protein_fragment)
def test_keep_top_k_epitopes():
    arg_parser = make_variant_sequences_arg_parser()
    args = arg_parser.parse_args([
        "--vcf", data_path("b16.f10/b16.f10.Phip.vcf"),
        "--bam", data_path("b16.f10/b16.combined.sorted.bam"),
    ])
    reads_generator = allele_reads_generator_from_args(args)

    keep_k_epitopes = 3
    ranked_list, _ = ranked_vaccine_peptides(
        reads_generator=reads_generator,
        mhc_predictor=RandomBindingPredictor(["H-2-Kb", "H-2-Db"]),
        vaccine_peptide_length=15,
        padding_around_mutation=5,
        min_alt_rna_reads=1,
        min_variant_sequence_coverage=1,
        variant_sequence_assembly=True,
        max_vaccine_peptides_per_variant=1,
        num_mutant_epitopes_to_keep=keep_k_epitopes)

    for variant, vaccine_peptides in ranked_list:
        vaccine_peptide = vaccine_peptides[0]
        eq_(keep_k_epitopes, len(vaccine_peptide.mutant_epitope_predictions))
        # recompute the expected score, make sure the top-k argument from ranked_vaccine_peptides()
        # propagated as expected
        mutant_epitope_score = sum(
            p.logistic_epitope_score() for p in vaccine_peptide.mutant_epitope_predictions)
        assert_almost_equal(mutant_epitope_score, vaccine_peptide.mutant_epitope_score)
def test_mutant_amino_acids_in_mm10_chrX_8125624_refC_altA_pS460I():
    # there are two co-occurring variants in the RNAseq data but since
    # they don't happen in the same codon then we're considering the Varcode
    # annotation to be correct
    # TODO: deal with phasing of variants explicitly so that both
    # variant positions are considered mutated
    arg_parser = make_variant_sequences_arg_parser()
    args = arg_parser.parse_args([
        "--vcf", data_path("b16.f10/b16.f10.Wdr13.vcf"),
        "--bam", data_path("b16.f10/b16.combined.sorted.bam"),
    ])
    reads_generator = allele_reads_generator_from_args(args)
    ranked_list, _ = ranked_vaccine_peptides(
        reads_generator=reads_generator,
        mhc_predictor=RandomBindingPredictor(["H-2-Kb", "H-2-Db"]),
        vaccine_peptide_length=15,
        padding_around_mutation=5,
        max_vaccine_peptides_per_variant=1,
        min_alt_rna_reads=1,
        min_variant_sequence_coverage=1,
        variant_sequence_assembly=True)

    for variant, vaccine_peptides in ranked_list:
        eq_(
            1,
            len(vaccine_peptides),
            "Expected 1 vaccine peptide for variant '%s' but got %d" % (
                variant, len(vaccine_peptides)))
        vaccine_peptide = vaccine_peptides[0]
        mutant_protein_fragment = vaccine_peptide.mutant_protein_fragment
        check_mutant_amino_acids(variant, mutant_protein_fragment)
Beispiel #4
0
def test_mutant_amino_acids_in_mm10_chr9_82927102_refGT_altTG_pT441H():
    # In the Isovar repository this test is weird because the VCF only
    # mentions the G>T variant but doesn't include the subsequent nucleotide
    # change T>G. To avoid having to think about phasing of variants I changed
    # the VCF in vaxrank to contain a GT>TG variant.
    arg_parser = make_variant_sequences_arg_parser()
    args = arg_parser.parse_args([
        "--vcf",
        data_path("b16.f10/b16.f10.Phip.vcf"),
        "--bam",
        data_path("b16.f10/b16.combined.sorted.bam"),
    ])
    reads_generator = allele_reads_generator_from_args(args)
    ranked_list = ranked_vaccine_peptides(reads_generator=reads_generator,
                                          mhc_predictor=RandomBindingPredictor(
                                              ["H-2-Kb", "H-2-Db"]),
                                          vaccine_peptide_length=15,
                                          padding_around_mutation=5,
                                          min_alt_rna_reads=1,
                                          min_variant_sequence_coverage=1,
                                          variant_sequence_assembly=True,
                                          max_vaccine_peptides_per_variant=1)

    for variant, vaccine_peptides in ranked_list:
        vaccine_peptide = vaccine_peptides[0]
        mutant_protein_fragment = vaccine_peptide.mutant_protein_fragment
        check_mutant_amino_acids(variant, mutant_protein_fragment)
Beispiel #5
0
def test_mutant_amino_acids_in_mm10_chrX_8125624_refC_altA_pS460I():
    # there are two co-occurring variants in the RNAseq data but since
    # they don't happen in the same codon then we're considering the Varcode
    # annotation to be correct
    # TODO: deal with phasing of variants explicitly so that both
    # variant positions are considered mutated
    arg_parser = make_variant_sequences_arg_parser()
    args = arg_parser.parse_args([
        "--vcf",
        data_path("b16.f10/b16.f10.Wdr13.vcf"),
        "--bam",
        data_path("b16.f10/b16.combined.sorted.bam"),
    ])
    reads_generator = allele_reads_generator_from_args(args)
    ranked_list = ranked_vaccine_peptides(reads_generator=reads_generator,
                                          mhc_predictor=RandomBindingPredictor(
                                              ["H-2-Kb", "H-2-Db"]),
                                          vaccine_peptide_length=15,
                                          padding_around_mutation=5,
                                          max_vaccine_peptides_per_variant=1,
                                          min_alt_rna_reads=1,
                                          min_variant_sequence_coverage=1,
                                          variant_sequence_assembly=True)

    for variant, vaccine_peptides in ranked_list:
        eq_(
            1, len(vaccine_peptides),
            "Expected 1 vaccine peptide for variant '%s' but got %d" %
            (variant, len(vaccine_peptides)))
        vaccine_peptide = vaccine_peptides[0]
        mutant_protein_fragment = vaccine_peptide.mutant_protein_fragment
        check_mutant_amino_acids(variant, mutant_protein_fragment)
Beispiel #6
0
def test_keep_top_k_epitopes():
    arg_parser = make_variant_sequences_arg_parser()
    args = arg_parser.parse_args([
        "--vcf",
        data_path("b16.f10/b16.f10.Phip.vcf"),
        "--bam",
        data_path("b16.f10/b16.combined.sorted.bam"),
    ])
    reads_generator = allele_reads_generator_from_args(args)

    keep_k_epitopes = 3
    ranked_list, _ = ranked_vaccine_peptides(
        reads_generator=reads_generator,
        mhc_predictor=RandomBindingPredictor(["H-2-Kb", "H-2-Db"]),
        vaccine_peptide_length=15,
        padding_around_mutation=5,
        min_alt_rna_reads=1,
        min_variant_sequence_coverage=1,
        variant_sequence_assembly=True,
        max_vaccine_peptides_per_variant=1,
        num_mutant_epitopes_to_keep=keep_k_epitopes)

    for variant, vaccine_peptides in ranked_list:
        vaccine_peptide = vaccine_peptides[0]
        eq_(keep_k_epitopes, len(vaccine_peptide.mutant_epitope_predictions))
        # recompute the expected score, make sure the top-k argument from ranked_vaccine_peptides()
        # propagated as expected
        mutant_epitope_score = sum(
            p.logistic_epitope_score()
            for p in vaccine_peptide.mutant_epitope_predictions)
        assert_almost_equal(mutant_epitope_score,
                            vaccine_peptide.mutant_epitope_score)
Prints number of reads supporting ref, alt, and other alleles at variant loci.
"""

from __future__ import division, absolute_import
import logging
import logging.config
import pkg_resources

from isovar.cli.rna_reads import (make_rna_reads_arg_parser,
                                  allele_reads_generator_from_args)
from isovar.allele_counts import allele_counts_dataframe

logging.config.fileConfig(
    pkg_resources.resource_filename('isovar.cli', 'logging.conf'))
logger = logging.getLogger(__name__)

parser = make_rna_reads_arg_parser()
parser.add_argument("--output",
                    default="isovar-allele-counts-result.csv",
                    help="Name of CSV file which contains read sequences")

if __name__ == "__main__":
    args = parser.parse_args()
    logger.info(args)
    variants_and_allele_reads_generator = allele_reads_generator_from_args(
        args)
    allele_counts_df = allele_counts_dataframe(
        variants_and_allele_reads_generator)
    logger.info(allele_counts_df)
    allele_counts_df.to_csv(args.output)
Beispiel #8
0
def ranked_variant_list_with_metadata(args):
    """
    Computes all the data needed for report generation.

    Parameters
    ----------
    args : Namespace
      Parsed user args from this run

    Returns a dictionary containing 3 items:
    - ranked variant/vaccine peptide list
    - a dictionary of command-line arguments used to generate it
    - patient info object
    """
    if hasattr(args, 'input_json_file'):
        with open(args.input_json_file) as f:
            data = serializable.from_json(f.read())
            # the JSON data from the previous run will have the older args saved, which may need to
            # be overridden with args from this run (which all be output related)
            data['args'].update(vars(args))

            # if we need to truncate the variant list based on max_mutations_in_report, do that here
            if len(data['variants']) > args.max_mutations_in_report:
                data['variants'] = data['variants'][:args.
                                                    max_mutations_in_report]
            return data

    # get various things from user args
    mhc_alleles = mhc_alleles_from_args(args)
    logger.info("MHC alleles: %s", mhc_alleles)
    variants = variant_collection_from_args(args)
    logger.info("Variants: %s", variants)
    # generator that for each variant gathers all RNA reads, both those
    # supporting the variant and reference alleles
    reads_generator = allele_reads_generator_from_args(args)
    mhc_predictor = mhc_binding_predictor_from_args(args)

    ranked_list, variants_count_dict = ranked_vaccine_peptides(
        reads_generator=reads_generator,
        mhc_predictor=mhc_predictor,
        vaccine_peptide_length=args.vaccine_peptide_length,
        padding_around_mutation=args.padding_around_mutation,
        max_vaccine_peptides_per_variant=args.
        max_vaccine_peptides_per_mutation,
        min_alt_rna_reads=args.min_alt_rna_reads,
        min_variant_sequence_coverage=args.min_variant_sequence_coverage,
        min_epitope_score=args.min_epitope_score,
        num_mutant_epitopes_to_keep=args.num_epitopes_per_peptide,
        variant_sequence_assembly=args.variant_sequence_assembly)

    ranked_list_for_report = ranked_list[:args.max_mutations_in_report]

    patient_info = PatientInfo(
        patient_id=args.output_patient_id,
        vcf_paths=variants.sources,
        bam_path=args.bam,
        mhc_alleles=mhc_alleles,
        num_somatic_variants=len(variants),
        num_coding_effect_variants=variants_count_dict[
            'num_coding_effect_variants'],
        num_variants_with_rna_support=variants_count_dict[
            'num_variants_with_rna_support'],
        num_variants_with_vaccine_peptides=variants_count_dict[
            'num_variants_with_vaccine_peptides'])

    # return variants, patient info, and command-line args
    data = {
        'variants': ranked_list_for_report,
        'patient_info': patient_info,
        'args': vars(args),
    }
    logger.info('About to save args: %s', data['args'])

    # save JSON data if necessary. as of time of writing, vaxrank takes ~25 min to run,
    # most of which is core logic. the formatting is super fast, and it can
    # be useful to save the data to be able to iterate just on the formatting
    if args.output_json_file:
        with open(args.output_json_file, 'w') as f:
            f.write(serializable.to_json(data))
            logger.info('Wrote JSON report data to %s', args.output_json_file)

    return data
Beispiel #9
0
def ranked_variant_list_with_metadata(args):
    """
    Computes all the data needed for report generation.

    Parameters
    ----------
    args : Namespace
      Parsed user args from this run

    Returns a dictionary containing 3 items:
    - ranked variant/vaccine peptide list
    - a dictionary of command-line arguments used to generate it
    - patient info object
    """
    if hasattr(args, 'input_json_file'):
        with open(args.input_json_file) as f:
            data = serializable.from_json(f.read())
            # the JSON data from the previous run will have the older args saved, which may need to
            # be overridden with args from this run (which all be output related)
            data['args'].update(vars(args))

            # if we need to truncate the variant list based on max_mutations_in_report, do that here
            if len(data['variants']) > args.max_mutations_in_report:
                data['variants'] = data['variants'][:args.max_mutations_in_report]
            return data

    # get various things from user args
    mhc_alleles = mhc_alleles_from_args(args)
    logger.info("MHC alleles: %s", mhc_alleles)
    variants = variant_collection_from_args(args)
    logger.info("Variants: %s", variants)
    # generator that for each variant gathers all RNA reads, both those
    # supporting the variant and reference alleles
    reads_generator = allele_reads_generator_from_args(args)
    mhc_predictor = mhc_binding_predictor_from_args(args)

    ranked_list, variants_count_dict = ranked_vaccine_peptides(
        reads_generator=reads_generator,
        mhc_predictor=mhc_predictor,
        vaccine_peptide_length=args.vaccine_peptide_length,
        padding_around_mutation=args.padding_around_mutation,
        max_vaccine_peptides_per_variant=args.max_vaccine_peptides_per_mutation,
        min_alt_rna_reads=args.min_alt_rna_reads,
        min_variant_sequence_coverage=args.min_variant_sequence_coverage,
        min_epitope_score=args.min_epitope_score,
        num_mutant_epitopes_to_keep=args.num_epitopes_per_peptide,
        variant_sequence_assembly=args.variant_sequence_assembly)

    ranked_list_for_report = ranked_list[:args.max_mutations_in_report]

    patient_info = PatientInfo(
        patient_id=args.output_patient_id,
        vcf_paths=variants.sources,
        bam_path=args.bam,
        mhc_alleles=mhc_alleles,
        num_somatic_variants=len(variants),
        num_coding_effect_variants=variants_count_dict['num_coding_effect_variants'],
        num_variants_with_rna_support=variants_count_dict['num_variants_with_rna_support'],
        num_variants_with_vaccine_peptides=variants_count_dict['num_variants_with_vaccine_peptides']
    )

    # return variants, patient info, and command-line args
    data = {
        'variants': ranked_list_for_report,
        'patient_info': patient_info,
        'args': vars(args),
    }
    logger.info('About to save args: %s', data['args'])

    # save JSON data if necessary. as of time of writing, vaxrank takes ~25 min to run,
    # most of which is core logic. the formatting is super fast, and it can
    # be useful to save the data to be able to iterate just on the formatting
    if args.output_json_file:
        with open(args.output_json_file, 'w') as f:
            f.write(serializable.to_json(data))
            logger.info('Wrote JSON report data to %s', args.output_json_file)

    return data
Beispiel #10
0
"""

from __future__ import division, absolute_import
import logging
import logging.config
import pkg_resources

from isovar.cli.rna_reads import (
    make_rna_reads_arg_parser,
    allele_reads_generator_from_args
)
from isovar.allele_counts import allele_counts_dataframe


logging.config.fileConfig(pkg_resources.resource_filename('isovar.cli', 'logging.conf'))
logger = logging.getLogger(__name__)

parser = make_rna_reads_arg_parser()
parser.add_argument(
    "--output",
    default="isovar-allele-counts-result.csv",
    help="Name of CSV file which contains read sequences")

if __name__ == "__main__":
    args = parser.parse_args()
    logger.info(args)
    variants_and_allele_reads_generator = allele_reads_generator_from_args(args)
    allele_counts_df = allele_counts_dataframe(variants_and_allele_reads_generator)
    logger.info(allele_counts_df)
    allele_counts_df.to_csv(args.output)