def test_mutant_amino_acids_in_mm10_chr9_82927102_refGT_altTG_pT441H():
    # In the Isovar repository this test is weird because the VCF only
    # mentions the G>T variant but doesn't include the subsequent nucleotide
    # change T>G. To avoid having to think about phasing of variants I changed
    # the VCF in vaxrank to contain a GT>TG variant.
    arg_parser = make_variant_sequences_arg_parser()
    args = arg_parser.parse_args([
        "--vcf", data_path("b16.f10/b16.f10.Phip.vcf"),
        "--bam", data_path("b16.f10/b16.combined.sorted.bam"),
    ])
    reads_generator = allele_reads_generator_from_args(args)
    variants = variant_collection_from_args(args)
    core_logic = VaxrankCoreLogic(
        reads_generator=reads_generator,
        mhc_predictor=random_binding_predictor,
        variants=variants,
        vaccine_peptide_length=15,
        padding_around_mutation=5,
        min_alt_rna_reads=1,
        min_variant_sequence_coverage=1,
        variant_sequence_assembly=True,
        max_vaccine_peptides_per_variant=1)
    ranked_list = core_logic.ranked_vaccine_peptides()

    for variant, vaccine_peptides in ranked_list:
        vaccine_peptide = vaccine_peptides[0]
        mutant_protein_fragment = vaccine_peptide.mutant_protein_fragment
        check_mutant_amino_acids(
            variant,
            mutant_protein_fragment)
def test_keep_top_k_epitopes():
    arg_parser = make_variant_sequences_arg_parser()
    args = arg_parser.parse_args([
        "--vcf", data_path("b16.f10/b16.f10.Phip.vcf"),
        "--bam", data_path("b16.f10/b16.combined.sorted.bam"),
    ])
    reads_generator = allele_reads_generator_from_args(args)
    variants = variant_collection_from_args(args)
    keep_k_epitopes = 3
    core_logic = VaxrankCoreLogic(
        reads_generator=reads_generator,
        mhc_predictor=random_binding_predictor,
        variants=variants,
        vaccine_peptide_length=15,
        padding_around_mutation=5,
        min_alt_rna_reads=1,
        min_variant_sequence_coverage=1,
        variant_sequence_assembly=True,
        max_vaccine_peptides_per_variant=1,
        num_mutant_epitopes_to_keep=keep_k_epitopes)
    ranked_list = core_logic.ranked_vaccine_peptides()

    for variant, vaccine_peptides in ranked_list:
        vaccine_peptide = vaccine_peptides[0]
        eq_(keep_k_epitopes, len(vaccine_peptide.mutant_epitope_predictions))
        # recompute the expected score, make sure the top-k argument from ranked_vaccine_peptides()
        # propagated as expected
        mutant_epitope_score = sum(
            p.logistic_epitope_score() for p in vaccine_peptide.mutant_epitope_predictions)
        assert_almost_equal(mutant_epitope_score, vaccine_peptide.mutant_epitope_score)
def test_mutant_amino_acids_in_mm10_chrX_8125624_refC_altA_pS460I():
    # there are two co-occurring variants in the RNAseq data but since
    # they don't happen in the same codon then we're considering the Varcode
    # annotation to be correct
    # TODO: deal with phasing of variants explicitly so that both
    # variant positions are considered mutated
    arg_parser = make_variant_sequences_arg_parser()
    args = arg_parser.parse_args([
        "--vcf", data_path("b16.f10/b16.f10.Wdr13.vcf"),
        "--bam", data_path("b16.f10/b16.combined.sorted.bam"),
    ])
    reads_generator = allele_reads_generator_from_args(args)
    variants = variant_collection_from_args(args)
    core_logic = VaxrankCoreLogic(
        variants=variants,
        reads_generator=reads_generator,
        mhc_predictor=random_binding_predictor,
        vaccine_peptide_length=15,
        padding_around_mutation=5,
        max_vaccine_peptides_per_variant=1,
        min_alt_rna_reads=1,
        min_variant_sequence_coverage=1,
        variant_sequence_assembly=True)
    ranked_list = core_logic.ranked_vaccine_peptides()

    for variant, vaccine_peptides in ranked_list:
        eq_(
            1,
            len(vaccine_peptides),
            "Expected 1 vaccine peptide for variant '%s' but got %d" % (
                variant, len(vaccine_peptides)))
        vaccine_peptide = vaccine_peptides[0]
        mutant_protein_fragment = vaccine_peptide.mutant_protein_fragment
        check_mutant_amino_acids(variant, mutant_protein_fragment)
def test_mutant_amino_acids_in_mm10_chr9_82927102_refGT_altTG_pT441H():
    # In the Isovar repository this test is weird because the VCF only
    # mentions the G>T variant but doesn't include the subsequent nucleotide
    # change T>G. To avoid having to think about phasing of variants I changed
    # the VCF in vaxrank to contain a GT>TG variant.
    arg_parser = make_variant_sequences_arg_parser()
    args = arg_parser.parse_args([
        "--vcf",
        data_path("b16.f10/b16.f10.Phip.vcf"),
        "--bam",
        data_path("b16.f10/b16.combined.sorted.bam"),
    ])
    reads_generator = allele_reads_generator_from_args(args)
    variants = variant_collection_from_args(args)
    core_logic = VaxrankCoreLogic(reads_generator=reads_generator,
                                  mhc_predictor=random_binding_predictor,
                                  variants=variants,
                                  vaccine_peptide_length=15,
                                  padding_around_mutation=5,
                                  min_alt_rna_reads=1,
                                  min_variant_sequence_coverage=1,
                                  variant_sequence_assembly=True,
                                  max_vaccine_peptides_per_variant=1)
    ranked_list = core_logic.ranked_vaccine_peptides()

    for variant, vaccine_peptides in ranked_list:
        vaccine_peptide = vaccine_peptides[0]
        mutant_protein_fragment = vaccine_peptide.mutant_protein_fragment
        check_mutant_amino_acids(variant, mutant_protein_fragment)
Example #5
0
def predict_epitopes_from_args(args):
    """
    Returns an epitope collection from the given commandline arguments.

    Parameters
    ----------
    args : argparse.Namespace
        Parsed commandline arguments for Topiary
    """
    mhc_model = mhc_binding_predictor_from_args(args)
    variants = variant_collection_from_args(args)
    gene_expression_dict = rna_gene_expression_dict_from_args(args)
    transcript_expression_dict = rna_transcript_expression_dict_from_args(args)

    predictor = TopiaryPredictor(
        mhc_model=mhc_model,
        padding_around_mutation=args.padding_around_mutation,
        ic50_cutoff=args.ic50_cutoff,
        percentile_cutoff=args.percentile_cutoff,
        min_transcript_expression=args.rna_min_transcript_expression,
        min_gene_expression=args.rna_min_gene_expression,
        only_novel_epitopes=args.only_novel_epitopes,
        raise_on_error=not args.skip_variant_errors)
    return predictor.predict_from_variants(
        variants=variants,
        transcript_expression_dict=transcript_expression_dict,
        gene_expression_dict=gene_expression_dict)
def test_mutant_amino_acids_in_mm10_chrX_8125624_refC_altA_pS460I():
    # there are two co-occurring variants in the RNAseq data but since
    # they don't happen in the same codon then we're considering the Varcode
    # annotation to be correct
    # TODO: deal with phasing of variants explicitly so that both
    # variant positions are considered mutated
    arg_parser = make_variant_sequences_arg_parser()
    args = arg_parser.parse_args([
        "--vcf",
        data_path("b16.f10/b16.f10.Wdr13.vcf"),
        "--bam",
        data_path("b16.f10/b16.combined.sorted.bam"),
    ])
    reads_generator = allele_reads_generator_from_args(args)
    variants = variant_collection_from_args(args)
    core_logic = VaxrankCoreLogic(variants=variants,
                                  reads_generator=reads_generator,
                                  mhc_predictor=random_binding_predictor,
                                  vaccine_peptide_length=15,
                                  padding_around_mutation=5,
                                  max_vaccine_peptides_per_variant=1,
                                  min_alt_rna_reads=1,
                                  min_variant_sequence_coverage=1,
                                  variant_sequence_assembly=True)
    ranked_list = core_logic.ranked_vaccine_peptides()

    for variant, vaccine_peptides in ranked_list:
        eq_(
            1, len(vaccine_peptides),
            "Expected 1 vaccine peptide for variant '%s' but got %d" %
            (variant, len(vaccine_peptides)))
        vaccine_peptide = vaccine_peptides[0]
        mutant_protein_fragment = vaccine_peptide.mutant_protein_fragment
        check_mutant_amino_acids(variant, mutant_protein_fragment)
def test_keep_top_k_epitopes():
    arg_parser = make_variant_sequences_arg_parser()
    args = arg_parser.parse_args([
        "--vcf",
        data_path("b16.f10/b16.f10.Phip.vcf"),
        "--bam",
        data_path("b16.f10/b16.combined.sorted.bam"),
    ])
    reads_generator = allele_reads_generator_from_args(args)
    variants = variant_collection_from_args(args)
    keep_k_epitopes = 3
    core_logic = VaxrankCoreLogic(reads_generator=reads_generator,
                                  mhc_predictor=random_binding_predictor,
                                  variants=variants,
                                  vaccine_peptide_length=15,
                                  padding_around_mutation=5,
                                  min_alt_rna_reads=1,
                                  min_variant_sequence_coverage=1,
                                  variant_sequence_assembly=True,
                                  max_vaccine_peptides_per_variant=1,
                                  num_mutant_epitopes_to_keep=keep_k_epitopes)
    ranked_list = core_logic.ranked_vaccine_peptides()

    for variant, vaccine_peptides in ranked_list:
        vaccine_peptide = vaccine_peptides[0]
        eq_(keep_k_epitopes, len(vaccine_peptide.mutant_epitope_predictions))
        # recompute the expected score, make sure the top-k argument from ranked_vaccine_peptides()
        # propagated as expected
        mutant_epitope_score = sum(
            p.logistic_epitope_score()
            for p in vaccine_peptide.mutant_epitope_predictions)
        assert_almost_equal(mutant_epitope_score,
                            vaccine_peptide.mutant_epitope_score)
Example #8
0
def variant_reads_generator_from_args(args):
    variants = variant_collection_from_args(args)
    samfile = samfile_from_args(args)
    return reads_supporting_variants(
        variants=variants,
        samfile=samfile,
        use_duplicate_reads=args.use_duplicate_reads,
        use_secondary_alignments=not args.drop_secondary_alignments,
        min_mapping_quality=args.min_mapping_quality)
Example #9
0
def variant_reads_generator_from_args(args):
    variants = variant_collection_from_args(args)
    samfile = samfile_from_args(args)
    return reads_supporting_variants(
        variants=variants,
        samfile=samfile,
        use_duplicate_reads=args.use_duplicate_reads,
        use_secondary_alignments=not args.drop_secondary_alignments,
        min_mapping_quality=args.min_mapping_quality)
Example #10
0
def read_evidence_generator_from_args(args):
    """
    Creates a generator of (Variant, ReadEvidence) pairs from parsed
    arguments.
    """
    variants = variant_collection_from_args(args)
    samfile = alignment_file_from_args(args)
    read_creator = read_collector_from_args(args)
    return read_creator.read_evidence_generator(
        variants=variants,
        alignment_file=samfile)
Example #11
0
def run_isovar_from_parsed_args(args):
    """
    Extract parameters from parsed arguments and use them to run Isovar
    """
    variants = variant_collection_from_args(args)
    read_collector = read_collector_from_args(args)
    alignment_file = alignment_file_from_args(args)
    protein_sequence_creator = protein_sequence_creator_from_args(args)
    filter_thresholds = filter_threshold_dict_from_args(args)
    return run_isovar(variants=variants,
                      alignment_file=alignment_file,
                      read_collector=read_collector,
                      protein_sequence_creator=protein_sequence_creator,
                      filter_thresholds=filter_thresholds)
Example #12
0
def main(args_list=None):
    if args_list is None:
        args_list = sys.argv[1:]
    args = parser.parse_args(args_list)

    variants = variant_collection_from_args(args)
    all_effects = variants.effects()

    coding_effects = all_effects.drop_silent_and_noncoding()
    coding_effects_per_variant = coding_effects.top_priority_effect_per_variant()

    with StringIO() as string_io:
        string_io.write("%30s: %5d\n" % ("total variants", len(variants)))
        string_io.write("%30s: %5d\n" % (
            "# SNVs",
            sum([v.is_snv for v in variants])
        ))
        string_io.write("%30s: %5d\n" % (
            "# indels",
            sum([v.is_indel for v in variants])
        ))

        string_io.write("%30s: %5d\n" % (
            "coding non-synonymous variants",
            len(coding_effects_per_variant)))

        string_io.write("===\n\n")
        string_io.write("\nCoding variants in known cancer genes:\n")
        for v, e in coding_effects_per_variant.items():
            if e.gene_id in cancer_driver_gene_id_set:
                string_io.write("-- %s %s (%s)\n" % (e.gene_name, e.short_description, v.short_description))

        string_io.write("\nCoding variants in MHC-I presentation genes:\n")
        for v, e in coding_effects_per_variant.items():
            if e.gene_id in class1_mhc_gene_id_set:
                string_io.write("-- %s %s (%s)\n" % (e.gene_name, e.short_description, v.short_description))

        string_io.write("\nCoding variants in interferon response genes:\n")
        for v, e in coding_effects_per_variant.items():
            if e.gene_id in interferon_response_gene_id_set:
                string_io.write("-- %s %s (%s)\n" % (e.gene_name, e.short_description, v.short_description))
        text = string_io.getvalue()

    print(text)

    if args.output_text:
        with open(args.output_text, "w") as f:
            f.write(text)
Example #13
0
def run(args_list=None):
    if args_list is None:
        args_list = argv[1:]
    args = parser.parse_args(args_list)
    print("MS-MHC version %s" % __version__)
    reference_genome = genome_for_reference_name(
        args.genome if args.genome else "grch37")
    print("Using reference genome %s" % reference_genome)
    if args.vcf or args.maf or args.variant or args.json_variants:
        variants = variant_collection_from_args(args)
    else:
        variants = []

    hits = generate_protein_sequences(
        genome=reference_genome,
        variants=variants,
        upstream_reading_frames=args.upstream_reading_frames,
        downstream_reading_frames=args.downstream_reading_frames,
        skip_exons=args.skip_exons,
        min_peptide_length=args.min_peptide_length,
        restrict_sources_to_gene_name=args.gene_name)

    if args.extract_peptides:
        print("Extracting %dmer-%dmer peptides from generated sequences" %
              (args.min_peptide_length, args.max_peptide_length))
        sequence_dict = extract_peptides(hits,
                                         min_length=args.min_peptide_length,
                                         max_length=args.max_peptide_length)
    else:
        # make sure we don't have repeated protein sequences
        sequence_dict = defaultdict(list)
        for sequence_obj in hits:
            sequence_dict[sequence_obj.amino_acids].append(sequence_obj)

    hits = collapse_peptide_sources(sequence_dict)

    decoys = generate_decoys(hits,
                             n_decoys=len(hits) * args.num_decoys_per_hit,
                             random_seed=args.random_seed)

    combined_sequences = hits + decoys
    print("Writing %d FASTA records (%d hits, %d decoys)" %
          (len(combined_sequences), len(hits), len(decoys)))

    with open(args.output, "w") as f:
        for seq in progressbar(combined_sequences):
            seq.write_to_fasta_file(f)
    print("Done.")
Example #14
0
def ranked_variant_list_with_metadata(args):
    """
    Computes all the data needed for report generation.

    Parameters
    ----------
    args : Namespace
      Parsed user args from this run

    Returns a dictionary containing 3 items:
    - ranked variant/vaccine peptide list
    - a dictionary of command-line arguments used to generate it
    - patient info object
    """
    if hasattr(args, 'input_json_file'):
        with open(args.input_json_file) as f:
            data = serializable.from_json(f.read())
            # the JSON data from the previous run will have the older args saved, which may need to
            # be overridden with args from this run (which all be output related)
            data['args'].update(vars(args))

            # if we need to truncate the variant list based on max_mutations_in_report, do that here
            if len(data['variants']) > args.max_mutations_in_report:
                data['variants'] = data['variants'][:args.
                                                    max_mutations_in_report]
            return data

    # get various things from user args
    mhc_alleles = mhc_alleles_from_args(args)
    logger.info("MHC alleles: %s", mhc_alleles)
    variants = variant_collection_from_args(args)
    logger.info("Variants: %s", variants)
    # generator that for each variant gathers all RNA reads, both those
    # supporting the variant and reference alleles
    reads_generator = allele_reads_generator_from_args(args)
    mhc_predictor = mhc_binding_predictor_from_args(args)

    ranked_list, variants_count_dict = ranked_vaccine_peptides(
        reads_generator=reads_generator,
        mhc_predictor=mhc_predictor,
        vaccine_peptide_length=args.vaccine_peptide_length,
        padding_around_mutation=args.padding_around_mutation,
        max_vaccine_peptides_per_variant=args.
        max_vaccine_peptides_per_mutation,
        min_alt_rna_reads=args.min_alt_rna_reads,
        min_variant_sequence_coverage=args.min_variant_sequence_coverage,
        min_epitope_score=args.min_epitope_score,
        num_mutant_epitopes_to_keep=args.num_epitopes_per_peptide,
        variant_sequence_assembly=args.variant_sequence_assembly)

    ranked_list_for_report = ranked_list[:args.max_mutations_in_report]

    patient_info = PatientInfo(
        patient_id=args.output_patient_id,
        vcf_paths=variants.sources,
        bam_path=args.bam,
        mhc_alleles=mhc_alleles,
        num_somatic_variants=len(variants),
        num_coding_effect_variants=variants_count_dict[
            'num_coding_effect_variants'],
        num_variants_with_rna_support=variants_count_dict[
            'num_variants_with_rna_support'],
        num_variants_with_vaccine_peptides=variants_count_dict[
            'num_variants_with_vaccine_peptides'])

    # return variants, patient info, and command-line args
    data = {
        'variants': ranked_list_for_report,
        'patient_info': patient_info,
        'args': vars(args),
    }
    logger.info('About to save args: %s', data['args'])

    # save JSON data if necessary. as of time of writing, vaxrank takes ~25 min to run,
    # most of which is core logic. the formatting is super fast, and it can
    # be useful to save the data to be able to iterate just on the formatting
    if args.output_json_file:
        with open(args.output_json_file, 'w') as f:
            f.write(serializable.to_json(data))
            logger.info('Wrote JSON report data to %s', args.output_json_file)

    return data
Example #15
0
def ranked_vaccine_peptides_with_metadata_from_parsed_args(args):
    """
    Computes all the data needed for report generation.

    Parameters
    ----------
    args : Namespace
      Parsed user args from this run

    Returns a dictionary containing 3 items:
    - ranked variant/vaccine peptide list
    - a dictionary of command-line arguments used to generate it
    - patient info object
    """

    if hasattr(args, 'input_json_file'):
        with open(args.input_json_file) as f:

            data = serializable.from_json(f.read())
            # the JSON data from the previous run will have the older args saved, which may need to
            # be overridden with args from this run (which all be output related)
            data['args'].update(vars(args))

            # if we need to truncate the variant list based on max_mutations_in_report, do that here
            if len(data['variants']) > args.max_mutations_in_report:
                data['variants'] = data['variants'][:args.
                                                    max_mutations_in_report]
            return data
    # get various things from user args
    mhc_alleles = mhc_alleles_from_args(args)
    logger.info("MHC alleles: %s", mhc_alleles)

    variants = variant_collection_from_args(args)
    logger.info("Variants: %s", variants)

    vaxrank_results = run_vaxrank_from_parsed_args(args)

    variants_count_dict = vaxrank_results.variant_counts()
    assert len(variants) == variants_count_dict['num_total_variants'], \
        "Len(variants) is %d but variants_count_dict came back with %d" % (
            len(variants), variants_count_dict['num_total_variants'])

    if args.output_passing_variants_csv:
        variant_metadata_dicts = vaxrank_results.variant_properties(
            gene_pathway_check=GenePathwayCheck())
        df = pd.DataFrame(variant_metadata_dicts)
        df.to_csv(args.output_passing_variants_csv, index=False)

    ranked_variants_with_vaccine_peptides = vaxrank_results.ranked_vaccine_peptides
    ranked_variants_with_vaccine_peptides_for_report = \
        ranked_variants_with_vaccine_peptides[:args.max_mutations_in_report]
    patient_info = PatientInfo(
        patient_id=args.output_patient_id,
        vcf_paths=variants.sources,
        bam_path=args.bam,
        mhc_alleles=mhc_alleles,
        num_somatic_variants=variants_count_dict['num_total_variants'],
        num_coding_effect_variants=variants_count_dict[
            'num_coding_effect_variants'],
        num_variants_with_rna_support=variants_count_dict[
            'num_variants_with_rna_support'],
        num_variants_with_vaccine_peptides=variants_count_dict[
            'num_variants_with_vaccine_peptides'])
    # return variants, patient info, and command-line args
    data = {
        # TODO:
        #  change this field to 'ranked_variants_with_vaccine_peptides'
        #  but figure out how to do it in a backwards compatible way
        'variants': ranked_variants_with_vaccine_peptides_for_report,
        'patient_info': patient_info,
        'args': vars(args),
    }
    logger.info('About to save args: %s', data['args'])

    # save JSON data if necessary. as of time of writing, vaxrank takes ~25 min to run,
    # most of which is core logic. the formatting is super fast, and it can
    # be useful to save the data to be able to iterate just on the formatting
    if args.output_json_file:
        with open(args.output_json_file, 'w') as f:
            f.write(serializable.to_json(data))
            logger.info('Wrote JSON report data to %s', args.output_json_file)

    return data
Example #16
0
def ranked_variant_list_with_metadata(args):
    """
    Computes all the data needed for report generation.

    Parameters
    ----------
    args : Namespace
      Parsed user args from this run

    Returns a dictionary containing 3 items:
    - ranked variant/vaccine peptide list
    - a dictionary of command-line arguments used to generate it
    - patient info object
    """
    if hasattr(args, 'input_json_file'):
        with open(args.input_json_file) as f:
            data = serializable.from_json(f.read())
            # the JSON data from the previous run will have the older args saved, which may need to
            # be overridden with args from this run (which all be output related)
            data['args'].update(vars(args))

            # if we need to truncate the variant list based on max_mutations_in_report, do that here
            if len(data['variants']) > args.max_mutations_in_report:
                data['variants'] = data['variants'][:args.max_mutations_in_report]
            return data

    # get various things from user args
    mhc_alleles = mhc_alleles_from_args(args)
    logger.info("MHC alleles: %s", mhc_alleles)
    variants = variant_collection_from_args(args)
    logger.info("Variants: %s", variants)
    # generator that for each variant gathers all RNA reads, both those
    # supporting the variant and reference alleles
    reads_generator = allele_reads_generator_from_args(args)
    mhc_predictor = mhc_binding_predictor_from_args(args)

    ranked_list, variants_count_dict = ranked_vaccine_peptides(
        reads_generator=reads_generator,
        mhc_predictor=mhc_predictor,
        vaccine_peptide_length=args.vaccine_peptide_length,
        padding_around_mutation=args.padding_around_mutation,
        max_vaccine_peptides_per_variant=args.max_vaccine_peptides_per_mutation,
        min_alt_rna_reads=args.min_alt_rna_reads,
        min_variant_sequence_coverage=args.min_variant_sequence_coverage,
        min_epitope_score=args.min_epitope_score,
        num_mutant_epitopes_to_keep=args.num_epitopes_per_peptide,
        variant_sequence_assembly=args.variant_sequence_assembly)

    ranked_list_for_report = ranked_list[:args.max_mutations_in_report]

    patient_info = PatientInfo(
        patient_id=args.output_patient_id,
        vcf_paths=variants.sources,
        bam_path=args.bam,
        mhc_alleles=mhc_alleles,
        num_somatic_variants=len(variants),
        num_coding_effect_variants=variants_count_dict['num_coding_effect_variants'],
        num_variants_with_rna_support=variants_count_dict['num_variants_with_rna_support'],
        num_variants_with_vaccine_peptides=variants_count_dict['num_variants_with_vaccine_peptides']
    )

    # return variants, patient info, and command-line args
    data = {
        'variants': ranked_list_for_report,
        'patient_info': patient_info,
        'args': vars(args),
    }
    logger.info('About to save args: %s', data['args'])

    # save JSON data if necessary. as of time of writing, vaxrank takes ~25 min to run,
    # most of which is core logic. the formatting is super fast, and it can
    # be useful to save the data to be able to iterate just on the formatting
    if args.output_json_file:
        with open(args.output_json_file, 'w') as f:
            f.write(serializable.to_json(data))
            logger.info('Wrote JSON report data to %s', args.output_json_file)

    return data