def test_cnv_simple(genomic_sequence_2013, gene_models_2013, variant_type, location, effect_type, effect_genes): effects = VariantAnnotator.annotate_variant(gene_models_2013, genomic_sequence_2013, loc=location, variant_type=variant_type) assert effects et, eg, _ = \ VariantAnnotator.effect_simplify(effects) assert et == effect_type assert set(eg) == set(effect_genes)
def test_chr2_32853362_ins_var(genomic_sequence_2013, gene_models_2013): effects = VariantAnnotator.annotate_variant( gene_models_2013, genomic_sequence_2013, loc="6:157527729", var="complex(CTGG->ATAG)", ) assert len(effects) == 2 effects_sorted = sorted(effects, key=lambda k: k.transcript_id) assert effects_sorted[0].gene == "ARID1B" assert effects_sorted[0].transcript_id == "NM_017519_1" assert effects_sorted[0].strand == "+" assert effects_sorted[0].effect == "nonsense" # assert effects_sorted[0].prot_pos is None # assert effects_sorted[0].prot_length is None assert effects_sorted[0].aa_change == "His,Trp->Gln,End" assert effects_sorted[1].gene == "ARID1B" assert effects_sorted[1].transcript_id == "NM_020732_1" assert effects_sorted[1].strand == "+" assert effects_sorted[1].effect == "nonsense" assert effects_sorted[1].prot_pos, 1 assert effects_sorted[1].prot_length, 843 assert effects_sorted[1].aa_change == "His,Trp->Gln,End"
def test_chr1_802610_867930_CNV_var(genome_2013, gene_models_2013): effects = VariantAnnotation.annotate_variant(gene_models_2013, genome_2013, loc="1:802610-867930", var="CNV+") assert len(effects) == 3 effects_sorted = sorted(effects, key=lambda k: k.transcript_id) assert effects_sorted[0].gene == "SAMD11" assert effects_sorted[0].transcript_id == "NM_152486_1" assert effects_sorted[0].strand == "+" assert effects_sorted[0].effect == "unknown" # assert effects_sorted[0].prot_pos is None # assert effects_sorted[0].prot_length is None assert effects_sorted[0].aa_change is None assert effects_sorted[1].gene == "LOC100130417" assert effects_sorted[1].transcript_id == "NR_026874_1" assert effects_sorted[1].strand == "-" assert effects_sorted[1].effect == "unknown" # assert effects_sorted[1].prot_pos is None # assert effects_sorted[1].prot_length is None assert effects_sorted[1].aa_change is None assert effects_sorted[2].gene == "FAM41C" assert effects_sorted[2].transcript_id == "NR_027055_1" assert effects_sorted[2].strand == "-" assert effects_sorted[2].effect == "unknown" # assert effects_sorted[2].prot_pos is None # assert effects_sorted[2].prot_length is None assert effects_sorted[2].aa_change is None
def __init__(self, config, genomes_db, **kwargs): super(EffectAnnotatorBase, self).__init__(config, genomes_db) self.effect_annotator = VariantAnnotator( self.genomic_sequence, self.gene_models, promoter_len=self.config.options.prom_len, ) self.columns = OrderedDict() for col_name, _col_type in self.COLUMNS_SCHEMA: self.columns[col_name] = getattr(self.config.columns, col_name)
def test_chr1_120387132_del_var(genome_2013, gene_models_2013): [effect] = VariantAnnotation.annotate_variant(gene_models_2013, genome_2013, loc="1:120387132", var="del(71)") assert effect.gene == "NBPF7" assert effect.transcript_id == "NM_001047980_1" assert effect.strand == "-" assert effect.effect == "noStart" assert effect.prot_pos == 1 assert effect.prot_length == 421 assert effect.aa_change is None
def test_chr2_237172988_ins_var(genome_2013, gene_models_2013): [effect] = VariantAnnotation.annotate_variant(gene_models_2013, genome_2013, loc="2:237172988", var="ins(TTGTTACG)") assert effect.gene == "ASB18" assert effect.transcript_id == "NM_212556_1" assert effect.strand == "-" assert effect.effect == "noStart" assert effect.prot_pos == 1 assert effect.prot_length == 466 assert effect.aa_change is None
def test_chr5_75902128_sub_var(genomic_sequence_2013, gene_models_2013): [effect] = VariantAnnotator.annotate_variant( gene_models_2013, genomic_sequence_2013, loc="5:75902128", var="sub(C->T)", ) assert effect.gene == "IQGAP2" assert effect.transcript_id == "NM_006633_1" assert effect.strand == "+" assert effect.effect == "nonsense" # assert effect.prot_pos is None # assert effect.prot_length is None assert effect.aa_change == "Arg->End"
def test_synonymous_complex_var(genomic_sequence_2013, gene_models_2013): [effect] = VariantAnnotator.annotate_variant( gene_models_2013, genomic_sequence_2013, loc="1:897349", var="complex(GG->AA)", ) assert effect.gene == "KLHL17" assert effect.transcript_id == "NM_198317_1" assert effect.strand == "+" assert effect.effect == "missense" assert effect.prot_pos == 211 assert effect.prot_length == 642 assert effect.aa_change == "Lys,Ala->Lys,Thr"
def test_just_next_to_splice_site_var(genomic_sequence_2013, gene_models_2013): effects = VariantAnnotator.annotate_variant( gene_models_2013, genomic_sequence_2013, loc="5:86705101", var="del(4)" ) assert len(effects) == 2 effects_sorted = sorted(effects, key=lambda k: k.transcript_id) assert effects_sorted[0].gene == "CCNH" assert effects_sorted[0].transcript_id == "NM_001199189_1" assert effects_sorted[0].strand == "-" assert effects_sorted[0].effect == "intron" # assert effects_sorted[0].prot_pos is None # assert effects_sorted[0].prot_length is None assert effects_sorted[0].aa_change is None assert effects_sorted[1].gene == "CCNH" assert effects_sorted[1].transcript_id == "NM_001239_1" assert effects_sorted[1].strand == "-" assert effects_sorted[1].effect == "intron" # assert effects_sorted[1].prot_pos is None # assert effects_sorted[1].prot_length is None assert effects_sorted[1].aa_change is None
def cli_vcf(argv=sys.argv[1:]): parser = argparse.ArgumentParser( description="VCF variants effect annotator", conflict_handler="resolve", formatter_class=argparse.RawDescriptionHelpFormatter, ) cli_genome_options(parser) parser.add_argument("input_filename", help="input VCF variants file name") parser.add_argument("output_filename", nargs="?", help="output file name (default: stdout)") args = parser.parse_args(argv) genomic_sequence, gene_models = parse_cli_genome_options(args) assert genomic_sequence is not None assert gene_models is not None annotator = VariantAnnotation(genomic_sequence, gene_models, promoter_len=args.promoter_len) assert os.path.exists(args.input_filename), args.input_filename infile = pysam.VariantFile(args.input_filename) if args.output_filename is None: outfile = sys.stdout else: outfile = open(args.output_filename, "w") start = time.time() # Transfer VCF header header = infile.header header.add_meta("variant_effect_annotation", "GPF variant effects annotation") header.add_meta("variant_effect_annotation_command", '"{}"'.format(" ".join(sys.argv))) header.info.add("ET", ".", "String", "effected type") header.info.add("EG", ".", "String", "effected gene") header.info.add("ED", ".", "String", "effect details") print(str(header), file=outfile, end="") counter = 0 for counter, variant in enumerate(infile): effect_types = [] effect_genes = [] effect_details = [] eg = "" ed = "" for alt in variant.alts: effects = annotator.do_annotate_variant( chrom=variant.chrom, position=variant.pos, ref=variant.ref, alt=alt, ) et, eg, ed = annotator.effect_description(effects) ed = ed.replace(";", "|") effect_types.append(et) effect_genes.append(eg) effect_details.append(ed) effect_types = ",".join(effect_types) effect_genes = ",".join(effect_genes) effect_details = ",".join(effect_details) variant.info["ET"] = effect_types variant.info["EG"] = eg variant.info["ED"] = ed print(str(variant), file=outfile, end="") if (counter + 1) % 1000 == 0: elapsed = time.time() - start print( f"processed {counter + 1} variants in {elapsed:0.2f} sec", file=sys.stderr, ) infile.close() if args.output_filename: outfile.close() elapsed = time.time() - start print(80 * "=", file=sys.stderr) print( f"DONE: {counter + 1} variants in {elapsed:0.2f} sec", file=sys.stderr, ) print(80 * "=", file=sys.stderr)
def cli(argv=sys.argv[1:]): parser = argparse.ArgumentParser( description="variants effect annotator", conflict_handler="resolve", formatter_class=argparse.RawDescriptionHelpFormatter, ) cli_genome_options(parser) cli_variants_options(parser) parser.add_argument("input_filename", nargs="?", help="input variants file name") parser.add_argument("output_filename", nargs="?", help="output file name (default: stdout)") args = parser.parse_args(argv) genomic_sequence, gene_models = parse_cli_genome_options(args) assert genomic_sequence is not None assert gene_models is not None annotator = VariantAnnotation(genomic_sequence, gene_models, promoter_len=args.promoter_len) variant_columns = parse_cli_variants_options(args) if args.input_filename == "-" or args.input_filename is None: infile = sys.stdin else: assert os.path.exists(args.input_filename), args.input_filename infile = open(args.input_filename, "r") if args.output_filename is None: outfile = sys.stdout else: outfile = open(args.output_filename, "w") start = time.time() header = None if args.no_header: for key, value in variant_columns.items(): variant_columns[key] = int(value) else: line = infile.readline().strip() header = [c.strip() for c in line.split("\t")] for key, value in variant_columns.items(): assert value in header variant_columns[key] = header.index(value) header.extend(["effectType", "effectGene", "effectDetails"]) print("\t".join(header), file=outfile) counter = 0 for counter, line in enumerate(infile): if line[0] == "#": continue columns = [c.strip() for c in line.split("\t")] variant = { key: columns[value] for key, value in variant_columns.items() } effects = annotator.do_annotate_variant(**variant) desc = annotator.effect_description(effects) columns.extend(desc) print("\t".join(columns), file=outfile) if (counter + 1) % 1000 == 0: elapsed = time.time() - start print( f"processed {counter + 1} lines in {elapsed:0.2f} sec", file=sys.stderr, ) infile.close() if args.output_filename: outfile.close() elapsed = time.time() - start print(80 * "=", file=sys.stderr) print( f"DONE: {counter + 1} variants in {elapsed:0.2f} sec", file=sys.stderr, ) print(80 * "=", file=sys.stderr)