Exemple #1
0
def test_cnv_simple(genomic_sequence_2013, gene_models_2013, variant_type,
                    location, effect_type, effect_genes):
    effects = VariantAnnotator.annotate_variant(gene_models_2013,
                                                genomic_sequence_2013,
                                                loc=location,
                                                variant_type=variant_type)
    assert effects
    et, eg, _ = \
        VariantAnnotator.effect_simplify(effects)

    assert et == effect_type
    assert set(eg) == set(effect_genes)
Exemple #2
0
def test_chr2_32853362_ins_var(genomic_sequence_2013, gene_models_2013):
    effects = VariantAnnotator.annotate_variant(
        gene_models_2013,
        genomic_sequence_2013,
        loc="6:157527729",
        var="complex(CTGG->ATAG)",
    )

    assert len(effects) == 2
    effects_sorted = sorted(effects, key=lambda k: k.transcript_id)

    assert effects_sorted[0].gene == "ARID1B"
    assert effects_sorted[0].transcript_id == "NM_017519_1"
    assert effects_sorted[0].strand == "+"
    assert effects_sorted[0].effect == "nonsense"
    # assert effects_sorted[0].prot_pos is None
    # assert effects_sorted[0].prot_length is None
    assert effects_sorted[0].aa_change == "His,Trp->Gln,End"

    assert effects_sorted[1].gene == "ARID1B"
    assert effects_sorted[1].transcript_id == "NM_020732_1"
    assert effects_sorted[1].strand == "+"
    assert effects_sorted[1].effect == "nonsense"
    assert effects_sorted[1].prot_pos, 1
    assert effects_sorted[1].prot_length, 843
    assert effects_sorted[1].aa_change == "His,Trp->Gln,End"
def test_chr1_802610_867930_CNV_var(genome_2013, gene_models_2013):
    effects = VariantAnnotation.annotate_variant(gene_models_2013,
                                                 genome_2013,
                                                 loc="1:802610-867930",
                                                 var="CNV+")
    assert len(effects) == 3
    effects_sorted = sorted(effects, key=lambda k: k.transcript_id)

    assert effects_sorted[0].gene == "SAMD11"
    assert effects_sorted[0].transcript_id == "NM_152486_1"
    assert effects_sorted[0].strand == "+"
    assert effects_sorted[0].effect == "unknown"
    # assert effects_sorted[0].prot_pos is None
    # assert effects_sorted[0].prot_length is None
    assert effects_sorted[0].aa_change is None

    assert effects_sorted[1].gene == "LOC100130417"
    assert effects_sorted[1].transcript_id == "NR_026874_1"
    assert effects_sorted[1].strand == "-"
    assert effects_sorted[1].effect == "unknown"
    # assert effects_sorted[1].prot_pos is None
    # assert effects_sorted[1].prot_length is None
    assert effects_sorted[1].aa_change is None

    assert effects_sorted[2].gene == "FAM41C"
    assert effects_sorted[2].transcript_id == "NR_027055_1"
    assert effects_sorted[2].strand == "-"
    assert effects_sorted[2].effect == "unknown"
    # assert effects_sorted[2].prot_pos is None
    # assert effects_sorted[2].prot_length is None
    assert effects_sorted[2].aa_change is None
Exemple #4
0
    def __init__(self, config, genomes_db, **kwargs):
        super(EffectAnnotatorBase, self).__init__(config, genomes_db)

        self.effect_annotator = VariantAnnotator(
            self.genomic_sequence,
            self.gene_models,
            promoter_len=self.config.options.prom_len,
        )

        self.columns = OrderedDict()
        for col_name, _col_type in self.COLUMNS_SCHEMA:
            self.columns[col_name] = getattr(self.config.columns, col_name)
def test_chr1_120387132_del_var(genome_2013, gene_models_2013):
    [effect] = VariantAnnotation.annotate_variant(gene_models_2013,
                                                  genome_2013,
                                                  loc="1:120387132",
                                                  var="del(71)")

    assert effect.gene == "NBPF7"
    assert effect.transcript_id == "NM_001047980_1"
    assert effect.strand == "-"
    assert effect.effect == "noStart"
    assert effect.prot_pos == 1
    assert effect.prot_length == 421
    assert effect.aa_change is None
def test_chr2_237172988_ins_var(genome_2013, gene_models_2013):
    [effect] = VariantAnnotation.annotate_variant(gene_models_2013,
                                                  genome_2013,
                                                  loc="2:237172988",
                                                  var="ins(TTGTTACG)")

    assert effect.gene == "ASB18"
    assert effect.transcript_id == "NM_212556_1"
    assert effect.strand == "-"
    assert effect.effect == "noStart"
    assert effect.prot_pos == 1
    assert effect.prot_length == 466
    assert effect.aa_change is None
Exemple #7
0
def test_chr5_75902128_sub_var(genomic_sequence_2013, gene_models_2013):
    [effect] = VariantAnnotator.annotate_variant(
        gene_models_2013,
        genomic_sequence_2013,
        loc="5:75902128",
        var="sub(C->T)",
    )

    assert effect.gene == "IQGAP2"
    assert effect.transcript_id == "NM_006633_1"
    assert effect.strand == "+"
    assert effect.effect == "nonsense"
    # assert effect.prot_pos is None
    # assert effect.prot_length is None
    assert effect.aa_change == "Arg->End"
Exemple #8
0
def test_synonymous_complex_var(genomic_sequence_2013, gene_models_2013):
    [effect] = VariantAnnotator.annotate_variant(
        gene_models_2013,
        genomic_sequence_2013,
        loc="1:897349",
        var="complex(GG->AA)",
    )

    assert effect.gene == "KLHL17"
    assert effect.transcript_id == "NM_198317_1"
    assert effect.strand == "+"
    assert effect.effect == "missense"
    assert effect.prot_pos == 211
    assert effect.prot_length == 642
    assert effect.aa_change == "Lys,Ala->Lys,Thr"
Exemple #9
0
def test_just_next_to_splice_site_var(genomic_sequence_2013, gene_models_2013):
    effects = VariantAnnotator.annotate_variant(
        gene_models_2013, genomic_sequence_2013, loc="5:86705101", var="del(4)"
    )

    assert len(effects) == 2
    effects_sorted = sorted(effects, key=lambda k: k.transcript_id)

    assert effects_sorted[0].gene == "CCNH"
    assert effects_sorted[0].transcript_id == "NM_001199189_1"
    assert effects_sorted[0].strand == "-"
    assert effects_sorted[0].effect == "intron"
    # assert effects_sorted[0].prot_pos is None
    # assert effects_sorted[0].prot_length is None
    assert effects_sorted[0].aa_change is None

    assert effects_sorted[1].gene == "CCNH"
    assert effects_sorted[1].transcript_id == "NM_001239_1"
    assert effects_sorted[1].strand == "-"
    assert effects_sorted[1].effect == "intron"
    # assert effects_sorted[1].prot_pos is None
    # assert effects_sorted[1].prot_length is None
    assert effects_sorted[1].aa_change is None
Exemple #10
0
def cli_vcf(argv=sys.argv[1:]):
    parser = argparse.ArgumentParser(
        description="VCF variants effect annotator",
        conflict_handler="resolve",
        formatter_class=argparse.RawDescriptionHelpFormatter,
    )
    cli_genome_options(parser)
    parser.add_argument("input_filename", help="input VCF variants file name")
    parser.add_argument("output_filename",
                        nargs="?",
                        help="output file name (default: stdout)")

    args = parser.parse_args(argv)
    genomic_sequence, gene_models = parse_cli_genome_options(args)
    assert genomic_sequence is not None
    assert gene_models is not None
    annotator = VariantAnnotation(genomic_sequence,
                                  gene_models,
                                  promoter_len=args.promoter_len)

    assert os.path.exists(args.input_filename), args.input_filename
    infile = pysam.VariantFile(args.input_filename)

    if args.output_filename is None:
        outfile = sys.stdout
    else:
        outfile = open(args.output_filename, "w")

    start = time.time()
    # Transfer VCF header
    header = infile.header
    header.add_meta("variant_effect_annotation",
                    "GPF variant effects annotation")
    header.add_meta("variant_effect_annotation_command",
                    '"{}"'.format(" ".join(sys.argv)))

    header.info.add("ET", ".", "String", "effected type")
    header.info.add("EG", ".", "String", "effected gene")
    header.info.add("ED", ".", "String", "effect details")

    print(str(header), file=outfile, end="")
    counter = 0
    for counter, variant in enumerate(infile):
        effect_types = []
        effect_genes = []
        effect_details = []
        eg = ""
        ed = ""

        for alt in variant.alts:
            effects = annotator.do_annotate_variant(
                chrom=variant.chrom,
                position=variant.pos,
                ref=variant.ref,
                alt=alt,
            )
            et, eg, ed = annotator.effect_description(effects)
            ed = ed.replace(";", "|")
            effect_types.append(et)
            effect_genes.append(eg)
            effect_details.append(ed)

        effect_types = ",".join(effect_types)
        effect_genes = ",".join(effect_genes)
        effect_details = ",".join(effect_details)
        variant.info["ET"] = effect_types
        variant.info["EG"] = eg
        variant.info["ED"] = ed

        print(str(variant), file=outfile, end="")
        if (counter + 1) % 1000 == 0:
            elapsed = time.time() - start
            print(
                f"processed {counter + 1} variants in {elapsed:0.2f} sec",
                file=sys.stderr,
            )

    infile.close()
    if args.output_filename:
        outfile.close()

    elapsed = time.time() - start
    print(80 * "=", file=sys.stderr)
    print(
        f"DONE: {counter + 1} variants in {elapsed:0.2f} sec",
        file=sys.stderr,
    )
    print(80 * "=", file=sys.stderr)
Exemple #11
0
def cli(argv=sys.argv[1:]):
    parser = argparse.ArgumentParser(
        description="variants effect annotator",
        conflict_handler="resolve",
        formatter_class=argparse.RawDescriptionHelpFormatter,
    )
    cli_genome_options(parser)
    cli_variants_options(parser)

    parser.add_argument("input_filename",
                        nargs="?",
                        help="input variants file name")
    parser.add_argument("output_filename",
                        nargs="?",
                        help="output file name (default: stdout)")

    args = parser.parse_args(argv)
    genomic_sequence, gene_models = parse_cli_genome_options(args)
    assert genomic_sequence is not None
    assert gene_models is not None
    annotator = VariantAnnotation(genomic_sequence,
                                  gene_models,
                                  promoter_len=args.promoter_len)

    variant_columns = parse_cli_variants_options(args)

    if args.input_filename == "-" or args.input_filename is None:
        infile = sys.stdin
    else:
        assert os.path.exists(args.input_filename), args.input_filename
        infile = open(args.input_filename, "r")

    if args.output_filename is None:
        outfile = sys.stdout
    else:
        outfile = open(args.output_filename, "w")

    start = time.time()
    header = None
    if args.no_header:
        for key, value in variant_columns.items():
            variant_columns[key] = int(value)
    else:
        line = infile.readline().strip()
        header = [c.strip() for c in line.split("\t")]
        for key, value in variant_columns.items():
            assert value in header
            variant_columns[key] = header.index(value)
        header.extend(["effectType", "effectGene", "effectDetails"])
        print("\t".join(header), file=outfile)

    counter = 0
    for counter, line in enumerate(infile):
        if line[0] == "#":
            continue
        columns = [c.strip() for c in line.split("\t")]
        variant = {
            key: columns[value]
            for key, value in variant_columns.items()
        }
        effects = annotator.do_annotate_variant(**variant)
        desc = annotator.effect_description(effects)
        columns.extend(desc)
        print("\t".join(columns), file=outfile)

        if (counter + 1) % 1000 == 0:
            elapsed = time.time() - start
            print(
                f"processed {counter + 1} lines in {elapsed:0.2f} sec",
                file=sys.stderr,
            )

    infile.close()
    if args.output_filename:
        outfile.close()

    elapsed = time.time() - start
    print(80 * "=", file=sys.stderr)
    print(
        f"DONE: {counter + 1} variants in {elapsed:0.2f} sec",
        file=sys.stderr,
    )
    print(80 * "=", file=sys.stderr)