コード例 #1
0
def getCleanningRules(variant_caller):
    """
    Return by INFO tag the correct declaration for header and the function to clean the values of this tag in records.

    :param variant_caller: The variant caller used to produce the VCF to fix.
    :type variant_caller: str
    :return: By INFO tag the correct declaration for header and the function to clean the values of this tag in records.
    :rtype: dict
    """
    info_by_caller = {
        "vardict": {
            "REFBIAS": {
                "declaration":
                HeaderInfoAttr("REFBIAS",
                               "Reference depth by strand",
                               type="Integer",
                               number="2"),
                "process":
                lambda val: [int(elt) for elt in val.split(":")]
            },
            "VARBIAS": {
                "declaration":
                HeaderInfoAttr("VARBIAS",
                               "Variant depth by strand",
                               type="Integer",
                               number="2"),
                "process":
                lambda val: [int(elt) for elt in val.split(":")]
            }
        }
    }
    return info_by_caller[variant_caller]
コード例 #2
0
def normAndMove(genome_path, in_variant_file, out_variant_file,
                trace_unstandard):
    """
    Write in a new file the normalized version of each variant. The normalization constists in three steps:
      1- The variants with multiple alternative alleles are splitted in one record by alternative allele.
      2- In each allele the empty allele marker is replaced by a dot and alternative and reference allele are reduced to the minimal string (example: ATG/A becomes TG/. ; AAGC/ATAC becomes AG/TA.).
      3- The allele is replaced by the most upstream allele that can have the same alternative sequence (example: a deletion in homopolymer is moved to first nucleotid of this homopolymer).

    :param genome_path: Path to the genome file (format: fasta).
    :type genome_path: str
    :param in_variant_file: Path to the variants file (format: VCF).
    :type in_variant_file: str
    :param out_variant_file: Path to the normalized variants file (format: VCF).
    :type out_variant_file: str
    :param trace_unstandard: True if you want to keep the trace of the variant before standardization in INFO.
    :type trace_unstandard: bool
    """
    genome_by_chr = getSeqByChr(genome_path)
    with VCFIO(out_variant_file, "w") as FH_out:
        with VCFIO(in_variant_file) as FH_in:
            # Header
            FH_out.copyHeader(FH_in)
            if trace_unstandard:
                FH_out.info["UNSTD"] = HeaderInfoAttr(
                    "UNSTD",
                    type="String",
                    number="1",
                    description=
                    "The variant id (chromosome:position=reference/alternative) before standardization."
                )
            FH_out.writeHeader()
            # Records
            for record in FH_in:
                curr_chrom = genome_by_chr[record.chrom]
                for alt_idx, alt in enumerate(record.alt):
                    alt_record = getAlleleRecord(FH_in, record, alt_idx)
                    if trace_unstandard:
                        alt_record.info["UNSTD"] = "{}:{}={}/{}".format(
                            alt_record.chrom, alt_record.pos, alt_record.ref,
                            "/".join(alt_record.alt))
                    FH_out.write(alt_record.getMostUpstream(curr_chrom))
コード例 #3
0
def normOnly(in_variant_file, out_variant_file, trace_unstandard):
    """
    Write in a new file the normalized version of each variant. The normalization constists in two steps:
      1- The variants with multiple alternative alleles are splitted in one record by alternative allele.
      2- In each allele the empty allele marker is replaced by a dot and alternative and reference allele are reduced to the minimal string (example: ATG/A becomes TG/. ; AAGC/ATAC becomes AG/TA.).

    :param in_variant_file: Path to the variants file (format: VCF).
    :type in_variant_file: str
    :param out_variant_file: Path to the normalized variants file (format: VCF).
    :type out_variant_file: str
    :param trace_unstandard: True if you want to keep the trace of the variant before standardization in INFO.
    :type trace_unstandard: bool
    """
    with VCFIO(out_variant_file, "w") as FH_out:
        with VCFIO(in_variant_file) as FH_in:
            # Header
            FH_out.copyHeader(FH_in)
            if trace_unstandard:
                FH_out.info["UNSTD"] = HeaderInfoAttr(
                    "UNSTD",
                    type="String",
                    number="1",
                    description=
                    "The variant id (chromosome:position=reference/alternative) before standardization."
                )
            FH_out.writeHeader()
            # Records
            for record in FH_in:
                for alt_idx, alt in enumerate(record.alt):
                    alt_record = getAlleleRecord(FH_in, record, alt_idx)
                    if trace_unstandard:
                        alt_record.info["UNSTD"] = "{}:{}={}/{}".format(
                            alt_record.chrom, alt_record.pos, alt_record.ref,
                            "/".join(alt_record.alt))
                    alt_record.normalizeSingleAllele()
                    FH_out.write(alt_record)
コード例 #4
0
    def setUp(self):
        tmp_folder = tempfile.gettempdir()
        unique_id = str(uuid.uuid1())

        # Temporary files
        self.tmp_sequences = os.path.join(tmp_folder, unique_id + ".fasta")
        self.tmp_faidx = os.path.join(tmp_folder, unique_id + ".fasta.fai")
        self.tmp_variants = os.path.join(tmp_folder, unique_id + ".vcf")
        self.tmp_output = os.path.join(tmp_folder, unique_id + "_out.vcf")

        # Exec command
        self.cmd = [
            "filterVCFHomopolym.py", "--mode", "remove", "--homopolym-length",
            "4", "--input-variants", self.tmp_variants, "--input-reference",
            self.tmp_sequences, "--output-variants", self.tmp_output
        ]

        # Create fasta
        with FastaIO(self.tmp_sequences, "w") as FH_seq:
            #                                                    12  16  20  24  28  32  36  40  44  48  52  56  60  64  68  72  76  80  84  88  92  96  100
            #                                          2 4 6 8 10| 14| 18| 22| 26| 30| 34| 38| 42| 46| 50| 54| 58| 62| 66| 70| 74| 78| 82| 86| 90| 94| 98| 102
            #                                          | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
            FH_seq.write(
                Sequence(
                    "artificial_chr1",
                    "CGAATATGATCCAGCAATAAAAAGTTCCTACAGGAAAAAAGTAGAAAGAGAAACCTGTCTCTTGGATATTCTCGACACAGCAGGTCAAG"
                ))
            FH_seq.write(
                Sequence(
                    "artificial_chr2",
                    "CGAATATGATCCAGCAATAAAAAGCTCCTACAGGCAAAAGTAGGCAAAGAGAAACCTGTCTCTTGGATATTCTCGACACAGCAGGTCAA"
                ))
            FH_seq.write(
                Sequence(
                    "artificial_chr3",
                    "CGAATATGATCCAGCAATGAAAATTCCTACAGGTAAAACGTAGAAAGAGAAACCTGTCTCTTGGATATTCTCGACACAGCAGGTCAAG"
                ))
            FH_seq.write(
                Sequence(
                    "artificial_chr4",
                    "CGAATATGATCCAGCAATAAAAAGTTCCTACAGGAAAAAAGTAGAAAGAGAAACCTGTCAAAAGGATATTCTCGACAAAACAGCAGAAAGTCAAG"
                ))
            FH_seq.write(
                Sequence(
                    "artificial_chr5",
                    "CGAATATGATCCAGTAATAAAAAGTTCCTACAGGAAAAAAGTAGAAAGAGAAACCTGTCTCTTGGATATTCTCGACACAGCAGGTCAAG"
                ))
            FH_seq.write(
                Sequence(
                    "artificial_chr6",
                    "CGAATATGATCCAGCAATAAAAAGTTCCTACAGGAAAAAAGTAGAAAGCACAACCTGTCTCTTGGAAAATCTCGACACAGCAGGTAAAACAATGCAGTAAAT"
                ))
        """
        Variant	before_start	before_end	before_seq	after_start	after_end	after_seq
        alt_00	10	13	TCCA	15	18	CAAT
        alt_01	20	23	AAAA	25	28	TTCC
        alt_02	30	33	ACAG	35	38	AAAA
        alt_03	40	43	AGTA	45	48	AAAG
        alt_04	10	13	TCCA	16	19	AATA
        alt_05	20	23	AAAA	26	29	TCCT
        alt_06	30	33	ACAG	36	39	AAAA
        alt_07	40	43	GTAG	46	49	AAAG
        alt_08	11	14	CCAG	15	18	CAAT
        alt_09	20	23	AAAA	24	27	TTCC
        alt_10	31	34	AGGT	35	38	AAAA
        alt_11	40	43	GTAG	44	47	AAAG
        alt_12	11	14	CCAG	15	18	CAAT
        alt_13	20	23	AAAA	24	27	GTTC
        alt_14	31	34	CAGG	35	38	AAAA
        alt_15	41	44	GTAG	45	48	AAAG
        alt_16	50	53	GAAA	57	60	GTCA
        alt_17	60	63	AAAA	67	70	TATT
        alt_18	70	73	TCTC	77	80	AAAA
        alt_19	80	83	ACAG	87	90	AAAG
        alt_20	11	14	CCAG	16	19	AATA
        alt_21	20	23	AAAA	25	28	TTCC
        alt_22	31	34	CAGG	36	39	AAAA
        alt_23	40	43	AGTA	45	48	AAAG
        alt_24	11	14	CCAG	17	20	ATAA
        alt_25	19	22	AAAA	26	29	TCCT
        alt_26	29	32	TACA	35	38	AAAA
        alt_27	38	41	AAAG	45	48	AAAG
        alt_28	50	53	ACAA	61	64	CTTG
        alt_29	66	69	AAAA	76	79	CACA
        alt_30	76	79	CACA	86	89	AAAA
        alt_31	88	91	AACA	99	102	AAAT
        """

        # Create faidx
        with open(self.tmp_faidx, "w") as FH_fai:
            FH_fai.write("""artificial_chr1	89	17	89	90
artificial_chr2	89	124	89	90
artificial_chr3	88	231	88	89
artificial_chr4	95	337	95	96
artificial_chr5	89	450	89	90
artificial_chr6	102	557	102	103""")

        # Create VCF
        with VCFIO(self.tmp_variants, "w") as FH_var:
            FH_var.info = {
                "is_filtered":
                HeaderInfoAttr(
                    "is_filtered",
                    "1 if the variant is adjacent to an homopolymer.",
                    type="Integer",
                    number="1")
            }
            FH_var.writeHeader()
            self.variants = [
                # Substit single nt
                VCFRecord("artificial_chr1", 14, "alt_00", "G", ["T"], None,
                          None,
                          {"is_filtered": 0}),  # Without adjacent homopolymers
                VCFRecord(
                    "artificial_chr1", 24, "alt_01", "G", ["T"], None, None,
                    {"is_filtered": 1}),  # Adjacent homopolymers upstream
                VCFRecord(
                    "artificial_chr1", 34, "alt_02", "G", ["T"], None, None,
                    {"is_filtered": 1}),  # Adjacent homopolymers downstream
                VCFRecord(
                    "artificial_chr1", 44, "alt_03", "G", ["T"], None, None,
                    {"is_filtered": 0}),  # Adjacent too short homopolymers
                # Substit multi nt
                VCFRecord("artificial_chr2", 14, "alt_04", "GC", ["TA"], None,
                          None,
                          {"is_filtered": 0}),  # Without adjacent homopolymers
                VCFRecord(
                    "artificial_chr2", 24, "alt_05", "GC", ["TA"], None, None,
                    {"is_filtered": 1}),  # Adjacent homopolymers upstream
                VCFRecord(
                    "artificial_chr2", 34, "alt_06", "GC", ["TA"], None, None,
                    {"is_filtered": 1}),  # Adjacent homopolymers downstream
                VCFRecord(
                    "artificial_chr2", 44, "alt_07", "GC", ["TA"], None, None,
                    {"is_filtered": 0}),  # Adjacent too short homopolymers
                # Ins single nt
                VCFRecord("artificial_chr3", 14, "alt_08", "G", ["GT"], None,
                          None,
                          {"is_filtered": 0}),  # Without adjacent homopolymers
                VCFRecord(
                    "artificial_chr3", 23, "alt_09", "A", ["AT"], None, None,
                    {"is_filtered": 1}),  # Adjacent homopolymers upstream
                VCFRecord(
                    "artificial_chr3", 34, "alt_10", "T", ["TA"], None, None,
                    {"is_filtered": 1}),  # Adjacent homopolymers downstream
                VCFRecord(
                    "artificial_chr3", 43, "alt_11", "G", ["GT"], None, None,
                    {"is_filtered": 0}),  # Adjacent too short homopolymers
                # Ins multi nt
                VCFRecord("artificial_chr4", 14, "alt_12", "G", ["GTA"], None,
                          None,
                          {"is_filtered": 0}),  # Without adjacent homopolymers
                VCFRecord(
                    "artificial_chr4", 23, "alt_13", "A", ["ATA"], None, None,
                    {"is_filtered": 1}),  # Adjacent homopolymers upstream
                VCFRecord(
                    "artificial_chr4", 34, "alt_14", "G", ["GTA"], None, None,
                    {"is_filtered": 1}),  # Adjacent homopolymers downstream
                VCFRecord(
                    "artificial_chr4", 44, "alt_15", "G", ["GTC"], None, None,
                    {"is_filtered": 0}),  # Adjacent too short homopolymer
                VCFRecord("artificial_chr4", 54, "alt_16", "CCT", ["ATCCAGA"],
                          None, None,
                          {"is_filtered": 0}),  # Without adjacent homopolymers
                VCFRecord(
                    "artificial_chr4", 64, "alt_17", "GGA", ["CTCCAGT"], None,
                    None,
                    {"is_filtered": 1}),  # Adjacent homopolymers upstream
                VCFRecord(
                    "artificial_chr4", 74, "alt_18", "GAC", ["ATCCAGT"], None,
                    None,
                    {"is_filtered": 1}),  # Adjacent homopolymers downstream
                VCFRecord(
                    "artificial_chr4", 84, "alt_19", "CAG", ["ATCCAGT"], None,
                    None,
                    {"is_filtered": 0}),  # Adjacent too short homopolymer
                # Del single nt
                VCFRecord("artificial_chr5", 14, "alt_20", "GT", ["G"], None,
                          None,
                          {"is_filtered": 0}),  # Without adjacent homopolymers
                VCFRecord(
                    "artificial_chr5", 23, "alt_21", "AG", ["A"], None, None,
                    {"is_filtered": 1}),  # Adjacent homopolymers upstream
                VCFRecord(
                    "artificial_chr5", 34, "alt_22", "GA", ["G"], None, None,
                    {"is_filtered": 1}),  # Adjacent homopolymers downstream
                VCFRecord(
                    "artificial_chr5", 43, "alt_23", "AG", ["A"], None, None,
                    {"is_filtered": 0}),  # Adjacent too short homopolymers
                # # Del multi nt
                VCFRecord("artificial_chr6", 14, "alt_24", "GCA", ["G"], None,
                          None,
                          {"is_filtered": 0}),  # Without adjacent homopolymers
                VCFRecord(
                    "artificial_chr6", 23, "alt_25", "AGT", ["C"], None, None,
                    {"is_filtered": 1}),  # Adjacent homopolymers upstream
                VCFRecord(
                    "artificial_chr6", 32, "alt_26", "AGG", ["A"], None, None,
                    {"is_filtered": 1}),  # Adjacent homopolymers downstream
                VCFRecord(
                    "artificial_chr6", 42, "alt_27", "TAG", ["C"], None, None,
                    {"is_filtered": 0}),  # Adjacent too short homopolymer
                VCFRecord("artificial_chr6", 54, "alt_28", "CCTGTCT", ["GAA"],
                          None, None,
                          {"is_filtered": 0}),  # Without adjacent homopolymers
                VCFRecord(
                    "artificial_chr6", 70, "alt_29", "TCTCGA", ["CCC"], None,
                    None,
                    {"is_filtered": 1}),  # Adjacent homopolymers upstream
                VCFRecord(
                    "artificial_chr6", 80, "alt_30", "GCAGGT", ["CCC"], None,
                    None,
                    {"is_filtered": 1}),  # Adjacent homopolymers downstream
                VCFRecord(
                    "artificial_chr6", 92, "alt_31", "ATGCAGT", ["CCC"], None,
                    None,
                    {"is_filtered": 0}),  # Adjacent too short homopolymer
            ]
            for idx, curr_var in enumerate(self.variants):
                FH_var.write(curr_var)
コード例 #5
0
    def setUp(self):
        tmp_folder = tempfile.gettempdir()
        unique_id = str(uuid.uuid1())

        # Temporary files
        self.tmp_selected_rna = os.path.join(tmp_folder,
                                             unique_id + "_rna.tsv")
        self.tmp_variants = os.path.join(tmp_folder, unique_id + ".vcf")
        self.tmp_output = os.path.join(tmp_folder, unique_id + "_out.vcf")

        # Create RNA ref
        with open(self.tmp_selected_rna, "w") as FH_rna:
            FH_rna.write("#Gene\tTranscript\n")
            FH_rna.write("Gene_1\tENST_selected1\n")
            FH_rna.write("Gene_1\tENST_selected2\n")

        # Create VCF
        with AnnotVCFIO(self.tmp_variants, "w") as FH_var:
            FH_var.ANN_titles = [
                "Allele", "Consequence", "Feature", "EUR_AF", "gnomAD_AF",
                "expected_filter"
            ]
            FH_var.info = {
                "ANN":
                HeaderInfoAttr(
                    "ANN",
                    "Consequence annotations from Ensembl VEP. Format: Allele|Consequence|Feature|gnomAD_AF|expected_filter.",
                    type="String",
                    number="."),
                "expected_filter":
                HeaderInfoAttr("expected_filter",
                               "The expected filters.",
                               type="String",
                               number=".")
            }
            FH_var.writeHeader()
            self.variants = [
                VCFRecord(
                    "artificial_chr1", 14, "alt_00", "G", ["T"], None, None, {
                        "ANN": [{
                            "Allele": "T",
                            "Consequence": "missense_variant",
                            "Feature": "ENST_selected1",
                            "EUR_AF": "0.001&0.001",
                            "gnomAD_AF": 0.001,
                            "expected_filter": "PASS"
                        }],
                        "expected_filter": ["PASS"]
                    }),
                VCFRecord("artificial_chr1", 14, "alt_01", "G", ["T"], None,
                          None, {"expected_filter": ["CSQ"]}),
                VCFRecord(
                    "artificial_chr1", 14, "alt_02", "G", ["T"], None, None, {
                        "ANN": [{
                            "Allele": "T",
                            "Consequence": "synonymous_variant",
                            "Feature": "ENST_selected1",
                            "EUR_AF": "0.001&0.001",
                            "gnomAD_AF": 0.001,
                            "expected_filter": "ANN.CSQ"
                        }],
                        "expected_filter": ["CSQ"]
                    }),
                VCFRecord(
                    "artificial_chr1", 14, "alt_03", "G", ["T"], None, None, {
                        "ANN": [{
                            "Allele": "T",
                            "Consequence": "missense_variant",
                            "Feature": "ENST_selected1",
                            "EUR_AF": "0.001&0.001",
                            "gnomAD_AF": 0.01,
                            "expected_filter": "ANN.popAF"
                        }],
                        "expected_filter": ["popAF"]
                    }),
                VCFRecord(
                    "artificial_chr1", 14, "alt_04", "G", ["T"], None, None, {
                        "ANN": [{
                            "Allele": "T",
                            "Consequence": "missense_variant",
                            "Feature": "other",
                            "EUR_AF": "0.001&0.001",
                            "gnomAD_AF": 0.001,
                            "expected_filter": "ANN.RNA"
                        }],
                        "expected_filter": ["CSQ"]
                    }),
                VCFRecord(
                    "artificial_chr1", 14, "alt_05", "G", ["T"], None, None, {
                        "ANN": [{
                            "Allele": "G",
                            "Consequence": "missense_variant",
                            "Feature": "ENST_selected1",
                            "EUR_AF": "0.001&0.001",
                            "gnomAD_AF": 0.001,
                            "expected_filter": "ANN.COLLOC"
                        }],
                        "expected_filter": ["CSQ"]
                    }),
                VCFRecord(
                    "artificial_chr1", 14, "alt_06", "G", ["T"], None, None, {
                        "ANN": [{
                            "Allele": "T",
                            "Consequence": "missense_variant",
                            "Feature": "ENST_selected1",
                            "EUR_AF": "0.001&0.001",
                            "gnomAD_AF": 0.001,
                            "expected_filter": "PASS"
                        }, {
                            "Allele": "C",
                            "Consequence": "missense_variant",
                            "Feature": "ENST_selected1",
                            "EUR_AF": "0.001&0.001",
                            "gnomAD_AF": 0.001,
                            "expected_filter": "ANN.COLLOC"
                        }],
                        "expected_filter": ["PASS"],
                    }),
                VCFRecord(
                    "artificial_chr1", 14, "alt_07", "G", ["T"], None, None, {
                        "ANN": [{
                            "Allele": "T",
                            "Consequence": "missense_variant",
                            "Feature": "ENST_selected1",
                            "EUR_AF": "0.001&0.001",
                            "gnomAD_AF": 0.01,
                            "expected_filter": "ANN.popAF"
                        }, {
                            "Allele": "C",
                            "Consequence": "missense_variant",
                            "Feature": "ENST_selected1",
                            "EUR_AF": "0.001&0.001",
                            "gnomAD_AF": 0.001,
                            "expected_filter": "ANN.COLLOC"
                        }],
                        "expected_filter": ["popAF"],
                    }),
                VCFRecord(
                    "artificial_chr1", 14, "alt_08", "G", ["T"], None, None, {
                        "ANN": [{
                            "Allele": "T",
                            "Consequence": "synonymous_variant",
                            "Feature": "ENST_selected1",
                            "EUR_AF": "0.001&0.001",
                            "gnomAD_AF": 0.001,
                            "expected_filter": "ANN.CSQ"
                        }, {
                            "Allele": "C",
                            "Consequence": "missense_variant",
                            "Feature": "ENST_selected1",
                            "EUR_AF": "0.001&0.001",
                            "gnomAD_AF": 0.001,
                            "expected_filter": "ANN.COLLOC"
                        }],
                        "expected_filter": ["CSQ"],
                    }),
                VCFRecord(
                    "artificial_chr1", 14, "alt_09", "G", ["T"], None, None, {
                        "ANN": [{
                            "Allele": "T",
                            "Consequence": "missense_variant",
                            "Feature": "other",
                            "EUR_AF": "0.001&0.001",
                            "gnomAD_AF": 0.001,
                            "expected_filter": "ANN.RNA"
                        }, {
                            "Allele": "C",
                            "Consequence": "missense_variant",
                            "Feature": "ENST_selected1",
                            "EUR_AF": "0.001&0.001",
                            "gnomAD_AF": 0.001,
                            "expected_filter": "ANN.COLLOC"
                        }],
                        "expected_filter": ["CSQ"],
                    }),
                VCFRecord(
                    "artificial_chr1", 14, "alt_10", "G", ["T"], None, None, {
                        "ANN": [{
                            "Allele": "T",
                            "Consequence": "synonymous_variant",
                            "Feature": "other",
                            "EUR_AF": "0.001&0.001",
                            "gnomAD_AF": 0.01,
                            "expected_filter": "ANN.CSQ&ANN.RNA&ANN.popAF"
                        }, {
                            "Allele": "C",
                            "Consequence": "missense_variant",
                            "Feature": "ENST_selected1",
                            "EUR_AF": "0.001&0.001",
                            "gnomAD_AF": 0.001,
                            "expected_filter": "ANN.COLLOC"
                        }],
                        "expected_filter": ["CSQ", "popAF"],
                    }),
                VCFRecord(
                    "artificial_chr1", 14, "alt_11", "G", ["T"], None, None, {
                        "ANN": [{
                            "Allele": "T",
                            "Consequence": "synonymous_variant",
                            "Feature": "ENST_selected1",
                            "EUR_AF": "0.001&0.001",
                            "gnomAD_AF": 0.01,
                            "expected_filter": "ANN.CSQ&ANN.popAF"
                        }, {
                            "Allele": "C",
                            "Consequence": "missense_variant",
                            "Feature": "ENST_selected1",
                            "EUR_AF": "0.001&0.001",
                            "gnomAD_AF": 0.001,
                            "expected_filter": "ANN.COLLOC"
                        }],
                        "expected_filter": ["CSQ", "popAF"],
                    }),
                VCFRecord(
                    "artificial_chr1", 14, "alt_12", "G", ["T"], None, None, {
                        "ANN": [{
                            "Allele": "T",
                            "Consequence": "synonymous_variant",
                            "Feature": "ENST_selected1",
                            "EUR_AF": "0.001&0.001",
                            "gnomAD_AF": 0.01,
                            "expected_filter": "ANN.CSQ&ANN.popAF"
                        }, {
                            "Allele": "T",
                            "Consequence": "missense_variant",
                            "Feature": "other",
                            "EUR_AF": "0.001&0.001",
                            "gnomAD_AF": 0.01,
                            "expected_filter": "ANN.RNA&ANN.popAF"
                        }, {
                            "Allele": "C",
                            "Consequence": "missense_variant",
                            "Feature": "ENST_selected1",
                            "EUR_AF": "0.001&0.001",
                            "gnomAD_AF": 0.001,
                            "expected_filter": "ANN.COLLOC"
                        }],
                        "expected_filter": ["CSQ", "popAF"],
                    }),
                VCFRecord(
                    "artificial_chr1", 14, "alt_13", "G", ["T"], None, None, {
                        "ANN": [{
                            "Allele": "T",
                            "Consequence": "synonymous_variant",
                            "Feature": "ENST_selected1",
                            "EUR_AF": "0.01&0.01",
                            "gnomAD_AF": 0.001,
                            "expected_filter": "ANN.CSQ&ANN.popAF"
                        }, {
                            "Allele": "C",
                            "Consequence": "missense_variant",
                            "Feature": "ENST_selected1",
                            "EUR_AF": "0.05&0.05",
                            "gnomAD_AF": 0.001,
                            "expected_filter": "ANN.COLLOC&ANN.popAF"
                        }],
                        "expected_filter": ["CSQ", "popAF"],
                    }),
                VCFRecord(
                    "artificial_chr1", 14, "alt_14", "G", ["GT"], None, None, {
                        "ANN": [{
                            "Allele": "GT",
                            "Consequence": "missense_variant",
                            "Feature": "ENST_selected1",
                            "EUR_AF": "0.01&0.01",
                            "gnomAD_AF": 0.001,
                            "expected_filter": "ANN.popAF"
                        }, {
                            "Allele": "C",
                            "Consequence": "missense_variant",
                            "Feature": "ENST_selected1",
                            "EUR_AF": "0.05&0.05",
                            "gnomAD_AF": 0.001,
                            "expected_filter": "ANN.COLLOC&ANN.popAF"
                        }, {
                            "Allele": "T",
                            "Consequence": "missense_variant",
                            "Feature": "ENST_selected1",
                            "EUR_AF": "0.05&0.05",
                            "gnomAD_AF": 0.001,
                            "expected_filter": "ANN.COLLOC&ANN.popAF"
                        }],
                        "expected_filter": ["popAF"],
                    }),
                VCFRecord(
                    "artificial_chr1", 15, "alt_15", "-", ["T"], None, None, {
                        "ANN": [{
                            "Allele": "GT",
                            "Consequence": "missense_variant",
                            "Feature": "ENST_selected1",
                            "EUR_AF": "0.01&0.01",
                            "gnomAD_AF": 0.001,
                            "expected_filter": "ANN.COLLOC&ANN.popAF"
                        }, {
                            "Allele": "T",
                            "Consequence": "missense_variant",
                            "Feature": "ENST_selected1",
                            "EUR_AF": "0.01&0.01",
                            "gnomAD_AF": 0.001,
                            "expected_filter": "ANN.popAF"
                        }, {
                            "Allele": "C",
                            "Consequence": "missense_variant",
                            "Feature": "ENST_selected1",
                            "EUR_AF": "0.05&0.05",
                            "gnomAD_AF": 0.001,
                            "expected_filter": "ANN.COLLOC&ANN.popAF"
                        }],
                        "expected_filter": ["popAF"],
                    }),
                VCFRecord(
                    "artificial_chr1", 14, "alt_15", "G", ["-"], None, None, {
                        "ANN": [{
                            "Allele": "-",
                            "Consequence": "missense_variant",
                            "Feature": "ENST_selected1",
                            "EUR_AF": "0.01&0.01",
                            "gnomAD_AF": 0.001,
                            "expected_filter": "ANN.popAF"
                        }, {
                            "Allele": "G",
                            "Consequence": "missense_variant",
                            "Feature": "ENST_selected1",
                            "EUR_AF": "0.01&0.01",
                            "gnomAD_AF": 0.001,
                            "expected_filter": "ANN.COLLOC&ANN.popAF"
                        }, {
                            "Allele": "C",
                            "Consequence": "missense_variant",
                            "Feature": "ENST_selected1",
                            "EUR_AF": "0.05&0.05",
                            "gnomAD_AF": 0.001,
                            "expected_filter": "ANN.COLLOC&ANN.popAF"
                        }],
                        "expected_filter": ["popAF"],
                    }),
                VCFRecord(
                    "artificial_chr1", 14, "alt_16", "GG", ["G"], None, None, {
                        "ANN": [{
                            "Allele": "-",
                            "Consequence": "missense_variant",
                            "Feature": "ENST_selected1",
                            "EUR_AF": "0.01&0.01",
                            "gnomAD_AF": 0.001,
                            "expected_filter": "ANN.COLLOC&ANN.popAF"
                        }, {
                            "Allele": "G",
                            "Consequence": "missense_variant",
                            "Feature": "ENST_selected1",
                            "EUR_AF": "0.01&0.01",
                            "gnomAD_AF": 0.001,
                            "expected_filter": "ANN.popAF"
                        }, {
                            "Allele": "C",
                            "Consequence": "missense_variant",
                            "Feature": "ENST_selected1",
                            "EUR_AF": "0.05&0.05",
                            "gnomAD_AF": 0.001,
                            "expected_filter": "ANN.COLLOC&ANN.popAF"
                        }],
                        "expected_filter": ["popAF"],
                    })
            ]
            for idx, curr_var in enumerate(self.variants):
                FH_var.write(curr_var)
コード例 #6
0
    def setUp(self):
        tmp_folder = tempfile.gettempdir()
        unique_id = str(uuid.uuid1())

        # Temporary files
        self.tmp_var_filters = os.path.join(tmp_folder,
                                            unique_id + "_varFilters.json")
        self.tmp_annot_filters = os.path.join(tmp_folder,
                                              unique_id + "_annFilters.json")
        self.tmp_variants = os.path.join(tmp_folder, unique_id + ".vcf")
        self.tmp_output = os.path.join(tmp_folder, unique_id + "_out.vcf")

        # Command
        self.cmd = [
            "filterAnnotVCF.py", "--input-variants", self.tmp_variants,
            "--output-variants", self.tmp_output
        ]

        # Create filters
        with open(self.tmp_var_filters, "w") as FH_filter:
            FH_filter.write("""{
    "class": "FiltersCombiner",
    "operator": "or",
    "filters": [
        {
            "class": "Filter",
            "getter": "filter",
            "action": "select",
            "aggregator": "ratio:1",
            "operator": "!=",
            "values": "CSQ"
        }, {
            "class": "Filter",
            "getter": "chrom",
            "action": "select",
            "aggregator": "nb:1",
            "operator": "==",
            "values": "artificial_chr2"
        }
    ]
}""")
        with open(self.tmp_annot_filters, "w") as FH_filter:
            FH_filter.write("""{
    "class": "Filter",
    "getter": "FILTER",
    "action": "select",
    "aggregator": "ratio:1",
    "operator": "==",
    "values": "PASS"
}""")

        # Create VCF
        with AnnotVCFIO(self.tmp_variants, "w") as FH_var:
            FH_var.ANN_titles = ["Allele", "id", "is_filtered", "FILTER"]
            FH_var.info = {
                "ANN":
                HeaderInfoAttr(
                    "ANN",
                    "Consequence annotations from Ensembl VEP. Format: Allele|id|is_filtered|FILTER.",
                    type="String",
                    number="."),
                "is_filtered":
                HeaderInfoAttr("is_filtered",
                               "The expected result.",
                               type="Integer",
                               number="1")
            }
            FH_var.writeHeader()
            self.variants = [
                VCFRecord("artificial_chr1", 10, "alt_00", "G", ["T"], None,
                          ["PASS"], {"is_filtered": 0}),
                VCFRecord("artificial_chr1", 10, "alt_01", "G", ["T"], None,
                          ["CSQ"], {"is_filtered": 1}),
                VCFRecord(
                    "artificial_chr2",
                    10,
                    "alt_02",
                    "G",
                    ["T"],
                    None,
                    ["CSQ"],
                    {
                        "is_filtered": 0,  # Proctected
                    }),
                VCFRecord(
                    "artificial_chr1", 10, "alt_03", "G", ["T"], None,
                    ["PASS"], {
                        "ANN": [{
                            "Allele": "T",
                            "id": "ann_00",
                            "FILTER": "PASS",
                            "is_filtered": 0
                        }],
                        "is_filtered":
                        0
                    }),
                VCFRecord(
                    "artificial_chr1", 10, "alt_04", "G", ["T"], None,
                    ["PASS"], {
                        "ANN": [{
                            "Allele": "C",
                            "id": "ann_01",
                            "FILTER": "ANN.COLLOC",
                            "is_filtered": 1
                        }],
                        "is_filtered":
                        0
                    }),
                VCFRecord(
                    "artificial_chr1", 10, "alt_05", "G", ["T"], None, ["CSQ"],
                    {
                        "ANN": [{
                            "Allele": "C",
                            "id": "ann_02",
                            "FILTER": "ANN.COLLOC",
                            "is_filtered": 1
                        }],
                        "is_filtered":
                        1
                    }),
                VCFRecord(
                    "artificial_chr1", 10, "alt_06", "G", ["T"], None, ["CSQ"],
                    {
                        "ANN": [{
                            "Allele": "T",
                            "id": "ann_03",
                            "FILTER": "PASS",
                            "is_filtered": 0
                        }],
                        "is_filtered":
                        1
                    }),
                VCFRecord(
                    "artificial_chr1", 10, "alt_07", "G", ["T"], None,
                    ["PASS"], {
                        "ANN": [
                            {
                                "Allele": "T",
                                "id": "ann_04",
                                "FILTER": "PASS",
                                "is_filtered": 0
                            },
                            {
                                "Allele": "C",
                                "id": "ann_05",
                                "FILTER": "ANN.COLLOC",
                                "is_filtered": 1
                            },
                        ],
                        "is_filtered":
                        0
                    }),
                VCFRecord(
                    "artificial_chr1", 10, "alt_08", "G", ["T"], None,
                    ["PASS"], {
                        "ANN": [
                            {
                                "Allele": "T",
                                "id": "ann_06",
                                "FILTER": "ANN.popAF",
                                "is_filtered": 1
                            },
                            {
                                "Allele": "C",
                                "id": "ann_07",
                                "FILTER": "ANN.COLLOC&ANN.popAF",
                                "is_filtered": 1
                            },
                        ],
                        "is_filtered":
                        0
                    }),
                VCFRecord(
                    "artificial_chr2",
                    10,
                    "alt_09",
                    "G",
                    ["T"],
                    None,
                    ["CSQ"],
                    {
                        "ANN": [
                            {
                                "Allele": "T",
                                "id": "ann_08",
                                "FILTER": "ANN.popAF",
                                "is_filtered": 1
                            },
                            {
                                "Allele": "C",
                                "id": "ann_09",
                                "FILTER": "ANN.COLLOC&ANN.popAF",
                                "is_filtered": 1
                            },
                        ],
                        "is_filtered":
                        0  # Protected
                    }),
                VCFRecord(
                    "artificial_chr2",
                    10,
                    "alt_10",
                    "G",
                    ["T"],
                    None,
                    ["CSQ"],
                    {
                        "ANN": [
                            {
                                "Allele": "T",
                                "id": "ann_10",
                                "FILTER": "PASS",
                                "is_filtered": 0
                            },
                            {
                                "Allele": "C",
                                "id": "ann_11",
                                "FILTER": "ANN.COLLOC&ANN.popAF",
                                "is_filtered": 1
                            },
                        ],
                        "is_filtered":
                        0  # Protected
                    })
            ]
            for idx, curr_var in enumerate(self.variants):
                FH_var.write(curr_var)
コード例 #7
0
 def setUp(self):
     # VCF
     self.vcfio = FakeVCFIO(
         {
             "AF":
             HeaderInfoAttr("AF", "Alternative alleles frequencies",
                            "Float", "A")
         }, {
             "AD":
             HeaderFormatAttr("AD", "Alternative alleles depths", "Integer",
                              "A"),
             "DP":
             HeaderFormatAttr("DP", "total depth", "Integer", "1")
         })
     # Ref seq
     tmp_folder = tempfile.gettempdir()
     unique_id = str(uuid.uuid1())
     self.tmp_fasta_path = os.path.join(tmp_folder, unique_id + ".fa")
     self.tmp_faidx_path = os.path.join(tmp_folder, unique_id + ".fa.fai")
     self.ref_seq = "ACGCAAATCTCGGCATGCCGATT"
     #               | | | | | |  |  |  |  |
     #               1 3 5 7 9 11 14 17 20 23
     with open(self.tmp_fasta_path, "w") as FH_seq:
         FH_seq.write(">chr1\n{}".format(self.ref_seq))
     with open(self.tmp_faidx_path, "w") as FH_faidx:
         FH_faidx.write("chr1\t{}\t6\t60\t61".format(len(self.ref_seq)))
     # Variants
     self.variant_1 = VCFRecord(
         "chr1",  # chrom
         None,  # pos
         "artificial_1",  # id
         None,  # ref
         None,  # alt
         10,  # qual
         ["lowQual", "lowDP"],  # filter
         {"AF": [0.05]},  # info
         ["DP", "AD"],  # format
         {
             "splA": {
                 "AD": [10],
                 "DP": 100
             },
             "splB": {
                 "AD": [40],
                 "DP": 4900
             },
         })
     self.variant_2 = VCFRecord(
         "chr1",  # chrom
         None,  # pos
         None,  # id
         None,  # ref
         None,  # alt
         30,  # qual
         ["PASS"],  # filter
         {"AF": [0.06]},  # info
         ["DP", "AD"],  # format
         {
             "splA": {
                 "AD": [5],
                 "DP": 50
             },
             "splB": {
                 "AD": [31],
                 "DP": 550
             },
         })
     self.expected_merge = VCFRecord(
         "chr1",  # chrom
         None,  # pos
         None,  # id
         None,  # ref
         None,  # alt
         20,  # qual
         ["lowQual", "lowDP"],  # filter
         {
             "AF": [0.06],
             "MCO_QUAL": [10, 30],
             "MCO_VAR": ["chr1:5=A/T", "chr1:20=G/C"]
         },  # info
         ["DP", "AD"],  # format
         {
             "splA": {
                 "AD": [5],
                 "DP": 50
             },
             "splB": {
                 "AD": [31],
                 "DP": 550
             },
         })
コード例 #8
0
def getNewHeaderAttr(args):
    """
    Return renamed and new VCFHeader elements for the merged VCF.

    :param args: The script's parameters.
    :type args: NameSpace
    :return: VCFHeader elements (filter, info, format, samples).
    :rtype: dict
    """
    unchanged_info = {"MATEID", "RNA_FIRST", "SVTYPE", "IMPRECISE"}
    final_filter = {}
    final_info = {
        "CIPOS":
        HeaderInfoAttr("CIPOS",
                       type="Integer",
                       number="2",
                       description="Confidence interval around POS"),
        "IDSRC":
        HeaderInfoAttr("IDSRC",
                       type="String",
                       number=".",
                       description="ID of breakend by source"),
        "REFSRC":
        HeaderInfoAttr(
            "REFSRC",
            type="String",
            number="1",
            description="Selected support data (SR, PR) come from this source"
        ),
        "SRC":
        HeaderInfoAttr(
            "SRC",
            type="String",
            number=".",
            description=
            "Fusions callers where the breakend is identified. Possible values: {}"
            .format({
                name: "s" + str(idx)
                for idx, name in enumerate(args.calling_sources)
            }))
    }
    final_format = {
        "SR":
        HeaderFormatAttr(
            "SR",
            type="Integer",
            number="1",
            description="Count of reads mapping on the fusion junction"),
        "PR":
        HeaderFormatAttr(
            "PR",
            type="Integer",
            number="1",
            description="Count of pairs of reads supporting the fusion"),
        "SRSRC":
        HeaderFormatAttr(
            "SRSRC",
            type="Integer",
            number=".",
            description=
            "Count of reads mapping on the fusion junction by source"),
        "PRSRC":
        HeaderFormatAttr(
            "PRSRC",
            type="Integer",
            number=".",
            description=
            "Count of pairs of reads supporting the fusion by source")
    }
    final_samples = None
    for idx_in, curr_in in enumerate(args.inputs_variants):
        with VCFIO(curr_in) as FH_vcf:
            # Samples
            if final_samples is None:
                final_samples = FH_vcf.samples
            elif FH_vcf.samples != final_samples:
                raise Exception(
                    "The samples in VCF are not the same: {} in {} and {} in {}."
                    .format(final_samples, args.inputs_variants[0],
                            FH_vcf.samples, curr_in))
            # FILTER
            for tag, data in FH_vcf.filter.items():
                new_tag = tag
                if tag not in args.shared_filters:  # Rename filters not based on caller
                    new_tag = "s{}_{}".format(idx_in, tag)
                    data.id = new_tag
                    data.source = args.calling_sources[idx_in]
                final_filter[new_tag] = data
            # INFO
            for tag, data in FH_vcf.info.items():
                if tag in unchanged_info:
                    if tag not in final_info or len(
                            final_info[tag].description
                    ) < len(
                            data.description
                    ):  # Manage merge between callers with 0 variants (and 0 annotations) and callers with variants
                        final_info[tag] = data
                else:
                    new_tag = "s{}_{}".format(idx_in, tag)
                    data.id = new_tag
                    data.source = args.calling_sources[idx_in]
                    final_info[new_tag] = data
            qual_tag = "s{}_VCQUAL".format(idx_in)
            final_info[qual_tag] = HeaderInfoAttr(
                qual_tag,
                type="Float",
                number="1",
                description="The variant quality",
                source=args.calling_sources[idx_in])
            # FORMAT
            for tag, data in FH_vcf.format.items():
                new_tag = "s{}_{}".format(idx_in, tag)
                data.id = new_tag
                data.source = args.calling_sources[idx_in]
                final_format[new_tag] = data
    return {
        "filter": final_filter,
        "info": final_info,
        "format": final_format,
        "samples": final_samples
    }
コード例 #9
0
    def setUp(self):
        tmp_folder = tempfile.gettempdir()
        unique_id = str(uuid.uuid1())

        # Temporary files
        self.tmp_sequences = os.path.join(tmp_folder, unique_id + ".fasta")
        self.tmp_faidx = os.path.join(tmp_folder, unique_id + ".fasta.fai")
        self.tmp_regions = os.path.join(tmp_folder, unique_id + ".bed")
        self.tmp_variants = os.path.join(tmp_folder, unique_id + ".vcf")
        self.tmp_output = os.path.join(tmp_folder, unique_id + "_out.vcf")

        # Exec command
        self.cmd = [
            "filterVCFTargets.py", "--mode", "remove", "--input-variants",
            self.tmp_variants, "--input-targets", self.tmp_regions,
            "--input-reference", self.tmp_sequences, "--output-variants",
            self.tmp_output
        ]

        # Create fasta
        with FastaIO(self.tmp_sequences, "w") as FH_seq:
            # Repeats:                                       ****....            ...***
            # Region:                                 |----|        |------------|         |------|
            FH_seq.write(
                Sequence("artificial_chr1",
                         "CTCAGTCATGTATGTATGTGCTCACAAAGTAGTAGATCATGGCAC"))
            #                                         123456789| | | | | | | | | | | | | | | | | |
            #                                                  10| 14| 18| 22| 26| 30| 34| 38| 42|
            #                                                    12  16  20  24  28  32  36  40  44
            FH_seq.write(Sequence("artificial_chr2", "CGATNNNCGAT"))
            #                                         123456789|
            #                                                  10

        # Create faidx
        with open(self.tmp_faidx, "w") as FH_fai:
            FH_fai.write("""artificial_chr1	45	17	45	46
artificial_chr2	11	80	11	12""")

        # Create targets
        with BEDIO(self.tmp_regions, "w", write_nb_col=4) as FH_bed:
            FH_bed.write(BEDRecord("artificial_chr1", 1, 6, "target_1"))
            FH_bed.write(BEDRecord("artificial_chr1", 15, 28, "target_2"))
            FH_bed.write(BEDRecord("artificial_chr1", 38, 45, "target_3"))

        # Create VCF
        with VCFIO(self.tmp_variants, "w") as FH_var:
            FH_var.info = {
                "target":
                HeaderInfoAttr("target",
                               "The ID of the overlapped target.",
                               type="String",
                               number="1")
            }
            FH_var.writeHeader()
            self.variants = [
                # Substit single nt
                VCFRecord("artificial_chr1", 14, "alt_00", "G", ["T"], None,
                          None, {"target": None
                                 }),  # Before target ; first nt before target
                VCFRecord(
                    "artificial_chr1", 15, "alt_01", "G", ["T"], None, None,
                    {"target": "target_2"}),  # On target ; first nt of target
                VCFRecord("artificial_chr1", 21, "alt_02", "C", ["G"], None,
                          None, {"target": "target_2"}),  # On target
                VCFRecord("artificial_chr1", 28, "alt_03", "A", ["G"], None,
                          None, {"target": "target_2"}),  # On target ; last nt
                VCFRecord(
                    "artificial_chr1", 29, "alt_04", "G", ["C"], None, None,
                    {"target": None}),  # After target ; first nt after target
                # Substit multi nt
                VCFRecord("artificial_chr1", 7, "alt_05", "CATGTATG",
                          ["GTACCCGC"], None, None,
                          {"target": None
                           }),  # Before target ; first nt before target
                VCFRecord("artificial_chr1", 11, "alt_06", "TATGTATG",
                          ["GTACCCGC"], None, None,
                          {"target": "target_2"}),  # Overlap target start
                VCFRecord("artificial_chr1", 13, "alt_07",
                          "TGTATGTGCTCACAAAGTA", ["CCCGCCCCTACATTGCAGT"], None,
                          None, {"target": "target_2"}),  # Include target
                VCFRecord("artificial_chr1", 15, "alt_08", "TATGTGCTCACAAA",
                          ["CGCCCCTACATTGC"], None, None,
                          {"target": "target_2"}),  # Exact target
                VCFRecord("artificial_chr1", 21, "alt_09", "CTCACAA",
                          ["GTACCCG"], None, None,
                          {"target": "target_2"}),  # Included by target
                VCFRecord("artificial_chr1", 24, "alt_10", "ACAAAGTA",
                          ["GTACCCG"], None, None,
                          {"target": "target_2"}),  # Overlap target end
                VCFRecord(
                    "artificial_chr1", 29, "alt_11", "GTAGTAGAT",
                    ["GTACCCGA"], None, None,
                    {"target": None}),  # After target ; first nt after target
                # Ins single nt
                VCFRecord("artificial_chr1", 14, "alt_12", "G", ["GA"], None,
                          None, {"target": None
                                 }),  # Before target ; first nt before target
                VCFRecord("artificial_chr1", 15, "alt_12.2", "-", ["A"], None,
                          None, {"target": None
                                 }),  # Before target ; first nt before target
                VCFRecord(
                    "artificial_chr1", 15, "alt_13", "A", ["TG"], None, None,
                    {"target": "target_2"}),  # On target ; first nt of target
                VCFRecord("artificial_chr1", 21, "alt_14", "C", ["CG"], None,
                          None, {"target": "target_2"}),  # On target
                VCFRecord("artificial_chr1", 27, "alt_15", "A", ["AT"], None,
                          None, {"target": "target_2"}),  # On target ; last nt
                VCFRecord("artificial_chr1", 28, "alt_15.2", "-", ["T"], None,
                          None, {"target": "target_2"}),  # On target ; last nt
                VCFRecord(
                    "artificial_chr1", 28, "alt_16", "A", ["AT"], None, None,
                    {"target": None}),  # After target ; first nt afetr target
                # Movable del multi nt
                VCFRecord(
                    "artificial_chr1", 14, "alt_17", "G", ["GT"], None, None,
                    {"target": "target_2"}),  # Movable to first nt of target
                VCFRecord(
                    "artificial_chr1", 28, "alt_18", "A", ["AA"], None, None,
                    {"target": "target_2"}),  # Movable to last nt of target
                # Del single nt
                VCFRecord("artificial_chr1", 14, "alt_19", "G", [""], None,
                          None, {"target": None
                                 }),  # Before target ; first nt before target
                VCFRecord(
                    "artificial_chr1", 15, "alt_20", "T", [""], None, None,
                    {"target": "target_2"}),  # On target ; first nt of target
                VCFRecord("artificial_chr1", 21, "alt_21", "C", [""], None,
                          None, {"target": "target_2"}),  # On target
                VCFRecord("artificial_chr1", 28, "alt_22", "A", [""], None,
                          None, {"target": "target_2"}),  # On target ; last nt
                VCFRecord(
                    "artificial_chr1", 29, "alt_23", "G", [""], None, None,
                    {"target": None}),  # After target ; first nt afetr target
                # Del multi nt
                VCFRecord("artificial_chr1", 11, "alt_24", "TATG", ["T"], None,
                          None, {"target": None
                                 }),  # Before target ; first nt before target
                VCFRecord(
                    "artificial_chr1", 13, "alt_25", "TGTA", ["T"], None, None,
                    {"target": "target_2"}),  # On target ; first nt of target
                VCFRecord("artificial_chr1", 20, "alt_26", "GCTC", ["G"], None,
                          None, {"target": "target_2"}),  # On target
                VCFRecord("artificial_chr1", 27, "alt_27", "AAGT", ["A"], None,
                          None, {"target": "target_2"}),  # On target ; last nt
                VCFRecord(
                    "artificial_chr1", 28, "alt_28", "AGT", ["A"], None, None,
                    {"target": None}),  # After target ; first nt afetr target
                # Movable del multi nt
                VCFRecord("artificial_chr1", 7, "alt_29", "CATGT", ["C"], None,
                          None,
                          {"target": "target_2"
                           }),  # On repeat and movable to first nt of target
                VCFRecord(
                    "artificial_chr1", 12, "alt_30", "ATG", ["A"], None, None,
                    {"target": "target_2"}),  # Movable to first nt of target
                VCFRecord(
                    "artificial_chr1", 28, "alt_31", "AGTA", ["A"], None, None,
                    {"target": "target_2"}),  # Movable to last nt of target
                VCFRecord("artificial_chr1", 30, "alt_32", "TAGT", ["T"], None,
                          None,
                          {"target": "target_2"
                           }),  # On repeat and movable to last nt of target
            ]
            for idx, curr_var in enumerate(self.variants):
                FH_var.write(curr_var)
コード例 #10
0
    def setUp(self):
        tmp_folder = tempfile.gettempdir()
        unique_id = str(uuid.uuid1())

        # Temporary files
        self.tmp_variants = os.path.join(tmp_folder, unique_id + ".vcf")
        self.tmp_output = os.path.join(tmp_folder, unique_id + "_out.vcf")

        # Exec command
        self.cmd = [
            "filterVCFBySOR.py", "--input-variants", self.tmp_variants,
            "--output-variants", self.tmp_output
        ]

        # Create VCF
        with VCFIO(self.tmp_variants, "w") as FH_var:
            FH_var.info = {
                "expected":
                HeaderInfoAttr("expected",
                               "Expected filter tag.",
                               type="String",
                               number="1"),
                "SAR":
                HeaderInfoAttr(
                    "SAR",
                    "Number of reads supporting the alternative allele in reverse strand.",
                    type="Integer",
                    number="1"),
                "SAF":
                HeaderInfoAttr(
                    "SAF",
                    "Number of reads supporting the alternative allele in forward strand.",
                    type="Integer",
                    number="1"),
                "SRR":
                HeaderInfoAttr(
                    "SRR",
                    "Number of reads supporting the reference allele in reverse strand.",
                    type="Integer",
                    number="1"),
                "SRF":
                HeaderInfoAttr(
                    "SRF",
                    "Number of reads supporting the reference allele in forward strand.",
                    type="Integer",
                    number="1"),
            }
            FH_var.writeHeader()
            self.variants = [
                # 0.5 alt, 0.5 ref, low DP, alt no bias, ref no bias
                VCFRecord("artificial_chr1", 10, "sub_01", "G", ["T"], None,
                          None, {
                              "SAR": 5,
                              "SAF": 5,
                              "SRR": 5,
                              "SRF": 5,
                              "expected": "PASS"
                          }),
                # 0.05 alt, 0.95 ref, good DP, alt no bias, ref no bias
                VCFRecord("artificial_chr1", 20, "sub_02", "G", ["T"], None,
                          None, {
                              "SAR": 5,
                              "SAF": 5,
                              "SRR": 95,
                              "SRF": 95,
                              "expected": "PASS"
                          }),
                # 0.05 alt, 0.95 ref, good DP, alt no bias, ref strand bias
                VCFRecord("artificial_chr1", 30, "sub_03", "G", ["T"], None,
                          None, {
                              "SAR": 5,
                              "SAF": 5,
                              "SRR": 150,
                              "SRF": 30,
                              "expected": "PASS"
                          }),
                # 0.05 alt, 0.95 ref, good DP, alt strand bias, ref no bias
                VCFRecord(
                    "artificial_chr1", 40, "sub_04", "G", ["T"], None, None, {
                        "SAR": 9,
                        "SAF": 1,
                        "SRR": 95,
                        "SRF": 95,
                        "expected": "strandRatioBias"
                    }),
                # 0.05 alt, 0.95 ref, good DP, alt strand bias, ref strand bias => no bias
                VCFRecord("artificial_chr1", 50, "sub_05", "G", ["T"], None,
                          None, {
                              "SAR": 9,
                              "SAF": 1,
                              "SRR": 150,
                              "SRF": 30,
                              "expected": "PASS"
                          }),
                # 0.5 alt, 0.5 ref, low DP, alt strand bias, ref no bias
                VCFRecord(
                    "artificial_chr1", 60, "sub_06", "G", ["T"], None, None, {
                        "SAR": 9,
                        "SAF": 1,
                        "SRR": 5,
                        "SRF": 5,
                        "expected": "strandRatioBias"
                    }),
                # 0.29 alt, 0.71 ref, good DP, alt no bias, ref no bias
                VCFRecord(
                    "artificial_chr1", 70, "sub_07", "G", ["T"], None, None, {
                        "SAR": 400,
                        "SAF": 600,
                        "SRR": 1400,
                        "SRF": 1000,
                        "expected": "PASS"
                    }),
                # 0.71 alt, 0.29 ref, good DP, alt no bias, ref no bias
                VCFRecord(
                    "artificial_chr1", 80, "sub_08", "G", ["T"], None, None, {
                        "SAR": 1400,
                        "SAF": 1000,
                        "SRR": 400,
                        "SRF": 600,
                        "expected": "PASS"
                    }),
                # 1.0 alt, 0.0 ref, good DP, alt no bias, ref 0 DP
                VCFRecord(
                    "artificial_chr1", 90, "sub_09", "G", ["T"], None, None, {
                        "SAR": 1400,
                        "SAF": 1000,
                        "SRR": 0,
                        "SRF": 0,
                        "expected": "PASS"
                    }),
                # 1.0 alt, 0.0 ref, good DP, alt no bias, ref 2 DP
                VCFRecord(
                    "artificial_chr1", 100, "sub_10", "G", ["T"], None, None, {
                        "SAR": 1400,
                        "SAF": 1000,
                        "SRR": 0,
                        "SRF": 2,
                        "expected": "PASS"
                    }),
                # 1.0 alt, 0.0 ref, limit DP, alt no bias, ref 0 DP
                VCFRecord("artificial_chr1", 110, "sub_11", "G", ["T"], None,
                          None, {
                              "SAR": 90,
                              "SAF": 30,
                              "SRR": 0,
                              "SRF": 0,
                              "expected": "PASS"
                          }),
                # 1.0 alt, 0.0 ref, limit DP, alt no bias, ref 2 DP
                VCFRecord("artificial_chr1", 120, "sub_12", "G", ["T"], None,
                          None, {
                              "SAR": 90,
                              "SAF": 30,
                              "SRR": 0,
                              "SRF": 2,
                              "expected": "PASS"
                          }),
                # 1.0 alt, 0.0 ref, limit DP, alt strand bias, ref 0 DP
                VCFRecord(
                    "artificial_chr1", 130, "sub_13", "G", ["T"], None, None, {
                        "SAR": 90,
                        "SAF": 10,
                        "SRR": 0,
                        "SRF": 0,
                        "expected": "strandRatioBias"
                    }),
                # 1.0 alt, 0.0 ref, limit DP, alt strand bias, ref 2 DP
                VCFRecord(
                    "artificial_chr1", 140, "sub_14", "G", ["T"], None, None, {
                        "SAR": 90,
                        "SAF": 10,
                        "SRR": 0,
                        "SRF": 2,
                        "expected": "strandRatioBias"
                    }),
                # 1.0 alt, 0.0 ref, limit DP, alt strand bias, ref 1 DP
                VCFRecord(
                    "artificial_chr1",
                    150,
                    "sub_15",
                    "G",
                    ["T"],
                    None,
                    None,
                    {
                        "SAR": 90,
                        "SAF": 10,
                        "SRR": 1,
                        "SRF": 0,
                        "expected": "PASS"  # It can be discuss: 2.89
                    }),
                # 0.04 alt, 0.96 ref, good DP, alt strand bias, ref no bias
                VCFRecord(
                    "artificial_chr1", 160, "sub_16", "G", ["T"], None, None, {
                        "SAR": 15,
                        "SAF": 2,
                        "SRR": 200,
                        "SRF": 200,
                        "expected": "strandRatioBias"
                    }),
                # 0.04 alt, 0.96 ref, good DP, alt strand bias, ref no bias
                VCFRecord(
                    "artificial_chr1",
                    170,
                    "sub_17",
                    "G",
                    ["T"],
                    None,
                    None,
                    {
                        "SAR": 13,  # 12 => PASS
                        "SAF": 2,
                        "SRR": 200,
                        "SRF": 200,
                        "expected": "strandRatioBias"
                    }),
                # 0.04 alt, 0.96 ref, good DP, alt strand bias, ref strand bias => no bias
                VCFRecord("artificial_chr1", 180, "sub_18", "G", ["T"], None,
                          None, {
                              "SAR": 13,
                              "SAF": 2,
                              "SRR": 350,
                              "SRF": 50,
                              "expected": "PASS"
                          }),
                # 0.04 alt, 0.96 ref, good DP, alt strand bias, ref strand bias rev => bias
                VCFRecord(
                    "artificial_chr1", 190, "sub_19", "G", ["T"], None, None, {
                        "SAR": 13,
                        "SAF": 2,
                        "SRR": 50,
                        "SRF": 350,
                        "expected": "strandRatioBias"
                    }),
                # 0.5 alt, 0.5 ref, low DP, alt strand bias, ref no bias
                VCFRecord(
                    "artificial_chr1", 200, "sub_20", "G", ["T"], None, None, {
                        "SAR": 14,
                        "SAF": 2,
                        "SRR": 8,
                        "SRF": 8,
                        "expected": "strandRatioBias"
                    }),
            ]
            for idx, curr_var in enumerate(self.variants):
                FH_var.write(curr_var)
コード例 #11
0
    def setUp(self):
        tmp_folder = tempfile.gettempdir()
        unique_id = str(uuid.uuid1())

        # Temporary files
        self.tmp_sequences = os.path.join(tmp_folder, unique_id + ".fasta")
        self.tmp_faidx = os.path.join(tmp_folder, unique_id + ".fasta.fai")
        self.tmp_variants = os.path.join(tmp_folder, unique_id + ".vcf")
        self.tmp_output = os.path.join(tmp_folder, unique_id + "_out.vcf")

        # Exec command
        self.cmd = [
            "standardizeVCF.py",
            "--trace-unstandard",
            "--input-reference", self.tmp_sequences,
            "--input-variants", self.tmp_variants,
            "--output-variants", self.tmp_output
        ]

        # Create fasta
        with FastaIO(self.tmp_sequences, "w") as FH_seq:
            # Repeats:                                       ****....            ...***
            # Region:                                 |----|        |------------|         |------|
            FH_seq.write(Sequence("artificial_chr1", "CTCAGTCATGTATGTATGTGCTCACAAAGTAGTAGATCATGGCAC"))
            #                                         123456789| | | | | | | | | | | | | | | | | |
            #                                                  10| 14| 18| 22| 26| 30| 34| 38| 42|
            #                                                    12  16  20  24  28  32  36  40  44
            FH_seq.write(Sequence("artificial_chr2", "CGATNNNCGAT"))
            #                                         123456789|
            #                                                  10

        # Create faidx
        with open(self.tmp_faidx, "w") as FH_fai:
            FH_fai.write("""artificial_chr1	45	17	45	46
artificial_chr2	11	80	11	12""")

        # Create VCF
        with VCFIO(self.tmp_variants, "w") as FH_var:
            FH_var.info = {
                "expected": HeaderInfoAttr("expected", "Standardized version of {chrom}:{pos}={ref}/{alt}.", type="String", number="."),
                "ANN": HeaderInfoAttr("ANN", "Annotation of variants Format: Allele|Annotation_id|Alt_allele_idx", type="String", number="."),
                "expectedANN": HeaderInfoAttr("expectedANN", "Standardized version of annotations Format: Allele|Annotation_id|Alt_allele_idx", type="String", number=".")
            }
            FH_var.writeHeader()
            self.variants = [
                # Substit single nt
                VCFRecord("artificial_chr1", 14, "sub_01", "G", ["T"], None, None, {
                    "expected": ["artificial_chr1:14=G/T"],
                    "ANN": ["T|ann_1|0", "T|ann_2|0", "A|ann_3|"],
                    "expectedANN": ["T|ann_1|0", "T|ann_2|0"]
                }),
                VCFRecord("artificial_chr1", 19, "sub_02", "T", ["A", "C"], None, None, {
                    "expected": ["artificial_chr1:19=T/A", "artificial_chr1:19=T/C"],
                    "ANN": ["A|ann_1|0", "A|ann_2|0", "T|ann_3|"],
                    "expectedANN": ["A|ann_1|0", "A|ann_2|0"]
                }),
                # Substit multi nt
                VCFRecord("artificial_chr1", 7, "sub_03", "CATGTATG", ["GTACCCGC"], None, None, {
                    "expected": ["artificial_chr1:7=CATGTATG/GTACCCGC"],
                    "ANN": ["GTACCCGC|ann_1|0", "GTACCCGC|ann_2|0", "GTGT|ann_3|"],
                    "expectedANN": ["GTACCCGC|ann_1|0", "GTACCCGC|ann_2|0"]
                }),
                VCFRecord("artificial_chr1", 11, "sub_04", "TATGTATG", ["GTACCCGC", "GTACCCAA"], None, None, {
                    "expected": ["artificial_chr1:11=TATGTATG/GTACCCGC", "artificial_chr1:11=TATGTATG/GTACCCAA"],
                    "ANN": ["GTACCCGC|ann_1|0", "GTACCCGC|ann_2|0", "GTACCCAA|ann_3|1"],
                    "expectedANN": ["GTACCCGC|ann_1|0", "GTACCCGC|ann_2|0", "GTACCCAA|ann_3|1"]
                }),
                # Insertion single nt
                VCFRecord("artificial_chr1", 14, "ins_01", "G", ["GA"], None, None, {
                    "expected": ["artificial_chr1:14=G/GA"],
                    "ANN": ["GA|ann_1|0", "GA|ann_2|0", "GT|ann_3|"],
                    "expectedANN": ["GA|ann_1|0", "GA|ann_2|0"]
                }),
                VCFRecord("artificial_chr1", 20, "ins_02", "-", ["A"], None, None, {
                    "expected": ["artificial_chr1:19=T/TA"],
                    "ANN": ["A|ann_1|0", "A|ann_2|0", "T|ann_3|"],
                    "expectedANN": ["TA|ann_1|0", "TA|ann_2|0"]
                }),
                VCFRecord("artificial_chr1", 14, "ins_03", "G", ["GA", "GC"], None, None, {
                    "expected": ["artificial_chr1:14=G/GA", "artificial_chr1:14=G/GC"],
                    "ANN": ["GA|ann_1|0", "GA|ann_2|0", "GC|ann_3|1", "GT|ann_4|"],
                    "expectedANN": ["GA|ann_1|0", "GA|ann_2|0", "GC|ann_3|1"]
                }),
                VCFRecord("artificial_chr1", 20, "ins_04", "-", ["A", "C"], None, None, {
                    "expected": ["artificial_chr1:19=T/TA", "artificial_chr1:19=T/TC"],
                    "ANN": ["A|ann_1|0", "A|ann_2|0", "C|ann_3|1", "T|ann_4|"],
                    "expectedANN": ["TA|ann_1|0", "TA|ann_2|0", "TC|ann_3|1"]
                }),
                # Insertion multi nt
                VCFRecord("artificial_chr1", 14, "ins_05", "G", ["GATGC"], None, None, {
                    "expected": ["artificial_chr1:14=G/GATGC"],
                    "ANN": ["GATGC|ann_1|0", "GATGC|ann_2|0", "GAAAC|ann_3|"],
                    "expectedANN": ["GATGC|ann_1|0", "GATGC|ann_2|0"]
                }),
                VCFRecord("artificial_chr1", 20, "ins_06", "-", ["AAATC"], None, None, {
                    "expected": ["artificial_chr1:19=T/TAAATC"],
                    "ANN": ["AAATC|ann_1|0", "AAATC|ann_2|0", "GAAAC|ann_3|"],
                    "expectedANN": ["TAAATC|ann_1|0", "TAAATC|ann_2|0"]
                }),
                # Movable insertion multi nt
                VCFRecord("artificial_chr1", 14, "ins_07", "G", ["GTG"], None, None, {
                    "expected": ["artificial_chr1:12=A/ATG"],
                    "ANN": ["GTG|ann_1|0", "GTG|ann_2|0", "GAAAC|ann_3|"],
                    "expectedANN": ["ATG|ann_1|0", "ATG|ann_2|0"]
                }),
                VCFRecord("artificial_chr1", 27, "ins_08", "A", ["AAAA"], None, None, {
                    "expected": ["artificial_chr1:25=C/CAAA"],
                    "ANN": ["AAAA|ann_1|0", "AAAA|ann_2|0", "CAAA|ann_3|"],
                    "expectedANN": ["CAAA|ann_1|0", "CAAA|ann_2|0"]
                }),
                # Deletion single nt
                VCFRecord("artificial_chr1", 14, "del_01", "G", [""], None, None, {
                    "expected": ["artificial_chr1:13=TG/T"],
                    "ANN": ["-|ann_1|0", "-|ann_2|0", "T|ann_3|"],
                    "expectedANN": ["T|ann_1|0", "T|ann_2|0"]
                }),
                VCFRecord("artificial_chr1", 14, "del_02", "G", ["-"], None, None, {
                    "expected": ["artificial_chr1:13=TG/T"],
                    "ANN": ["-|ann_1|0", "-|ann_2|0", "T|ann_3|"],
                    "expectedANN": ["T|ann_1|0", "T|ann_2|0"]
                }),
                VCFRecord("artificial_chr1", 13, "del_03", "TG", ["T"], None, None, {
                    "expected": ["artificial_chr1:13=TG/T"],
                    "ANN": ["T|ann_1|0", "T|ann_2|0", "A|ann_3|"],
                    "expectedANN": ["T|ann_1|0", "T|ann_2|0"]
                }),
                VCFRecord("artificial_chr1", 13, "del_04", "TG", ["T", "-"], None, None, {
                    "expected": ["artificial_chr1:13=TG/T", "artificial_chr1:12=ATG/A"],
                    "ANN": ["T|ann_1|0", "T|ann_2|0", "-|ann_3|1"],
                    "expectedANN": ["T|ann_1|0", "T|ann_2|0", "A|ann_3|1"]
                }),
                # Movable deletion multi nt
                VCFRecord("artificial_chr1", 11, "del_05", "TATG", ["T", "TA", "-"], None, None, {
                    "expected": ["artificial_chr1:11=TATG/T", "artificial_chr1:12=ATG/A", "artificial_chr1:7=CATGT/C"],
                    "ANN": ["T|ann_1|0", "T|ann_2|0", "TA|ann_3|1", "-|ann_4|2"],
                    "expectedANN": ["T|ann_1|0", "T|ann_2|0", "A|ann_3|1", "C|ann_4|2"]
                }),
            ]
            for idx, curr_var in enumerate(self.variants):
                FH_var.write(curr_var)
コード例 #12
0
    log = logging.getLogger(os.path.basename(__file__))
    log.setLevel(logging.INFO)
    log.info("Command: " + " ".join(sys.argv))

    # Load knowns
    log.info("Load known partners from {}.".format(args.input_known_partners))
    sources_by_symbols = sourcesBySymbols(args.input_known_partners)

    # Annot variants
    log.info("Annotate known fusions partners.")
    with BreakendVCFIO(args.output_variants, "w",
                       args.annotation_field) as writer:
        with BreakendVCFIO(args.input_variants, "r",
                           args.annotation_field) as reader:
            # Header
            writer.copyHeader(reader)
            writer.info["known_partners"] = HeaderInfoAttr(
                id="known_partners",
                type="String",
                number=".",
                description=
                "Database containing the fusion of these gene. Format: 5primSymbol_@_3primSymbol=db1name:entryId,entryId|db2name:entryId (example: BCR_@_ABL1=cosmic_91:1743,1745|chimerdb_pub-V4:3427,3428)"
            )
            writer.writeHeader()
            # Records
            for first, second in reader:
                annotate(first, second, sources_by_symbols,
                         args.annotation_field)
                writer.write(first, second)
    log.info("End of job")
コード例 #13
0
    logging.basicConfig(
        format=
        '%(asctime)s -- [%(filename)s][pid:%(process)d][%(levelname)s] -- %(message)s'
    )
    log = logging.getLogger(os.path.basename(__file__))
    log.setLevel(logging.INFO)
    log.info("Command: " + " ".join(sys.argv))

    # Process
    with IdxFastaIO(args.input_genome) as genome_reader:
        with BreakendVCFIO(args.input_variants) as reader:
            with BreakendVCFIO(args.output_variants, "w") as writer:
                writer.copyHeader(reader)
                writer.info["CIPOS"] = HeaderInfoAttr(
                    "CIPOS",
                    type="Integer",
                    number="2",
                    description="Confidence interval around POS")
                if args.trace_unstandard:
                    writer.info["UNSTD"] = HeaderInfoAttr(
                        "UNSTD",
                        type="String",
                        number="1",
                        description=
                        "Breakend id (chromosome:position=reference/alternative) before standardization"
                    )
                writer.writeHeader()
                for first, second in reader:
                    if args.trace_unstandard:
                        first.info["UNSTD"] = "{}:{}={}/{}".format(
                            first.chrom, first.pos, first.ref,
コード例 #14
0
def getNewHeaderAttr(args):
    """
    Return renamed and new VCFHeader elements for the merged VCF.

    :param args: The script's parameters.
    :type args: NameSpace
    :return: VCFHeader elements (filter, info, format, samples).
    :rtype: dict
    """
    final_filter = {}
    final_info = {
        "SRC":
        HeaderInfoAttr(
            "SRC",
            type="String",
            number=".",
            description=
            "Variant callers where the variant is identified. Possible values: {}"
            .format({
                name: "s" + str(idx)
                for idx, name in enumerate(args.calling_sources)
            }))
    }
    final_format = {
        "AD":
        HeaderFormatAttr("AD",
                         type="Integer",
                         number="A",
                         description="Allele Depth"),
        "DP":
        HeaderFormatAttr("DP",
                         type="Integer",
                         number="1",
                         description="Total Depth"),
        "ADSRC":
        HeaderFormatAttr("ADSRC",
                         type="Integer",
                         number=".",
                         description="Allele Depth by source"),
        "DPSRC":
        HeaderFormatAttr("DPSRC",
                         type="Integer",
                         number=".",
                         description="Total Depth by source")
    }
    final_samples = None
    for idx_in, curr_in in enumerate(args.inputs_variants):
        with VCFIO(curr_in) as FH_vcf:
            # Samples
            if final_samples is None:
                final_samples = FH_vcf.samples
            elif FH_vcf.samples != final_samples:
                raise Exception(
                    "The samples in VCF are not the same: {} in {} and {} in {}."
                    .format(final_samples, args.inputs_variants[0],
                            FH_vcf.samples, curr_in))
            # FILTER
            for tag, data in FH_vcf.filter.items():
                new_tag = tag
                if tag not in args.shared_filters:  # Rename filters not based on caller
                    new_tag = "s{}_{}".format(idx_in, tag)
                    data.id = new_tag
                    data.source = args.calling_sources[idx_in]
                final_filter[new_tag] = data
            # INFO
            for tag, data in FH_vcf.info.items():
                if tag == args.annotations_field:
                    if tag not in final_info or len(
                            final_info[tag].description
                    ) < len(
                            data.description
                    ):  # Manage merge between callers with 0 variants (and 0 annotations) and callers with variants
                        final_info[tag] = data
                else:
                    new_tag = "s{}_{}".format(idx_in, tag)
                    data.id = new_tag
                    data.source = args.calling_sources[idx_in]
                    final_info[new_tag] = data
            qual_tag = "s{}_VCQUAL".format(idx_in)
            final_info[qual_tag] = HeaderInfoAttr(
                qual_tag,
                type="Float",
                number="1",
                description="The variant quality",
                source=args.calling_sources[idx_in])
            # FORMAT
            for tag, data in FH_vcf.format.items():
                new_tag = "s{}_{}".format(idx_in, tag)
                data.id = new_tag
                data.source = args.calling_sources[idx_in]
                final_format[new_tag] = data
    return {
        "filter": final_filter,
        "info": final_info,
        "format": final_format,
        "samples": final_samples
    }
コード例 #15
0
                                vcaller_curr_AF = vcaller_AF[alt_idx + 1]
                            record_allele.samples[curr_spl]["AF"] = [round(vcaller_curr_AF, args.AF_precision)]
                            record_allele.samples[curr_spl]["AD"] = [int(vcaller_curr_AF * vcaller_DP)]
                            record_allele.samples[curr_spl]["DP"] = vcaller_DP
                            # Store allele
                            allele_id = record_allele.getName()
                            if allele_id not in variants:
                                variants[allele_id] = record_allele
                            else:
                                variants[allele_id].samples[curr_spl] = record_allele.samples[curr_spl]

    # Completes and writes variants
    with VCFIO(args.output_variants, "w") as FH_out:
        # Header
        FH_out.copyHeader(FH_vcf)
        FH_out.info["AF"] = HeaderInfoAttr("AF", type="Float", number="A", description="The alleles frequencies for the group of samples.")
        FH_out.info["AD"] = HeaderInfoAttr("AD", type="Integer", number="A", description="The alleles depths for the group of samples.")
        FH_out.info["DP"] = HeaderInfoAttr("DP", type="Integer", number="1", description="Combined depth across samples.")
        FH_out.format["AF"] = HeaderFormatAttr("AF", type="Float", number="A", description="The alleles frequencies.")
        FH_out.format["AD"] = HeaderFormatAttr("AD", type="Integer", number="A", description="The alleles depths.")
        FH_out.format["DP"] = HeaderFormatAttr("DP", type="Integer", number="1", description="Depth.")
        FH_out.samples = [spl for spl in sorted(aln_by_samples)]
        FH_out.writeHeader()

        # Records
        for allele_id in variants:
            curr_var = variants[allele_id]
            # Add tag AF, AD and DP by sample
            if "AF" not in curr_var.format: curr_var.format.append("AF")
            if "AD" not in curr_var.format: curr_var.format.append("AD")
            if "DP" not in curr_var.format: curr_var.format.append("DP")
コード例 #16
0
 def setUp(self):
     self.vcfio = FakeVCFIO(
         {
             "AF":
             HeaderInfoAttr("AF", "Alternative alleles frequencies",
                            "Float", "A")
         }, {
             "AD":
             HeaderFormatAttr("AD", "Alternative alleles depths", "Integer",
                              "A"),
             "DP":
             HeaderFormatAttr("DP", "total depth", "Integer", "1")
         })
     self.ref_seq = "ACGCAAATCTCGGCATGCCGATT"
     #               | | | | | |  |  |  |  |
     #               1 3 5 7 9 11 14 17 20 23
     self.variant_1 = VCFRecord(
         "chr1",  # chrom
         None,  # pos
         "artificial_1",  # id
         None,  # ref
         None,  # alt
         10,  # qual
         ["lowQual", "lowDP"],  # filter
         {"AF": [0.05]},  # info
         ["DP", "AD"],  # format
         {
             "splA": {
                 "AD": [10],
                 "DP": 100
             },
             "splB": {
                 "AD": [40],
                 "DP": 4900
             },
         })
     self.variant_2 = VCFRecord(
         "chr1",  # chrom
         None,  # pos
         None,  # id
         None,  # ref
         None,  # alt
         30,  # qual
         ["PASS"],  # filter
         {"AF": [0.06]},  # info
         ["DP", "AD"],  # format
         {
             "splA": {
                 "AD": [5],
                 "DP": 50
             },
             "splB": {
                 "AD": [31],
                 "DP": 550
             },
         })
     self.expected_merge = VCFRecord(
         "chr1",  # chrom
         None,  # pos
         None,  # id
         None,  # ref
         None,  # alt
         20,  # qual
         ["lowQual", "lowDP"],  # filter
         {
             "AF": [0.06],
             "MCO_QUAL": [10, 30],
             "MCO_VAR": ["chr1:5=A/T", "chr1:20=G/C"]
         },  # info
         ["DP", "AD"],  # format
         {
             "splA": {
                 "AD": [5],
                 "DP": 50
             },
             "splB": {
                 "AD": [31],
                 "DP": 550
             },
         })
コード例 #17
0
ファイル: annotBND.py プロジェクト: bialimed/AnaCore-utils
    # Annot variants
    log.info("Annot variants in {}.".format(args.input_variants))
    with BreakendVCFIO(args.output_variants, "w",
                       args.annotation_field) as writer:
        with BreakendVCFIO(args.input_variants) as reader:
            # Header
            writer.copyHeader(reader)
            writer.ANN_titles = [
                "SYMBOL", "Gene", "Feature", "Feature_type", "Protein",
                "STRAND", "RNA_ELT_TYPE", "RNA_ELT_POS", "CDS_position",
                "Protein_position", "GENE_SHARD", "IN_FRAME"
            ]
            writer.info[args.annotation_field] = HeaderInfoAttr(
                id=args.annotation_field,
                type="String",
                number=".",
                description="Consequence annotations. Format: " +
                "|".join(writer.ANN_titles))
            writer.info["ANNOT_POS"] = HeaderInfoAttr(
                id="ANNOT_POS",
                type="Integer",
                number="1",
                description=
                "Breakend position used in annotation. It take into account CIPOS to give priority to a breakend on exon boundaries."
            )
            writer.writeHeader()
            # Records
            for first, second in reader:
                annot(first, second, genes_by_chr, args.annotation_field)
                writer.write(first, second)
    log.info("End of job")
コード例 #18
0
    def testTagMultipleValues(self):
        # Write test data
        with VCFIO(self.tmp_variants, "w") as FH_var:
            FH_var.info = {
                "expected":
                HeaderInfoAttr("expected",
                               "Expected filter tag.",
                               type="String",
                               number="1"),
                "SAR":
                HeaderInfoAttr(
                    "SAR",
                    "Number of reads supporting the alternative allele in reverse strand.",
                    type="Integer",
                    number="A"),
                "SAF":
                HeaderInfoAttr(
                    "SAF",
                    "Number of reads supporting the alternative allele in forward strand.",
                    type="Integer",
                    number="A"),
                "SRR":
                HeaderInfoAttr(
                    "SRR",
                    "Number of reads supporting the reference allele in reverse strand.",
                    type="Integer",
                    number="A"),
                "SRF":
                HeaderInfoAttr(
                    "SRF",
                    "Number of reads supporting the reference allele in forward strand.",
                    type="Integer",
                    number="A"),
            }
            FH_var.writeHeader()
            self.variants = [
                # 0.5 alt, 0.5 ref, low DP, alt no bias, ref no bias
                VCFRecord(
                    "artificial_chr1", 10, "sub_01", "G", ["T"], None, None, {
                        "SAR": [5],
                        "SAF": [5],
                        "SRR": [5],
                        "SRF": [5],
                        "expected": "PASS"
                    }),
                # 0.05 alt, 0.95 ref, good DP, alt strand bias, ref no bias
                VCFRecord(
                    "artificial_chr1", 40, "sub_04", "G", ["T"], None, None, {
                        "SAR": [9],
                        "SAF": [1],
                        "SRR": [95],
                        "SRF": [95],
                        "expected": "strandRatioBias"
                    })
            ]
            for idx, curr_var in enumerate(self.variants):
                FH_var.write(curr_var)

        # Execute command
        subprocess.check_call(self.cmd, stderr=subprocess.DEVNULL)

        # Validate results
        expected = []
        for record in self.variants:
            for alt in record.alt:
                expected.append(record.id + ":" + record.info["expected"])
        observed = []
        with VCFIO(self.tmp_output) as FH_results:
            for record in FH_results:
                observed.append(record.id + ":" + record.filter[0])
        self.assertEqual(expected, observed)
コード例 #19
0
    )
    log = logging.getLogger(os.path.basename(__file__))
    log.setLevel(args.logging_level)
    log.info("Command: " + " ".join(sys.argv))

    # Merge variants
    getIncludingReads = getIncludingReadsRNA if args.spliced_aln else getIncludingReadsDNA
    with IdxFastaIO(args.input_sequences) as FH_seq:
        with VCFIO(args.output_variants, "w") as FH_out:
            with pysam.AlignmentFile(args.input_aln, "rb") as FH_aln:
                with VCFIO(args.input_variants) as FH_vcf:
                    # Header
                    FH_out.copyHeader(FH_vcf)
                    FH_out.info["MCO_VAR"] = HeaderInfoAttr(
                        "MCO_VAR",
                        "Name of the variants merged because their occur on same reads.",
                        type="String",
                        number=".")
                    FH_out.info["MCO_QUAL"] = HeaderInfoAttr(
                        "MCO_QUAL",
                        "Qualities of the variants merged because their occur on same reads.",
                        type="String",
                        number=".")
                    FH_out.info["MCO_IR"] = HeaderInfoAttr(
                        "MCO_IR",
                        "Co-occurancy rate between pairs of variants.",
                        type="String",
                        number=".")
                    FH_out.info["MCO_IC"] = HeaderInfoAttr(
                        "MCO_IC",
                        "Co-occurancy count between pairs of variants.",
コード例 #20
0
    args = parser.parse_args()

    # Logger
    logging.basicConfig(format='%(asctime)s -- [%(filename)s][pid:%(process)d][%(levelname)s] -- %(message)s')
    log = logging.getLogger(os.path.basename(__file__))
    log.setLevel(logging.INFO)
    log.info("Command: " + " ".join(sys.argv))

    # Process
    nb_variants = 0
    nb_filtered = 0
    with VCFIO(args.input_variants) as handle_in:
        with VCFIO(args.output_variants, "w") as handle_out:
            # Header
            handle_out.copyHeader(handle_in)
            handle_out.info[args.SOR_tag] = HeaderInfoAttr(args.SOR_tag, "Strand bias estimated by the symmetric odds ratio test.", type="Float")
            handle_out.filter[args.bias_tag] = HeaderFilterAttr(args.bias_tag, "Strand ratio bias (estimated by the symmetric odds ratio test): substit SOR > {}, InDel SOR > {}.".format(args.substit_max_SOR, args.indel_max_SOR))
            handle_out.writeHeader()
            # Records
            for record in handle_in:
                if len(record.alt) > 1:
                    raise Exception("The multi-allelic variants cannot be processed: {}.".format(record.getName()))
                nb_variants += 1
                is_filtered = False
                # Compute SOR
                record.info[args.SOR_tag] = strandOddRatio(
                    record.info[args.ref_fwd_tag] if handle_in.info[args.ref_fwd_tag].number == "1" else record.info[args.ref_fwd_tag][0],
                    record.info[args.ref_rev_tag] if handle_in.info[args.ref_rev_tag].number == "1" else record.info[args.ref_rev_tag][0],
                    record.info[args.alt_fwd_tag] if handle_in.info[args.alt_fwd_tag].number == "1" else record.info[args.alt_fwd_tag][0],
                    record.info[args.alt_rev_tag] if handle_in.info[args.alt_rev_tag].number == "1" else record.info[args.alt_rev_tag][0]
                )
コード例 #21
0
def stdizeVCF(FH_ref, FH_in, FH_out, trace_unstandard=False, log=None):
    """
    Split alternatives alleles in multi-lines, removes unecessary reference and alternative nucleotids, move indel to most upstream position and update alt allele in annotations.

    :param FH_ref: File handle to the reference file (format: fasta with faidx).
    :type FH_ref: anacore.sequenceIO.IdxFastaIO
    :param FH_in: File handle to the variants file (format: VCF).
    :type FH_in: anacore.vcf.VCFIO
    :param FH_out: File handle to the standardized variants file (format: VCF).
    :type FH_out: anacore.vcf.VCFIO
    :param trace_unstandard: True if you want to keep the trace of the variant before standardization in INFO.
    :type trace_unstandard: bool
    :param log: Logger used.
    :type log: logging.Logger
    """
    nb_annot = {"exact": 0, "collocated": 0}
    is_annotated = issubclass(FH_out.__class__, AnnotVCFIO)
    # Header
    FH_out.copyHeader(FH_in)
    if trace_unstandard:
        FH_out.info["UNSTD"] = HeaderInfoAttr(
            "UNSTD",
            type="String",
            number="1",
            description=
            "The variant id (chromosome:position=reference/alternative) before standardization."
        )
    FH_out.writeHeader()
    # Records
    for record in FH_in:
        collocated_records = []
        for alt_idx, alt in enumerate(record.alt):
            alt_record = getAlleleRecord(FH_in, record, alt_idx)
            if trace_unstandard:
                alt_record.info["UNSTD"] = alt_record.getName()
            # Previous
            unstd = {
                "chrom": alt_record.chrom,
                "pos": alt_record.pos,
                "ref": alt_record.ref,
                "alt": alt_record.alt[0]
            }
            # Standardize pos, ref and alt
            alt_record.fastStandardize(FH_ref, 1000)
            # Update annotations
            if is_annotated and FH_in.annot_field in alt_record.info:
                cleaned_annot = []
                for idx_ann, annot in enumerate(
                        alt_record.info[FH_in.annot_field]):
                    if unstd["alt"] == annot["Allele"]:
                        nb_annot["exact"] += 1
                        annot["Allele"] = alt_record.alt[0]
                        cleaned_annot.append(annot)
                    else:
                        nb_annot["collocated"] += 1
                alt_record.info[FH_in.annot_field] = cleaned_annot
            collocated_records.append(alt_record)
        if len(collocated_records) == 1:
            FH_out.write(collocated_records[0])
        else:
            for alt_record in sorted(
                    collocated_records,
                    key=lambda elt:
                (elt.refStart(), elt.refEnd())):  # Sorted splitted alleleles
                FH_out.write(alt_record)
    if log is not None and nb_annot["collocated"] != 0:
        log.warning(
            "{}/{} annotations have been deleted because they concern collocated variant."
            .format(nb_annot["collocated"],
                    nb_annot["exact"] + nb_annot["collocated"]))
コード例 #22
0
    def setUp(self):
        tmp_folder = tempfile.gettempdir()
        unique_id = str(uuid.uuid1())
        self.tmp_initial_pathes = os.path.join(tmp_folder, unique_id + "_{}_initial.vcf")
        self.tmp_haplotyped_pathes = os.path.join(tmp_folder, unique_id + "_{}_haplotyped.vcf")
        self.tmp_expected_pathes = os.path.join(tmp_folder, unique_id + "_{}_expected.vcf")
        self.tmp_out_pathes = os.path.join(tmp_folder, unique_id + "_{}_out.vcf")

        # test cases
        self.test_cases = [
            {  # *a-b, a-b, a b, /
                "initial": {
                    "caller1": [VCFRecord("chr1", 14, None, "GCGTA", ["CCGTG"])],
                    "caller2": [VCFRecord("chr1", 14, None, "GCGTA", ["CCGTG"])],
                    "caller3": [
                        VCFRecord("chr1", 14, None, "G", ["C"], info={"AD": 100}),
                        VCFRecord("chr1", 18, None, "A", ["G"], info={"AD": 104})
                    ]
                },
                "haplotyped": {
                    "caller1": [VCFRecord("chr1", 14, None, "GCGTA", ["CCGTG"])],
                    "caller2": [VCFRecord("chr1", 14, None, "GCGTA", ["CCGTG"])],
                    "caller3": [VCFRecord("chr1", 14, None, "GCGTA", ["CCGTG"], info={"MCO_VAR": ["chr1:14=G/C", "chr1:18=A/G"], "AD": 100})]
                },
                "expected": {
                    "caller1": [VCFRecord("chr1", 14, None, "GCGTA", ["CCGTG"])],
                    "caller2": [VCFRecord("chr1", 14, None, "GCGTA", ["CCGTG"])],
                    "caller3": [VCFRecord("chr1", 14, None, "GCGTA", ["CCGTG"], info={"AD": 104})]
                }
            },
            {  # *a b, a b, a-b, /
                "initial": {
                    "caller1": [
                        VCFRecord("chr2", 14, None, "G", ["C"]),
                        VCFRecord("chr2", 18, None, "A", ["G"])
                    ],
                    "caller2": [
                        VCFRecord("chr2", 14, None, "G", ["C"]),
                        VCFRecord("chr2", 18, None, "A", ["G"])
                    ],
                    "caller3": [VCFRecord("chr2", 14, None, "GCGTA", ["CCGTG"])]
                },
                "haplotyped": {
                    "caller1": [VCFRecord("chr2", 14, None, "GCGTA", ["CCGTG"], info={"MCO_VAR": ["chr2:14=G/C", "chr2:18=A/G"]})],
                    "caller2": [VCFRecord("chr2", 14, None, "GCGTA", ["CCGTG"], info={"MCO_VAR": ["chr2:14=G/C", "chr2:18=A/G"]})],
                    "caller3": [VCFRecord("chr2", 14, None, "GCGTA", ["CCGTG"])]
                },
                "expected": {
                    "caller1": [
                        VCFRecord("chr2", 14, None, "G", ["C"]),
                        VCFRecord("chr2", 18, None, "A", ["G"])
                    ],
                    "caller2": [
                        VCFRecord("chr2", 14, None, "G", ["C"]),
                        VCFRecord("chr2", 18, None, "A", ["G"])
                    ],
                    "caller3": [
                        VCFRecord("chr2", 14, None, "G", ["C"]),
                        VCFRecord("chr2", 18, None, "A", ["G"])
                    ]
                }
            },
            {  # *a-b c, a-b c, a b c, /
                "initial": {
                    "caller1": [
                        VCFRecord("chr3", 14, None, "GCGTA", ["CCGTG"]),
                        VCFRecord("chr3", 20, None, "A", ["G"])
                    ],
                    "caller2": [
                        VCFRecord("chr3", 14, None, "GCGTA", ["CCGTG"]),
                        VCFRecord("chr3", 20, None, "A", ["G"])
                    ],
                    "caller3": [
                        VCFRecord("chr3", 14, None, "G", ["C"], info={"AD": 104}),
                        VCFRecord("chr3", 18, None, "A", ["G"], info={"AD": 100}),
                        VCFRecord("chr3", 20, None, "A", ["G"], info={"AD": 98})
                    ]
                },
                "haplotyped": {
                    "caller1": [VCFRecord("chr3", 14, None, "GCGTATCA", ["CCGTGTCG"], info={"MCO_VAR": ["chr3:14=GCGTA/CCGTG", "chr3:20=A/G"]})],
                    "caller2": [VCFRecord("chr3", 14, None, "GCGTATCA", ["CCGTGTCG"], info={"MCO_VAR": ["chr3:14=GCGTA/CCGTG", "chr3:20=A/G"]})],
                    "caller3": [VCFRecord("chr3", 14, None, "GCGTATCA", ["CCGTGTCG"], info={"MCO_VAR": ["chr3:14=G/C", "chr3:18=A/G", "chr3:20=A/G"], "AD": 98})]
                },
                "expected": {
                    "caller1": [
                        VCFRecord("chr3", 14, None, "GCGTA", ["CCGTG"]),
                        VCFRecord("chr3", 20, None, "A", ["G"])
                    ],
                    "caller2": [
                        VCFRecord("chr3", 14, None, "GCGTA", ["CCGTG"]),
                        VCFRecord("chr3", 20, None, "A", ["G"])
                    ],
                    "caller3": [
                        VCFRecord("chr3", 14, None, "GCGTA", ["CCGTG"], info={"AD": 104}),
                        VCFRecord("chr3", 20, None, "A", ["G"], info={"AD": 98})
                    ]
                }
            },
            {  # *a-b c, a-b c, a b c, a-b-c
                "initial": {
                    "caller1": [
                        VCFRecord("chr4", 14, None, "GCGTA", ["CCGTG"]),
                        VCFRecord("chr4", 20, None, "A", ["G"])
                    ],
                    "caller2": [
                        VCFRecord("chr4", 14, None, "GCGTA", ["CCGTG"]),
                        VCFRecord("chr4", 20, None, "A", ["G"])
                    ],
                    "caller3": [
                        VCFRecord("chr4", 14, None, "G", ["C"], info={"AD": 98}),
                        VCFRecord("chr4", 18, None, "A", ["G"], info={"AD": 104}),
                        VCFRecord("chr4", 20, None, "A", ["G"], info={"AD": 100})
                    ],
                    "caller4": [VCFRecord("chr4", 14, None, "GCGTATCA", ["CCGTGTCG"])]
                },
                "haplotyped": {
                    "caller1": [VCFRecord("chr4", 14, None, "GCGTATCA", ["CCGTGTCG"], info={"MCO_VAR": ["chr4:14=GCGTA/CCGTG", "chr4:20=A/G"]})],
                    "caller2": [VCFRecord("chr4", 14, None, "GCGTATCA", ["CCGTGTCG"], info={"MCO_VAR": ["chr4:14=GCGTA/CCGTG", "chr4:20=A/G"]})],
                    "caller3": [VCFRecord("chr4", 14, None, "GCGTATCA", ["CCGTGTCG"], info={"MCO_VAR": ["chr4:14=G/C", "chr4:18=A/G", "chr4:20=A/G"], "AD": 98})],
                    "caller4": [VCFRecord("chr4", 14, None, "GCGTATCA", ["CCGTGTCG"])]
                },
                "expected": {
                    "caller1": [
                        VCFRecord("chr4", 14, None, "GCGTA", ["CCGTG"]),
                        VCFRecord("chr4", 20, None, "A", ["G"])
                    ],
                    "caller2": [
                        VCFRecord("chr4", 14, None, "GCGTA", ["CCGTG"]),
                        VCFRecord("chr4", 20, None, "A", ["G"])
                    ],
                    "caller3": [
                        VCFRecord("chr4", 14, None, "GCGTA", ["CCGTG"], info={"AD": 104}),
                        VCFRecord("chr4", 20, None, "A", ["G"], info={"AD": 100})
                    ],
                    "caller4": [
                        VCFRecord("chr4", 14, None, "GCGTA", ["CCGTG"]),
                        VCFRecord("chr4", 20, None, "A", ["G"])
                    ]
                }
            },
            {  # *a-b c, a' a-b c, a b c, a-b-c
                "initial": {
                    "caller1": [
                        VCFRecord("chr5", 14, None, "GCGTA", ["CCGTG"]),
                        VCFRecord("chr5", 20, None, "A", ["G"])
                    ],
                    "caller2": [
                        VCFRecord("chr5", 14, None, "G", ["C"], info={"AD": 3}),
                        VCFRecord("chr5", 14, None, "GCGTA", ["CCGTG"], info={"AD": 100}),
                        VCFRecord("chr5", 20, None, "A", ["G"], info={"AD": 104})
                    ],
                    "caller3": [
                        VCFRecord("chr5", 14, None, "G", ["C"], info={"AD": 110}),
                        VCFRecord("chr5", 18, None, "A", ["G"], info={"AD": 105}),
                        VCFRecord("chr5", 20, None, "A", ["G"], info={"AD": 100})
                    ],
                    "caller4": [VCFRecord("chr5", 14, None, "GCGTATCA", ["CCGTGTCG"])]
                },
                "haplotyped": {
                    "caller1": [VCFRecord("chr5", 14, None, "GCGTATCA", ["CCGTGTCG"], info={"MCO_VAR": ["chr5:14=GCGTA/CCGTG", "chr5:20=A/G"]})],
                    "caller2": [
                        VCFRecord("chr5", 14, None, "G", ["C"], info={"AD": 3}),
                        VCFRecord("chr5", 14, None, "GCGTATCA", ["CCGTGTCG"], info={"MCO_VAR": ["chr5:14=GCGTA/CCGTG", "chr5:20=A/G"], "AD": 100})
                    ],
                    "caller3": [VCFRecord("chr5", 14, None, "GCGTATCA", ["CCGTGTCG"], info={"MCO_VAR": ["chr5:14=G/C", "chr5:18=A/G", "chr5:20=A/G"], "AD": 100})],
                    "caller4": [VCFRecord("chr5", 14, None, "GCGTATCA", ["CCGTGTCG"])]
                },
                "expected": {
                    "caller1": [
                        VCFRecord("chr5", 14, None, "GCGTA", ["CCGTG"]),
                        VCFRecord("chr5", 20, None, "A", ["G"])
                    ],
                    "caller2": [
                        VCFRecord("chr5", 14, None, "G", ["C"], info={"AD": 3}),
                        VCFRecord("chr5", 14, None, "GCGTA", ["CCGTG"], info={"AD": 100}),
                        VCFRecord("chr5", 20, None, "A", ["G"], info={"AD": 104})
                    ],
                    "caller3": [
                        VCFRecord("chr5", 14, None, "GCGTA", ["CCGTG"], info={"AD": 110}),
                        VCFRecord("chr5", 20, None, "A", ["G"], info={"AD": 100})
                    ],
                    "caller4": [
                        VCFRecord("chr5", 14, None, "GCGTA", ["CCGTG"]),
                        VCFRecord("chr5", 20, None, "A", ["G"])
                    ]
                }
            },
            {  # *a b c, a' a-b c, a-b c, a-b-c
                "initial": {
                    "caller1": [
                        VCFRecord("chr6", 14, None, "G", ["C"]),
                        VCFRecord("chr6", 18, None, "A", ["G"]),
                        VCFRecord("chr6", 20, None, "A", ["G"])
                    ],
                    "caller2": [
                        VCFRecord("chr6", 14, None, "G", ["C"], info={"AD": 3}),
                        VCFRecord("chr6", 14, None, "GCGTA", ["CCGTG"], info={"AD": 100}),
                        VCFRecord("chr6", 20, None, "A", ["G"], info={"AD": 104})
                    ],
                    "caller3": [
                        VCFRecord("chr6", 14, None, "GCGTA", ["CCGTG"], info={"AD": 105}),
                        VCFRecord("chr6", 20, None, "A", ["G"], info={"AD": 101})
                    ],
                    "caller4": [VCFRecord("chr6", 14, None, "GCGTATCA", ["CCGTGTCG"])]
                },
                "haplotyped": {
                    "caller1": [VCFRecord("chr6", 14, None, "GCGTATCA", ["CCGTGTCG"], info={"MCO_VAR": ["chr6:14=G/C", "chr6:18=A/G", "chr6:20=A/G"]})],
                    "caller2": [
                        VCFRecord("chr6", 14, None, "G", ["C"], info={"AD": 3}),
                        VCFRecord("chr6", 14, None, "GCGTATCA", ["CCGTGTCG"], info={"MCO_VAR": ["chr6:14=GCGTA/CCGTG", "chr6:20=A/G"], "AD": 100})
                    ],
                    "caller3": [VCFRecord("chr6", 14, None, "GCGTATCA", ["CCGTGTCG"], info={"MCO_VAR": ["chr6:14=GCGTA/CCGTG", "chr6:20=A/G"], "AD": 101})],
                    "caller4": [VCFRecord("chr6", 14, None, "GCGTATCA", ["CCGTGTCG"])]
                },
                "expected": {
                    "caller1": [
                        VCFRecord("chr6", 14, None, "G", ["C"]),
                        VCFRecord("chr6", 18, None, "A", ["G"]),
                        VCFRecord("chr6", 20, None, "A", ["G"])
                    ],
                    "caller2": [
                        VCFRecord("chr6", 14, None, "G", ["C"], info={"AD": 100}),
                        VCFRecord("chr6", 18, None, "A", ["G"], info={"AD": 100}),
                        VCFRecord("chr6", 20, None, "A", ["G"], info={"AD": 104})
                    ],
                    "caller3": [
                        VCFRecord("chr6", 14, None, "G", ["C"], info={"AD": 105}),
                        VCFRecord("chr6", 18, None, "A", ["G"], info={"AD": 105}),
                        VCFRecord("chr6", 20, None, "A", ["G"], info={"AD": 101})
                    ],
                    "caller4": [
                        VCFRecord("chr6", 14, None, "G", ["C"]),
                        VCFRecord("chr6", 18, None, "A", ["G"]),
                        VCFRecord("chr6", 20, None, "A", ["G"])
                    ]
                }
            },
            {  # *a b c, a-b b' c, a-b c, a-b-c
                "initial": {
                    "caller1": [
                        VCFRecord("chr7", 14, None, "G", ["C"]),
                        VCFRecord("chr7", 18, None, "A", ["G"]),
                        VCFRecord("chr7", 20, None, "A", ["G"])
                    ],
                    "caller2": [
                        VCFRecord("chr7", 14, None, "GCGTA", ["CCGTG"], info={"AD": 100}),
                        VCFRecord("chr7", 18, None, "A", ["G"], info={"AD": 3}),
                        VCFRecord("chr7", 20, None, "A", ["G"], info={"AD": 104})
                    ],
                    "caller3": [
                        VCFRecord("chr7", 14, None, "GCGTA", ["CCGTG"]),
                        VCFRecord("chr7", 20, None, "A", ["G"])
                    ],
                    "caller4": [VCFRecord("chr7", 14, None, "GCGTATCA", ["CCGTGTCG"])]
                },
                "haplotyped": {
                    "caller1": [VCFRecord("chr7", 14, None, "GCGTATCA", ["CCGTGTCG"], info={"MCO_VAR": ["chr7:14=G/C", "chr7:18=A/G", "chr7:20=A/G"]})],
                    "caller2": [
                        VCFRecord("chr7", 14, None, "GCGTATCA", ["CCGTGTCG"], info={"MCO_VAR": ["chr7:14=GCGTA/CCGTG", "chr7:20=A/G"], "AD": 100}),
                        VCFRecord("chr7", 18, None, "G", ["C"], info={"AD": 3})
                    ],
                    "caller3": [VCFRecord("chr7", 14, None, "GCGTATCA", ["CCGTGTCG"], info={"MCO_VAR": ["chr7:14=GCGTA/CCGTG", "chr7:20=A/G"]})],
                    "caller4": [VCFRecord("chr7", 14, None, "GCGTATCA", ["CCGTGTCG"])]
                },
                "expected": {
                    "caller1": [
                        VCFRecord("chr7", 14, None, "G", ["C"]),
                        VCFRecord("chr7", 18, None, "A", ["G"]),
                        VCFRecord("chr7", 20, None, "A", ["G"])
                    ],
                    "caller2": [
                        VCFRecord("chr7", 14, None, "G", ["C"], info={"AD": 100}),
                        VCFRecord("chr7", 18, None, "A", ["G"], info={"AD": 100}),
                        VCFRecord("chr7", 20, None, "A", ["G"], info={"AD": 104})
                    ],
                    "caller3": [
                        VCFRecord("chr7", 14, None, "G", ["C"]),
                        VCFRecord("chr7", 18, None, "A", ["G"]),
                        VCFRecord("chr7", 20, None, "A", ["G"])
                    ],
                    "caller4": [
                        VCFRecord("chr7", 14, None, "G", ["C"]),
                        VCFRecord("chr7", 18, None, "A", ["G"]),
                        VCFRecord("chr7", 20, None, "A", ["G"])
                    ]
                }
            },
            {  # *a-b c, a-b b' c, a b c, a-b-c
                "initial": {
                    "caller1": [
                        VCFRecord("chr8", 14, None, "GCGTA", ["CCGTG"]),
                        VCFRecord("chr8", 20, None, "A", ["G"])
                    ],
                    "caller2": [
                        VCFRecord("chr8", 14, None, "GCGTA", ["CCGTG"], info={"AD": 100}),
                        VCFRecord("chr8", 18, None, "A", ["G"], info={"AD": 3}),
                        VCFRecord("chr8", 20, None, "A", ["G"], info={"AD": 104})
                    ],
                    "caller3": [
                        VCFRecord("chr8", 14, None, "G", ["C"], info={"AD": 110}),
                        VCFRecord("chr8", 18, None, "A", ["G"], info={"AD": 105}),
                        VCFRecord("chr8", 20, None, "A", ["G"], info={"AD": 100})
                    ],
                    "caller4": [VCFRecord("chr8", 14, None, "GCGTATCA", ["CCGTGTCG"])]
                },
                "haplotyped": {
                    "caller1": [VCFRecord("chr8", 14, None, "GCGTATCA", ["CCGTGTCG"], info={"MCO_VAR": ["chr8:14=GCGTA/CCGTG", "chr8:20=A/G"]})],
                    "caller2": [
                        VCFRecord("chr8", 14, None, "GCGTATCA", ["CCGTGTCG"], info={"MCO_VAR": ["chr8:14=GCGTA/CCGTG", "chr8:20=A/G"], "AD": 100}),
                        VCFRecord("chr8", 18, None, "G", ["C"], info={"AD": 3})
                    ],
                    "caller3": [VCFRecord("chr8", 14, None, "GCGTATCA", ["CCGTGTCG"], info={"MCO_VAR": ["chr8:14=G/C", "chr8:18=A/G", "chr8:20=A/G"], "AD": 100})],
                    "caller4": [VCFRecord("chr8", 14, None, "GCGTATCA", ["CCGTGTCG"])]
                },
                "expected": {
                    "caller1": [
                        VCFRecord("chr8", 14, None, "GCGTA", ["CCGTG"]),
                        VCFRecord("chr8", 20, None, "A", ["G"])
                    ],
                    "caller2": [
                        VCFRecord("chr8", 14, None, "GCGTA", ["CCGTG"], info={"AD": 100}),
                        VCFRecord("chr8", 18, None, "A", ["G"], info={"AD": 3}),
                        VCFRecord("chr8", 20, None, "A", ["G"], info={"AD": 104})
                    ],
                    "caller3": [
                        VCFRecord("chr8", 14, None, "GCGTA", ["CCGTG"], info={"AD": 110}),
                        VCFRecord("chr8", 20, None, "A", ["G"], info={"AD": 100})
                    ],
                    "caller4": [
                        VCFRecord("chr8", 14, None, "GCGTA", ["CCGTG"]),
                        VCFRecord("chr8", 20, None, "A", ["G"])
                    ]
                }
            },
            {  # *a' a-b c, a-b b' c, a b c, a-b-c
                "initial": {
                    "caller1": [
                        VCFRecord("chr9", 14, None, "G", ["C"]),
                        VCFRecord("chr9", 14, None, "GCGTA", ["CCGTG"]),
                        VCFRecord("chr9", 20, None, "A", ["G"])
                    ],
                    "caller2": [
                        VCFRecord("chr9", 14, None, "GCGTA", ["CCGTG"], info={"AD": 100}),
                        VCFRecord("chr9", 18, None, "A", ["G"], info={"AD": 3}),
                        VCFRecord("chr9", 20, None, "A", ["G"], info={"AD": 104})
                    ],
                    "caller3": [
                        VCFRecord("chr9", 14, None, "G", ["C"], info={"AD": 110}),
                        VCFRecord("chr9", 18, None, "A", ["G"], info={"AD": 105}),
                        VCFRecord("chr9", 20, None, "A", ["G"], info={"AD": 100})
                    ],
                    "caller4": [VCFRecord("chr9", 14, None, "GCGTATCA", ["CCGTGTCG"])]
                },
                "haplotyped": {
                    "caller1": [
                        VCFRecord("chr9", 14, None, "G", ["C"]),
                        VCFRecord("chr9", 14, None, "GCGTATCA", ["CCGTGTCG"], info={"MCO_VAR": ["chr9:14=GCGTA/CCGTG", "chr9:20=A/G"]})
                    ],
                    "caller2": [
                        VCFRecord("chr9", 14, None, "GCGTATCA", ["CCGTGTCG"], info={"MCO_VAR": ["chr9:14=GCGTA/CCGTG", "chr9:20=A/G"], "AD": 100}),
                        VCFRecord("chr9", 18, None, "G", ["C"], info={"AD": 3})
                    ],
                    "caller3": [VCFRecord("chr9", 14, None, "GCGTATCA", ["CCGTGTCG"], info={"MCO_VAR": ["chr9:14=G/C", "chr9:18=A/G", "chr9:20=A/G"], "AD": 100})],
                    "caller4": [VCFRecord("chr9", 14, None, "GCGTATCA", ["CCGTGTCG"])]
                },
                "expected": {
                    "caller1": [
                        VCFRecord("chr9", 14, None, "G", ["C"]),
                        VCFRecord("chr9", 14, None, "GCGTA", ["CCGTG"]),
                        VCFRecord("chr9", 20, None, "A", ["G"])
                    ],
                    "caller2": [
                        VCFRecord("chr9", 14, None, "GCGTA", ["CCGTG"], info={"AD": 100}),
                        VCFRecord("chr9", 18, None, "A", ["G"], info={"AD": 3}),
                        VCFRecord("chr9", 20, None, "A", ["G"], info={"AD": 104})
                    ],
                    "caller3": [
                        VCFRecord("chr9", 14, None, "GCGTA", ["CCGTG"], info={"AD": 110}),
                        VCFRecord("chr9", 20, None, "A", ["G"], info={"AD": 100})
                    ],
                    "caller4": [
                        VCFRecord("chr9", 14, None, "GCGTA", ["CCGTG"]),
                        VCFRecord("chr9", 20, None, "A", ["G"])
                    ]
                }
            }
        ]

        # Get callers
        callers = set()
        for curr_test in self.test_cases:
            for curr_caller in curr_test["initial"]:
                callers.add(curr_caller)
        self.callers = sorted(list(callers))

        # Write files
        for curr_caller in self.callers:
            # Initial
            with VCFIO(self.tmp_initial_pathes.format(curr_caller), "w") as handle_out:
                handle_out.info = {
                    "AD": HeaderInfoAttr("AD", "Alternative allele depth.", type="Integer", number="1")
                }
                handle_out.extra_header = ["##source={}".format(curr_caller)]
                handle_out.writeHeader()
                for curr_test in self.test_cases:
                    if curr_caller in curr_test["initial"]:
                        for curr_var in curr_test["initial"][curr_caller]:
                            handle_out.write(curr_var)
            # Haplotyped
            with VCFIO(self.tmp_haplotyped_pathes.format(curr_caller), "w") as handle_out:
                handle_out.info = {
                    "AD": HeaderInfoAttr("AD", "Alternative allele depth.", type="Integer", number="1"),
                    "MCO_VAR": HeaderInfoAttr("MCO_VAR", "Name of the variants merged because their occur on same reads.", type="String", number=".")
                }
                handle_out.extra_header = ["##source={}".format(curr_caller)]
                handle_out.writeHeader()
                for curr_test in self.test_cases:
                    if curr_caller in curr_test["haplotyped"]:
                        for curr_var in curr_test["haplotyped"][curr_caller]:
                            handle_out.write(curr_var)
            # Expected
            with VCFIO(self.tmp_expected_pathes.format(curr_caller), "w") as handle_out:
                handle_out.info = {
                    "AD": HeaderInfoAttr("AD", "Alternative allele depth.", type="Integer", number="1")
                }
                handle_out.extra_header = ["##source={}".format(curr_caller)]
                handle_out.writeHeader()
                for curr_test in self.test_cases:
                    if curr_caller in curr_test["expected"]:
                        for curr_var in curr_test["expected"][curr_caller]:
                            handle_out.write(curr_var)
コード例 #23
0
    def setUp(self):
        tmp_folder = tempfile.gettempdir()
        unique_id = str(uuid.uuid1())

        # Temporary files
        self.tmp_sequences = os.path.join(tmp_folder, unique_id + ".fasta")
        self.tmp_regions = os.path.join(tmp_folder, unique_id + ".bed")
        self.tmp_variants = os.path.join(tmp_folder, unique_id + ".vcf")
        self.tmp_output = os.path.join(tmp_folder, unique_id + "_out.vcf")

        # Exec command
        self.cmd = [
            "filterVCFPrimers.py",
            "--input-variants", self.tmp_variants,
            "--input-regions", self.tmp_regions,
            "--input-sequences", self.tmp_sequences,
            "--output-variants", self.tmp_output
        ]

        # Create fasta
        with FastaIO(self.tmp_sequences, "w") as FH_seq:
            FH_seq.write(Sequence("artificial_chr1", "NNNAAAATTTGGGGGGGGGGTTTAAANNN"))
            #                                         123456789| | | | | | | | | |
            #                                                  10| 14| 18| 22| 26|
            #                                                    12  16  20  24  28
            FH_seq.write(Sequence("artificial_chr2", "CGATNNNCGAT"))
            #                                         123456789|
            #                                                  10

        # Create VCF
        with VCFIO(self.tmp_variants, "w") as FH_var:
            FH_var.info = {"ZOI": HeaderInfoAttr("ZOI", "If the variant can be in interest area.", type="String", number="1")}
            FH_var.writeHeader()
            self.variants = [
                VCFRecord("artificial_chr1", 6, "alt_0", "A", ["AA"], None, None, {"ZOI": "no"}),
                VCFRecord("artificial_chr1", 8, "alt_1", "TT", ["T"], None, None, {"ZOI": "no"}),
                VCFRecord("artificial_chr1", 8, "alt_2", "T", ["TT"], None, None, {"ZOI": "yes"}),
                VCFRecord("artificial_chr1", 9, "alt_3", "TTGG", ["TT"], None, None, {"ZOI": "yes"}),
                VCFRecord("artificial_chr1", 14, "alt_4", "G", ["GG"], None, None, {"ZOI": "yes"}),
                VCFRecord("artificial_chr1", 18, "alt_5", "GGG", ["G"], None, None, {"ZOI": "yes"}),  # ZOI downstream limit deletion
                VCFRecord("artificial_chr1", 22, "alt_6", "T", ["TT"], None, None, {"ZOI": "yes"}),

                VCFRecord("artificial_chr1", 9, "alt_7", "TT", ["TC"], None, None, {"ZOI": "no"}),  # Substitution before end of upstream primer
                VCFRecord("artificial_chr1", 10, "alt_8", "TG", ["TC"], None, None, {"ZOI": "yes"}),  # Substitution in upstream limit of ZOI
                VCFRecord("artificial_chr1", 15, "alt_9", "GG", ["GC"], None, None, {"ZOI": "yes"}),  # Substitution in dosnstream limit of ZOI
                VCFRecord("artificial_chr1", 20, "alt_10", "GT", ["GC"], None, None, {"ZOI": "no"}),  # Substitution after start of downstream primer
                VCFRecord("artificial_chr1", 21, "alt_11", "TT", ["TC"], None, None, {"ZOI": "no"}),  # Substitution in downstream primer

                VCFRecord("artificial_chr2", 1, "alt_12", "C", ["CTT"], None, None, {"ZOI": "no"}),  # Insertion before end of upstream primer
                VCFRecord("artificial_chr2", 2, "alt_13", "G", ["GCC"], None, None, {"ZOI": "yes"}),  # Insertion in upstream limit of ZOI
                VCFRecord("artificial_chr2", 3, "alt_14", "AT", ["CCGC"], None, None, {"ZOI": "yes"}),  # Insertion in upstream limit of ZOI and without standardization
                VCFRecord("artificial_chr2", 9, "alt_15", "G", ["GCC"], None, None, {"ZOI": "yes"}),  # Insertion in downstream limit of ZOI
                VCFRecord("artificial_chr2", 9, "alt_16", "G", ["NNN"], None, None, {"ZOI": "yes"}),  # Insertion in downstream limit of ZOI and without standardization
                VCFRecord("artificial_chr2", 10, "alt_17", "-", ["CC"], None, None, {"ZOI": "yes"}),  # Insertion in downstream limit of ZOI
                VCFRecord("artificial_chr2", 10, "alt_18", "A", ["ATT"], None, None, {"ZOI": "no"}),  # Insertion after start of downstream primer

                VCFRecord("artificial_chr2", 1, "alt_19", "CG", ["C"], None, None, {"ZOI": "no"}),  # Deletion before end of upstream primer
                VCFRecord("artificial_chr2", 2, "alt_20", "GA", ["G"], None, None, {"ZOI": "yes"}),  # Deletion in upstream limit of ZOI
                VCFRecord("artificial_chr2", 3, "alt_21", "AT", ["C"], None, None, {"ZOI": "yes"}),  # Deletion in upstream limit of ZOI and without standardization
                VCFRecord("artificial_chr2", 6, "alt_22", "NNCG", ["N"], None, None, {"ZOI": "yes"}),  # Deletion in downstream limit of ZOI
                VCFRecord("artificial_chr2", 8, "alt_23", "CG", ["C"], None, None, {"ZOI": "yes"}),  # Deletion in downstream limit of ZOI
                VCFRecord("artificial_chr2", 8, "alt_24", "CG", ["T"], None, None, {"ZOI": "yes"}),  # Deletion in downstream limit of ZOI and without standardization
                VCFRecord("artificial_chr2", 9, "alt_25", "GA", ["G"], None, None, {"ZOI": "no"}),  # Insertion after start of downstream primer
                VCFRecord("artificial_chr2", 10, "alt_26", "A", ["-"], None, None, {"ZOI": "no"}),  # Insertion after start of downstream primer
                VCFRecord("artificial_chr2", 10, "alt_27", "AT", ["A"], None, None, {"ZOI": "no"}),  # Insertion after start of downstream primer
            ]
            for idx, curr_var in enumerate(self.variants):
                FH_var.write(curr_var)