Ejemplo n.º 1
0
def setRefPos(variant, seq_handler, padding=200):
    """
    Add start and end attributes in VCFRecord. For insertions the start is defined on the first position before the insertion and the end on the last position affected by the insertion.

    :param variant: The variant to update.
    :type variant: anacore.vcf.VCFRecord
    """
    if variant.ref == VCFRecord.getEmptyAlleleMarker() or variant.alt[
            0] == VCFRecord.getEmptyAlleleMarker():  # Normalized indel
        # Most upstream
        variant.upstream_start, variant.upstream_end = getStartEnd(variant)
        # Most downstream
        sub_region = seq_handler.getSub(
            variant.chrom, variant.pos - 2,
            variant.pos + len(variant.ref) + padding)
        chrom_pos = variant.pos
        variant.pos = 3  # Switch position from chromosome to position from subregion
        downstream_var = variant.getMostDownstream(sub_region)
        variant.pos = chrom_pos + variant.pos - 3  # Switch position from subregion to position from chromosome
        downstream_var.pos = variant.pos
        variant.downstream_start, variant.downstream_end = getStartEnd(
            downstream_var)
    else:
        variant.upstream_start, variant.upstream_end = getStartEnd(variant)
        variant.downstream_start = variant.upstream_start
        variant.downstream_end = variant.upstream_end
Ejemplo n.º 2
0
 def testHasLowSupport(self):
     up = VCFRecord("chr1",
                    140,
                    "id_01",
                    "A", ["A[chr1:199["],
                    pFormat=["PR", "SR"],
                    samples={"splA": {
                        "PR": 9,
                        "SR": 3
                    }})
     self.assertTrue(not hasLowSupport(up, 10))
     up = VCFRecord("chr1",
                    140,
                    "id_02",
                    "A", ["A[chr1:199["],
                    pFormat=["PR", "SR"],
                    samples={"splA": {
                        "PR": 9,
                        "SR": 0
                    }})
     self.assertTrue(hasLowSupport(up, 10))
     up = VCFRecord("chr1",
                    140,
                    "id_03",
                    "A", ["A[chr1:199["],
                    pFormat=["PR", "SR"],
                    samples={
                        "splA": {
                            "PR": 4,
                            "SR": 2
                        },
                        "splB": {
                            "PR": 3,
                            "SR": 3
                        },
                    })
     self.assertTrue(not hasLowSupport(up, 10))
     up = VCFRecord("chr1",
                    140,
                    "id_04",
                    "A", ["A[chr1:199["],
                    pFormat=["PR", "SR"],
                    samples={
                        "splA": {
                            "PR": 1,
                            "SR": 2
                        },
                        "splB": {
                            "PR": 3,
                            "SR": 3
                        },
                    })
     self.assertTrue(hasLowSupport(up, 10))
     up = VCFRecord("chr1", 140, "id_05", "A", ["A[chr1:199["])
     self.assertTrue(not hasLowSupport(up, 0))  # No test
Ejemplo n.º 3
0
def getSupportingReads(var, chrom_seq, FH_aln, log):
    """
    Return read ID of reads supporting the altenative variant.

    :param var: The variant.
    :type var: anacore.vcf.VCFRecord updated with iniVariant() and isIns
    :param chrom_seq: The sequence of the chromosome.
    :type chrom_seq: str
    :param FH_aln: The file handle to the alignments file. The variants must have been defined from this alignments file.
    :type FH_aln: pysam.AlignmentFile
    :param log: The logger object.
    :type log: logging.Logger
    :return: The list of supporting reads IDs.
    :rtype: set
    """
    supporting_reads = set()
    is_insertion = var.isInsertion()
    for read in FH_aln.fetch(var.chrom, var.upstream_start - 1, var.downstream_end):
        if not read.is_duplicate:
            reads_pos = read.get_reference_positions()
            if len(reads_pos) != 0:  # Skip alignment with problem
                ref_start = reads_pos[0] + 1  # 0-based to 1-based
                ref_end = reads_pos[-1] + 1  # 0-based to 1-based
                overlap_var = (ref_start <= var.upstream_start and ref_end >= var.downstream_end)
                if overlap_var:
                    ref_aln, read_aln = getAlnCmp(read, chrom_seq[ref_start - 1:ref_end])
                    var_alt = var.alt[0].upper().replace(VCFRecord.getEmptyAlleleMarker(), "")
                    var_ref = var.ref.upper().replace(VCFRecord.getEmptyAlleleMarker(), "")
                    # Test with upstream coordinates
                    ref, alt = getReadRefAlt(ref_aln, read_aln, ref_start, is_insertion, var.upstream_start, var.upstream_end)
                    if "".join(alt).upper() == var_alt and "".join(ref).upper() == var_ref:  # The alternative is present on most upstream coordinates
                        log.debug("{}\t{}/{}\t'{}'\t'{}'\t{}".format(read.query_name, var.ref, var.alt[0], "".join(ref), "".join(alt), read.cigarstring))
                        supporting_reads.add(read.query_name)  # Fragment is overlapping if at least one of his read is ovelapping
                    # Test with downstream coordinates
                    elif var.upstream_start != var.downstream_start:
                        ref, alt = getReadRefAlt(ref_aln, read_aln, ref_start, is_insertion, var.downstream_start, var.downstream_end)
                        if "".join(alt).upper() == var_alt and "".join(ref).upper() == var_ref:  # The alternative is present on most downstream coordinates
                            log.debug("{}\t{}/{}\t'{}'\t'{}'\t{}".format(read.query_name, var.ref, var.alt[0], "".join(ref), "".join(alt), read.cigarstring))
                            supporting_reads.add(read.query_name)  # Fragment is overlapping if at least one of his read is ovelapping
    return supporting_reads
    def setUp(self):
        tmp_folder = tempfile.gettempdir()
        unique_id = str(uuid.uuid1())

        # Temporary files
        self.tmp_selected_rna = os.path.join(tmp_folder,
                                             unique_id + "_rna.tsv")
        self.tmp_variants = os.path.join(tmp_folder, unique_id + ".vcf")
        self.tmp_output = os.path.join(tmp_folder, unique_id + "_out.vcf")

        # Create RNA ref
        with open(self.tmp_selected_rna, "w") as FH_rna:
            FH_rna.write("#Gene\tTranscript\n")
            FH_rna.write("Gene_1\tENST_selected1\n")
            FH_rna.write("Gene_1\tENST_selected2\n")

        # Create VCF
        with AnnotVCFIO(self.tmp_variants, "w") as FH_var:
            FH_var.ANN_titles = [
                "Allele", "Consequence", "Feature", "EUR_AF", "gnomAD_AF",
                "expected_filter"
            ]
            FH_var.info = {
                "ANN":
                HeaderInfoAttr(
                    "ANN",
                    "Consequence annotations from Ensembl VEP. Format: Allele|Consequence|Feature|gnomAD_AF|expected_filter.",
                    type="String",
                    number="."),
                "expected_filter":
                HeaderInfoAttr("expected_filter",
                               "The expected filters.",
                               type="String",
                               number=".")
            }
            FH_var.writeHeader()
            self.variants = [
                VCFRecord(
                    "artificial_chr1", 14, "alt_00", "G", ["T"], None, None, {
                        "ANN": [{
                            "Allele": "T",
                            "Consequence": "missense_variant",
                            "Feature": "ENST_selected1",
                            "EUR_AF": "0.001&0.001",
                            "gnomAD_AF": 0.001,
                            "expected_filter": "PASS"
                        }],
                        "expected_filter": ["PASS"]
                    }),
                VCFRecord("artificial_chr1", 14, "alt_01", "G", ["T"], None,
                          None, {"expected_filter": ["CSQ"]}),
                VCFRecord(
                    "artificial_chr1", 14, "alt_02", "G", ["T"], None, None, {
                        "ANN": [{
                            "Allele": "T",
                            "Consequence": "synonymous_variant",
                            "Feature": "ENST_selected1",
                            "EUR_AF": "0.001&0.001",
                            "gnomAD_AF": 0.001,
                            "expected_filter": "ANN.CSQ"
                        }],
                        "expected_filter": ["CSQ"]
                    }),
                VCFRecord(
                    "artificial_chr1", 14, "alt_03", "G", ["T"], None, None, {
                        "ANN": [{
                            "Allele": "T",
                            "Consequence": "missense_variant",
                            "Feature": "ENST_selected1",
                            "EUR_AF": "0.001&0.001",
                            "gnomAD_AF": 0.01,
                            "expected_filter": "ANN.popAF"
                        }],
                        "expected_filter": ["popAF"]
                    }),
                VCFRecord(
                    "artificial_chr1", 14, "alt_04", "G", ["T"], None, None, {
                        "ANN": [{
                            "Allele": "T",
                            "Consequence": "missense_variant",
                            "Feature": "other",
                            "EUR_AF": "0.001&0.001",
                            "gnomAD_AF": 0.001,
                            "expected_filter": "ANN.RNA"
                        }],
                        "expected_filter": ["CSQ"]
                    }),
                VCFRecord(
                    "artificial_chr1", 14, "alt_05", "G", ["T"], None, None, {
                        "ANN": [{
                            "Allele": "G",
                            "Consequence": "missense_variant",
                            "Feature": "ENST_selected1",
                            "EUR_AF": "0.001&0.001",
                            "gnomAD_AF": 0.001,
                            "expected_filter": "ANN.COLLOC"
                        }],
                        "expected_filter": ["CSQ"]
                    }),
                VCFRecord(
                    "artificial_chr1", 14, "alt_06", "G", ["T"], None, None, {
                        "ANN": [{
                            "Allele": "T",
                            "Consequence": "missense_variant",
                            "Feature": "ENST_selected1",
                            "EUR_AF": "0.001&0.001",
                            "gnomAD_AF": 0.001,
                            "expected_filter": "PASS"
                        }, {
                            "Allele": "C",
                            "Consequence": "missense_variant",
                            "Feature": "ENST_selected1",
                            "EUR_AF": "0.001&0.001",
                            "gnomAD_AF": 0.001,
                            "expected_filter": "ANN.COLLOC"
                        }],
                        "expected_filter": ["PASS"],
                    }),
                VCFRecord(
                    "artificial_chr1", 14, "alt_07", "G", ["T"], None, None, {
                        "ANN": [{
                            "Allele": "T",
                            "Consequence": "missense_variant",
                            "Feature": "ENST_selected1",
                            "EUR_AF": "0.001&0.001",
                            "gnomAD_AF": 0.01,
                            "expected_filter": "ANN.popAF"
                        }, {
                            "Allele": "C",
                            "Consequence": "missense_variant",
                            "Feature": "ENST_selected1",
                            "EUR_AF": "0.001&0.001",
                            "gnomAD_AF": 0.001,
                            "expected_filter": "ANN.COLLOC"
                        }],
                        "expected_filter": ["popAF"],
                    }),
                VCFRecord(
                    "artificial_chr1", 14, "alt_08", "G", ["T"], None, None, {
                        "ANN": [{
                            "Allele": "T",
                            "Consequence": "synonymous_variant",
                            "Feature": "ENST_selected1",
                            "EUR_AF": "0.001&0.001",
                            "gnomAD_AF": 0.001,
                            "expected_filter": "ANN.CSQ"
                        }, {
                            "Allele": "C",
                            "Consequence": "missense_variant",
                            "Feature": "ENST_selected1",
                            "EUR_AF": "0.001&0.001",
                            "gnomAD_AF": 0.001,
                            "expected_filter": "ANN.COLLOC"
                        }],
                        "expected_filter": ["CSQ"],
                    }),
                VCFRecord(
                    "artificial_chr1", 14, "alt_09", "G", ["T"], None, None, {
                        "ANN": [{
                            "Allele": "T",
                            "Consequence": "missense_variant",
                            "Feature": "other",
                            "EUR_AF": "0.001&0.001",
                            "gnomAD_AF": 0.001,
                            "expected_filter": "ANN.RNA"
                        }, {
                            "Allele": "C",
                            "Consequence": "missense_variant",
                            "Feature": "ENST_selected1",
                            "EUR_AF": "0.001&0.001",
                            "gnomAD_AF": 0.001,
                            "expected_filter": "ANN.COLLOC"
                        }],
                        "expected_filter": ["CSQ"],
                    }),
                VCFRecord(
                    "artificial_chr1", 14, "alt_10", "G", ["T"], None, None, {
                        "ANN": [{
                            "Allele": "T",
                            "Consequence": "synonymous_variant",
                            "Feature": "other",
                            "EUR_AF": "0.001&0.001",
                            "gnomAD_AF": 0.01,
                            "expected_filter": "ANN.CSQ&ANN.RNA&ANN.popAF"
                        }, {
                            "Allele": "C",
                            "Consequence": "missense_variant",
                            "Feature": "ENST_selected1",
                            "EUR_AF": "0.001&0.001",
                            "gnomAD_AF": 0.001,
                            "expected_filter": "ANN.COLLOC"
                        }],
                        "expected_filter": ["CSQ", "popAF"],
                    }),
                VCFRecord(
                    "artificial_chr1", 14, "alt_11", "G", ["T"], None, None, {
                        "ANN": [{
                            "Allele": "T",
                            "Consequence": "synonymous_variant",
                            "Feature": "ENST_selected1",
                            "EUR_AF": "0.001&0.001",
                            "gnomAD_AF": 0.01,
                            "expected_filter": "ANN.CSQ&ANN.popAF"
                        }, {
                            "Allele": "C",
                            "Consequence": "missense_variant",
                            "Feature": "ENST_selected1",
                            "EUR_AF": "0.001&0.001",
                            "gnomAD_AF": 0.001,
                            "expected_filter": "ANN.COLLOC"
                        }],
                        "expected_filter": ["CSQ", "popAF"],
                    }),
                VCFRecord(
                    "artificial_chr1", 14, "alt_12", "G", ["T"], None, None, {
                        "ANN": [{
                            "Allele": "T",
                            "Consequence": "synonymous_variant",
                            "Feature": "ENST_selected1",
                            "EUR_AF": "0.001&0.001",
                            "gnomAD_AF": 0.01,
                            "expected_filter": "ANN.CSQ&ANN.popAF"
                        }, {
                            "Allele": "T",
                            "Consequence": "missense_variant",
                            "Feature": "other",
                            "EUR_AF": "0.001&0.001",
                            "gnomAD_AF": 0.01,
                            "expected_filter": "ANN.RNA&ANN.popAF"
                        }, {
                            "Allele": "C",
                            "Consequence": "missense_variant",
                            "Feature": "ENST_selected1",
                            "EUR_AF": "0.001&0.001",
                            "gnomAD_AF": 0.001,
                            "expected_filter": "ANN.COLLOC"
                        }],
                        "expected_filter": ["CSQ", "popAF"],
                    }),
                VCFRecord(
                    "artificial_chr1", 14, "alt_13", "G", ["T"], None, None, {
                        "ANN": [{
                            "Allele": "T",
                            "Consequence": "synonymous_variant",
                            "Feature": "ENST_selected1",
                            "EUR_AF": "0.01&0.01",
                            "gnomAD_AF": 0.001,
                            "expected_filter": "ANN.CSQ&ANN.popAF"
                        }, {
                            "Allele": "C",
                            "Consequence": "missense_variant",
                            "Feature": "ENST_selected1",
                            "EUR_AF": "0.05&0.05",
                            "gnomAD_AF": 0.001,
                            "expected_filter": "ANN.COLLOC&ANN.popAF"
                        }],
                        "expected_filter": ["CSQ", "popAF"],
                    }),
                VCFRecord(
                    "artificial_chr1", 14, "alt_14", "G", ["GT"], None, None, {
                        "ANN": [{
                            "Allele": "GT",
                            "Consequence": "missense_variant",
                            "Feature": "ENST_selected1",
                            "EUR_AF": "0.01&0.01",
                            "gnomAD_AF": 0.001,
                            "expected_filter": "ANN.popAF"
                        }, {
                            "Allele": "C",
                            "Consequence": "missense_variant",
                            "Feature": "ENST_selected1",
                            "EUR_AF": "0.05&0.05",
                            "gnomAD_AF": 0.001,
                            "expected_filter": "ANN.COLLOC&ANN.popAF"
                        }, {
                            "Allele": "T",
                            "Consequence": "missense_variant",
                            "Feature": "ENST_selected1",
                            "EUR_AF": "0.05&0.05",
                            "gnomAD_AF": 0.001,
                            "expected_filter": "ANN.COLLOC&ANN.popAF"
                        }],
                        "expected_filter": ["popAF"],
                    }),
                VCFRecord(
                    "artificial_chr1", 15, "alt_15", "-", ["T"], None, None, {
                        "ANN": [{
                            "Allele": "GT",
                            "Consequence": "missense_variant",
                            "Feature": "ENST_selected1",
                            "EUR_AF": "0.01&0.01",
                            "gnomAD_AF": 0.001,
                            "expected_filter": "ANN.COLLOC&ANN.popAF"
                        }, {
                            "Allele": "T",
                            "Consequence": "missense_variant",
                            "Feature": "ENST_selected1",
                            "EUR_AF": "0.01&0.01",
                            "gnomAD_AF": 0.001,
                            "expected_filter": "ANN.popAF"
                        }, {
                            "Allele": "C",
                            "Consequence": "missense_variant",
                            "Feature": "ENST_selected1",
                            "EUR_AF": "0.05&0.05",
                            "gnomAD_AF": 0.001,
                            "expected_filter": "ANN.COLLOC&ANN.popAF"
                        }],
                        "expected_filter": ["popAF"],
                    }),
                VCFRecord(
                    "artificial_chr1", 14, "alt_15", "G", ["-"], None, None, {
                        "ANN": [{
                            "Allele": "-",
                            "Consequence": "missense_variant",
                            "Feature": "ENST_selected1",
                            "EUR_AF": "0.01&0.01",
                            "gnomAD_AF": 0.001,
                            "expected_filter": "ANN.popAF"
                        }, {
                            "Allele": "G",
                            "Consequence": "missense_variant",
                            "Feature": "ENST_selected1",
                            "EUR_AF": "0.01&0.01",
                            "gnomAD_AF": 0.001,
                            "expected_filter": "ANN.COLLOC&ANN.popAF"
                        }, {
                            "Allele": "C",
                            "Consequence": "missense_variant",
                            "Feature": "ENST_selected1",
                            "EUR_AF": "0.05&0.05",
                            "gnomAD_AF": 0.001,
                            "expected_filter": "ANN.COLLOC&ANN.popAF"
                        }],
                        "expected_filter": ["popAF"],
                    }),
                VCFRecord(
                    "artificial_chr1", 14, "alt_16", "GG", ["G"], None, None, {
                        "ANN": [{
                            "Allele": "-",
                            "Consequence": "missense_variant",
                            "Feature": "ENST_selected1",
                            "EUR_AF": "0.01&0.01",
                            "gnomAD_AF": 0.001,
                            "expected_filter": "ANN.COLLOC&ANN.popAF"
                        }, {
                            "Allele": "G",
                            "Consequence": "missense_variant",
                            "Feature": "ENST_selected1",
                            "EUR_AF": "0.01&0.01",
                            "gnomAD_AF": 0.001,
                            "expected_filter": "ANN.popAF"
                        }, {
                            "Allele": "C",
                            "Consequence": "missense_variant",
                            "Feature": "ENST_selected1",
                            "EUR_AF": "0.05&0.05",
                            "gnomAD_AF": 0.001,
                            "expected_filter": "ANN.COLLOC&ANN.popAF"
                        }],
                        "expected_filter": ["popAF"],
                    })
            ]
            for idx, curr_var in enumerate(self.variants):
                FH_var.write(curr_var)
Ejemplo n.º 5
0
    def setUp(self):
        tmp_folder = tempfile.gettempdir()
        unique_id = str(uuid.uuid1())

        # Temporary files
        self.tmp_var_filters = os.path.join(tmp_folder,
                                            unique_id + "_varFilters.json")
        self.tmp_annot_filters = os.path.join(tmp_folder,
                                              unique_id + "_annFilters.json")
        self.tmp_variants = os.path.join(tmp_folder, unique_id + ".vcf")
        self.tmp_output = os.path.join(tmp_folder, unique_id + "_out.vcf")

        # Command
        self.cmd = [
            "filterAnnotVCF.py", "--input-variants", self.tmp_variants,
            "--output-variants", self.tmp_output
        ]

        # Create filters
        with open(self.tmp_var_filters, "w") as FH_filter:
            FH_filter.write("""{
    "class": "FiltersCombiner",
    "operator": "or",
    "filters": [
        {
            "class": "Filter",
            "getter": "filter",
            "action": "select",
            "aggregator": "ratio:1",
            "operator": "!=",
            "values": "CSQ"
        }, {
            "class": "Filter",
            "getter": "chrom",
            "action": "select",
            "aggregator": "nb:1",
            "operator": "==",
            "values": "artificial_chr2"
        }
    ]
}""")
        with open(self.tmp_annot_filters, "w") as FH_filter:
            FH_filter.write("""{
    "class": "Filter",
    "getter": "FILTER",
    "action": "select",
    "aggregator": "ratio:1",
    "operator": "==",
    "values": "PASS"
}""")

        # Create VCF
        with AnnotVCFIO(self.tmp_variants, "w") as FH_var:
            FH_var.ANN_titles = ["Allele", "id", "is_filtered", "FILTER"]
            FH_var.info = {
                "ANN":
                HeaderInfoAttr(
                    "ANN",
                    "Consequence annotations from Ensembl VEP. Format: Allele|id|is_filtered|FILTER.",
                    type="String",
                    number="."),
                "is_filtered":
                HeaderInfoAttr("is_filtered",
                               "The expected result.",
                               type="Integer",
                               number="1")
            }
            FH_var.writeHeader()
            self.variants = [
                VCFRecord("artificial_chr1", 10, "alt_00", "G", ["T"], None,
                          ["PASS"], {"is_filtered": 0}),
                VCFRecord("artificial_chr1", 10, "alt_01", "G", ["T"], None,
                          ["CSQ"], {"is_filtered": 1}),
                VCFRecord(
                    "artificial_chr2",
                    10,
                    "alt_02",
                    "G",
                    ["T"],
                    None,
                    ["CSQ"],
                    {
                        "is_filtered": 0,  # Proctected
                    }),
                VCFRecord(
                    "artificial_chr1", 10, "alt_03", "G", ["T"], None,
                    ["PASS"], {
                        "ANN": [{
                            "Allele": "T",
                            "id": "ann_00",
                            "FILTER": "PASS",
                            "is_filtered": 0
                        }],
                        "is_filtered":
                        0
                    }),
                VCFRecord(
                    "artificial_chr1", 10, "alt_04", "G", ["T"], None,
                    ["PASS"], {
                        "ANN": [{
                            "Allele": "C",
                            "id": "ann_01",
                            "FILTER": "ANN.COLLOC",
                            "is_filtered": 1
                        }],
                        "is_filtered":
                        0
                    }),
                VCFRecord(
                    "artificial_chr1", 10, "alt_05", "G", ["T"], None, ["CSQ"],
                    {
                        "ANN": [{
                            "Allele": "C",
                            "id": "ann_02",
                            "FILTER": "ANN.COLLOC",
                            "is_filtered": 1
                        }],
                        "is_filtered":
                        1
                    }),
                VCFRecord(
                    "artificial_chr1", 10, "alt_06", "G", ["T"], None, ["CSQ"],
                    {
                        "ANN": [{
                            "Allele": "T",
                            "id": "ann_03",
                            "FILTER": "PASS",
                            "is_filtered": 0
                        }],
                        "is_filtered":
                        1
                    }),
                VCFRecord(
                    "artificial_chr1", 10, "alt_07", "G", ["T"], None,
                    ["PASS"], {
                        "ANN": [
                            {
                                "Allele": "T",
                                "id": "ann_04",
                                "FILTER": "PASS",
                                "is_filtered": 0
                            },
                            {
                                "Allele": "C",
                                "id": "ann_05",
                                "FILTER": "ANN.COLLOC",
                                "is_filtered": 1
                            },
                        ],
                        "is_filtered":
                        0
                    }),
                VCFRecord(
                    "artificial_chr1", 10, "alt_08", "G", ["T"], None,
                    ["PASS"], {
                        "ANN": [
                            {
                                "Allele": "T",
                                "id": "ann_06",
                                "FILTER": "ANN.popAF",
                                "is_filtered": 1
                            },
                            {
                                "Allele": "C",
                                "id": "ann_07",
                                "FILTER": "ANN.COLLOC&ANN.popAF",
                                "is_filtered": 1
                            },
                        ],
                        "is_filtered":
                        0
                    }),
                VCFRecord(
                    "artificial_chr2",
                    10,
                    "alt_09",
                    "G",
                    ["T"],
                    None,
                    ["CSQ"],
                    {
                        "ANN": [
                            {
                                "Allele": "T",
                                "id": "ann_08",
                                "FILTER": "ANN.popAF",
                                "is_filtered": 1
                            },
                            {
                                "Allele": "C",
                                "id": "ann_09",
                                "FILTER": "ANN.COLLOC&ANN.popAF",
                                "is_filtered": 1
                            },
                        ],
                        "is_filtered":
                        0  # Protected
                    }),
                VCFRecord(
                    "artificial_chr2",
                    10,
                    "alt_10",
                    "G",
                    ["T"],
                    None,
                    ["CSQ"],
                    {
                        "ANN": [
                            {
                                "Allele": "T",
                                "id": "ann_10",
                                "FILTER": "PASS",
                                "is_filtered": 0
                            },
                            {
                                "Allele": "C",
                                "id": "ann_11",
                                "FILTER": "ANN.COLLOC&ANN.popAF",
                                "is_filtered": 1
                            },
                        ],
                        "is_filtered":
                        0  # Protected
                    })
            ]
            for idx, curr_var in enumerate(self.variants):
                FH_var.write(curr_var)
    def setUp(self):
        tmp_folder = tempfile.gettempdir()
        unique_id = str(uuid.uuid1())

        # Temporary files
        self.tmp_sequences = os.path.join(tmp_folder, unique_id + ".fasta")
        self.tmp_faidx = os.path.join(tmp_folder, unique_id + ".fasta.fai")
        self.tmp_regions = os.path.join(tmp_folder, unique_id + ".bed")
        self.tmp_variants = os.path.join(tmp_folder, unique_id + ".vcf")
        self.tmp_output = os.path.join(tmp_folder, unique_id + "_out.vcf")

        # Exec command
        self.cmd = [
            "filterVCFTargets.py", "--mode", "remove", "--input-variants",
            self.tmp_variants, "--input-targets", self.tmp_regions,
            "--input-reference", self.tmp_sequences, "--output-variants",
            self.tmp_output
        ]

        # Create fasta
        with FastaIO(self.tmp_sequences, "w") as FH_seq:
            # Repeats:                                       ****....            ...***
            # Region:                                 |----|        |------------|         |------|
            FH_seq.write(
                Sequence("artificial_chr1",
                         "CTCAGTCATGTATGTATGTGCTCACAAAGTAGTAGATCATGGCAC"))
            #                                         123456789| | | | | | | | | | | | | | | | | |
            #                                                  10| 14| 18| 22| 26| 30| 34| 38| 42|
            #                                                    12  16  20  24  28  32  36  40  44
            FH_seq.write(Sequence("artificial_chr2", "CGATNNNCGAT"))
            #                                         123456789|
            #                                                  10

        # Create faidx
        with open(self.tmp_faidx, "w") as FH_fai:
            FH_fai.write("""artificial_chr1	45	17	45	46
artificial_chr2	11	80	11	12""")

        # Create targets
        with BEDIO(self.tmp_regions, "w", write_nb_col=4) as FH_bed:
            FH_bed.write(BEDRecord("artificial_chr1", 1, 6, "target_1"))
            FH_bed.write(BEDRecord("artificial_chr1", 15, 28, "target_2"))
            FH_bed.write(BEDRecord("artificial_chr1", 38, 45, "target_3"))

        # Create VCF
        with VCFIO(self.tmp_variants, "w") as FH_var:
            FH_var.info = {
                "target":
                HeaderInfoAttr("target",
                               "The ID of the overlapped target.",
                               type="String",
                               number="1")
            }
            FH_var.writeHeader()
            self.variants = [
                # Substit single nt
                VCFRecord("artificial_chr1", 14, "alt_00", "G", ["T"], None,
                          None, {"target": None
                                 }),  # Before target ; first nt before target
                VCFRecord(
                    "artificial_chr1", 15, "alt_01", "G", ["T"], None, None,
                    {"target": "target_2"}),  # On target ; first nt of target
                VCFRecord("artificial_chr1", 21, "alt_02", "C", ["G"], None,
                          None, {"target": "target_2"}),  # On target
                VCFRecord("artificial_chr1", 28, "alt_03", "A", ["G"], None,
                          None, {"target": "target_2"}),  # On target ; last nt
                VCFRecord(
                    "artificial_chr1", 29, "alt_04", "G", ["C"], None, None,
                    {"target": None}),  # After target ; first nt after target
                # Substit multi nt
                VCFRecord("artificial_chr1", 7, "alt_05", "CATGTATG",
                          ["GTACCCGC"], None, None,
                          {"target": None
                           }),  # Before target ; first nt before target
                VCFRecord("artificial_chr1", 11, "alt_06", "TATGTATG",
                          ["GTACCCGC"], None, None,
                          {"target": "target_2"}),  # Overlap target start
                VCFRecord("artificial_chr1", 13, "alt_07",
                          "TGTATGTGCTCACAAAGTA", ["CCCGCCCCTACATTGCAGT"], None,
                          None, {"target": "target_2"}),  # Include target
                VCFRecord("artificial_chr1", 15, "alt_08", "TATGTGCTCACAAA",
                          ["CGCCCCTACATTGC"], None, None,
                          {"target": "target_2"}),  # Exact target
                VCFRecord("artificial_chr1", 21, "alt_09", "CTCACAA",
                          ["GTACCCG"], None, None,
                          {"target": "target_2"}),  # Included by target
                VCFRecord("artificial_chr1", 24, "alt_10", "ACAAAGTA",
                          ["GTACCCG"], None, None,
                          {"target": "target_2"}),  # Overlap target end
                VCFRecord(
                    "artificial_chr1", 29, "alt_11", "GTAGTAGAT",
                    ["GTACCCGA"], None, None,
                    {"target": None}),  # After target ; first nt after target
                # Ins single nt
                VCFRecord("artificial_chr1", 14, "alt_12", "G", ["GA"], None,
                          None, {"target": None
                                 }),  # Before target ; first nt before target
                VCFRecord("artificial_chr1", 15, "alt_12.2", "-", ["A"], None,
                          None, {"target": None
                                 }),  # Before target ; first nt before target
                VCFRecord(
                    "artificial_chr1", 15, "alt_13", "A", ["TG"], None, None,
                    {"target": "target_2"}),  # On target ; first nt of target
                VCFRecord("artificial_chr1", 21, "alt_14", "C", ["CG"], None,
                          None, {"target": "target_2"}),  # On target
                VCFRecord("artificial_chr1", 27, "alt_15", "A", ["AT"], None,
                          None, {"target": "target_2"}),  # On target ; last nt
                VCFRecord("artificial_chr1", 28, "alt_15.2", "-", ["T"], None,
                          None, {"target": "target_2"}),  # On target ; last nt
                VCFRecord(
                    "artificial_chr1", 28, "alt_16", "A", ["AT"], None, None,
                    {"target": None}),  # After target ; first nt afetr target
                # Movable del multi nt
                VCFRecord(
                    "artificial_chr1", 14, "alt_17", "G", ["GT"], None, None,
                    {"target": "target_2"}),  # Movable to first nt of target
                VCFRecord(
                    "artificial_chr1", 28, "alt_18", "A", ["AA"], None, None,
                    {"target": "target_2"}),  # Movable to last nt of target
                # Del single nt
                VCFRecord("artificial_chr1", 14, "alt_19", "G", [""], None,
                          None, {"target": None
                                 }),  # Before target ; first nt before target
                VCFRecord(
                    "artificial_chr1", 15, "alt_20", "T", [""], None, None,
                    {"target": "target_2"}),  # On target ; first nt of target
                VCFRecord("artificial_chr1", 21, "alt_21", "C", [""], None,
                          None, {"target": "target_2"}),  # On target
                VCFRecord("artificial_chr1", 28, "alt_22", "A", [""], None,
                          None, {"target": "target_2"}),  # On target ; last nt
                VCFRecord(
                    "artificial_chr1", 29, "alt_23", "G", [""], None, None,
                    {"target": None}),  # After target ; first nt afetr target
                # Del multi nt
                VCFRecord("artificial_chr1", 11, "alt_24", "TATG", ["T"], None,
                          None, {"target": None
                                 }),  # Before target ; first nt before target
                VCFRecord(
                    "artificial_chr1", 13, "alt_25", "TGTA", ["T"], None, None,
                    {"target": "target_2"}),  # On target ; first nt of target
                VCFRecord("artificial_chr1", 20, "alt_26", "GCTC", ["G"], None,
                          None, {"target": "target_2"}),  # On target
                VCFRecord("artificial_chr1", 27, "alt_27", "AAGT", ["A"], None,
                          None, {"target": "target_2"}),  # On target ; last nt
                VCFRecord(
                    "artificial_chr1", 28, "alt_28", "AGT", ["A"], None, None,
                    {"target": None}),  # After target ; first nt afetr target
                # Movable del multi nt
                VCFRecord("artificial_chr1", 7, "alt_29", "CATGT", ["C"], None,
                          None,
                          {"target": "target_2"
                           }),  # On repeat and movable to first nt of target
                VCFRecord(
                    "artificial_chr1", 12, "alt_30", "ATG", ["A"], None, None,
                    {"target": "target_2"}),  # Movable to first nt of target
                VCFRecord(
                    "artificial_chr1", 28, "alt_31", "AGTA", ["A"], None, None,
                    {"target": "target_2"}),  # Movable to last nt of target
                VCFRecord("artificial_chr1", 30, "alt_32", "TAGT", ["T"], None,
                          None,
                          {"target": "target_2"
                           }),  # On repeat and movable to last nt of target
            ]
            for idx, curr_var in enumerate(self.variants):
                FH_var.write(curr_var)
Ejemplo n.º 7
0
    def setUp(self):
        tmp_folder = tempfile.gettempdir()
        unique_id = str(uuid.uuid1())

        # Temporary files
        self.tmp_variants = os.path.join(tmp_folder, unique_id + ".vcf")
        self.tmp_output = os.path.join(tmp_folder, unique_id + "_out.vcf")

        # Exec command
        self.cmd = [
            "filterVCFBySOR.py", "--input-variants", self.tmp_variants,
            "--output-variants", self.tmp_output
        ]

        # Create VCF
        with VCFIO(self.tmp_variants, "w") as FH_var:
            FH_var.info = {
                "expected":
                HeaderInfoAttr("expected",
                               "Expected filter tag.",
                               type="String",
                               number="1"),
                "SAR":
                HeaderInfoAttr(
                    "SAR",
                    "Number of reads supporting the alternative allele in reverse strand.",
                    type="Integer",
                    number="1"),
                "SAF":
                HeaderInfoAttr(
                    "SAF",
                    "Number of reads supporting the alternative allele in forward strand.",
                    type="Integer",
                    number="1"),
                "SRR":
                HeaderInfoAttr(
                    "SRR",
                    "Number of reads supporting the reference allele in reverse strand.",
                    type="Integer",
                    number="1"),
                "SRF":
                HeaderInfoAttr(
                    "SRF",
                    "Number of reads supporting the reference allele in forward strand.",
                    type="Integer",
                    number="1"),
            }
            FH_var.writeHeader()
            self.variants = [
                # 0.5 alt, 0.5 ref, low DP, alt no bias, ref no bias
                VCFRecord("artificial_chr1", 10, "sub_01", "G", ["T"], None,
                          None, {
                              "SAR": 5,
                              "SAF": 5,
                              "SRR": 5,
                              "SRF": 5,
                              "expected": "PASS"
                          }),
                # 0.05 alt, 0.95 ref, good DP, alt no bias, ref no bias
                VCFRecord("artificial_chr1", 20, "sub_02", "G", ["T"], None,
                          None, {
                              "SAR": 5,
                              "SAF": 5,
                              "SRR": 95,
                              "SRF": 95,
                              "expected": "PASS"
                          }),
                # 0.05 alt, 0.95 ref, good DP, alt no bias, ref strand bias
                VCFRecord("artificial_chr1", 30, "sub_03", "G", ["T"], None,
                          None, {
                              "SAR": 5,
                              "SAF": 5,
                              "SRR": 150,
                              "SRF": 30,
                              "expected": "PASS"
                          }),
                # 0.05 alt, 0.95 ref, good DP, alt strand bias, ref no bias
                VCFRecord(
                    "artificial_chr1", 40, "sub_04", "G", ["T"], None, None, {
                        "SAR": 9,
                        "SAF": 1,
                        "SRR": 95,
                        "SRF": 95,
                        "expected": "strandRatioBias"
                    }),
                # 0.05 alt, 0.95 ref, good DP, alt strand bias, ref strand bias => no bias
                VCFRecord("artificial_chr1", 50, "sub_05", "G", ["T"], None,
                          None, {
                              "SAR": 9,
                              "SAF": 1,
                              "SRR": 150,
                              "SRF": 30,
                              "expected": "PASS"
                          }),
                # 0.5 alt, 0.5 ref, low DP, alt strand bias, ref no bias
                VCFRecord(
                    "artificial_chr1", 60, "sub_06", "G", ["T"], None, None, {
                        "SAR": 9,
                        "SAF": 1,
                        "SRR": 5,
                        "SRF": 5,
                        "expected": "strandRatioBias"
                    }),
                # 0.29 alt, 0.71 ref, good DP, alt no bias, ref no bias
                VCFRecord(
                    "artificial_chr1", 70, "sub_07", "G", ["T"], None, None, {
                        "SAR": 400,
                        "SAF": 600,
                        "SRR": 1400,
                        "SRF": 1000,
                        "expected": "PASS"
                    }),
                # 0.71 alt, 0.29 ref, good DP, alt no bias, ref no bias
                VCFRecord(
                    "artificial_chr1", 80, "sub_08", "G", ["T"], None, None, {
                        "SAR": 1400,
                        "SAF": 1000,
                        "SRR": 400,
                        "SRF": 600,
                        "expected": "PASS"
                    }),
                # 1.0 alt, 0.0 ref, good DP, alt no bias, ref 0 DP
                VCFRecord(
                    "artificial_chr1", 90, "sub_09", "G", ["T"], None, None, {
                        "SAR": 1400,
                        "SAF": 1000,
                        "SRR": 0,
                        "SRF": 0,
                        "expected": "PASS"
                    }),
                # 1.0 alt, 0.0 ref, good DP, alt no bias, ref 2 DP
                VCFRecord(
                    "artificial_chr1", 100, "sub_10", "G", ["T"], None, None, {
                        "SAR": 1400,
                        "SAF": 1000,
                        "SRR": 0,
                        "SRF": 2,
                        "expected": "PASS"
                    }),
                # 1.0 alt, 0.0 ref, limit DP, alt no bias, ref 0 DP
                VCFRecord("artificial_chr1", 110, "sub_11", "G", ["T"], None,
                          None, {
                              "SAR": 90,
                              "SAF": 30,
                              "SRR": 0,
                              "SRF": 0,
                              "expected": "PASS"
                          }),
                # 1.0 alt, 0.0 ref, limit DP, alt no bias, ref 2 DP
                VCFRecord("artificial_chr1", 120, "sub_12", "G", ["T"], None,
                          None, {
                              "SAR": 90,
                              "SAF": 30,
                              "SRR": 0,
                              "SRF": 2,
                              "expected": "PASS"
                          }),
                # 1.0 alt, 0.0 ref, limit DP, alt strand bias, ref 0 DP
                VCFRecord(
                    "artificial_chr1", 130, "sub_13", "G", ["T"], None, None, {
                        "SAR": 90,
                        "SAF": 10,
                        "SRR": 0,
                        "SRF": 0,
                        "expected": "strandRatioBias"
                    }),
                # 1.0 alt, 0.0 ref, limit DP, alt strand bias, ref 2 DP
                VCFRecord(
                    "artificial_chr1", 140, "sub_14", "G", ["T"], None, None, {
                        "SAR": 90,
                        "SAF": 10,
                        "SRR": 0,
                        "SRF": 2,
                        "expected": "strandRatioBias"
                    }),
                # 1.0 alt, 0.0 ref, limit DP, alt strand bias, ref 1 DP
                VCFRecord(
                    "artificial_chr1",
                    150,
                    "sub_15",
                    "G",
                    ["T"],
                    None,
                    None,
                    {
                        "SAR": 90,
                        "SAF": 10,
                        "SRR": 1,
                        "SRF": 0,
                        "expected": "PASS"  # It can be discuss: 2.89
                    }),
                # 0.04 alt, 0.96 ref, good DP, alt strand bias, ref no bias
                VCFRecord(
                    "artificial_chr1", 160, "sub_16", "G", ["T"], None, None, {
                        "SAR": 15,
                        "SAF": 2,
                        "SRR": 200,
                        "SRF": 200,
                        "expected": "strandRatioBias"
                    }),
                # 0.04 alt, 0.96 ref, good DP, alt strand bias, ref no bias
                VCFRecord(
                    "artificial_chr1",
                    170,
                    "sub_17",
                    "G",
                    ["T"],
                    None,
                    None,
                    {
                        "SAR": 13,  # 12 => PASS
                        "SAF": 2,
                        "SRR": 200,
                        "SRF": 200,
                        "expected": "strandRatioBias"
                    }),
                # 0.04 alt, 0.96 ref, good DP, alt strand bias, ref strand bias => no bias
                VCFRecord("artificial_chr1", 180, "sub_18", "G", ["T"], None,
                          None, {
                              "SAR": 13,
                              "SAF": 2,
                              "SRR": 350,
                              "SRF": 50,
                              "expected": "PASS"
                          }),
                # 0.04 alt, 0.96 ref, good DP, alt strand bias, ref strand bias rev => bias
                VCFRecord(
                    "artificial_chr1", 190, "sub_19", "G", ["T"], None, None, {
                        "SAR": 13,
                        "SAF": 2,
                        "SRR": 50,
                        "SRF": 350,
                        "expected": "strandRatioBias"
                    }),
                # 0.5 alt, 0.5 ref, low DP, alt strand bias, ref no bias
                VCFRecord(
                    "artificial_chr1", 200, "sub_20", "G", ["T"], None, None, {
                        "SAR": 14,
                        "SAF": 2,
                        "SRR": 8,
                        "SRF": 8,
                        "expected": "strandRatioBias"
                    }),
            ]
            for idx, curr_var in enumerate(self.variants):
                FH_var.write(curr_var)
Ejemplo n.º 8
0
 def setUp(self):
     self.vcfio = FakeVCFIO(
         {
             "AF":
             HeaderInfoAttr("AF", "Alternative alleles frequencies",
                            "Float", "A")
         }, {
             "AD":
             HeaderFormatAttr("AD", "Alternative alleles depths", "Integer",
                              "A"),
             "DP":
             HeaderFormatAttr("DP", "total depth", "Integer", "1")
         })
     self.ref_seq = "ACGCAAATCTCGGCATGCCGATT"
     #               | | | | | |  |  |  |  |
     #               1 3 5 7 9 11 14 17 20 23
     self.variant_1 = VCFRecord(
         "chr1",  # chrom
         None,  # pos
         "artificial_1",  # id
         None,  # ref
         None,  # alt
         10,  # qual
         ["lowQual", "lowDP"],  # filter
         {"AF": [0.05]},  # info
         ["DP", "AD"],  # format
         {
             "splA": {
                 "AD": [10],
                 "DP": 100
             },
             "splB": {
                 "AD": [40],
                 "DP": 4900
             },
         })
     self.variant_2 = VCFRecord(
         "chr1",  # chrom
         None,  # pos
         None,  # id
         None,  # ref
         None,  # alt
         30,  # qual
         ["PASS"],  # filter
         {"AF": [0.06]},  # info
         ["DP", "AD"],  # format
         {
             "splA": {
                 "AD": [5],
                 "DP": 50
             },
             "splB": {
                 "AD": [31],
                 "DP": 550
             },
         })
     self.expected_merge = VCFRecord(
         "chr1",  # chrom
         None,  # pos
         None,  # id
         None,  # ref
         None,  # alt
         20,  # qual
         ["lowQual", "lowDP"],  # filter
         {
             "AF": [0.06],
             "MCO_QUAL": [10, 30],
             "MCO_VAR": ["chr1:5=A/T", "chr1:20=G/C"]
         },  # info
         ["DP", "AD"],  # format
         {
             "splA": {
                 "AD": [5],
                 "DP": 50
             },
             "splB": {
                 "AD": [31],
                 "DP": 550
             },
         })
Ejemplo n.º 9
0
 def testIsHLA(self):
     up = VCFRecord(
         "chr1", 110, "id_01", "A", ["A[chr1:200["],
         info={
             "RNA_FIRST": True,
             "MATEID": "id_02",
             "TESTANN": [
                 {"SYMBOL": "HLA-DRB1", "STRAND": "+"},
                 {"SYMBOL": "HLA-DMB", "STRAND": "+"}
             ]
         }
     )
     down = VCFRecord(
         "chr1", 200, "id_02", "A", ["]chr1:110]A"],
         info={
             "MATEID": "id_01",
             "TESTANN": [
                 {"SYMBOL": "GENE_N02", "STRAND": "+"},
                 {"SYMBOL": "GENE_N04", "STRAND": "+"}
             ]
         }
     )
     self.assertTrue(isHLA(up, down, "TESTANN"))
     up = VCFRecord(
         "chr1", 110, "id_01", "A", ["A[chr1:200["],
         info={
             "RNA_FIRST": True,
             "MATEID": "id_02",
             "TESTANN": [
                 {"SYMBOL": "HLA-DRB1", "STRAND": "+"},
                 {"SYMBOL": "HLA-DMB", "STRAND": "+"}
             ]
         }
     )
     down = VCFRecord(
         "chr1", 200, "id_02", "A", ["]chr1:110]A"],
         info={
             "MATEID": "id_01",
             "TESTANN": []
         }
     )
     self.assertTrue(isHLA(up, down, "TESTANN"))
     up = VCFRecord(
         "chr1", 110, "id_01", "A", ["A[chr1:200["],
         info={
             "RNA_FIRST": True,
             "MATEID": "id_02",
             "TESTANN": []
         }
     )
     down = VCFRecord(
         "chr1", 200, "id_02", "A", ["]chr1:110]A"],
         info={
             "MATEID": "id_01",
             "TESTANN": []
         }
     )
     self.assertTrue(not isHLA(up, down, "TESTANN"))
     up = VCFRecord(
         "chr1", 110, "id_01", "A", ["A[chr1:200["],
         info={
             "RNA_FIRST": True,
             "MATEID": "id_02",
             "TESTANN": [
                 {"SYMBOL": "GENE_N01", "STRAND": "+"},
                 {"SYMBOL": "GENE_N04", "STRAND": "+"}
             ]
         }
     )
     down = VCFRecord(
         "chr1", 200, "id_02", "A", ["]chr1:110]A"],
         info={
             "MATEID": "id_01",
             "TESTANN": [
                 {"SYMBOL": "HLAN02", "STRAND": "+"},
                 {"SYMBOL": "GENE_N04", "STRAND": "+"}
             ]
         }
     )
     self.assertTrue(not isHLA(up, down, "TESTANN"))
Ejemplo n.º 10
0
 def testInner(self):
     up = VCFRecord(
         "chr1", 140, "id_01", "A", ["A[chr1:299["],
         info={
             "RNA_FIRST": True,
             "MATEID": "id_02",
             "TESTANN": [
                 {"SYMBOL": "GENE_N01", "STRAND": "+"},
                 {"SYMBOL": "GENE_N04", "STRAND": "+"}
             ]
         }
     )
     down = VCFRecord(
         "chr1", 299, "id_02", "A", ["]chr1:140]A"],
         info={
             "MATEID": "id_01",
             "TESTANN": [
                 {"SYMBOL": "GENE_N03", "STRAND": "+"},
                 {"SYMBOL": "GENE_N06", "STRAND": "-"}
             ]
         }
     )
     self.assertTrue(
         not isInner(
             up, down, "TESTANN", annCmpNameFct(False), regCmpNameFct(False)
         )
     )  # +/+ not inner (starts on limit)
     up = VCFRecord(
         "chr1", 140, "id_01", "A", ["A[chr1:199["],
         info={
             "RNA_FIRST": True,
             "MATEID": "id_02",
             "TESTANN": [
                 {"SYMBOL": "GENE_N01", "STRAND": "+"},
                 {"SYMBOL": "GENE_N04", "STRAND": "+"}
             ]
         }
     )
     down = VCFRecord(
         "chr1", 199, "id_02", "A", ["]chr1:140]A"],
         info={
             "MATEID": "id_01",
             "TESTANN": [
                 {"SYMBOL": "GENE_N02", "STRAND": "+"},
                 {"SYMBOL": "GENE_N04", "STRAND": "+"}
             ]
         }
     )
     self.assertTrue(
         isInner(
             up, down, "TESTANN", annCmpNameFct(False), regCmpNameFct(False)
         )
     )  # +/+ inner gene 4 (starts on limit)
     up = VCFRecord(
         "chr1", 298, "id_01", "A", ["]chr1:320]A"],
         info={
             "RNA_FIRST": True,
             "MATEID": "id_02",
             "TESTANN": [
                 {"SYMBOL": "GENE_N06", "STRAND": "-"}
             ]
         }
     )
     down = VCFRecord(
         "chr1", 320, "id_02", "A", ["A[chr1:298["],
         info={
             "MATEID": "id_01",
             "TESTANN": [
                 {"SYMBOL": "GENE_N06", "STRAND": "-"},
                 {"SYMBOL": "GENE_N03", "STRAND": "+"}
             ]
         }
     )
     self.assertTrue(
         isInner(
             up, down, "TESTANN", annCmpNameFct(False), regCmpNameFct(False)
         )
     )  # -/- inner gene 6
     up = VCFRecord(
         "chr1", 298, "id_01", "A", ["A[chr1:320["],
         info={
             "RNA_FIRST": True,
             "MATEID": "id_02",
             "TESTANN": [
                 {"SYMBOL": "GENE_N06", "STRAND": "-"}
             ]
         }
     )
     down = VCFRecord(
         "chr1", 320, "id_02", "A", ["]chr1:298]A"],
         info={
             "MATEID": "id_01",
             "TESTANN": [
                 {"SYMBOL": "GENE_N06", "STRAND": "-"},
                 {"SYMBOL": "GENE_N03", "STRAND": "+"}
             ]
         }
     )
     self.assertTrue(
         not isInner(
             up, down, "TESTANN", annCmpNameFct(False), regCmpNameFct(False)
         )
     )  # +/+ inner gene 6 => not valid strand
     up = VCFRecord(
         "chr1", 298, "id_01", "A", ["A[chr1:320["],
         info={
             "RNA_FIRST": True,
             "MATEID": "id_02",
             "TESTANN": [
                 {"SYMBOL": "GENE_N06", "STRAND": "-"}
             ]
         }
     )
     down = VCFRecord(
         "chr1", 320, "id_02", "A", ["A[chr1:298["],
         info={
             "MATEID": "id_01",
             "TESTANN": [
                 {"SYMBOL": "GENE_N06", "STRAND": "-"},
                 {"SYMBOL": "GENE_N03", "STRAND": "+"}
             ]
         }
     )
     self.assertTrue(
         isInner(
             up, down, "TESTANN", annCmpNameFct(False), regCmpNameFct(False)
         )
     )  # +/- inner gene 6
Ejemplo n.º 11
0
 def testInNormal(self):
     normal_fusions_id = {"GENE_ID01	GENE_ID02", "GENE_ID02	GENE_ID03"}
     normal_fusions_symbol = {"GENE_N01	GENE_N02", "GENE_N02	GENE_N03"}
     up = VCFRecord(
         "chr1", 140, "id_01", "A", ["A[chr1:199["],
         info={
             "RNA_FIRST": True,
             "MATEID": "id_02",
             "ANN": [
                 {"SYMBOL": "GENE_N01", "Gene": "GENE_ID01", "STRAND": "+"},
                 {"SYMBOL": "GENE_N04", "Gene": "GENE_ID04", "STRAND": "+"}
             ]
         }
     )
     down = VCFRecord(
         "chr1", 199, "id_02", "A", ["]chr1:140]A"],
         info={
             "MATEID": "id_01",
             "ANN": [
                 {"SYMBOL": "GENE_N02", "Gene": "GENE_ID02", "STRAND": "+"}
             ]
         }
     )
     self.assertTrue(inNormal(up, down, "ANN", normal_fusions_id, "id"))
     self.assertTrue(inNormal(up, down, "ANN", normal_fusions_symbol, "symbol"))
     up = VCFRecord(
         "chr1", 140, "id_01", "A", ["A[chr1:299["],
         info={
             "RNA_FIRST": True,
             "MATEID": "id_02",
             "ANN": [
                 {"SYMBOL": "GENE_N01", "Gene": "GENE_ID01", "STRAND": "+"},
                 {"SYMBOL": "GENE_N04", "Gene": "GENE_ID04", "STRAND": "+"}
             ]
         }
     )
     down = VCFRecord(
         "chr1", 299, "id_02", "A", ["]chr1:140]A"],
         info={
             "MATEID": "id_01",
             "ANN": [
                 {"SYMBOL": "GENE_N03", "Gene": "GENE_ID03", "STRAND": "+"},
                 {"SYMBOL": "GENE_N06", "Gene": "GENE_ID06", "STRAND": "-"}
             ]
         }
     )
     self.assertTrue(not inNormal(up, down, "ANN", normal_fusions_id, "id"))
     self.assertTrue(not inNormal(up, down, "ANN", normal_fusions_symbol, "symbol"))
     down = VCFRecord(
         "chr1", 140, "id_01", "A", ["]chr1:299]A"],
         info={
             "MATEID": "id_02",
             "ANN": [
                 {"SYMBOL": "GENE_N01", "Gene": "GENE_ID01", "STRAND": "+"},
                 {"SYMBOL": "GENE_N04", "Gene": "GENE_ID04", "STRAND": "+"}
             ]
         }
     )
     up = VCFRecord(
         "chr1", 299, "id_02", "A", ["A[chr1:140["],
         info={
             "RNA_FIRST": True,
             "MATEID": "id_01",
             "ANN": [
                 {"SYMBOL": "GENE_N03", "Gene": "GENE_ID03", "STRAND": "+"},
                 {"SYMBOL": "GENE_N06", "Gene": "GENE_ID06", "STRAND": "-"}
             ]
         }
     )
     self.assertTrue(not inNormal(up, down, "ANN", normal_fusions_id, "id"))
     self.assertTrue(not inNormal(up, down, "ANN", normal_fusions_symbol, "symbol"))
Ejemplo n.º 12
0
 def setUp(self):
     # VCF
     self.vcfio = FakeVCFIO(
         {
             "AF":
             HeaderInfoAttr("AF", "Alternative alleles frequencies",
                            "Float", "A")
         }, {
             "AD":
             HeaderFormatAttr("AD", "Alternative alleles depths", "Integer",
                              "A"),
             "DP":
             HeaderFormatAttr("DP", "total depth", "Integer", "1")
         })
     # Ref seq
     tmp_folder = tempfile.gettempdir()
     unique_id = str(uuid.uuid1())
     self.tmp_fasta_path = os.path.join(tmp_folder, unique_id + ".fa")
     self.tmp_faidx_path = os.path.join(tmp_folder, unique_id + ".fa.fai")
     self.ref_seq = "ACGCAAATCTCGGCATGCCGATT"
     #               | | | | | |  |  |  |  |
     #               1 3 5 7 9 11 14 17 20 23
     with open(self.tmp_fasta_path, "w") as FH_seq:
         FH_seq.write(">chr1\n{}".format(self.ref_seq))
     with open(self.tmp_faidx_path, "w") as FH_faidx:
         FH_faidx.write("chr1\t{}\t6\t60\t61".format(len(self.ref_seq)))
     # Variants
     self.variant_1 = VCFRecord(
         "chr1",  # chrom
         None,  # pos
         "artificial_1",  # id
         None,  # ref
         None,  # alt
         10,  # qual
         ["lowQual", "lowDP"],  # filter
         {"AF": [0.05]},  # info
         ["DP", "AD"],  # format
         {
             "splA": {
                 "AD": [10],
                 "DP": 100
             },
             "splB": {
                 "AD": [40],
                 "DP": 4900
             },
         })
     self.variant_2 = VCFRecord(
         "chr1",  # chrom
         None,  # pos
         None,  # id
         None,  # ref
         None,  # alt
         30,  # qual
         ["PASS"],  # filter
         {"AF": [0.06]},  # info
         ["DP", "AD"],  # format
         {
             "splA": {
                 "AD": [5],
                 "DP": 50
             },
             "splB": {
                 "AD": [31],
                 "DP": 550
             },
         })
     self.expected_merge = VCFRecord(
         "chr1",  # chrom
         None,  # pos
         None,  # id
         None,  # ref
         None,  # alt
         20,  # qual
         ["lowQual", "lowDP"],  # filter
         {
             "AF": [0.06],
             "MCO_QUAL": [10, 30],
             "MCO_VAR": ["chr1:5=A/T", "chr1:20=G/C"]
         },  # info
         ["DP", "AD"],  # format
         {
             "splA": {
                 "AD": [5],
                 "DP": 50
             },
             "splB": {
                 "AD": [31],
                 "DP": 550
             },
         })
Ejemplo n.º 13
0
class MergeCoOccurVar(unittest.TestCase):
    def setUp(self):
        # VCF
        self.vcfio = FakeVCFIO(
            {
                "AF":
                HeaderInfoAttr("AF", "Alternative alleles frequencies",
                               "Float", "A")
            }, {
                "AD":
                HeaderFormatAttr("AD", "Alternative alleles depths", "Integer",
                                 "A"),
                "DP":
                HeaderFormatAttr("DP", "total depth", "Integer", "1")
            })
        # Ref seq
        tmp_folder = tempfile.gettempdir()
        unique_id = str(uuid.uuid1())
        self.tmp_fasta_path = os.path.join(tmp_folder, unique_id + ".fa")
        self.tmp_faidx_path = os.path.join(tmp_folder, unique_id + ".fa.fai")
        self.ref_seq = "ACGCAAATCTCGGCATGCCGATT"
        #               | | | | | |  |  |  |  |
        #               1 3 5 7 9 11 14 17 20 23
        with open(self.tmp_fasta_path, "w") as FH_seq:
            FH_seq.write(">chr1\n{}".format(self.ref_seq))
        with open(self.tmp_faidx_path, "w") as FH_faidx:
            FH_faidx.write("chr1\t{}\t6\t60\t61".format(len(self.ref_seq)))
        # Variants
        self.variant_1 = VCFRecord(
            "chr1",  # chrom
            None,  # pos
            "artificial_1",  # id
            None,  # ref
            None,  # alt
            10,  # qual
            ["lowQual", "lowDP"],  # filter
            {"AF": [0.05]},  # info
            ["DP", "AD"],  # format
            {
                "splA": {
                    "AD": [10],
                    "DP": 100
                },
                "splB": {
                    "AD": [40],
                    "DP": 4900
                },
            })
        self.variant_2 = VCFRecord(
            "chr1",  # chrom
            None,  # pos
            None,  # id
            None,  # ref
            None,  # alt
            30,  # qual
            ["PASS"],  # filter
            {"AF": [0.06]},  # info
            ["DP", "AD"],  # format
            {
                "splA": {
                    "AD": [5],
                    "DP": 50
                },
                "splB": {
                    "AD": [31],
                    "DP": 550
                },
            })
        self.expected_merge = VCFRecord(
            "chr1",  # chrom
            None,  # pos
            None,  # id
            None,  # ref
            None,  # alt
            20,  # qual
            ["lowQual", "lowDP"],  # filter
            {
                "AF": [0.06],
                "MCO_QUAL": [10, 30],
                "MCO_VAR": ["chr1:5=A/T", "chr1:20=G/C"]
            },  # info
            ["DP", "AD"],  # format
            {
                "splA": {
                    "AD": [5],
                    "DP": 50
                },
                "splB": {
                    "AD": [31],
                    "DP": 550
                },
            })

    def tearDown(self):
        # Clean temporary files
        for curr_file in [self.tmp_fasta_path, self.tmp_faidx_path]:
            if os.path.exists(curr_file):
                os.remove(curr_file)

    def testMergedRecord_1_substit(self):
        # Variant 1
        self.variant_1.pos = 5
        self.variant_1.ref = "A"
        self.variant_1.alt = ["T"]
        # Variant 2
        self.variant_2.pos = 20
        self.variant_2.ref = "G"
        self.variant_2.alt = ["C"]
        # Expected merge
        self.expected_merge.pos = 5
        self.expected_merge.ref = "AAATCTCGGCATGCCG"
        self.expected_merge.alt = ["TAATCTCGGCATGCCC"]
        self.expected_merge.info = {
            "AF": [0.06],
            "MCO_QUAL": [10, 30],
            "MCO_VAR": ["chr1:5=A/T", "chr1:20=G/C"]
        }
        # Eval
        with IdxFastaIO(self.tmp_fasta_path) as FH_ref:
            observed_merge = mergedRecord(self.vcfio, self.variant_1,
                                          self.variant_1.getName(),
                                          self.variant_2,
                                          self.variant_2.getName(), FH_ref)
        self.assertEqual(strVariant(observed_merge),
                         strVariant(self.expected_merge))

    def testMergedRecord_2_largeSubstit(self):
        # Variant 1
        self.variant_1.pos = 5
        self.variant_1.ref = "AAAT"
        self.variant_1.alt = ["TGCA"]
        # Variant 2
        self.variant_2.pos = 10
        self.variant_2.ref = "TC"
        self.variant_2.alt = ["GG"]
        # Expected merge
        self.expected_merge.pos = 5
        self.expected_merge.ref = "AAATCTC"
        self.expected_merge.alt = ["TGCACGG"]
        self.expected_merge.info = {
            "AF": [0.06],
            "MCO_QUAL": [10, 30],
            "MCO_VAR": ["chr1:5=AAAT/TGCA", "chr1:10=TC/GG"]
        }
        # Eval
        with IdxFastaIO(self.tmp_fasta_path) as FH_ref:
            observed_merge = mergedRecord(self.vcfio, self.variant_1,
                                          self.variant_1.getName(),
                                          self.variant_2,
                                          self.variant_2.getName(), FH_ref)
        self.assertEqual(strVariant(observed_merge),
                         strVariant(self.expected_merge))

    def testMergedRecord_3_largeCloseSubstit(self):
        # Variant 1
        self.variant_1.pos = 5
        self.variant_1.ref = "AAAT"
        self.variant_1.alt = ["TGCA"]
        # Variant 2
        self.variant_2.pos = 9
        self.variant_2.ref = "CT"
        self.variant_2.alt = ["GG"]
        # Expected merge
        self.expected_merge.pos = 5
        self.expected_merge.ref = "AAATCT"
        self.expected_merge.alt = ["TGCAGG"]
        self.expected_merge.info = {
            "AF": [0.06],
            "MCO_QUAL": [10, 30],
            "MCO_VAR": ["chr1:5=AAAT/TGCA", "chr1:9=CT/GG"]
        }
        # Eval
        with IdxFastaIO(self.tmp_fasta_path) as FH_ref:
            observed_merge = mergedRecord(self.vcfio, self.variant_1,
                                          self.variant_1.getName(),
                                          self.variant_2,
                                          self.variant_2.getName(), FH_ref)
        self.assertEqual(strVariant(observed_merge),
                         strVariant(self.expected_merge))

    def testMergedRecord_4_delIns(self):
        # Variant 1
        self.variant_1.pos = 5
        self.variant_1.ref = "AAAT"
        self.variant_1.alt = ["-"]
        # Variant 2
        self.variant_2.pos = 10
        self.variant_2.ref = "-"
        self.variant_2.alt = ["GGCATCT"]
        # Expected merge
        self.expected_merge.pos = 5
        self.expected_merge.ref = "AAATC"
        self.expected_merge.alt = ["CGGCATCT"]
        self.expected_merge.info = {
            "AF": [0.06],
            "MCO_QUAL": [10, 30],
            "MCO_VAR": ["chr1:5=AAAT/-", "chr1:10=-/GGCATCT"]
        }
        # Eval
        with IdxFastaIO(self.tmp_fasta_path) as FH_ref:
            observed_merge = mergedRecord(self.vcfio, self.variant_1,
                                          self.variant_1.getName(),
                                          self.variant_2,
                                          self.variant_2.getName(), FH_ref)
        self.assertEqual(strVariant(observed_merge),
                         strVariant(self.expected_merge))

    def testMergedRecord_5_coDelIns(self):
        # Variant 1
        self.variant_1.pos = 5
        self.variant_1.ref = "AAAT"
        self.variant_1.alt = ["-"]
        # Variant 2
        self.variant_2.pos = 9
        self.variant_2.ref = "-"
        self.variant_2.alt = ["AGG"]
        # Expected merge
        self.expected_merge.pos = 5
        self.expected_merge.ref = "AAAT"
        self.expected_merge.alt = ["AGG"]
        self.expected_merge.info = {
            "AF": [0.06],
            "MCO_QUAL": [10, 30],
            "MCO_VAR": ["chr1:5=AAAT/-", "chr1:9=-/AGG"]
        }
        # Eval
        with IdxFastaIO(self.tmp_fasta_path) as FH_ref:
            observed_merge = mergedRecord(self.vcfio, self.variant_1,
                                          self.variant_1.getName(),
                                          self.variant_2,
                                          self.variant_2.getName(), FH_ref)
        self.assertEqual(strVariant(observed_merge),
                         strVariant(self.expected_merge))

    def testMergedRecord_6_insDel(self):
        # Variant 1
        self.variant_1.pos = 5
        self.variant_1.ref = "-"
        self.variant_1.alt = ["GTGTG"]
        # Variant 2
        self.variant_2.pos = 7
        self.variant_2.ref = "ATC"
        self.variant_2.alt = ["-"]
        # Expected merge
        self.expected_merge.pos = 5
        self.expected_merge.ref = "AAATC"
        self.expected_merge.alt = ["GTGTGAA"]
        self.expected_merge.info = {
            "AF": [0.06],
            "MCO_QUAL": [10, 30],
            "MCO_VAR": ["chr1:5=-/GTGTG", "chr1:7=ATC/-"]
        }
        # Eval
        with IdxFastaIO(self.tmp_fasta_path) as FH_ref:
            observed_merge = mergedRecord(self.vcfio, self.variant_1,
                                          self.variant_1.getName(),
                                          self.variant_2,
                                          self.variant_2.getName(), FH_ref)
        self.assertEqual(strVariant(observed_merge),
                         strVariant(self.expected_merge))

    def testMergedRecord_7_closeInsDel(self):
        # Variant 1
        self.variant_1.pos = 5
        self.variant_1.ref = "-"
        self.variant_1.alt = ["GTGTG"]
        # Variant 2
        self.variant_2.pos = 6
        self.variant_2.ref = "AA"
        self.variant_2.alt = ["-"]
        # Expected merge
        self.expected_merge.pos = 5
        self.expected_merge.ref = "AAA"
        self.expected_merge.alt = ["GTGTGA"]
        self.expected_merge.info = {
            "AF": [0.06],
            "MCO_QUAL": [10, 30],
            "MCO_VAR": ["chr1:5=-/GTGTG", "chr1:6=AA/-"]
        }
        # Eval
        with IdxFastaIO(self.tmp_fasta_path) as FH_ref:
            observed_merge = mergedRecord(self.vcfio, self.variant_1,
                                          self.variant_1.getName(),
                                          self.variant_2,
                                          self.variant_2.getName(), FH_ref)
        self.assertEqual(strVariant(observed_merge),
                         strVariant(self.expected_merge))

    def testMergedRecord_8_coInsDel(self):
        # Variant 1
        self.variant_1.pos = 5
        self.variant_1.ref = "-"
        self.variant_1.alt = ["GTGTG"]
        # Variant 2
        self.variant_2.pos = 5
        self.variant_2.ref = "AA"
        self.variant_2.alt = ["-"]
        # Expected merge
        self.expected_merge.pos = 5
        self.expected_merge.ref = "AA"
        self.expected_merge.alt = ["GTGTG"]
        self.expected_merge.info = {
            "AF": [0.06],
            "MCO_QUAL": [10, 30],
            "MCO_VAR": ["chr1:5=-/GTGTG", "chr1:5=AA/-"]
        }
        # Eval
        with IdxFastaIO(self.tmp_fasta_path) as FH_ref:
            observed_merge = mergedRecord(self.vcfio, self.variant_1,
                                          self.variant_1.getName(),
                                          self.variant_2,
                                          self.variant_2.getName(), FH_ref)
        self.assertEqual(strVariant(observed_merge),
                         strVariant(self.expected_merge))
Ejemplo n.º 14
0
        '-o',
        '--output-variants',
        required=True,
        help='The path to the outputted file (format: VCF).')
    args = parser.parse_args()

    # Process
    curr_chrom = {"name": "", "seq": None}
    with VCFIO(args.output_variants, "w") as FH_out_vcf:
        with VCFIO(args.input_variants) as FH_in_vcf:
            # Header
            FH_out_vcf.copyHeader(FH_in_vcf)
            FH_out_vcf.writeHeader()
            # Records
            for record in FH_in_vcf:
                if record.ref == VCFRecord.getEmptyAlleleMarker() or any([
                        alt == VCFRecord.getEmptyAlleleMarker()
                        for alt in record.alt
                ]):  # record is a standardized in/del
                    # Get previous nt
                    if record.chrom != curr_chrom["name"]:
                        curr_chrom["name"] = record.chrom
                        curr_chrom["seq"] = getChromSeq(
                            record.chrom, args.input_reference)
                    prev_nt = curr_chrom["seq"][record.pos - 2]
                    # Update record
                    record.pos -= 1
                    if record.ref == VCFRecord.getEmptyAlleleMarker(
                    ):  # Insertion
                        record.ref = prev_nt
                    else:  # Deletion
    def setUp(self):
        tmp_folder = tempfile.gettempdir()
        unique_id = str(uuid.uuid1())

        # Temporary files
        self.tmp_sequences = os.path.join(tmp_folder, unique_id + ".fasta")
        self.tmp_faidx = os.path.join(tmp_folder, unique_id + ".fasta.fai")
        self.tmp_variants = os.path.join(tmp_folder, unique_id + ".vcf")
        self.tmp_output = os.path.join(tmp_folder, unique_id + "_out.vcf")

        # Exec command
        self.cmd = [
            "filterVCFHomopolym.py", "--mode", "remove", "--homopolym-length",
            "4", "--input-variants", self.tmp_variants, "--input-reference",
            self.tmp_sequences, "--output-variants", self.tmp_output
        ]

        # Create fasta
        with FastaIO(self.tmp_sequences, "w") as FH_seq:
            #                                                    12  16  20  24  28  32  36  40  44  48  52  56  60  64  68  72  76  80  84  88  92  96  100
            #                                          2 4 6 8 10| 14| 18| 22| 26| 30| 34| 38| 42| 46| 50| 54| 58| 62| 66| 70| 74| 78| 82| 86| 90| 94| 98| 102
            #                                          | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
            FH_seq.write(
                Sequence(
                    "artificial_chr1",
                    "CGAATATGATCCAGCAATAAAAAGTTCCTACAGGAAAAAAGTAGAAAGAGAAACCTGTCTCTTGGATATTCTCGACACAGCAGGTCAAG"
                ))
            FH_seq.write(
                Sequence(
                    "artificial_chr2",
                    "CGAATATGATCCAGCAATAAAAAGCTCCTACAGGCAAAAGTAGGCAAAGAGAAACCTGTCTCTTGGATATTCTCGACACAGCAGGTCAA"
                ))
            FH_seq.write(
                Sequence(
                    "artificial_chr3",
                    "CGAATATGATCCAGCAATGAAAATTCCTACAGGTAAAACGTAGAAAGAGAAACCTGTCTCTTGGATATTCTCGACACAGCAGGTCAAG"
                ))
            FH_seq.write(
                Sequence(
                    "artificial_chr4",
                    "CGAATATGATCCAGCAATAAAAAGTTCCTACAGGAAAAAAGTAGAAAGAGAAACCTGTCAAAAGGATATTCTCGACAAAACAGCAGAAAGTCAAG"
                ))
            FH_seq.write(
                Sequence(
                    "artificial_chr5",
                    "CGAATATGATCCAGTAATAAAAAGTTCCTACAGGAAAAAAGTAGAAAGAGAAACCTGTCTCTTGGATATTCTCGACACAGCAGGTCAAG"
                ))
            FH_seq.write(
                Sequence(
                    "artificial_chr6",
                    "CGAATATGATCCAGCAATAAAAAGTTCCTACAGGAAAAAAGTAGAAAGCACAACCTGTCTCTTGGAAAATCTCGACACAGCAGGTAAAACAATGCAGTAAAT"
                ))
        """
        Variant	before_start	before_end	before_seq	after_start	after_end	after_seq
        alt_00	10	13	TCCA	15	18	CAAT
        alt_01	20	23	AAAA	25	28	TTCC
        alt_02	30	33	ACAG	35	38	AAAA
        alt_03	40	43	AGTA	45	48	AAAG
        alt_04	10	13	TCCA	16	19	AATA
        alt_05	20	23	AAAA	26	29	TCCT
        alt_06	30	33	ACAG	36	39	AAAA
        alt_07	40	43	GTAG	46	49	AAAG
        alt_08	11	14	CCAG	15	18	CAAT
        alt_09	20	23	AAAA	24	27	TTCC
        alt_10	31	34	AGGT	35	38	AAAA
        alt_11	40	43	GTAG	44	47	AAAG
        alt_12	11	14	CCAG	15	18	CAAT
        alt_13	20	23	AAAA	24	27	GTTC
        alt_14	31	34	CAGG	35	38	AAAA
        alt_15	41	44	GTAG	45	48	AAAG
        alt_16	50	53	GAAA	57	60	GTCA
        alt_17	60	63	AAAA	67	70	TATT
        alt_18	70	73	TCTC	77	80	AAAA
        alt_19	80	83	ACAG	87	90	AAAG
        alt_20	11	14	CCAG	16	19	AATA
        alt_21	20	23	AAAA	25	28	TTCC
        alt_22	31	34	CAGG	36	39	AAAA
        alt_23	40	43	AGTA	45	48	AAAG
        alt_24	11	14	CCAG	17	20	ATAA
        alt_25	19	22	AAAA	26	29	TCCT
        alt_26	29	32	TACA	35	38	AAAA
        alt_27	38	41	AAAG	45	48	AAAG
        alt_28	50	53	ACAA	61	64	CTTG
        alt_29	66	69	AAAA	76	79	CACA
        alt_30	76	79	CACA	86	89	AAAA
        alt_31	88	91	AACA	99	102	AAAT
        """

        # Create faidx
        with open(self.tmp_faidx, "w") as FH_fai:
            FH_fai.write("""artificial_chr1	89	17	89	90
artificial_chr2	89	124	89	90
artificial_chr3	88	231	88	89
artificial_chr4	95	337	95	96
artificial_chr5	89	450	89	90
artificial_chr6	102	557	102	103""")

        # Create VCF
        with VCFIO(self.tmp_variants, "w") as FH_var:
            FH_var.info = {
                "is_filtered":
                HeaderInfoAttr(
                    "is_filtered",
                    "1 if the variant is adjacent to an homopolymer.",
                    type="Integer",
                    number="1")
            }
            FH_var.writeHeader()
            self.variants = [
                # Substit single nt
                VCFRecord("artificial_chr1", 14, "alt_00", "G", ["T"], None,
                          None,
                          {"is_filtered": 0}),  # Without adjacent homopolymers
                VCFRecord(
                    "artificial_chr1", 24, "alt_01", "G", ["T"], None, None,
                    {"is_filtered": 1}),  # Adjacent homopolymers upstream
                VCFRecord(
                    "artificial_chr1", 34, "alt_02", "G", ["T"], None, None,
                    {"is_filtered": 1}),  # Adjacent homopolymers downstream
                VCFRecord(
                    "artificial_chr1", 44, "alt_03", "G", ["T"], None, None,
                    {"is_filtered": 0}),  # Adjacent too short homopolymers
                # Substit multi nt
                VCFRecord("artificial_chr2", 14, "alt_04", "GC", ["TA"], None,
                          None,
                          {"is_filtered": 0}),  # Without adjacent homopolymers
                VCFRecord(
                    "artificial_chr2", 24, "alt_05", "GC", ["TA"], None, None,
                    {"is_filtered": 1}),  # Adjacent homopolymers upstream
                VCFRecord(
                    "artificial_chr2", 34, "alt_06", "GC", ["TA"], None, None,
                    {"is_filtered": 1}),  # Adjacent homopolymers downstream
                VCFRecord(
                    "artificial_chr2", 44, "alt_07", "GC", ["TA"], None, None,
                    {"is_filtered": 0}),  # Adjacent too short homopolymers
                # Ins single nt
                VCFRecord("artificial_chr3", 14, "alt_08", "G", ["GT"], None,
                          None,
                          {"is_filtered": 0}),  # Without adjacent homopolymers
                VCFRecord(
                    "artificial_chr3", 23, "alt_09", "A", ["AT"], None, None,
                    {"is_filtered": 1}),  # Adjacent homopolymers upstream
                VCFRecord(
                    "artificial_chr3", 34, "alt_10", "T", ["TA"], None, None,
                    {"is_filtered": 1}),  # Adjacent homopolymers downstream
                VCFRecord(
                    "artificial_chr3", 43, "alt_11", "G", ["GT"], None, None,
                    {"is_filtered": 0}),  # Adjacent too short homopolymers
                # Ins multi nt
                VCFRecord("artificial_chr4", 14, "alt_12", "G", ["GTA"], None,
                          None,
                          {"is_filtered": 0}),  # Without adjacent homopolymers
                VCFRecord(
                    "artificial_chr4", 23, "alt_13", "A", ["ATA"], None, None,
                    {"is_filtered": 1}),  # Adjacent homopolymers upstream
                VCFRecord(
                    "artificial_chr4", 34, "alt_14", "G", ["GTA"], None, None,
                    {"is_filtered": 1}),  # Adjacent homopolymers downstream
                VCFRecord(
                    "artificial_chr4", 44, "alt_15", "G", ["GTC"], None, None,
                    {"is_filtered": 0}),  # Adjacent too short homopolymer
                VCFRecord("artificial_chr4", 54, "alt_16", "CCT", ["ATCCAGA"],
                          None, None,
                          {"is_filtered": 0}),  # Without adjacent homopolymers
                VCFRecord(
                    "artificial_chr4", 64, "alt_17", "GGA", ["CTCCAGT"], None,
                    None,
                    {"is_filtered": 1}),  # Adjacent homopolymers upstream
                VCFRecord(
                    "artificial_chr4", 74, "alt_18", "GAC", ["ATCCAGT"], None,
                    None,
                    {"is_filtered": 1}),  # Adjacent homopolymers downstream
                VCFRecord(
                    "artificial_chr4", 84, "alt_19", "CAG", ["ATCCAGT"], None,
                    None,
                    {"is_filtered": 0}),  # Adjacent too short homopolymer
                # Del single nt
                VCFRecord("artificial_chr5", 14, "alt_20", "GT", ["G"], None,
                          None,
                          {"is_filtered": 0}),  # Without adjacent homopolymers
                VCFRecord(
                    "artificial_chr5", 23, "alt_21", "AG", ["A"], None, None,
                    {"is_filtered": 1}),  # Adjacent homopolymers upstream
                VCFRecord(
                    "artificial_chr5", 34, "alt_22", "GA", ["G"], None, None,
                    {"is_filtered": 1}),  # Adjacent homopolymers downstream
                VCFRecord(
                    "artificial_chr5", 43, "alt_23", "AG", ["A"], None, None,
                    {"is_filtered": 0}),  # Adjacent too short homopolymers
                # # Del multi nt
                VCFRecord("artificial_chr6", 14, "alt_24", "GCA", ["G"], None,
                          None,
                          {"is_filtered": 0}),  # Without adjacent homopolymers
                VCFRecord(
                    "artificial_chr6", 23, "alt_25", "AGT", ["C"], None, None,
                    {"is_filtered": 1}),  # Adjacent homopolymers upstream
                VCFRecord(
                    "artificial_chr6", 32, "alt_26", "AGG", ["A"], None, None,
                    {"is_filtered": 1}),  # Adjacent homopolymers downstream
                VCFRecord(
                    "artificial_chr6", 42, "alt_27", "TAG", ["C"], None, None,
                    {"is_filtered": 0}),  # Adjacent too short homopolymer
                VCFRecord("artificial_chr6", 54, "alt_28", "CCTGTCT", ["GAA"],
                          None, None,
                          {"is_filtered": 0}),  # Without adjacent homopolymers
                VCFRecord(
                    "artificial_chr6", 70, "alt_29", "TCTCGA", ["CCC"], None,
                    None,
                    {"is_filtered": 1}),  # Adjacent homopolymers upstream
                VCFRecord(
                    "artificial_chr6", 80, "alt_30", "GCAGGT", ["CCC"], None,
                    None,
                    {"is_filtered": 1}),  # Adjacent homopolymers downstream
                VCFRecord(
                    "artificial_chr6", 92, "alt_31", "ATGCAGT", ["CCC"], None,
                    None,
                    {"is_filtered": 0}),  # Adjacent too short homopolymer
            ]
            for idx, curr_var in enumerate(self.variants):
                FH_var.write(curr_var)
Ejemplo n.º 16
0
class MergeCoOccurVar(unittest.TestCase):
    def setUp(self):
        self.vcfio = FakeVCFIO(
            {
                "AF":
                HeaderInfoAttr("AF", "Alternative alleles frequencies",
                               "Float", "A")
            }, {
                "AD":
                HeaderFormatAttr("AD", "Alternative alleles depths", "Integer",
                                 "A"),
                "DP":
                HeaderFormatAttr("DP", "total depth", "Integer", "1")
            })
        self.ref_seq = "ACGCAAATCTCGGCATGCCGATT"
        #               | | | | | |  |  |  |  |
        #               1 3 5 7 9 11 14 17 20 23
        self.variant_1 = VCFRecord(
            "chr1",  # chrom
            None,  # pos
            "artificial_1",  # id
            None,  # ref
            None,  # alt
            10,  # qual
            ["lowQual", "lowDP"],  # filter
            {"AF": [0.05]},  # info
            ["DP", "AD"],  # format
            {
                "splA": {
                    "AD": [10],
                    "DP": 100
                },
                "splB": {
                    "AD": [40],
                    "DP": 4900
                },
            })
        self.variant_2 = VCFRecord(
            "chr1",  # chrom
            None,  # pos
            None,  # id
            None,  # ref
            None,  # alt
            30,  # qual
            ["PASS"],  # filter
            {"AF": [0.06]},  # info
            ["DP", "AD"],  # format
            {
                "splA": {
                    "AD": [5],
                    "DP": 50
                },
                "splB": {
                    "AD": [31],
                    "DP": 550
                },
            })
        self.expected_merge = VCFRecord(
            "chr1",  # chrom
            None,  # pos
            None,  # id
            None,  # ref
            None,  # alt
            20,  # qual
            ["lowQual", "lowDP"],  # filter
            {
                "AF": [0.06],
                "MCO_QUAL": [10, 30],
                "MCO_VAR": ["chr1:5=A/T", "chr1:20=G/C"]
            },  # info
            ["DP", "AD"],  # format
            {
                "splA": {
                    "AD": [5],
                    "DP": 50
                },
                "splB": {
                    "AD": [31],
                    "DP": 550
                },
            })

    def testMergedRecord_1_substit(self):
        # Variant 1
        self.variant_1.pos = 5
        self.variant_1.ref = "A"
        self.variant_1.alt = ["T"]
        # Variant 2
        self.variant_2.pos = 20
        self.variant_2.ref = "G"
        self.variant_2.alt = ["C"]
        # Expected merge
        self.expected_merge.pos = 5
        self.expected_merge.ref = "AAATCTCGGCATGCCG"
        self.expected_merge.alt = ["TAATCTCGGCATGCCC"]
        self.expected_merge.info = {
            "AF": [0.06],
            "MCO_QUAL": [10, 30],
            "MCO_VAR": ["chr1:5=A/T", "chr1:20=G/C"]
        }
        # Eval
        observed_merge = mergedRecord(self.vcfio, self.variant_1,
                                      self.variant_1.getName(), self.variant_2,
                                      self.variant_2.getName(), self.ref_seq)
        self.assertEqual(strVariant(observed_merge),
                         strVariant(self.expected_merge))

    def testMergedRecord_2_largeSubstit(self):
        # Variant 1
        self.variant_1.pos = 5
        self.variant_1.ref = "AAAT"
        self.variant_1.alt = ["TGCA"]
        # Variant 2
        self.variant_2.pos = 10
        self.variant_2.ref = "TC"
        self.variant_2.alt = ["GG"]
        # Expected merge
        self.expected_merge.pos = 5
        self.expected_merge.ref = "AAATCTC"
        self.expected_merge.alt = ["TGCACGG"]
        self.expected_merge.info = {
            "AF": [0.06],
            "MCO_QUAL": [10, 30],
            "MCO_VAR": ["chr1:5=AAAT/TGCA", "chr1:10=TC/GG"]
        }
        # Eval
        observed_merge = mergedRecord(self.vcfio, self.variant_1,
                                      self.variant_1.getName(), self.variant_2,
                                      self.variant_2.getName(), self.ref_seq)
        self.assertEqual(strVariant(observed_merge),
                         strVariant(self.expected_merge))

    def testMergedRecord_3_largeCloseSubstit(self):
        # Variant 1
        self.variant_1.pos = 5
        self.variant_1.ref = "AAAT"
        self.variant_1.alt = ["TGCA"]
        # Variant 2
        self.variant_2.pos = 9
        self.variant_2.ref = "CT"
        self.variant_2.alt = ["GG"]
        # Expected merge
        self.expected_merge.pos = 5
        self.expected_merge.ref = "AAATCT"
        self.expected_merge.alt = ["TGCAGG"]
        self.expected_merge.info = {
            "AF": [0.06],
            "MCO_QUAL": [10, 30],
            "MCO_VAR": ["chr1:5=AAAT/TGCA", "chr1:9=CT/GG"]
        }
        # Eval
        observed_merge = mergedRecord(self.vcfio, self.variant_1,
                                      self.variant_1.getName(), self.variant_2,
                                      self.variant_2.getName(), self.ref_seq)
        self.assertEqual(strVariant(observed_merge),
                         strVariant(self.expected_merge))

    def testMergedRecord_4_delIns(self):
        # Variant 1
        self.variant_1.pos = 5
        self.variant_1.ref = "AAAT"
        self.variant_1.alt = ["-"]
        # Variant 2
        self.variant_2.pos = 10
        self.variant_2.ref = "-"
        self.variant_2.alt = ["GGCATCT"]
        # Expected merge
        self.expected_merge.pos = 5
        self.expected_merge.ref = "AAATC"
        self.expected_merge.alt = ["CGGCATCT"]
        self.expected_merge.info = {
            "AF": [0.06],
            "MCO_QUAL": [10, 30],
            "MCO_VAR": ["chr1:5=AAAT/-", "chr1:10=-/GGCATCT"]
        }
        # Eval
        observed_merge = mergedRecord(self.vcfio, self.variant_1,
                                      self.variant_1.getName(), self.variant_2,
                                      self.variant_2.getName(), self.ref_seq)
        self.assertEqual(strVariant(observed_merge),
                         strVariant(self.expected_merge))

    def testMergedRecord_5_coDelIns(self):
        # Variant 1
        self.variant_1.pos = 5
        self.variant_1.ref = "AAAT"
        self.variant_1.alt = ["-"]
        # Variant 2
        self.variant_2.pos = 9
        self.variant_2.ref = "-"
        self.variant_2.alt = ["AGG"]
        # Expected merge
        self.expected_merge.pos = 5
        self.expected_merge.ref = "AAAT"
        self.expected_merge.alt = ["AGG"]
        self.expected_merge.info = {
            "AF": [0.06],
            "MCO_QUAL": [10, 30],
            "MCO_VAR": ["chr1:5=AAAT/-", "chr1:9=-/AGG"]
        }
        # Eval
        observed_merge = mergedRecord(self.vcfio, self.variant_1,
                                      self.variant_1.getName(), self.variant_2,
                                      self.variant_2.getName(), self.ref_seq)
        self.assertEqual(strVariant(observed_merge),
                         strVariant(self.expected_merge))

    def testMergedRecord_6_insDel(self):
        # Variant 1
        self.variant_1.pos = 5
        self.variant_1.ref = "-"
        self.variant_1.alt = ["GTGTG"]
        # Variant 2
        self.variant_2.pos = 7
        self.variant_2.ref = "ATC"
        self.variant_2.alt = ["-"]
        # Expected merge
        self.expected_merge.pos = 5
        self.expected_merge.ref = "AAATC"
        self.expected_merge.alt = ["GTGTGAA"]
        self.expected_merge.info = {
            "AF": [0.06],
            "MCO_QUAL": [10, 30],
            "MCO_VAR": ["chr1:5=-/GTGTG", "chr1:7=ATC/-"]
        }
        # Eval
        observed_merge = mergedRecord(self.vcfio, self.variant_1,
                                      self.variant_1.getName(), self.variant_2,
                                      self.variant_2.getName(), self.ref_seq)
        self.assertEqual(strVariant(observed_merge),
                         strVariant(self.expected_merge))

    def testMergedRecord_7_closeInsDel(self):
        # Variant 1
        self.variant_1.pos = 5
        self.variant_1.ref = "-"
        self.variant_1.alt = ["GTGTG"]
        # Variant 2
        self.variant_2.pos = 6
        self.variant_2.ref = "AA"
        self.variant_2.alt = ["-"]
        # Expected merge
        self.expected_merge.pos = 5
        self.expected_merge.ref = "AAA"
        self.expected_merge.alt = ["GTGTGA"]
        self.expected_merge.info = {
            "AF": [0.06],
            "MCO_QUAL": [10, 30],
            "MCO_VAR": ["chr1:5=-/GTGTG", "chr1:6=AA/-"]
        }
        # Eval
        observed_merge = mergedRecord(self.vcfio, self.variant_1,
                                      self.variant_1.getName(), self.variant_2,
                                      self.variant_2.getName(), self.ref_seq)
        self.assertEqual(strVariant(observed_merge),
                         strVariant(self.expected_merge))

    def testMergedRecord_8_coInsDel(self):
        # Variant 1
        self.variant_1.pos = 5
        self.variant_1.ref = "-"
        self.variant_1.alt = ["GTGTG"]
        # Variant 2
        self.variant_2.pos = 5
        self.variant_2.ref = "AA"
        self.variant_2.alt = ["-"]
        # Expected merge
        self.expected_merge.pos = 5
        self.expected_merge.ref = "AA"
        self.expected_merge.alt = ["GTGTG"]
        self.expected_merge.info = {
            "AF": [0.06],
            "MCO_QUAL": [10, 30],
            "MCO_VAR": ["chr1:5=-/GTGTG", "chr1:5=AA/-"]
        }
        # Eval
        observed_merge = mergedRecord(self.vcfio, self.variant_1,
                                      self.variant_1.getName(), self.variant_2,
                                      self.variant_2.getName(), self.ref_seq)
        self.assertEqual(strVariant(observed_merge),
                         strVariant(self.expected_merge))
Ejemplo n.º 17
0
 def testIsReadthrough(self):
     genes = AnnotGetter(self.tmp_annot)
     up = VCFRecord(
         "chr1",
         110,
         "id_01",
         "A",
         ["A[chr1:200["],
         info={
             "RNA_FIRST": True,
             "MATEID": "id_02",
             "TESTANN": [
                 {"SYMBOL": "GENE_N01", "STRAND": "+"},
                 {"SYMBOL": "GENE_N04", "STRAND": "+"}
             ]
         }
     )
     down = VCFRecord(
         "chr1",
         200,
         "id_02",
         "A",
         ["]chr1:110]A"],
         info={
             "MATEID": "id_01",
             "TESTANN": [
                 {"SYMBOL": "GENE_N02", "STRAND": "+"},
                 {"SYMBOL": "GENE_N04", "STRAND": "+"}
             ]
         }
     )
     self.assertTrue(
         isReadthrough(
             up, down, "TESTANN", genes, 1000, annCmpNameFct(False),
             regCmpNameFct(False)
         )
     )
     up = VCFRecord(
         "chr1",
         110,
         "id_03",
         "A",
         ["A[chr1:300["],
         info={
             "RNA_FIRST": True,
             "MATEID": "id_04",
             "TESTANN": [
                 {"SYMBOL": "GENE_N01", "STRAND": "+"},
                 {"SYMBOL": "GENE_N04", "STRAND": "+"}
             ]
         }
     )
     down = VCFRecord(
         "chr1",
         300,
         "id_04",
         "A",
         ["]chr1:110]A"],
         info={
             "MATEID": "id_03",
             "TESTANN": [
                 {"SYMBOL": "GENE_N03", "STRAND": "+"},
                 {"SYMBOL": "GENE_N06", "STRAND": "-"}
             ]
         }
     )
     self.assertTrue(
         not isReadthrough(
             up, down, "TESTANN", genes, 1000, annCmpNameFct(False),
             regCmpNameFct(False)
         )
     )
     up = VCFRecord(
         "chr1",
         140,
         "id_05",
         "A",
         ["A[chr1:199["],
         info={
             "RNA_FIRST": True,
             "MATEID": "id_06",
             "TESTANN": [
                 {"SYMBOL": "GENE_N01", "STRAND": "+"},
                 {"SYMBOL": "GENE_N04", "STRAND": "+"}
             ]
         }
     )
     down = VCFRecord(
         "chr1",
         199,
         "id_06",
         "A",
         ["]chr1:140]A"],
         info={
             "MATEID": "id_05",
             "TESTANN": [
                 {"SYMBOL": "GENE_N02", "STRAND": "+"}
             ]
         }
     )
     self.assertTrue(
         isReadthrough(
             up, down, "TESTANN", genes, 1000, annCmpNameFct(False),
             regCmpNameFct(False)
         )
     )
     up = VCFRecord(
         "chr1",
         289,
         "id_07",
         "A",
         ["]chr1:148]A"],
         info={
             "RNA_FIRST": True,
             "MATEID": "id_08",
             "TESTANN": [
                 {"SYMBOL": "GENE_N06", "STRAND": "-"}
             ]
         }
     )
     down = VCFRecord(
         "chr1",
         148,
         "id_08",
         "A",
         ["A[chr1:289["],
         info={
             "MATEID": "id_07",
             "TESTANN": [
                 {"SYMBOL": "GENE_N04", "STRAND": "+"},
                 {"SYMBOL": "GENE_N05", "STRAND": "-"}
             ]
         }
     )
     self.assertTrue(
         isReadthrough(
             up, down, "TESTANN", genes, 1000, annCmpNameFct(False),
             regCmpNameFct(False)
         )
     )
     up = VCFRecord(
         "chr1",
         180,
         "id_09",
         "A",
         ["]chr1:299]A"],
         info={
             "RNA_FIRST": True,
             "MATEID": "id_10",
             "TESTANN": [
                 {"SYMBOL": "GENE_N01", "STRAND": "+"},
                 {"SYMBOL": "GENE_N04", "STRAND": "+"},
             ]
         }
     )
     down = VCFRecord(
         "chr1",
         299,
         "id_10",
         "A",
         ["A[chr1:180["],
         info={
             "MATEID": "id_09",
             "TESTANN": [
                 {"SYMBOL": "GENE_N03", "STRAND": "+"},
                 {"SYMBOL": "GENE_N06", "STRAND": "-"}
             ]
         }
     )
     self.assertTrue(
         not isReadthrough(
             up, down, "TESTANN", genes, 1000, annCmpNameFct(False),
             regCmpNameFct(False)
         )
     )
     up = VCFRecord(
         "chr1",
         285,
         "id_11",
         "A",
         ["]chr1:148]A"],
         info={
             "RNA_FIRST": True,
             "CIPOS": [0, 4],
             "MATEID": "id_12",
             "TESTANN": [
                 {"SYMBOL": "GENE_N06", "STRAND": "-"}
             ]
         }
     )
     down = VCFRecord(
         "chr1",
         148,
         "id_12",
         "A",
         ["A[chr1:285["],
         info={
             "MATEID": "id_11",
             "TESTANN": [
                 {"SYMBOL": "GENE_N04", "STRAND": "+"},
                 {"SYMBOL": "GENE_N05", "STRAND": "-"}
             ]
         }
     )
     self.assertTrue(
         isReadthrough(
             up, down, "TESTANN", genes, 1000, annCmpNameFct(False),
             regCmpNameFct(False)
         )
     )
Ejemplo n.º 18
0
    def setUp(self):
        tmp_folder = tempfile.gettempdir()
        unique_id = str(uuid.uuid1())
        self.tmp_sam_path = os.path.join(tmp_folder, unique_id + ".sam")
        self.tmp_bam_path = os.path.join(tmp_folder, unique_id + ".bam")
        self.tmp_fasta_path = os.path.join(tmp_folder, unique_id + ".fa")
        self.tmp_faidx_path = os.path.join(tmp_folder, unique_id + ".fa.fai")
        self.ref_seq = "ggaagccctgatcACGCAAATCTCGGCATGCCGATTaagtgtgctctgaacaggacgaactggatttcctcatggaagccctgatcatcagcaaattcaaccaccagaacattgttcgctgcattggggtg"
        #               | | | | | |  |  |  |  |
        #               1 3 5 7 9 11 14 17 20 23
        with open(self.tmp_fasta_path, "w") as FH_seq:
            FH_seq.write(">chr1\n{}".format(self.ref_seq))
        with open(self.tmp_faidx_path, "w") as FH_faidx:
            FH_faidx.write("chr1\t{}\t6\t200\t201".format(len(self.ref_seq)))
        self.reads_content = """>subtit_AAA/CAC_1_alt
ggaagccctgatcACGCCACTCTCGGCATGCCGATTaagtgtgctctgaacaggacgaactggatttcctcatggaagccctgatcatcagcaaattcaaccaccagaacattgttcgctgca
>subtit_AAA/CAC_2_alt
aagccctgatcACGCCACTCTCGGCATGCCGATTaagtgtgctctgaacaggacgaactggatttcctcatggaagccctgatcatcagcaaattcaaccaccagaacattgttcgctgca
>subtit_AAA/CAC_3_ref
gaagccctgatcACGCAAATCTCGGCATGCCGATTaagtgtgctctgaacaggacgaactggatttcctcatggaagccctgatcatcagcaaattcaaccaccagaacattgttcgc
>subtit_AAA/CAC_4_mixUp
ggaagccctgatcACGCCAATCTCGGCATGCCGATTaagtgtgctctgaacaggacgaactggatttcctcatggaagccctgatcatcagcaaattcaaccaccagaacattgttcgctgca
>subtit_AAA/CAC_5_mixDown
ggaagccctgatcACGCAACTCTCGGCATGCCGATTaagtgtgctctgaacaggacgaactggatttcctcatggaagccctgatcatcagcaaattcaaccaccagaacattgttcgctgca
>subtitClose_AA/CC_1_alt
ggaagccctgatcACGCCCATCTCGGCATGCCGATTaagtgtgctctgaacaggacgaactggatttcctcatggaagccctgatcatcagcaaattcaaccaccagaacattgttcgctgca
>subtitClose_AA/CC_2_alt
aagccctgatcACGCCCATCTCGGCATGCCGATTaagtgtgctctgaacaggacgaactggatttcctcatggaagccctgatcatcagcaaattcaaccaccagaacattgttcgctgca
>subtitClose_AA/CC_3_ref
gaagccctgatcACGCAAATCTCGGCATGCCGATTaagtgtgctctgaacaggacgaactggatttcctcatggaagccctgatcatcagcaaattcaaccaccagaacattgttcgc
>subtitClose_AA/CC_4_mixUp
ggaagccctgatcACGCCAATCTCGGCATGCCGATTaagtgtgctctgaacaggacgaactggatttcctcatggaagccctgatcatcagcaaattcaaccaccagaacattgttcgctgca
>subtitClose_AA/CC_5_mixDown
ggaagccctgatcACGCACATCTCGGCATGCCGATTaagtgtgctctgaacaggacgaactggatttcctcatggaagccctgatcatcagcaaattcaaccaccagaacattgttcgctgca
>subtit_AAATCTC/CCTTCGG_1_alt
ggaagccctgatcACGCCCTTCGGGGCATGCCGATTaagtgtgctctgaacaggacgaactggatttcctcatggaagccctgatcatcagcaaattcaaccaccagaacattgttcgctgca
>subtit_AAATCTC/CCTTCGG_2_alt
aagccctgatcACGCCCTTCGGGGCATGCCGATTaagtgtgctctgaacaggacgaactggatttcctcatggaagccctgatcatcagcaaattcaaccaccagaacattgttcgctgca
>subtit_AAATCTC/CCTTCGG_3_ref
gaagccctgatcACGCAAATCTCGGCATGCCGATTaagtgtgctctgaacaggacgaactggatttcctcatggaagccctgatcatcagcaaattcaaccaccagaacattgttcgc
>subtit_AAATCTC/CCTTCGG_4_mixUp
ggaagccctgatcACGCCCTTCTCGGCATGCCGATTaagtgtgctctgaacaggacgaactggatttcctcatggaagccctgatcatcagcaaattcaaccaccagaacattgttcgctgca
>subtit_AAATCTC/CCTTCGG_5_mixDown
ggaagccctgatcACGCAAATCGGGGCATGCCGATTaagtgtgctctgaacaggacgaactggatttcctcatggaagccctgatcatcagcaaattcaaccaccagaacattgttcgctgca
>insertion_A/TGGAGG_1_alt
ggaagccctgatcACGCTGGAGGAATCTCGGCATGCCGATTaagtgtgctctgaacaggacgaactggatttcctcatggaagccctgatcatcagcaaattcaaccaccagaacattgttcgctgca
>insertion_A/TGGAGG_2_alt
aagccctgatcACGCTGGAGGAATCTCGGCATGCCGATTaagtgtgctctgaacaggacgaactggatttcctcatggaagccctgatcatcagcaaattcaaccaccagaacattgttcgctgca
>insertion_A/TGGAGG_3_ref
gaagccctgatcACGCAAATCTCGGCATGCCGATTaagtgtgctctgaacaggacgaactggatttcctcatggaagccctgatcatcagcaaattcaaccaccagaacattgttcgc
>insertion_A/TGGAGG_4_mixUp
ggaagccctgatcACGCTGGAAATCTCGGCATGCCGATTaagtgtgctctgaacaggacgaactggatttcctcatggaagccctgatcatcagcaaattcaaccaccagaacattgttcgctgca
>insertion_A/TGGAGG_5_mixDown
ggaagccctgatcACGCAGGAATCTCGGCATGCCGATTaagtgtgctctgaacaggacgaactggatttcctcatggaagccctgatcatcagcaaattcaaccaccagaacattgttcgctgca
>deletion_AAATCTC/T_1_alt
ggaagccctgatcACGCTGGCATGCCGATTaagtgtgctctgaacaggacgaactggatttcctcatggaagccctgatcatcagcaaattcaaccaccagaacattgttcgctgca
>deletion_AAATCTC/T_2_alt
aagccctgatcACGCTGGCATGCCGATTaagtgtgctctgaacaggacgaactggatttcctcatggaagccctgatcatcagcaaattcaaccaccagaacattgttcgctgca
>deletion_AAATCTC/T_3_ref
gaagccctgatcACGCAAATCTCGGCATGCCGATTaagtgtgctctgaacaggacgaactggatttcctcatggaagccctgatcatcagcaaattcaaccaccagaacattgttcgc
>deletion_AAATCTC/T_4_mixUp
ggaagccctgatcACGCTCTCGGCATGCCGATTaagtgtgctctgaacaggacgaactggatttcctcatggaagccctgatcatcagcaaattcaaccaccagaacattgttcgctgca
>deletion_AAATCTC/T_5_mixDown
ggaagccctgatcACGCAAATGGCATGCCGATTaagtgtgctctgaacaggacgaactggatttcctcatggaagccctgatcatcagcaaattcaaccaccagaacattgttcgctgca
>delIns_AAAT/TGA_1_alt
ggaagccctgatcACGCTGACTCGGCATGCCGATTaagtgtgctctgaacaggacgaactggatttcctcatggaagccctgatcatcagcaaattcaaccaccagaacattgttcgctgca
>delIns_AAAT/TGA_2_alt
aagccctgatcACGCTGACTCGGCATGCCGATTaagtgtgctctgaacaggacgaactggatttcctcatggaagccctgatcatcagcaaattcaaccaccagaacattgttcgctgca
>delIns_AAAT/TGA_3_ref
gaagccctgatcACGCAAATCTCGGCATGCCGATTaagtgtgctctgaacaggacgaactggatttcctcatggaagccctgatcatcagcaaattcaaccaccagaacattgttcgc
>delIns_AAAT/TGA_4_mixUp
ggaagccctgatcACGCTCTCGGCATGCCGATTaagtgtgctctgaacaggacgaactggatttcctcatggaagccctgatcatcagcaaattcaaccaccagaacattgttcgctgca
>delIns_AAAT/TGA_5_mixDown
ggaagccctgatcACGCAAATGACTCGGCATGCCGATTaagtgtgctctgaacaggacgaactggatttcctcatggaagccctgatcatcagcaaattcaaccaccagaacattgttcgctgca
>insDel_AAA/GGGA_1_alt
ggaagccctgatcACGCGGGATCTCGGCATGCCGATTaagtgtgctctgaacaggacgaactggatttcctcatggaagccctgatcatcagcaaattcaaccaccagaacattgttcgctgca
>insDel_AAA/GGGA_2_alt
aagccctgatcACGCGGGATCTCGGCATGCCGATTaagtgtgctctgaacaggacgaactggatttcctcatggaagccctgatcatcagcaaattcaaccaccagaacattgttcgctgca
>insDel_AAA/GGGA_3_ref
gaagccctgatcACGCAAATCTCGGCATGCCGATTaagtgtgctctgaacaggacgaactggatttcctcatggaagccctgatcatcagcaaattcaaccaccagaacattgttcgc
>insDel_AAA/GGGA_4_mixUp
ggaagccctgatcACGCGGGAAATCTCGGCATGCCGATTaagtgtgctctgaacaggacgaactggatttcctcatggaagccctgatcatcagcaaattcaaccaccagaacattgttcgctgca
>insDel_AAA/GGGA_5_mixDown
ggaagccctgatcACGCATCTCGGCATGCCGATTaagtgtgctctgaacaggacgaactggatttcctcatggaagccctgatcatcagcaaattcaaccaccagaacattgttcgctgca
>delInsNoStd_AAATCTC/CTGGG_1_alt
ggaagccctgatcACGCCTGGGCTCGGCATGCCGATTaagtgtgctctgaacaggacgaactggatttcctcatggaagccctgatcatcagcaaattcaaccaccagaacattgttcgctgca
>delInsNoStd_AAATCTC/CTGGG_2_alt
aagccctgatcACGCCTGGGCTCGGCATGCCGATTaagtgtgctctgaacaggacgaactggatttcctcatggaagccctgatcatcagcaaattcaaccaccagaacattgttcgctgca
>delInsNoStd_AAATCTC/CTGGG_3_ref
gaagccctgatcACGCAAATCTCGGCATGCCGATTaagtgtgctctgaacaggacgaactggatttcctcatggaagccctgatcatcagcaaattcaaccaccagaacattgttcgc
>delInsNoStd_AAATCTC/CTGGG_4_mixUp
ggaagccctgatcACGCCTCTCGGCATGCCGATTaagtgtgctctgaacaggacgaactggatttcctcatggaagccctgatcatcagcaaattcaaccaccagaacattgttcgctgca
>delInsNoStd_AAATCTC/CTGGG_5_mixDown
ggaagccctgatcACGCAAATGGGCTCGGCATGCCGATTaagtgtgctctgaacaggacgaactggatttcctcatggaagccctgatcatcagcaaattcaaccaccagaacattgttcgctgca
>insDelNoStd_AAAT/GTGA_1_alt
ggaagccctgatcACGCGTGACTCGGCATGCCGATTaagtgtgctctgaacaggacgaactggatttcctcatggaagccctgatcatcagcaaattcaaccaccagaacattgttcgctgca
>insDelNoStd_AAAT/GTGA_2_alt
aagccctgatcACGCGTGACTCGGCATGCCGATTaagtgtgctctgaacaggacgaactggatttcctcatggaagccctgatcatcagcaaattcaaccaccagaacattgttcgctgca
>insDelNoStd_AAAT/GTGA_3_ref
gaagccctgatcACGCAAATCTCGGCATGCCGATTaagtgtgctctgaacaggacgaactggatttcctcatggaagccctgatcatcagcaaattcaaccaccagaacattgttcgc
>insDelNoStd_AAAT/GTGA_4_mixUp
ggaagccctgatcACGCGTGAATCTCGGCATGCCGATTaagtgtgctctgaacaggacgaactggatttcctcatggaagccctgatcatcagcaaattcaaccaccagaacattgttcgctgca
>insDelNoStd_AAAT/GTGA_5_mixDown
ggaagccctgatcACGCAACTCGGCATGCCGATTaagtgtgctctgaacaggacgaactggatttcctcatggaagccctgatcatcagcaaattcaaccaccagaacattgttcgctgca
>insDelNoStd_CAAA/CGTGA_1_alt
ggaagccctgatcACGCGTGATCTCGGCATGCCGATTaagtgtgctctgaacaggacgaactggatttcctcatggaagccctgatcatcagcaaattcaaccaccagaacattgttcgctgca
>insDelNoStd_CAAA/CGTGA_2_alt
aagccctgatcACGCGTGATCTCGGCATGCCGATTaagtgtgctctgaacaggacgaactggatttcctcatggaagccctgatcatcagcaaattcaaccaccagaacattgttcgctgca
>insDelNoStd_CAAA/CGTGA_3_ref
gaagccctgatcACGCAAATCTCGGCATGCCGATTaagtgtgctctgaacaggacgaactggatttcctcatggaagccctgatcatcagcaaattcaaccaccagaacattgttcgc
>insDelNoStd_CAAA/CGTGA_4_mixUp
ggaagccctgatcACGCGTGAAATCTCGGCATGCCGATTaagtgtgctctgaacaggacgaactggatttcctcatggaagccctgatcatcagcaaattcaaccaccagaacattgttcgctgca
>insDelNoStd_CAAA/CGTGA_5_mixDown
ggaagccctgatcACGCATCTCGGCATGCCGATTaagtgtgctctgaacaggacgaactggatttcctcatggaagccctgatcatcagcaaattcaaccaccagaacattgttcgctgca"""
        self.test_cases = [
            [
                VCFRecord("chr1", 18, "subtit_AAA/CAC", "A", ["C"]),
                VCFRecord("chr1", 20, "subtit_AAA/CAC", "A", ["C"]),
                """@SQ	SN:chr1	LN:131
@PG	ID:bwa	PN:bwa	VN:0.7.17-r1188	CL:bwa mem ref.fa reads.fa
subtit_AAA/CAC_1_alt	0	chr1	1	60	123M	*	0	0	GGAAGCCCTGATCACGCCACTCTCGGCATGCCGATTAAGTGTGCTCTGAACAGGACGAACTGGATTTCCTCATGGAAGCCCTGATCATCAGCAAATTCAACCACCAGAACATTGTTCGCTGCA	*	NM:i:2	MD:Z:17A1A103	AS:i:113	XS:i:0
subtit_AAA/CAC_4_mixUp	0	chr1	1	60	123M	*	0	0	GGAAGCCCTGATCACGCCAATCTCGGCATGCCGATTAAGTGTGCTCTGAACAGGACGAACTGGATTTCCTCATGGAAGCCCTGATCATCAGCAAATTCAACCACCAGAACATTGTTCGCTGCA	*	NM:i:1	MD:Z:17A105	AS:i:118	XS:i:0
subtit_AAA/CAC_5_mixDown	0	chr1	1	60	123M	*	0	0	GGAAGCCCTGATCACGCAACTCTCGGCATGCCGATTAAGTGTGCTCTGAACAGGACGAACTGGATTTCCTCATGGAAGCCCTGATCATCAGCAAATTCAACCACCAGAACATTGTTCGCTGCA	*	NM:i:1	MD:Z:19A103	AS:i:118	XS:i:0
subtit_AAA/CAC_3_ref	0	chr1	2	60	118M	*	0	0	GAAGCCCTGATCACGCAAATCTCGGCATGCCGATTAAGTGTGCTCTGAACAGGACGAACTGGATTTCCTCATGGAAGCCCTGATCATCAGCAAATTCAACCACCAGAACATTGTTCGC	*	NM:i:0	MD:Z:118	AS:i:118	XS:i:0
subtit_AAA/CAC_2_alt	0	chr1	3	60	121M	*	0	0	AAGCCCTGATCACGCCACTCTCGGCATGCCGATTAAGTGTGCTCTGAACAGGACGAACTGGATTTCCTCATGGAAGCCCTGATCATCAGCAAATTCAACCACCAGAACATTGTTCGCTGCA	*	NM:i:2	MD:Z:15A1A103	AS:i:111	XS:i:0"""
            ],
            [
                VCFRecord("chr1", 18, "subtitClose_AA/CC", "A", ["C"]),
                VCFRecord("chr1", 19, "subtitClose_AA/CC", "A", ["C"]),
                """@SQ	SN:chr1	LN:131
@PG	ID:bwa	PN:bwa	VN:0.7.17-r1188	CL:bwa mem ref.fa reads.fa
subtitClose_AA/CC_1_alt	0	chr1	1	60	123M	*	0	0	GGAAGCCCTGATCACGCCCATCTCGGCATGCCGATTAAGTGTGCTCTGAACAGGACGAACTGGATTTCCTCATGGAAGCCCTGATCATCAGCAAATTCAACCACCAGAACATTGTTCGCTGCA	*	NM:i:2	MD:Z:17A0A104	AS:i:113	XS:i:0
subtitClose_AA/CC_4_mixUp	0	chr1	1	60	123M	*	0	0	GGAAGCCCTGATCACGCCAATCTCGGCATGCCGATTAAGTGTGCTCTGAACAGGACGAACTGGATTTCCTCATGGAAGCCCTGATCATCAGCAAATTCAACCACCAGAACATTGTTCGCTGCA	*	NM:i:1	MD:Z:17A105	AS:i:118	XS:i:0
subtitClose_AA/CC_5_mixDown	0	chr1	1	60	123M	*	0	0	GGAAGCCCTGATCACGCACATCTCGGCATGCCGATTAAGTGTGCTCTGAACAGGACGAACTGGATTTCCTCATGGAAGCCCTGATCATCAGCAAATTCAACCACCAGAACATTGTTCGCTGCA	*	NM:i:1	MD:Z:18A104	AS:i:118	XS:i:0
subtitClose_AA/CC_3_ref	0	chr1	2	60	118M	*	0	0	GAAGCCCTGATCACGCAAATCTCGGCATGCCGATTAAGTGTGCTCTGAACAGGACGAACTGGATTTCCTCATGGAAGCCCTGATCATCAGCAAATTCAACCACCAGAACATTGTTCGC	*	NM:i:0	MD:Z:118	AS:i:118	XS:i:0
subtitClose_AA/CC_2_alt	0	chr1	3	60	121M	*	0	0	AAGCCCTGATCACGCCCATCTCGGCATGCCGATTAAGTGTGCTCTGAACAGGACGAACTGGATTTCCTCATGGAAGCCCTGATCATCAGCAAATTCAACCACCAGAACATTGTTCGCTGCA	*	NM:i:2	MD:Z:15A0A104	AS:i:111	XS:i:0"""
            ],
            [
                VCFRecord("chr1", 18, "subtit_AAATCTC/CCTTCGG", "AAA",
                          ["CCT"]),
                VCFRecord("chr1", 23, "subtit_AAATCTC/CCTTCGG", "TC", ["GG"]),
                """@SQ	SN:chr1	LN:131
@PG	ID:bwa	PN:bwa	VN:0.7.17-r1188	CL:bwa mem ref.fa reads.fa
subtit_AAATCTC/CCTTCGG_1_alt	0	chr1	1	60	123M	*	0	0	GGAAGCCCTGATCACGCCCTTCGGGGCATGCCGATTAAGTGTGCTCTGAACAGGACGAACTGGATTTCCTCATGGAAGCCCTGATCATCAGCAAATTCAACCACCAGAACATTGTTCGCTGCA	*	NM:i:5	MD:Z:17A0A0A2T0C99	AS:i:99	XS:i:0
subtit_AAATCTC/CCTTCGG_4_mixUp	0	chr1	1	60	123M	*	0	0	GGAAGCCCTGATCACGCCCTTCTCGGCATGCCGATTAAGTGTGCTCTGAACAGGACGAACTGGATTTCCTCATGGAAGCCCTGATCATCAGCAAATTCAACCACCAGAACATTGTTCGCTGCA	*	NM:i:3	MD:Z:17A0A0A103	AS:i:108	XS:i:0
subtit_AAATCTC/CCTTCGG_5_mixDown	0	chr1	1	60	123M	*	0	0	GGAAGCCCTGATCACGCAAATCGGGGCATGCCGATTAAGTGTGCTCTGAACAGGACGAACTGGATTTCCTCATGGAAGCCCTGATCATCAGCAAATTCAACCACCAGAACATTGTTCGCTGCA	*	NM:i:2	MD:Z:22T0C99	AS:i:113	XS:i:0
subtit_AAATCTC/CCTTCGG_3_ref	0	chr1	2	60	118M	*	0	0	GAAGCCCTGATCACGCAAATCTCGGCATGCCGATTAAGTGTGCTCTGAACAGGACGAACTGGATTTCCTCATGGAAGCCCTGATCATCAGCAAATTCAACCACCAGAACATTGTTCGC	*	NM:i:0	MD:Z:118	AS:i:118	XS:i:0
subtit_AAATCTC/CCTTCGG_2_alt	0	chr1	3	60	121M	*	0	0	AAGCCCTGATCACGCCCTTCGGGGCATGCCGATTAAGTGTGCTCTGAACAGGACGAACTGGATTTCCTCATGGAAGCCCTGATCATCAGCAAATTCAACCACCAGAACATTGTTCGCTGCA	*	NM:i:5	MD:Z:15A0A0A2T0C99	AS:i:99	XS:i:0"""
            ],
            [
                VCFRecord("chr1", 18, "insertion_A/TGGAGG", "-", ["TGG"]),
                VCFRecord("chr1", 19, "insertion_A/TGGAGG", "-", ["GG"]),
                """@SQ	SN:chr1	LN:131
@PG	ID:bwa	PN:bwa	VN:0.7.17-r1188	CL:bwa mem ref.fa reads.fa
insertion_A/TGGAGG_1_alt	0	chr1	1	60	17M3I1M2I105M	*	0	0	GGAAGCCCTGATCACGCTGGAGGAATCTCGGCATGCCGATTAAGTGTGCTCTGAACAGGACGAACTGGATTTCCTCATGGAAGCCCTGATCATCAGCAAATTCAACCACCAGAACATTGTTCGCTGCA	*	NM:i:5	MD:Z:123	AS:i:107	XS:i:0
insertion_A/TGGAGG_4_mixUp	0	chr1	1	60	17M3I106M	*	0	0	GGAAGCCCTGATCACGCTGGAAATCTCGGCATGCCGATTAAGTGTGCTCTGAACAGGACGAACTGGATTTCCTCATGGAAGCCCTGATCATCAGCAAATTCAACCACCAGAACATTGTTCGCTGCA	*	NM:i:3	MD:Z:123	AS:i:114	XS:i:0
insertion_A/TGGAGG_5_mixDown	0	chr1	1	60	18M2I105M	*	0	0	GGAAGCCCTGATCACGCAGGAATCTCGGCATGCCGATTAAGTGTGCTCTGAACAGGACGAACTGGATTTCCTCATGGAAGCCCTGATCATCAGCAAATTCAACCACCAGAACATTGTTCGCTGCA	*	NM:i:2	MD:Z:123	AS:i:115	XS:i:0
insertion_A/TGGAGG_3_ref	0	chr1	2	60	118M	*	0	0	GAAGCCCTGATCACGCAAATCTCGGCATGCCGATTAAGTGTGCTCTGAACAGGACGAACTGGATTTCCTCATGGAAGCCCTGATCATCAGCAAATTCAACCACCAGAACATTGTTCGC	*	NM:i:0	MD:Z:118	AS:i:118	XS:i:0
insertion_A/TGGAGG_2_alt	0	chr1	3	60	15M3I1M2I105M	*	0	0	AAGCCCTGATCACGCTGGAGGAATCTCGGCATGCCGATTAAGTGTGCTCTGAACAGGACGAACTGGATTTCCTCATGGAAGCCCTGATCATCAGCAAATTCAACCACCAGAACATTGTTCGCTGCA	*	NM:i:5	MD:Z:121	AS:i:105	XS:i:0"""
            ],
            [
                VCFRecord("chr1", 18, "deletion_AAATCTC/T", "AAA", ["-"]),
                VCFRecord("chr1", 22, "deletion_AAATCTC/T", "CTC", ["-"]),
                """@SQ	SN:chr1	LN:131
@PG	ID:bwa	PN:bwa	VN:0.7.17-r1188	CL:bwa mem ref.fa reads.fa
deletion_AAATCTC/T_1_alt	0	chr1	1	60	17M3D1M3D99M	*	0	0	GGAAGCCCTGATCACGCTGGCATGCCGATTAAGTGTGCTCTGAACAGGACGAACTGGATTTCCTCATGGAAGCCCTGATCATCAGCAAATTCAACCACCAGAACATTGTTCGCTGCA	*	NM:i:5	MD:Z:17^AAA1^CTC99	AS:i:100	XS:i:0
deletion_AAATCTC/T_4_mixUp	0	chr1	1	60	17M3D103M	*	0	0	GGAAGCCCTGATCACGCTCTCGGCATGCCGATTAAGTGTGCTCTGAACAGGACGAACTGGATTTCCTCATGGAAGCCCTGATCATCAGCAAATTCAACCACCAGAACATTGTTCGCTGCA	*	NM:i:3	MD:Z:17^AAA103	AS:i:111	XS:i:0
deletion_AAATCTC/T_5_mixDown	0	chr1	1	60	21M3D99M	*	0	0	GGAAGCCCTGATCACGCAAATGGCATGCCGATTAAGTGTGCTCTGAACAGGACGAACTGGATTTCCTCATGGAAGCCCTGATCATCAGCAAATTCAACCACCAGAACATTGTTCGCTGCA	*	NM:i:3	MD:Z:21^CTC99	AS:i:111	XS:i:0
deletion_AAATCTC/T_3_ref	0	chr1	2	60	118M	*	0	0	GAAGCCCTGATCACGCAAATCTCGGCATGCCGATTAAGTGTGCTCTGAACAGGACGAACTGGATTTCCTCATGGAAGCCCTGATCATCAGCAAATTCAACCACCAGAACATTGTTCGC	*	NM:i:0	MD:Z:118	AS:i:118	XS:i:0
deletion_AAATCTC/T_2_alt	0	chr1	3	60	15M3D1M3D99M	*	0	0	AAGCCCTGATCACGCTGGCATGCCGATTAAGTGTGCTCTGAACAGGACGAACTGGATTTCCTCATGGAAGCCCTGATCATCAGCAAATTCAACCACCAGAACATTGTTCGCTGCA	*	NM:i:5	MD:Z:15^AAA1CTC99	AS:i:99	XS:i:0"""
            ],
            [
                VCFRecord("chr1", 18, "delIns_AAAT/TGA", "AAA", ["-"]),
                VCFRecord("chr1", 22, "delIns_AAAT/TGA", "-", ["GA"]),
                """@SQ	SN:chr1	LN:131
@PG	ID:bwa	PN:bwa	VN:0.7.17-r1188	CL:bwa mem ref.fa reads.fa
delIns_AAAT/TGA_1_alt	0	chr1	1	60	17M3D1M2I102M	*	0	0	GGAAGCCCTGATCACGCTGACTCGGCATGCCGATTAAGTGTGCTCTGAACAGGACGAACTGGATTTCCTCATGGAAGCCCTGATCATCAGCAAATTCAACCACCAGAACATTGTTCGCTGCA	*	NM:i:5	MD:Z:17^AAA103	AS:i:105	XS:i:0
delIns_AAAT/TGA_4_mixUp	0	chr1	1	60	17M3D103M	*	0	0	GGAAGCCCTGATCACGCTCTCGGCATGCCGATTAAGTGTGCTCTGAACAGGACGAACTGGATTTCCTCATGGAAGCCCTGATCATCAGCAAATTCAACCACCAGAACATTGTTCGCTGCA	*	NM:i:3	MD:Z:17^AAA103	AS:i:111	XS:i:0
delIns_AAAT/TGA_5_mixDown	0	chr1	1	60	21M2I102M	*	0	0	GGAAGCCCTGATCACGCAAATGACTCGGCATGCCGATTAAGTGTGCTCTGAACAGGACGAACTGGATTTCCTCATGGAAGCCCTGATCATCAGCAAATTCAACCACCAGAACATTGTTCGCTGCA	*	NM:i:2	MD:Z:123	AS:i:115	XS:i:0
delIns_AAAT/TGA_3_ref	0	chr1	2	60	118M	*	0	0	GAAGCCCTGATCACGCAAATCTCGGCATGCCGATTAAGTGTGCTCTGAACAGGACGAACTGGATTTCCTCATGGAAGCCCTGATCATCAGCAAATTCAACCACCAGAACATTGTTCGC	*	NM:i:0	MD:Z:118	AS:i:118	XS:i:0
delIns_AAAT/TGA_2_alt	0	chr1	3	60	15M3D1M2I102M	*	0	0	AAGCCCTGATCACGCTGACTCGGCATGCCGATTAAGTGTGCTCTGAACAGGACGAACTGGATTTCCTCATGGAAGCCCTGATCATCAGCAAATTCAACCACCAGAACATTGTTCGCTGCA	*	NM:i:5	MD:Z:15^AAA103	AS:i:103	XS:i:0"""
            ],
            [
                VCFRecord("chr1", 18, "insDel_AAA/GGGA", "-", ["GGG"]),
                VCFRecord("chr1", 19, "insDel_AAA/GGGA", "AA", ["-"]),
                """@SQ	SN:chr1	LN:131
@PG	ID:bwa	PN:bwa	VN:0.7.17-r1188	CL:bwa mem ref.fa reads.fa
insDel_AAA/GGGA_1_alt	0	chr1	1	60	17M3I1M2D103M	*	0	0	GGAAGCCCTGATCACGCGGGATCTCGGCATGCCGATTAAGTGTGCTCTGAACAGGACGAACTGGATTTCCTCATGGAAGCCCTGATCATCAGCAAATTCAACCACCAGAACATTGTTCGCTGCA	*	NM:i:5	MD:Z:18^AA103	AS:i:106	XS:i:0
insDel_AAA/GGGA_4_mixUp	0	chr1	1	60	17M3I106M	*	0	0	GGAAGCCCTGATCACGCGGGAAATCTCGGCATGCCGATTAAGTGTGCTCTGAACAGGACGAACTGGATTTCCTCATGGAAGCCCTGATCATCAGCAAATTCAACCACCAGAACATTGTTCGCTGCA	*	NM:i:3	MD:Z:123	AS:i:114	XS:i:0
insDel_AAA/GGGA_5_mixDown	0	chr1	1	60	18M2D103M	*	0	0	GGAAGCCCTGATCACGCATCTCGGCATGCCGATTAAGTGTGCTCTGAACAGGACGAACTGGATTTCCTCATGGAAGCCCTGATCATCAGCAAATTCAACCACCAGAACATTGTTCGCTGCA	*	NM:i:2	MD:Z:18^AA103	AS:i:113	XS:i:0
insDel_AAA/GGGA_3_ref	0	chr1	2	60	118M	*	0	0	GAAGCCCTGATCACGCAAATCTCGGCATGCCGATTAAGTGTGCTCTGAACAGGACGAACTGGATTTCCTCATGGAAGCCCTGATCATCAGCAAATTCAACCACCAGAACATTGTTCGC	*	NM:i:0	MD:Z:118	AS:i:118	XS:i:0
insDel_AAA/GGGA_2_alt	0	chr1	3	60	15M3I1M2D103M	*	0	0	AAGCCCTGATCACGCGGGATCTCGGCATGCCGATTAAGTGTGCTCTGAACAGGACGAACTGGATTTCCTCATGGAAGCCCTGATCATCAGCAAATTCAACCACCAGAACATTGTTCGCTGCA	*	NM:i:5	MD:Z:16^AA103	AS:i:104	XS:i:0"""
            ],
            [
                VCFRecord("chr1", 18, "delInsNoStd_AAATCTC/CTGGG", "AAA",
                          ["C"]),
                VCFRecord("chr1", 22, "delInsNoStd_AAATCTC/CTGGG", "-",
                          ["GGG"]), """@SQ	SN:chr1	LN:131
@PG	ID:bwa	PN:bwa	VN:0.7.17-r1188	CL:bwa mem ref.fa reads.fa
delInsNoStd_AAATCTC/CTGGG_1_alt	0	chr1	1	60	17M2D2M3I102M	*	0	0	GGAAGCCCTGATCACGCCTGGGCTCGGCATGCCGATTAAGTGTGCTCTGAACAGGACGAACTGGATTTCCTCATGGAAGCCCTGATCATCAGCAAATTCAACCACCAGAACATTGTTCGCTGCA	*	NM:i:6	MD:Z:17^AA0C103	AS:i:102	XS:i:0
delInsNoStd_AAATCTC/CTGGG_4_mixUp	0	chr1	1	60	17M2D104M	*	0	0	GGAAGCCCTGATCACGCCTCTCGGCATGCCGATTAAGTGTGCTCTGAACAGGACGAACTGGATTTCCTCATGGAAGCCCTGATCATCAGCAAATTCAACCACCAGAACATTGTTCGCTGCA	*	NM:i:3	MD:Z:17^AA0C103	AS:i:108	XS:i:0
delInsNoStd_AAATCTC/CTGGG_5_mixDown	0	chr1	1	60	21M3I102M	*	0	0	GGAAGCCCTGATCACGCAAATGGGCTCGGCATGCCGATTAAGTGTGCTCTGAACAGGACGAACTGGATTTCCTCATGGAAGCCCTGATCATCAGCAAATTCAACCACCAGAACATTGTTCGCTGCA	*	NM:i:3	MD:Z:123	AS:i:114	XS:i:0
delInsNoStd_AAATCTC/CTGGG_3_ref	0	chr1	2	60	118M	*	0	0	GAAGCCCTGATCACGCAAATCTCGGCATGCCGATTAAGTGTGCTCTGAACAGGACGAACTGGATTTCCTCATGGAAGCCCTGATCATCAGCAAATTCAACCACCAGAACATTGTTCGC	*	NM:i:0	MD:Z:118	AS:i:118	XS:i:0
delInsNoStd_AAATCTC/CTGGG_2_alt	0	chr1	3	60	15M2D2M3I102M	*	0	0	AAGCCCTGATCACGCCTGGGCTCGGCATGCCGATTAAGTGTGCTCTGAACAGGACGAACTGGATTTCCTCATGGAAGCCCTGATCATCAGCAAATTCAACCACCAGAACATTGTTCGCTGCA	*	NM:i:6	MD:Z:15^AA0C103	AS:i:102	XS:i:0"""
            ],
            [
                VCFRecord("chr1", 18, "insDelNoStd_AAAT/GTGA", "A", ["GTG"]),
                VCFRecord("chr1", 20, "insDelNoStd_AAAT/GTGA", "AT", ["-"]),
                """@SQ	SN:chr1	LN:131
@PG	ID:bwa	PN:bwa	VN:0.7.17-r1188	CL:bwa mem ref.fa reads.fa
insDelNoStd_AAAT/GTGA_1_alt	0	chr1	1	60	17M1D3I1M2D102M	*	0	0	GGAAGCCCTGATCACGCGTGACTCGGCATGCCGATTAAGTGTGCTCTGAACAGGACGAACTGGATTTCCTCATGGAAGCCCTGATCATCAGCAAATTCAACCACCAGAACATTGTTCGCTGCA	*	NM:i:6	MD:Z:17^A103	AS:i:103	XS:i:0
insDelNoStd_AAAT/GTGA_4_mixUp	0	chr1	1	60	17M1D3I105M	*	0	0	GGAAGCCCTGATCACGCGTGAATCTCGGCATGCCGATTAAGTGTGCTCTGAACAGGACGAACTGGATTTCCTCATGGAAGCCCTGATCATCAGCAAATTCAACCACCAGAACATTGTTCGCTGCA	*	NM:i:3	MD:Z:17^A105	AS:i:110	XS:i:0
insDelNoStd_AAAT/GTGA_5_mixDown	0	chr1	1	60	19M2D102M	*	0	0	GGAAGCCCTGATCACGCAACTCGGCATGCCGATTAAGTGTGCTCTGAACAGGACGAACTGGATTTCCTCATGGAAGCCCTGATCATCAGCAAATTCAACCACCAGAACATTGTTCGCTGCA	*	NM:i:2	MD:Z:19^AT102	AS:i:113	XS:i:0
insDelNoStd_AAAT/GTGA_3_ref	0	chr1	2	60	118M	*	0	0	GAAGCCCTGATCACGCAAATCTCGGCATGCCGATTAAGTGTGCTCTGAACAGGACGAACTGGATTTCCTCATGGAAGCCCTGATCATCAGCAAATTCAACCACCAGAACATTGTTCGC	*	NM:i:0	MD:Z:118	AS:i:118	XS:i:0
insDelNoStd_AAAT/GTGA_2_alt	0	chr1	3	60	15M1D3I1M2D102M	*	0	0	AAGCCCTGATCACGCGTGACTCGGCATGCCGATTAAGTGTGCTCTGAACAGGACGAACTGGATTTCCTCATGGAAGCCCTGATCATCAGCAAATTCAACCACCAGAACATTGTTCGCTGCA	*	NM:i:6	MD:Z:15A0A0A0T102	AS:i:102	XS:i:0"""
            ],
            [
                VCFRecord("chr1", 17, "insDelNoStd_CAAA/CGTGA", "C", ["CGTG"]),
                VCFRecord("chr1", 18, "insDelNoStd_CAAA/CGTGA", "AAA", ["A"]),
                """@SQ	SN:chr1	LN:131
@PG	ID:bwa	PN:bwa	VN:0.7.17-r1188	CL:bwa mem ref.fa reads.fa
insDelNoStd_CAAA/CGTGA_1_alt	0	chr1	1	60	17M3I1M2D103M	*	0	0	GGAAGCCCTGATCACGCGTGATCTCGGCATGCCGATTAAGTGTGCTCTGAACAGGACGAACTGGATTTCCTCATGGAAGCCCTGATCATCAGCAAATTCAACCACCAGAACATTGTTCGCTGCA	*	NM:i:5	MD:Z:18^AA103	AS:i:106	XS:i:0
insDelNoStd_CAAA/CGTGA_4_mixUp	0	chr1	1	60	17M3I106M	*	0	0	GGAAGCCCTGATCACGCGTGAAATCTCGGCATGCCGATTAAGTGTGCTCTGAACAGGACGAACTGGATTTCCTCATGGAAGCCCTGATCATCAGCAAATTCAACCACCAGAACATTGTTCGCTGCA	*	NM:i:3	MD:Z:123	AS:i:114	XS:i:0
insDelNoStd_CAAA/CGTGA_5_mixDown	0	chr1	1	60	18M2D103M	*	0	0	GGAAGCCCTGATCACGCATCTCGGCATGCCGATTAAGTGTGCTCTGAACAGGACGAACTGGATTTCCTCATGGAAGCCCTGATCATCAGCAAATTCAACCACCAGAACATTGTTCGCTGCA	*	NM:i:2	MD:Z:18^AA103	AS:i:113	XS:i:0
insDelNoStd_CAAA/CGTGA_3_ref	0	chr1	2	60	118M	*	0	0	GAAGCCCTGATCACGCAAATCTCGGCATGCCGATTAAGTGTGCTCTGAACAGGACGAACTGGATTTCCTCATGGAAGCCCTGATCATCAGCAAATTCAACCACCAGAACATTGTTCGC	*	NM:i:0	MD:Z:118	AS:i:118	XS:i:0
insDelNoStd_CAAA/CGTGA_2_alt	0	chr1	3	60	15M3I1M2D103M	*	0	0	AAGCCCTGATCACGCGTGATCTCGGCATGCCGATTAAGTGTGCTCTGAACAGGACGAACTGGATTTCCTCATGGAAGCCCTGATCATCAGCAAATTCAACCACCAGAACATTGTTCGCTGCA	*	NM:i:5	MD:Z:16^AA103	AS:i:104	XS:i:0"""
            ]
        ]
Ejemplo n.º 19
0
    def setUp(self):
        tmp_folder = tempfile.gettempdir()
        unique_id = str(uuid.uuid1())

        # Temporary files
        self.tmp_sequences = os.path.join(tmp_folder, unique_id + ".fasta")
        self.tmp_faidx = os.path.join(tmp_folder, unique_id + ".fasta.fai")
        self.tmp_variants = os.path.join(tmp_folder, unique_id + ".vcf")
        self.tmp_output = os.path.join(tmp_folder, unique_id + "_out.vcf")

        # Exec command
        self.cmd = [
            "standardizeVCF.py",
            "--trace-unstandard",
            "--input-reference", self.tmp_sequences,
            "--input-variants", self.tmp_variants,
            "--output-variants", self.tmp_output
        ]

        # Create fasta
        with FastaIO(self.tmp_sequences, "w") as FH_seq:
            # Repeats:                                       ****....            ...***
            # Region:                                 |----|        |------------|         |------|
            FH_seq.write(Sequence("artificial_chr1", "CTCAGTCATGTATGTATGTGCTCACAAAGTAGTAGATCATGGCAC"))
            #                                         123456789| | | | | | | | | | | | | | | | | |
            #                                                  10| 14| 18| 22| 26| 30| 34| 38| 42|
            #                                                    12  16  20  24  28  32  36  40  44
            FH_seq.write(Sequence("artificial_chr2", "CGATNNNCGAT"))
            #                                         123456789|
            #                                                  10

        # Create faidx
        with open(self.tmp_faidx, "w") as FH_fai:
            FH_fai.write("""artificial_chr1	45	17	45	46
artificial_chr2	11	80	11	12""")

        # Create VCF
        with VCFIO(self.tmp_variants, "w") as FH_var:
            FH_var.info = {
                "expected": HeaderInfoAttr("expected", "Standardized version of {chrom}:{pos}={ref}/{alt}.", type="String", number="."),
                "ANN": HeaderInfoAttr("ANN", "Annotation of variants Format: Allele|Annotation_id|Alt_allele_idx", type="String", number="."),
                "expectedANN": HeaderInfoAttr("expectedANN", "Standardized version of annotations Format: Allele|Annotation_id|Alt_allele_idx", type="String", number=".")
            }
            FH_var.writeHeader()
            self.variants = [
                # Substit single nt
                VCFRecord("artificial_chr1", 14, "sub_01", "G", ["T"], None, None, {
                    "expected": ["artificial_chr1:14=G/T"],
                    "ANN": ["T|ann_1|0", "T|ann_2|0", "A|ann_3|"],
                    "expectedANN": ["T|ann_1|0", "T|ann_2|0"]
                }),
                VCFRecord("artificial_chr1", 19, "sub_02", "T", ["A", "C"], None, None, {
                    "expected": ["artificial_chr1:19=T/A", "artificial_chr1:19=T/C"],
                    "ANN": ["A|ann_1|0", "A|ann_2|0", "T|ann_3|"],
                    "expectedANN": ["A|ann_1|0", "A|ann_2|0"]
                }),
                # Substit multi nt
                VCFRecord("artificial_chr1", 7, "sub_03", "CATGTATG", ["GTACCCGC"], None, None, {
                    "expected": ["artificial_chr1:7=CATGTATG/GTACCCGC"],
                    "ANN": ["GTACCCGC|ann_1|0", "GTACCCGC|ann_2|0", "GTGT|ann_3|"],
                    "expectedANN": ["GTACCCGC|ann_1|0", "GTACCCGC|ann_2|0"]
                }),
                VCFRecord("artificial_chr1", 11, "sub_04", "TATGTATG", ["GTACCCGC", "GTACCCAA"], None, None, {
                    "expected": ["artificial_chr1:11=TATGTATG/GTACCCGC", "artificial_chr1:11=TATGTATG/GTACCCAA"],
                    "ANN": ["GTACCCGC|ann_1|0", "GTACCCGC|ann_2|0", "GTACCCAA|ann_3|1"],
                    "expectedANN": ["GTACCCGC|ann_1|0", "GTACCCGC|ann_2|0", "GTACCCAA|ann_3|1"]
                }),
                # Insertion single nt
                VCFRecord("artificial_chr1", 14, "ins_01", "G", ["GA"], None, None, {
                    "expected": ["artificial_chr1:14=G/GA"],
                    "ANN": ["GA|ann_1|0", "GA|ann_2|0", "GT|ann_3|"],
                    "expectedANN": ["GA|ann_1|0", "GA|ann_2|0"]
                }),
                VCFRecord("artificial_chr1", 20, "ins_02", "-", ["A"], None, None, {
                    "expected": ["artificial_chr1:19=T/TA"],
                    "ANN": ["A|ann_1|0", "A|ann_2|0", "T|ann_3|"],
                    "expectedANN": ["TA|ann_1|0", "TA|ann_2|0"]
                }),
                VCFRecord("artificial_chr1", 14, "ins_03", "G", ["GA", "GC"], None, None, {
                    "expected": ["artificial_chr1:14=G/GA", "artificial_chr1:14=G/GC"],
                    "ANN": ["GA|ann_1|0", "GA|ann_2|0", "GC|ann_3|1", "GT|ann_4|"],
                    "expectedANN": ["GA|ann_1|0", "GA|ann_2|0", "GC|ann_3|1"]
                }),
                VCFRecord("artificial_chr1", 20, "ins_04", "-", ["A", "C"], None, None, {
                    "expected": ["artificial_chr1:19=T/TA", "artificial_chr1:19=T/TC"],
                    "ANN": ["A|ann_1|0", "A|ann_2|0", "C|ann_3|1", "T|ann_4|"],
                    "expectedANN": ["TA|ann_1|0", "TA|ann_2|0", "TC|ann_3|1"]
                }),
                # Insertion multi nt
                VCFRecord("artificial_chr1", 14, "ins_05", "G", ["GATGC"], None, None, {
                    "expected": ["artificial_chr1:14=G/GATGC"],
                    "ANN": ["GATGC|ann_1|0", "GATGC|ann_2|0", "GAAAC|ann_3|"],
                    "expectedANN": ["GATGC|ann_1|0", "GATGC|ann_2|0"]
                }),
                VCFRecord("artificial_chr1", 20, "ins_06", "-", ["AAATC"], None, None, {
                    "expected": ["artificial_chr1:19=T/TAAATC"],
                    "ANN": ["AAATC|ann_1|0", "AAATC|ann_2|0", "GAAAC|ann_3|"],
                    "expectedANN": ["TAAATC|ann_1|0", "TAAATC|ann_2|0"]
                }),
                # Movable insertion multi nt
                VCFRecord("artificial_chr1", 14, "ins_07", "G", ["GTG"], None, None, {
                    "expected": ["artificial_chr1:12=A/ATG"],
                    "ANN": ["GTG|ann_1|0", "GTG|ann_2|0", "GAAAC|ann_3|"],
                    "expectedANN": ["ATG|ann_1|0", "ATG|ann_2|0"]
                }),
                VCFRecord("artificial_chr1", 27, "ins_08", "A", ["AAAA"], None, None, {
                    "expected": ["artificial_chr1:25=C/CAAA"],
                    "ANN": ["AAAA|ann_1|0", "AAAA|ann_2|0", "CAAA|ann_3|"],
                    "expectedANN": ["CAAA|ann_1|0", "CAAA|ann_2|0"]
                }),
                # Deletion single nt
                VCFRecord("artificial_chr1", 14, "del_01", "G", [""], None, None, {
                    "expected": ["artificial_chr1:13=TG/T"],
                    "ANN": ["-|ann_1|0", "-|ann_2|0", "T|ann_3|"],
                    "expectedANN": ["T|ann_1|0", "T|ann_2|0"]
                }),
                VCFRecord("artificial_chr1", 14, "del_02", "G", ["-"], None, None, {
                    "expected": ["artificial_chr1:13=TG/T"],
                    "ANN": ["-|ann_1|0", "-|ann_2|0", "T|ann_3|"],
                    "expectedANN": ["T|ann_1|0", "T|ann_2|0"]
                }),
                VCFRecord("artificial_chr1", 13, "del_03", "TG", ["T"], None, None, {
                    "expected": ["artificial_chr1:13=TG/T"],
                    "ANN": ["T|ann_1|0", "T|ann_2|0", "A|ann_3|"],
                    "expectedANN": ["T|ann_1|0", "T|ann_2|0"]
                }),
                VCFRecord("artificial_chr1", 13, "del_04", "TG", ["T", "-"], None, None, {
                    "expected": ["artificial_chr1:13=TG/T", "artificial_chr1:12=ATG/A"],
                    "ANN": ["T|ann_1|0", "T|ann_2|0", "-|ann_3|1"],
                    "expectedANN": ["T|ann_1|0", "T|ann_2|0", "A|ann_3|1"]
                }),
                # Movable deletion multi nt
                VCFRecord("artificial_chr1", 11, "del_05", "TATG", ["T", "TA", "-"], None, None, {
                    "expected": ["artificial_chr1:11=TATG/T", "artificial_chr1:12=ATG/A", "artificial_chr1:7=CATGT/C"],
                    "ANN": ["T|ann_1|0", "T|ann_2|0", "TA|ann_3|1", "-|ann_4|2"],
                    "expectedANN": ["T|ann_1|0", "T|ann_2|0", "A|ann_3|1", "C|ann_4|2"]
                }),
            ]
            for idx, curr_var in enumerate(self.variants):
                FH_var.write(curr_var)
Ejemplo n.º 20
0
    def testTagMultipleValues(self):
        # Write test data
        with VCFIO(self.tmp_variants, "w") as FH_var:
            FH_var.info = {
                "expected":
                HeaderInfoAttr("expected",
                               "Expected filter tag.",
                               type="String",
                               number="1"),
                "SAR":
                HeaderInfoAttr(
                    "SAR",
                    "Number of reads supporting the alternative allele in reverse strand.",
                    type="Integer",
                    number="A"),
                "SAF":
                HeaderInfoAttr(
                    "SAF",
                    "Number of reads supporting the alternative allele in forward strand.",
                    type="Integer",
                    number="A"),
                "SRR":
                HeaderInfoAttr(
                    "SRR",
                    "Number of reads supporting the reference allele in reverse strand.",
                    type="Integer",
                    number="A"),
                "SRF":
                HeaderInfoAttr(
                    "SRF",
                    "Number of reads supporting the reference allele in forward strand.",
                    type="Integer",
                    number="A"),
            }
            FH_var.writeHeader()
            self.variants = [
                # 0.5 alt, 0.5 ref, low DP, alt no bias, ref no bias
                VCFRecord(
                    "artificial_chr1", 10, "sub_01", "G", ["T"], None, None, {
                        "SAR": [5],
                        "SAF": [5],
                        "SRR": [5],
                        "SRF": [5],
                        "expected": "PASS"
                    }),
                # 0.05 alt, 0.95 ref, good DP, alt strand bias, ref no bias
                VCFRecord(
                    "artificial_chr1", 40, "sub_04", "G", ["T"], None, None, {
                        "SAR": [9],
                        "SAF": [1],
                        "SRR": [95],
                        "SRF": [95],
                        "expected": "strandRatioBias"
                    })
            ]
            for idx, curr_var in enumerate(self.variants):
                FH_var.write(curr_var)

        # Execute command
        subprocess.check_call(self.cmd, stderr=subprocess.DEVNULL)

        # Validate results
        expected = []
        for record in self.variants:
            for alt in record.alt:
                expected.append(record.id + ":" + record.info["expected"])
        observed = []
        with VCFIO(self.tmp_output) as FH_results:
            for record in FH_results:
                observed.append(record.id + ":" + record.filter[0])
        self.assertEqual(expected, observed)
Ejemplo n.º 21
0
def mergedRecord(vcf, first, first_std_name, second, second_std_name, FH_seq):
    """
    Return the VCFRecord corresponding to the merge of first and second.

    :param vcf: The file handle to VCF.
    :type vcf: anacore.vcf.VCFIO
    :param first: The upstream variant to merge.
    :type first: anacore.vcf.VCFRecord
    :param first_std_name: The initial name of the upstream variant to merge (before normalisation).
    :type first_std_name: str
    :param second: The downstream variant to merge.
    :type second: anacore.vcf.VCFRecord
    :param second_std_name: The initial name of the downstream variant to merge (before normalisation).
    :type second_std_name: str
    :param FH_seq: File handle to the refersence sequence file.
    :type FH_seq: IdxFastaIO
    :return: The variant corresponding to the merge of first and second.
    :rtype: anacore.vcf.VCFRecord
    :todo: Keep INFO and format on strand from FreeBayes, VarDict, ...
    """
    merged = VCFRecord(
        first.chrom,  # chrom
        first.pos,  # pos
        pFormat=first.format)
    # Ref and Alt
    first_end = int(round(first.refEnd() - 0.49, 0))
    second_start = int(round(second.refStart() + 0.49, 0))
    ref_add = ""
    if second_start - first_end > 0:
        ref_add = FH_seq.getSub(first.chrom, first_end + 1, second_start - 1)
    merged.ref = first.ref + ref_add + second.ref
    merged.ref = merged.ref.replace(VCFRecord.getEmptyAlleleMarker(), "")
    merged.alt = [first.alt[0] + ref_add + second.alt[0]]
    merged.alt[0] = merged.alt[0].replace(VCFRecord.getEmptyAlleleMarker(), "")
    # Filter
    first_filters = [] if first.filter is None else first.filter
    second_filters = [] if second.filter is None else second.filter
    merged.filter = list(set(first_filters + second_filters))
    if len(merged.filter) > 1 and "PASS" in merged.filter:
        merged.filter.remove("PASS")
    # Samples
    for spl in first.samples:
        merged.samples[spl] = {}
        if "DP" in first.format:
            merged.samples[spl]["DP"] = min(first.getDP(spl),
                                            second.getDP(spl))
        if "AD" in first.format:
            if vcf.format["AD"].number == "1":  # Contains one alt allele
                merged.samples[spl]["AD"] = min(first.samples[spl]["AD"],
                                                second.samples[spl]["AD"])
            else:
                merged.samples[spl]["AD"] = [
                    min(first_AD, second_AD) for first_AD, second_AD in zip(
                        first.samples[spl]["AD"], second.samples[spl]["AD"])
                ]
        if "AF" in first.format:
            if vcf.format["AF"].number == "1":  # Contains one alt allele
                merged.samples[spl]["AF"] = min(first.samples[spl]["AF"],
                                                second.samples[spl]["AF"])
            else:
                merged.samples[spl]["AF"] = [
                    min(first_AF, second_AF) for first_AF, second_AF in zip(
                        first.samples[spl]["AF"], second.samples[spl]["AF"])
                ]
    # INFO metrics
    if "AD" in first.info:
        if vcf.info["AD"].number == "1":  # Contains one alt allele
            merged.info["AD"] = merged.getPopAltAD()[0]
        elif vcf.info["AD"].number == "R":  # Contains ref and alt alleles
            merged.info["AD"] = [merged.getPopRefAD()] + merged.getPopAltAD()
        else:  # Contains only alt alleles
            merged.info["AD"] = merged.getPopAltAD()
    if "DP" in first.info:
        merged.info["DP"] = merged.getPopDP()
    if "AF" in first.info:
        if vcf.info["AF"].number == "1":  # Contains one alt allele
            merged.info["AF"] = merged.getPopAltAF()[0]
        elif vcf.info["AF"].number == "R":  # Contains ref and alt alleles
            merged.info["AF"] = [merged.getPopRefAF()] + merged.getPopAltAF()
        else:  # Contains only alt alleles
            merged.info["AF"] = merged.getPopAltAF()
    # INFO Parents
    merged.info["MCO_VAR"] = []
    if "MCO_VAR" in first.info:
        for parent in first.info["MCO_VAR"]:
            merged.info["MCO_VAR"].append(parent)
    else:
        merged.info["MCO_VAR"].append(first_std_name)
    if "MCO_VAR" in second.info:
        for parent in second.info["MCO_VAR"]:
            merged.info["MCO_VAR"].append(parent)
    else:
        merged.info["MCO_VAR"].append(second_std_name)
    # Quality
    merged.info["MCO_QUAL"] = []
    if "MCO_QUAL" in first.info:
        for qual in first.info["MCO_QUAL"]:
            merged.info["MCO_QUAL"].append(qual)
    else:
        merged.info["MCO_QUAL"].append(first.qual)
    if "MCO_QUAL" in second.info:
        for qual in second.info["MCO_QUAL"]:
            merged.info["MCO_QUAL"].append(qual)
    else:
        merged.info["MCO_QUAL"].append(second.qual)
    if None not in merged.info["MCO_QUAL"]:
        merged.qual = mean(merged.info["MCO_QUAL"])
    # Return
    return merged
    def setUp(self):
        tmp_folder = tempfile.gettempdir()
        unique_id = str(uuid.uuid1())
        self.tmp_initial_pathes = os.path.join(tmp_folder, unique_id + "_{}_initial.vcf")
        self.tmp_haplotyped_pathes = os.path.join(tmp_folder, unique_id + "_{}_haplotyped.vcf")
        self.tmp_expected_pathes = os.path.join(tmp_folder, unique_id + "_{}_expected.vcf")
        self.tmp_out_pathes = os.path.join(tmp_folder, unique_id + "_{}_out.vcf")

        # test cases
        self.test_cases = [
            {  # *a-b, a-b, a b, /
                "initial": {
                    "caller1": [VCFRecord("chr1", 14, None, "GCGTA", ["CCGTG"])],
                    "caller2": [VCFRecord("chr1", 14, None, "GCGTA", ["CCGTG"])],
                    "caller3": [
                        VCFRecord("chr1", 14, None, "G", ["C"], info={"AD": 100}),
                        VCFRecord("chr1", 18, None, "A", ["G"], info={"AD": 104})
                    ]
                },
                "haplotyped": {
                    "caller1": [VCFRecord("chr1", 14, None, "GCGTA", ["CCGTG"])],
                    "caller2": [VCFRecord("chr1", 14, None, "GCGTA", ["CCGTG"])],
                    "caller3": [VCFRecord("chr1", 14, None, "GCGTA", ["CCGTG"], info={"MCO_VAR": ["chr1:14=G/C", "chr1:18=A/G"], "AD": 100})]
                },
                "expected": {
                    "caller1": [VCFRecord("chr1", 14, None, "GCGTA", ["CCGTG"])],
                    "caller2": [VCFRecord("chr1", 14, None, "GCGTA", ["CCGTG"])],
                    "caller3": [VCFRecord("chr1", 14, None, "GCGTA", ["CCGTG"], info={"AD": 104})]
                }
            },
            {  # *a b, a b, a-b, /
                "initial": {
                    "caller1": [
                        VCFRecord("chr2", 14, None, "G", ["C"]),
                        VCFRecord("chr2", 18, None, "A", ["G"])
                    ],
                    "caller2": [
                        VCFRecord("chr2", 14, None, "G", ["C"]),
                        VCFRecord("chr2", 18, None, "A", ["G"])
                    ],
                    "caller3": [VCFRecord("chr2", 14, None, "GCGTA", ["CCGTG"])]
                },
                "haplotyped": {
                    "caller1": [VCFRecord("chr2", 14, None, "GCGTA", ["CCGTG"], info={"MCO_VAR": ["chr2:14=G/C", "chr2:18=A/G"]})],
                    "caller2": [VCFRecord("chr2", 14, None, "GCGTA", ["CCGTG"], info={"MCO_VAR": ["chr2:14=G/C", "chr2:18=A/G"]})],
                    "caller3": [VCFRecord("chr2", 14, None, "GCGTA", ["CCGTG"])]
                },
                "expected": {
                    "caller1": [
                        VCFRecord("chr2", 14, None, "G", ["C"]),
                        VCFRecord("chr2", 18, None, "A", ["G"])
                    ],
                    "caller2": [
                        VCFRecord("chr2", 14, None, "G", ["C"]),
                        VCFRecord("chr2", 18, None, "A", ["G"])
                    ],
                    "caller3": [
                        VCFRecord("chr2", 14, None, "G", ["C"]),
                        VCFRecord("chr2", 18, None, "A", ["G"])
                    ]
                }
            },
            {  # *a-b c, a-b c, a b c, /
                "initial": {
                    "caller1": [
                        VCFRecord("chr3", 14, None, "GCGTA", ["CCGTG"]),
                        VCFRecord("chr3", 20, None, "A", ["G"])
                    ],
                    "caller2": [
                        VCFRecord("chr3", 14, None, "GCGTA", ["CCGTG"]),
                        VCFRecord("chr3", 20, None, "A", ["G"])
                    ],
                    "caller3": [
                        VCFRecord("chr3", 14, None, "G", ["C"], info={"AD": 104}),
                        VCFRecord("chr3", 18, None, "A", ["G"], info={"AD": 100}),
                        VCFRecord("chr3", 20, None, "A", ["G"], info={"AD": 98})
                    ]
                },
                "haplotyped": {
                    "caller1": [VCFRecord("chr3", 14, None, "GCGTATCA", ["CCGTGTCG"], info={"MCO_VAR": ["chr3:14=GCGTA/CCGTG", "chr3:20=A/G"]})],
                    "caller2": [VCFRecord("chr3", 14, None, "GCGTATCA", ["CCGTGTCG"], info={"MCO_VAR": ["chr3:14=GCGTA/CCGTG", "chr3:20=A/G"]})],
                    "caller3": [VCFRecord("chr3", 14, None, "GCGTATCA", ["CCGTGTCG"], info={"MCO_VAR": ["chr3:14=G/C", "chr3:18=A/G", "chr3:20=A/G"], "AD": 98})]
                },
                "expected": {
                    "caller1": [
                        VCFRecord("chr3", 14, None, "GCGTA", ["CCGTG"]),
                        VCFRecord("chr3", 20, None, "A", ["G"])
                    ],
                    "caller2": [
                        VCFRecord("chr3", 14, None, "GCGTA", ["CCGTG"]),
                        VCFRecord("chr3", 20, None, "A", ["G"])
                    ],
                    "caller3": [
                        VCFRecord("chr3", 14, None, "GCGTA", ["CCGTG"], info={"AD": 104}),
                        VCFRecord("chr3", 20, None, "A", ["G"], info={"AD": 98})
                    ]
                }
            },
            {  # *a-b c, a-b c, a b c, a-b-c
                "initial": {
                    "caller1": [
                        VCFRecord("chr4", 14, None, "GCGTA", ["CCGTG"]),
                        VCFRecord("chr4", 20, None, "A", ["G"])
                    ],
                    "caller2": [
                        VCFRecord("chr4", 14, None, "GCGTA", ["CCGTG"]),
                        VCFRecord("chr4", 20, None, "A", ["G"])
                    ],
                    "caller3": [
                        VCFRecord("chr4", 14, None, "G", ["C"], info={"AD": 98}),
                        VCFRecord("chr4", 18, None, "A", ["G"], info={"AD": 104}),
                        VCFRecord("chr4", 20, None, "A", ["G"], info={"AD": 100})
                    ],
                    "caller4": [VCFRecord("chr4", 14, None, "GCGTATCA", ["CCGTGTCG"])]
                },
                "haplotyped": {
                    "caller1": [VCFRecord("chr4", 14, None, "GCGTATCA", ["CCGTGTCG"], info={"MCO_VAR": ["chr4:14=GCGTA/CCGTG", "chr4:20=A/G"]})],
                    "caller2": [VCFRecord("chr4", 14, None, "GCGTATCA", ["CCGTGTCG"], info={"MCO_VAR": ["chr4:14=GCGTA/CCGTG", "chr4:20=A/G"]})],
                    "caller3": [VCFRecord("chr4", 14, None, "GCGTATCA", ["CCGTGTCG"], info={"MCO_VAR": ["chr4:14=G/C", "chr4:18=A/G", "chr4:20=A/G"], "AD": 98})],
                    "caller4": [VCFRecord("chr4", 14, None, "GCGTATCA", ["CCGTGTCG"])]
                },
                "expected": {
                    "caller1": [
                        VCFRecord("chr4", 14, None, "GCGTA", ["CCGTG"]),
                        VCFRecord("chr4", 20, None, "A", ["G"])
                    ],
                    "caller2": [
                        VCFRecord("chr4", 14, None, "GCGTA", ["CCGTG"]),
                        VCFRecord("chr4", 20, None, "A", ["G"])
                    ],
                    "caller3": [
                        VCFRecord("chr4", 14, None, "GCGTA", ["CCGTG"], info={"AD": 104}),
                        VCFRecord("chr4", 20, None, "A", ["G"], info={"AD": 100})
                    ],
                    "caller4": [
                        VCFRecord("chr4", 14, None, "GCGTA", ["CCGTG"]),
                        VCFRecord("chr4", 20, None, "A", ["G"])
                    ]
                }
            },
            {  # *a-b c, a' a-b c, a b c, a-b-c
                "initial": {
                    "caller1": [
                        VCFRecord("chr5", 14, None, "GCGTA", ["CCGTG"]),
                        VCFRecord("chr5", 20, None, "A", ["G"])
                    ],
                    "caller2": [
                        VCFRecord("chr5", 14, None, "G", ["C"], info={"AD": 3}),
                        VCFRecord("chr5", 14, None, "GCGTA", ["CCGTG"], info={"AD": 100}),
                        VCFRecord("chr5", 20, None, "A", ["G"], info={"AD": 104})
                    ],
                    "caller3": [
                        VCFRecord("chr5", 14, None, "G", ["C"], info={"AD": 110}),
                        VCFRecord("chr5", 18, None, "A", ["G"], info={"AD": 105}),
                        VCFRecord("chr5", 20, None, "A", ["G"], info={"AD": 100})
                    ],
                    "caller4": [VCFRecord("chr5", 14, None, "GCGTATCA", ["CCGTGTCG"])]
                },
                "haplotyped": {
                    "caller1": [VCFRecord("chr5", 14, None, "GCGTATCA", ["CCGTGTCG"], info={"MCO_VAR": ["chr5:14=GCGTA/CCGTG", "chr5:20=A/G"]})],
                    "caller2": [
                        VCFRecord("chr5", 14, None, "G", ["C"], info={"AD": 3}),
                        VCFRecord("chr5", 14, None, "GCGTATCA", ["CCGTGTCG"], info={"MCO_VAR": ["chr5:14=GCGTA/CCGTG", "chr5:20=A/G"], "AD": 100})
                    ],
                    "caller3": [VCFRecord("chr5", 14, None, "GCGTATCA", ["CCGTGTCG"], info={"MCO_VAR": ["chr5:14=G/C", "chr5:18=A/G", "chr5:20=A/G"], "AD": 100})],
                    "caller4": [VCFRecord("chr5", 14, None, "GCGTATCA", ["CCGTGTCG"])]
                },
                "expected": {
                    "caller1": [
                        VCFRecord("chr5", 14, None, "GCGTA", ["CCGTG"]),
                        VCFRecord("chr5", 20, None, "A", ["G"])
                    ],
                    "caller2": [
                        VCFRecord("chr5", 14, None, "G", ["C"], info={"AD": 3}),
                        VCFRecord("chr5", 14, None, "GCGTA", ["CCGTG"], info={"AD": 100}),
                        VCFRecord("chr5", 20, None, "A", ["G"], info={"AD": 104})
                    ],
                    "caller3": [
                        VCFRecord("chr5", 14, None, "GCGTA", ["CCGTG"], info={"AD": 110}),
                        VCFRecord("chr5", 20, None, "A", ["G"], info={"AD": 100})
                    ],
                    "caller4": [
                        VCFRecord("chr5", 14, None, "GCGTA", ["CCGTG"]),
                        VCFRecord("chr5", 20, None, "A", ["G"])
                    ]
                }
            },
            {  # *a b c, a' a-b c, a-b c, a-b-c
                "initial": {
                    "caller1": [
                        VCFRecord("chr6", 14, None, "G", ["C"]),
                        VCFRecord("chr6", 18, None, "A", ["G"]),
                        VCFRecord("chr6", 20, None, "A", ["G"])
                    ],
                    "caller2": [
                        VCFRecord("chr6", 14, None, "G", ["C"], info={"AD": 3}),
                        VCFRecord("chr6", 14, None, "GCGTA", ["CCGTG"], info={"AD": 100}),
                        VCFRecord("chr6", 20, None, "A", ["G"], info={"AD": 104})
                    ],
                    "caller3": [
                        VCFRecord("chr6", 14, None, "GCGTA", ["CCGTG"], info={"AD": 105}),
                        VCFRecord("chr6", 20, None, "A", ["G"], info={"AD": 101})
                    ],
                    "caller4": [VCFRecord("chr6", 14, None, "GCGTATCA", ["CCGTGTCG"])]
                },
                "haplotyped": {
                    "caller1": [VCFRecord("chr6", 14, None, "GCGTATCA", ["CCGTGTCG"], info={"MCO_VAR": ["chr6:14=G/C", "chr6:18=A/G", "chr6:20=A/G"]})],
                    "caller2": [
                        VCFRecord("chr6", 14, None, "G", ["C"], info={"AD": 3}),
                        VCFRecord("chr6", 14, None, "GCGTATCA", ["CCGTGTCG"], info={"MCO_VAR": ["chr6:14=GCGTA/CCGTG", "chr6:20=A/G"], "AD": 100})
                    ],
                    "caller3": [VCFRecord("chr6", 14, None, "GCGTATCA", ["CCGTGTCG"], info={"MCO_VAR": ["chr6:14=GCGTA/CCGTG", "chr6:20=A/G"], "AD": 101})],
                    "caller4": [VCFRecord("chr6", 14, None, "GCGTATCA", ["CCGTGTCG"])]
                },
                "expected": {
                    "caller1": [
                        VCFRecord("chr6", 14, None, "G", ["C"]),
                        VCFRecord("chr6", 18, None, "A", ["G"]),
                        VCFRecord("chr6", 20, None, "A", ["G"])
                    ],
                    "caller2": [
                        VCFRecord("chr6", 14, None, "G", ["C"], info={"AD": 100}),
                        VCFRecord("chr6", 18, None, "A", ["G"], info={"AD": 100}),
                        VCFRecord("chr6", 20, None, "A", ["G"], info={"AD": 104})
                    ],
                    "caller3": [
                        VCFRecord("chr6", 14, None, "G", ["C"], info={"AD": 105}),
                        VCFRecord("chr6", 18, None, "A", ["G"], info={"AD": 105}),
                        VCFRecord("chr6", 20, None, "A", ["G"], info={"AD": 101})
                    ],
                    "caller4": [
                        VCFRecord("chr6", 14, None, "G", ["C"]),
                        VCFRecord("chr6", 18, None, "A", ["G"]),
                        VCFRecord("chr6", 20, None, "A", ["G"])
                    ]
                }
            },
            {  # *a b c, a-b b' c, a-b c, a-b-c
                "initial": {
                    "caller1": [
                        VCFRecord("chr7", 14, None, "G", ["C"]),
                        VCFRecord("chr7", 18, None, "A", ["G"]),
                        VCFRecord("chr7", 20, None, "A", ["G"])
                    ],
                    "caller2": [
                        VCFRecord("chr7", 14, None, "GCGTA", ["CCGTG"], info={"AD": 100}),
                        VCFRecord("chr7", 18, None, "A", ["G"], info={"AD": 3}),
                        VCFRecord("chr7", 20, None, "A", ["G"], info={"AD": 104})
                    ],
                    "caller3": [
                        VCFRecord("chr7", 14, None, "GCGTA", ["CCGTG"]),
                        VCFRecord("chr7", 20, None, "A", ["G"])
                    ],
                    "caller4": [VCFRecord("chr7", 14, None, "GCGTATCA", ["CCGTGTCG"])]
                },
                "haplotyped": {
                    "caller1": [VCFRecord("chr7", 14, None, "GCGTATCA", ["CCGTGTCG"], info={"MCO_VAR": ["chr7:14=G/C", "chr7:18=A/G", "chr7:20=A/G"]})],
                    "caller2": [
                        VCFRecord("chr7", 14, None, "GCGTATCA", ["CCGTGTCG"], info={"MCO_VAR": ["chr7:14=GCGTA/CCGTG", "chr7:20=A/G"], "AD": 100}),
                        VCFRecord("chr7", 18, None, "G", ["C"], info={"AD": 3})
                    ],
                    "caller3": [VCFRecord("chr7", 14, None, "GCGTATCA", ["CCGTGTCG"], info={"MCO_VAR": ["chr7:14=GCGTA/CCGTG", "chr7:20=A/G"]})],
                    "caller4": [VCFRecord("chr7", 14, None, "GCGTATCA", ["CCGTGTCG"])]
                },
                "expected": {
                    "caller1": [
                        VCFRecord("chr7", 14, None, "G", ["C"]),
                        VCFRecord("chr7", 18, None, "A", ["G"]),
                        VCFRecord("chr7", 20, None, "A", ["G"])
                    ],
                    "caller2": [
                        VCFRecord("chr7", 14, None, "G", ["C"], info={"AD": 100}),
                        VCFRecord("chr7", 18, None, "A", ["G"], info={"AD": 100}),
                        VCFRecord("chr7", 20, None, "A", ["G"], info={"AD": 104})
                    ],
                    "caller3": [
                        VCFRecord("chr7", 14, None, "G", ["C"]),
                        VCFRecord("chr7", 18, None, "A", ["G"]),
                        VCFRecord("chr7", 20, None, "A", ["G"])
                    ],
                    "caller4": [
                        VCFRecord("chr7", 14, None, "G", ["C"]),
                        VCFRecord("chr7", 18, None, "A", ["G"]),
                        VCFRecord("chr7", 20, None, "A", ["G"])
                    ]
                }
            },
            {  # *a-b c, a-b b' c, a b c, a-b-c
                "initial": {
                    "caller1": [
                        VCFRecord("chr8", 14, None, "GCGTA", ["CCGTG"]),
                        VCFRecord("chr8", 20, None, "A", ["G"])
                    ],
                    "caller2": [
                        VCFRecord("chr8", 14, None, "GCGTA", ["CCGTG"], info={"AD": 100}),
                        VCFRecord("chr8", 18, None, "A", ["G"], info={"AD": 3}),
                        VCFRecord("chr8", 20, None, "A", ["G"], info={"AD": 104})
                    ],
                    "caller3": [
                        VCFRecord("chr8", 14, None, "G", ["C"], info={"AD": 110}),
                        VCFRecord("chr8", 18, None, "A", ["G"], info={"AD": 105}),
                        VCFRecord("chr8", 20, None, "A", ["G"], info={"AD": 100})
                    ],
                    "caller4": [VCFRecord("chr8", 14, None, "GCGTATCA", ["CCGTGTCG"])]
                },
                "haplotyped": {
                    "caller1": [VCFRecord("chr8", 14, None, "GCGTATCA", ["CCGTGTCG"], info={"MCO_VAR": ["chr8:14=GCGTA/CCGTG", "chr8:20=A/G"]})],
                    "caller2": [
                        VCFRecord("chr8", 14, None, "GCGTATCA", ["CCGTGTCG"], info={"MCO_VAR": ["chr8:14=GCGTA/CCGTG", "chr8:20=A/G"], "AD": 100}),
                        VCFRecord("chr8", 18, None, "G", ["C"], info={"AD": 3})
                    ],
                    "caller3": [VCFRecord("chr8", 14, None, "GCGTATCA", ["CCGTGTCG"], info={"MCO_VAR": ["chr8:14=G/C", "chr8:18=A/G", "chr8:20=A/G"], "AD": 100})],
                    "caller4": [VCFRecord("chr8", 14, None, "GCGTATCA", ["CCGTGTCG"])]
                },
                "expected": {
                    "caller1": [
                        VCFRecord("chr8", 14, None, "GCGTA", ["CCGTG"]),
                        VCFRecord("chr8", 20, None, "A", ["G"])
                    ],
                    "caller2": [
                        VCFRecord("chr8", 14, None, "GCGTA", ["CCGTG"], info={"AD": 100}),
                        VCFRecord("chr8", 18, None, "A", ["G"], info={"AD": 3}),
                        VCFRecord("chr8", 20, None, "A", ["G"], info={"AD": 104})
                    ],
                    "caller3": [
                        VCFRecord("chr8", 14, None, "GCGTA", ["CCGTG"], info={"AD": 110}),
                        VCFRecord("chr8", 20, None, "A", ["G"], info={"AD": 100})
                    ],
                    "caller4": [
                        VCFRecord("chr8", 14, None, "GCGTA", ["CCGTG"]),
                        VCFRecord("chr8", 20, None, "A", ["G"])
                    ]
                }
            },
            {  # *a' a-b c, a-b b' c, a b c, a-b-c
                "initial": {
                    "caller1": [
                        VCFRecord("chr9", 14, None, "G", ["C"]),
                        VCFRecord("chr9", 14, None, "GCGTA", ["CCGTG"]),
                        VCFRecord("chr9", 20, None, "A", ["G"])
                    ],
                    "caller2": [
                        VCFRecord("chr9", 14, None, "GCGTA", ["CCGTG"], info={"AD": 100}),
                        VCFRecord("chr9", 18, None, "A", ["G"], info={"AD": 3}),
                        VCFRecord("chr9", 20, None, "A", ["G"], info={"AD": 104})
                    ],
                    "caller3": [
                        VCFRecord("chr9", 14, None, "G", ["C"], info={"AD": 110}),
                        VCFRecord("chr9", 18, None, "A", ["G"], info={"AD": 105}),
                        VCFRecord("chr9", 20, None, "A", ["G"], info={"AD": 100})
                    ],
                    "caller4": [VCFRecord("chr9", 14, None, "GCGTATCA", ["CCGTGTCG"])]
                },
                "haplotyped": {
                    "caller1": [
                        VCFRecord("chr9", 14, None, "G", ["C"]),
                        VCFRecord("chr9", 14, None, "GCGTATCA", ["CCGTGTCG"], info={"MCO_VAR": ["chr9:14=GCGTA/CCGTG", "chr9:20=A/G"]})
                    ],
                    "caller2": [
                        VCFRecord("chr9", 14, None, "GCGTATCA", ["CCGTGTCG"], info={"MCO_VAR": ["chr9:14=GCGTA/CCGTG", "chr9:20=A/G"], "AD": 100}),
                        VCFRecord("chr9", 18, None, "G", ["C"], info={"AD": 3})
                    ],
                    "caller3": [VCFRecord("chr9", 14, None, "GCGTATCA", ["CCGTGTCG"], info={"MCO_VAR": ["chr9:14=G/C", "chr9:18=A/G", "chr9:20=A/G"], "AD": 100})],
                    "caller4": [VCFRecord("chr9", 14, None, "GCGTATCA", ["CCGTGTCG"])]
                },
                "expected": {
                    "caller1": [
                        VCFRecord("chr9", 14, None, "G", ["C"]),
                        VCFRecord("chr9", 14, None, "GCGTA", ["CCGTG"]),
                        VCFRecord("chr9", 20, None, "A", ["G"])
                    ],
                    "caller2": [
                        VCFRecord("chr9", 14, None, "GCGTA", ["CCGTG"], info={"AD": 100}),
                        VCFRecord("chr9", 18, None, "A", ["G"], info={"AD": 3}),
                        VCFRecord("chr9", 20, None, "A", ["G"], info={"AD": 104})
                    ],
                    "caller3": [
                        VCFRecord("chr9", 14, None, "GCGTA", ["CCGTG"], info={"AD": 110}),
                        VCFRecord("chr9", 20, None, "A", ["G"], info={"AD": 100})
                    ],
                    "caller4": [
                        VCFRecord("chr9", 14, None, "GCGTA", ["CCGTG"]),
                        VCFRecord("chr9", 20, None, "A", ["G"])
                    ]
                }
            }
        ]

        # Get callers
        callers = set()
        for curr_test in self.test_cases:
            for curr_caller in curr_test["initial"]:
                callers.add(curr_caller)
        self.callers = sorted(list(callers))

        # Write files
        for curr_caller in self.callers:
            # Initial
            with VCFIO(self.tmp_initial_pathes.format(curr_caller), "w") as handle_out:
                handle_out.info = {
                    "AD": HeaderInfoAttr("AD", "Alternative allele depth.", type="Integer", number="1")
                }
                handle_out.extra_header = ["##source={}".format(curr_caller)]
                handle_out.writeHeader()
                for curr_test in self.test_cases:
                    if curr_caller in curr_test["initial"]:
                        for curr_var in curr_test["initial"][curr_caller]:
                            handle_out.write(curr_var)
            # Haplotyped
            with VCFIO(self.tmp_haplotyped_pathes.format(curr_caller), "w") as handle_out:
                handle_out.info = {
                    "AD": HeaderInfoAttr("AD", "Alternative allele depth.", type="Integer", number="1"),
                    "MCO_VAR": HeaderInfoAttr("MCO_VAR", "Name of the variants merged because their occur on same reads.", type="String", number=".")
                }
                handle_out.extra_header = ["##source={}".format(curr_caller)]
                handle_out.writeHeader()
                for curr_test in self.test_cases:
                    if curr_caller in curr_test["haplotyped"]:
                        for curr_var in curr_test["haplotyped"][curr_caller]:
                            handle_out.write(curr_var)
            # Expected
            with VCFIO(self.tmp_expected_pathes.format(curr_caller), "w") as handle_out:
                handle_out.info = {
                    "AD": HeaderInfoAttr("AD", "Alternative allele depth.", type="Integer", number="1")
                }
                handle_out.extra_header = ["##source={}".format(curr_caller)]
                handle_out.writeHeader()
                for curr_test in self.test_cases:
                    if curr_caller in curr_test["expected"]:
                        for curr_var in curr_test["expected"][curr_caller]:
                            handle_out.write(curr_var)
Ejemplo n.º 23
0
    def setUp(self):
        tmp_folder = tempfile.gettempdir()
        unique_id = str(uuid.uuid1())

        # Temporary files
        self.tmp_sequences = os.path.join(tmp_folder, unique_id + ".fasta")
        self.tmp_regions = os.path.join(tmp_folder, unique_id + ".bed")
        self.tmp_variants = os.path.join(tmp_folder, unique_id + ".vcf")
        self.tmp_output = os.path.join(tmp_folder, unique_id + "_out.vcf")

        # Exec command
        self.cmd = [
            "filterVCFPrimers.py",
            "--input-variants", self.tmp_variants,
            "--input-regions", self.tmp_regions,
            "--input-sequences", self.tmp_sequences,
            "--output-variants", self.tmp_output
        ]

        # Create fasta
        with FastaIO(self.tmp_sequences, "w") as FH_seq:
            FH_seq.write(Sequence("artificial_chr1", "NNNAAAATTTGGGGGGGGGGTTTAAANNN"))
            #                                         123456789| | | | | | | | | |
            #                                                  10| 14| 18| 22| 26|
            #                                                    12  16  20  24  28
            FH_seq.write(Sequence("artificial_chr2", "CGATNNNCGAT"))
            #                                         123456789|
            #                                                  10

        # Create VCF
        with VCFIO(self.tmp_variants, "w") as FH_var:
            FH_var.info = {"ZOI": HeaderInfoAttr("ZOI", "If the variant can be in interest area.", type="String", number="1")}
            FH_var.writeHeader()
            self.variants = [
                VCFRecord("artificial_chr1", 6, "alt_0", "A", ["AA"], None, None, {"ZOI": "no"}),
                VCFRecord("artificial_chr1", 8, "alt_1", "TT", ["T"], None, None, {"ZOI": "no"}),
                VCFRecord("artificial_chr1", 8, "alt_2", "T", ["TT"], None, None, {"ZOI": "yes"}),
                VCFRecord("artificial_chr1", 9, "alt_3", "TTGG", ["TT"], None, None, {"ZOI": "yes"}),
                VCFRecord("artificial_chr1", 14, "alt_4", "G", ["GG"], None, None, {"ZOI": "yes"}),
                VCFRecord("artificial_chr1", 18, "alt_5", "GGG", ["G"], None, None, {"ZOI": "yes"}),  # ZOI downstream limit deletion
                VCFRecord("artificial_chr1", 22, "alt_6", "T", ["TT"], None, None, {"ZOI": "yes"}),

                VCFRecord("artificial_chr1", 9, "alt_7", "TT", ["TC"], None, None, {"ZOI": "no"}),  # Substitution before end of upstream primer
                VCFRecord("artificial_chr1", 10, "alt_8", "TG", ["TC"], None, None, {"ZOI": "yes"}),  # Substitution in upstream limit of ZOI
                VCFRecord("artificial_chr1", 15, "alt_9", "GG", ["GC"], None, None, {"ZOI": "yes"}),  # Substitution in dosnstream limit of ZOI
                VCFRecord("artificial_chr1", 20, "alt_10", "GT", ["GC"], None, None, {"ZOI": "no"}),  # Substitution after start of downstream primer
                VCFRecord("artificial_chr1", 21, "alt_11", "TT", ["TC"], None, None, {"ZOI": "no"}),  # Substitution in downstream primer

                VCFRecord("artificial_chr2", 1, "alt_12", "C", ["CTT"], None, None, {"ZOI": "no"}),  # Insertion before end of upstream primer
                VCFRecord("artificial_chr2", 2, "alt_13", "G", ["GCC"], None, None, {"ZOI": "yes"}),  # Insertion in upstream limit of ZOI
                VCFRecord("artificial_chr2", 3, "alt_14", "AT", ["CCGC"], None, None, {"ZOI": "yes"}),  # Insertion in upstream limit of ZOI and without standardization
                VCFRecord("artificial_chr2", 9, "alt_15", "G", ["GCC"], None, None, {"ZOI": "yes"}),  # Insertion in downstream limit of ZOI
                VCFRecord("artificial_chr2", 9, "alt_16", "G", ["NNN"], None, None, {"ZOI": "yes"}),  # Insertion in downstream limit of ZOI and without standardization
                VCFRecord("artificial_chr2", 10, "alt_17", "-", ["CC"], None, None, {"ZOI": "yes"}),  # Insertion in downstream limit of ZOI
                VCFRecord("artificial_chr2", 10, "alt_18", "A", ["ATT"], None, None, {"ZOI": "no"}),  # Insertion after start of downstream primer

                VCFRecord("artificial_chr2", 1, "alt_19", "CG", ["C"], None, None, {"ZOI": "no"}),  # Deletion before end of upstream primer
                VCFRecord("artificial_chr2", 2, "alt_20", "GA", ["G"], None, None, {"ZOI": "yes"}),  # Deletion in upstream limit of ZOI
                VCFRecord("artificial_chr2", 3, "alt_21", "AT", ["C"], None, None, {"ZOI": "yes"}),  # Deletion in upstream limit of ZOI and without standardization
                VCFRecord("artificial_chr2", 6, "alt_22", "NNCG", ["N"], None, None, {"ZOI": "yes"}),  # Deletion in downstream limit of ZOI
                VCFRecord("artificial_chr2", 8, "alt_23", "CG", ["C"], None, None, {"ZOI": "yes"}),  # Deletion in downstream limit of ZOI
                VCFRecord("artificial_chr2", 8, "alt_24", "CG", ["T"], None, None, {"ZOI": "yes"}),  # Deletion in downstream limit of ZOI and without standardization
                VCFRecord("artificial_chr2", 9, "alt_25", "GA", ["G"], None, None, {"ZOI": "no"}),  # Insertion after start of downstream primer
                VCFRecord("artificial_chr2", 10, "alt_26", "A", ["-"], None, None, {"ZOI": "no"}),  # Insertion after start of downstream primer
                VCFRecord("artificial_chr2", 10, "alt_27", "AT", ["A"], None, None, {"ZOI": "no"}),  # Insertion after start of downstream primer
            ]
            for idx, curr_var in enumerate(self.variants):
                FH_var.write(curr_var)