def changeAndWrite(in_vcf, out_vcf, change_by_rec):
    """
    Write uniformised variants.

    :param in_vcf: Path to the variants file before haplotyping (format: VCF).
    :type in_vcf: str
    :param out_vcf: Path to the output variants file.
    :type out_vcf: str
    :param change_by_rec: Uniformised variants(s) by previous variant (example: {"chr1:1235448=A/G": {"chr1:1235448=ATAG/GTAC"}, "chr1:1235451=G/C": {"chr1:1235448=ATAG/GTAC"}}).
    :type change_by_rec: dict
    """
    with VCFIO(out_vcf, "w") as handle_out:
        with VCFIO(in_vcf) as handle_in:
            rec_by_name = {}
            # Manage header
            handle_out.copyHeader(handle_in)
            handle_out.writeHeader()
            # Split/Merge variants
            for record in handle_in:
                if record.getName() not in change_by_rec:
                    if record.getName() not in rec_by_name:
                        rec_by_name[record.getName()] = record
                    elif record.getPopAltAD()[0] > rec_by_name[
                            record.getName()].getPopAltAD()[0]:
                        rec_by_name[record.getName()] = record
                else:  # Must be uniformised
                    if "PGT" in record.format and "PID" in record.format and "PS" in record.format:  # Remove mutect haplotype information
                        for spl_name, spl_info in record.samples.items():
                            del (spl_info["PGT"])
                            del (spl_info["PID"])
                            del (spl_info["PS"])
                        record.format = [
                            elt for elt in record.format
                            if elt not in ["PGT", "PID", "PS"]
                        ]
                    # Change variant
                    for curr_retained in change_by_rec[record.getName()]:
                        if curr_retained not in rec_by_name:
                            new_record = deepcopy(record)
                            rec_by_name[curr_retained] = new_record
                            new_record.ref, new_record.alt[
                                0] = curr_retained.split("=")[1].split("/")
                            new_record.pos = int(
                                curr_retained.split("=")[0].split(":")[1])
                        elif record.getPopAltAD(
                        )[0] > rec_by_name[curr_retained].getPopAltAD()[0]:
                            new_record = deepcopy(record)
                            rec_by_name[curr_retained] = new_record
                            new_record.ref, new_record.alt[
                                0] = curr_retained.split("=")[1].split("/")
                            new_record.pos = int(
                                curr_retained.split("=")[0].split(":")[1])
            # Write records
            for record in sorted(rec_by_name.values(),
                                 key=lambda x:
                                 (x.chrom, x.pos, x.refEnd(), x.alt[0])):
                handle_out.write(record)
def getVariantsByName(callers, haplotyped_variants_files):
    """
    Return variants detection and haplotype by variant name.

    :param callers: List of variants callers corresponding to haplotyped_variants_files.
    :type callers: list
    :param haplotyped_variants_files: Paths to the variants files after merging by haplotype (format: VCF).
    :type haplotyped_variants_files: list
    :return: Variants detection and haplotype by variant name (example: {"chr1:1235448=ATAG/GTAC": {"mutect2": {"merged": True, "sub": {"chr1:1235448=A/G", "chr1:1235451=G/C"}}, "freebayes": {"merged": False, "sub": {"chr1:1235448=ATAG/GTAC"}}}}).
    :rtype: dict
    """
    variants_by_name = {}
    for curr_caller, curr_file in zip(callers, haplotyped_variants_files):
        with VCFIO(curr_file) as FH_in:
            for record in FH_in:
                if record.getName() not in variants_by_name:
                    variants_by_name[record.getName()] = {}
                variants_by_name[record.getName()][curr_caller] = {
                    "merged":
                    "MCO_VAR" in record.info,
                    "sub":
                    set(record.info["MCO_VAR"])
                    if "MCO_VAR" in record.info else set()
                }
    return variants_by_name
Ejemplo n.º 3
0
def normAndMove(genome_path, in_variant_file, out_variant_file,
                trace_unstandard):
    """
    Write in a new file the normalized version of each variant. The normalization constists in three steps:
      1- The variants with multiple alternative alleles are splitted in one record by alternative allele.
      2- In each allele the empty allele marker is replaced by a dot and alternative and reference allele are reduced to the minimal string (example: ATG/A becomes TG/. ; AAGC/ATAC becomes AG/TA.).
      3- The allele is replaced by the most upstream allele that can have the same alternative sequence (example: a deletion in homopolymer is moved to first nucleotid of this homopolymer).

    :param genome_path: Path to the genome file (format: fasta).
    :type genome_path: str
    :param in_variant_file: Path to the variants file (format: VCF).
    :type in_variant_file: str
    :param out_variant_file: Path to the normalized variants file (format: VCF).
    :type out_variant_file: str
    :param trace_unstandard: True if you want to keep the trace of the variant before standardization in INFO.
    :type trace_unstandard: bool
    """
    genome_by_chr = getSeqByChr(genome_path)
    with VCFIO(out_variant_file, "w") as FH_out:
        with VCFIO(in_variant_file) as FH_in:
            # Header
            FH_out.copyHeader(FH_in)
            if trace_unstandard:
                FH_out.info["UNSTD"] = HeaderInfoAttr(
                    "UNSTD",
                    type="String",
                    number="1",
                    description=
                    "The variant id (chromosome:position=reference/alternative) before standardization."
                )
            FH_out.writeHeader()
            # Records
            for record in FH_in:
                curr_chrom = genome_by_chr[record.chrom]
                for alt_idx, alt in enumerate(record.alt):
                    alt_record = getAlleleRecord(FH_in, record, alt_idx)
                    if trace_unstandard:
                        alt_record.info["UNSTD"] = "{}:{}={}/{}".format(
                            alt_record.chrom, alt_record.pos, alt_record.ref,
                            "/".join(alt_record.alt))
                    FH_out.write(alt_record.getMostUpstream(curr_chrom))
Ejemplo n.º 4
0
    def testTag(self):
        # Execute command
        subprocess.check_call(self.cmd, stderr=subprocess.DEVNULL)

        # Validate results
        expected = []
        for record in self.variants:
            for alt in record.alt:
                expected.append(record.id + ":" + record.info["expected"])
        observed = []
        with VCFIO(self.tmp_output) as FH_results:
            for record in FH_results:
                observed.append(record.id + ":" + record.filter[0])
        self.assertEqual(expected, observed)
    def testResults(self):
        # Execute command
        subprocess.check_call(self.cmd, stderr=subprocess.DEVNULL)

        # Validate results
        expected = [
            curr_var.id for curr_var in self.variants
            if curr_var.info["target"] == "target_2"
        ]
        observed = list()
        with VCFIO(self.tmp_output) as FH_results:
            for record in FH_results:
                observed.append(record.id)
        self.assertEqual(sorted(expected), sorted(observed))
Ejemplo n.º 6
0
def normOnly(in_variant_file, out_variant_file, trace_unstandard):
    """
    Write in a new file the normalized version of each variant. The normalization constists in two steps:
      1- The variants with multiple alternative alleles are splitted in one record by alternative allele.
      2- In each allele the empty allele marker is replaced by a dot and alternative and reference allele are reduced to the minimal string (example: ATG/A becomes TG/. ; AAGC/ATAC becomes AG/TA.).

    :param in_variant_file: Path to the variants file (format: VCF).
    :type in_variant_file: str
    :param out_variant_file: Path to the normalized variants file (format: VCF).
    :type out_variant_file: str
    :param trace_unstandard: True if you want to keep the trace of the variant before standardization in INFO.
    :type trace_unstandard: bool
    """
    with VCFIO(out_variant_file, "w") as FH_out:
        with VCFIO(in_variant_file) as FH_in:
            # Header
            FH_out.copyHeader(FH_in)
            if trace_unstandard:
                FH_out.info["UNSTD"] = HeaderInfoAttr(
                    "UNSTD",
                    type="String",
                    number="1",
                    description=
                    "The variant id (chromosome:position=reference/alternative) before standardization."
                )
            FH_out.writeHeader()
            # Records
            for record in FH_in:
                for alt_idx, alt in enumerate(record.alt):
                    alt_record = getAlleleRecord(FH_in, record, alt_idx)
                    if trace_unstandard:
                        alt_record.info["UNSTD"] = "{}:{}={}/{}".format(
                            alt_record.chrom, alt_record.pos, alt_record.ref,
                            "/".join(alt_record.alt))
                    alt_record.normalizeSingleAllele()
                    FH_out.write(alt_record)
Ejemplo n.º 7
0
    def testAnnotVCFIO(self):
        # Execute command
        subprocess.check_call(self.cmd + ["--annotations-field", "ANN"], stderr=subprocess.DEVNULL)

        # Validate results
        expected = {}
        for record in self.variants:
            for idx, alt in enumerate(record.alt):
                id = "{} {}:{}={}/{}".format(record.id, record.chrom, record.pos, record.ref, alt)
                expected[id] = record.info["expected"][idx]
        observed = {}
        with VCFIO(self.tmp_output) as FH_results:
            for record in FH_results:
                observed[record.id + " " + record.info["UNSTD"]] = record.getName()
        self.assertEqual(
            expected,
            observed
        )

        # Validate annotations
        expected = {}
        for record in self.variants:
            for idx, alt in enumerate(record.alt):
                id = "{} {}:{}={}/{}".format(record.id, record.chrom, record.pos, record.ref, alt)
                expected[id] = sorted([ann for ann in record.info["expectedANN"] if ann.split("|")[2] == str(idx)])
        observed = {}
        with VCFIO(self.tmp_output) as FH_results:
            for record in FH_results:
                id = record.id + " " + record.info["UNSTD"]
                observed[id] = []
                if "ANN" in record.info:
                    observed[id] = sorted([ann for ann in record.info["ANN"]])
        self.assertEqual(
            expected,
            observed
        )
Ejemplo n.º 8
0
    def testRemove(self):
        # Execute command
        subprocess.check_call(self.cmd + ["--mode", "remove"],
                              stderr=subprocess.DEVNULL)

        # Validate results
        expected = []
        for record in self.variants:
            for alt in record.alt:
                if record.info["expected"] == "PASS":
                    expected.append(record.id)
        observed = []
        with VCFIO(self.tmp_output) as FH_results:
            for record in FH_results:
                observed.append(record.id)
        self.assertEqual(expected, observed)
Ejemplo n.º 9
0
    def testVCFIO(self):
        # Execute command
        subprocess.check_call(self.cmd, stderr=subprocess.DEVNULL)

        # Validate results
        expected = {}
        for record in self.variants:
            for idx, alt in enumerate(record.alt):
                id = "{} {}:{}={}/{}".format(record.id, record.chrom, record.pos, record.ref, alt)
                expected[id] = record.info["expected"][idx]
        observed = {}
        with VCFIO(self.tmp_output) as FH_results:
            for record in FH_results:
                observed[record.id + " " + record.info["UNSTD"]] = record.getName()
        self.assertEqual(
            expected,
            observed
        )
Ejemplo n.º 10
0
    def testResults(self):
        # Create BED
        with BEDIO(self.tmp_regions, "w", 8) as FH_reg:
            ampl1 = BEDRecord("artificial_chr1", 5, 25, "ampl1", None, "+", 11, 20)
            FH_reg.write(ampl1)
            ampl2 = BEDRecord("artificial_chr2", 1, 11, "ampl2", None, "+", 3, 9)
            FH_reg.write(ampl2)

        # Execute command
        subprocess.check_call(self.cmd, stderr=subprocess.DEVNULL)

        # Validate results
        expected = [curr_var.id for curr_var in self.variants if curr_var.info["ZOI"] == "yes"]
        observed = list()
        with VCFIO(self.tmp_output) as FH_results:
            for record in FH_results:
                observed.append(record.id)
        self.assertEqual(
            sorted(expected),
            sorted(observed)
        )
Ejemplo n.º 11
0
def loadBNDByID(in_vcf):
    """
    Return breakend by ID from a VCF file.

    :param in_vcf: Path to the VCF containing BND coming from one fusion caller (format: VCF).
    :type in_vcf: str
    :return: Breakend by ID.
    :rtype: dict
    """
    bnd_by_id = {}
    with VCFIO(in_vcf) as reader:
        if "SR" in reader.info and reader.info["SR"].number == ".":
            raise Exception(
                'The number attribute for SR must be "A" or "R" or "1".')
        if "PR" in reader.info and reader.info["PR"].number == ".":
            raise Exception(
                'The number attribute for PR must be "A" or "R" or "1".')
        for record in reader:
            if record.info["SVTYPE"] == "BND":
                bnd_by_id[record.id] = record
    return bnd_by_id
Ejemplo n.º 12
0
def variantsRegionFromVCF(vcf_path, min_count=1, symbol="GENE", hgvsc="CDS", hgvsp="AA", count="CNT"):
    """
    Return the region object corresponding to the known variants in a VCF.

    :param vcf_path: Path to the variants file (format: VCF).
    :type vcf_path: str
    :param min_count: Minimum number of samples where the variant is known in the databases to use its information.
    :type min_count: int
    :param symbol: Tag used in VCF.info to store the symbol of the gene.
    :type symbol: str
    :param hgvsc: Tag used in VCF.info to store the HGVSc.
    :type hgvsc: str
    :param hgvsp: Tag used in VCF.info to store the HGVSp.
    :type hgvsp: str
    :param count: Tag used in VCF.info to store the number of database's samples with this variant.
    :type count: str
    :return: List of variants regions.
    :rtype: anacore.region.RegionList
    """
    variants_region = None
    with VCFIO(vcf_path) as FH_in:
        variants_region = [
            Region(
                record.pos,
                record.pos + len(record.ref),
                None,
                record.chrom,
                record.id,
                {
                    "id": record.id,
                    "gene": ("" if symbol not in record.info else record.info[symbol]),
                    "HGVSp": ("" if hgvsp not in record.info else record.info[hgvsp]),
                    "HGVSc": ("" if hgvsc not in record.info else record.info[hgvsc]),
                    "count": (None if count not in record.info else int(record.info[count]))
                }
            ) for record in FH_in if (symbol not in record.info or "_ENST" not in record.info[symbol]) and (count not in record.info or int(record.info[count]) >= min_count)
        ]
    return RegionList(variants_region)
Ejemplo n.º 13
0
    # Logger
    logging.basicConfig(format='%(asctime)s -- [%(filename)s][pid:%(process)d][%(levelname)s] -- %(message)s')
    log = logging.getLogger()
    log.setLevel(logging.INFO)
    log.info("Command: " + " ".join(sys.argv))
    log.info("Version: " + str(__version__))

    # Get identified variants from VCF
    variants = dict()
    aln_by_samples = dict()
    for vcf_idx, current_vcf in enumerate(args.input_variants):
        current_aln = None
        if not args.deactivate_completion:
            current_aln = args.input_aln[vcf_idx]
        with VCFIO(current_vcf) as FH_vcf:
            # Manage samples
            for curr_spl in FH_vcf.samples:  # For each sample in VCF
                aln_by_samples[curr_spl] = current_aln
            # Manage records
            for record in FH_vcf:  # For each variant
                if args.selected_region is None or record.chrom == args.selected_region:
                    for curr_spl in FH_vcf.samples:  # For each sample in VCF
                        vcaller_AF = record.getAltAF(curr_spl)
                        vcaller_DP = record.getDP(curr_spl)
                        for alt_idx, curr_alt in enumerate(record.alt):  # For each alternative allele in in variant
                            record_allele = getAlleleRecord(FH_vcf, record, alt_idx)
                            # Get allele frequency from the variant caller
                            vcaller_curr_AF = vcaller_AF[alt_idx]
                            if len(vcaller_AF) == len(record.alt) + 1:  # The AF cointains reference AF
                                vcaller_curr_AF = vcaller_AF[alt_idx + 1]
Ejemplo n.º 14
0
def addVCFVariants(variants, vcf_path, vcf_idx, spl_name=None):
    """
    Add variant from VCF in dict.

    :param variants: By uniq ID the variants. The content of this variable is set by the call of this function.
                     Content example:
                     {
                       "chr1:10=A/T":{
                         "chrom":"chr1",
                         "pos":10,
                         "ref":"A",
                         "alt":"T",
                         "freq":[0.2, 0.5] },
                       "chr1:10=A/G":{
                         "chrom":"chr1",
                         "pos":10,
                         "ref":"A",
                         "alt":"G",
                         "freq":[0.01, 0] },
                       "chr3:20=G/T":{
                         "chrom":"chr3",
                         "pos":20,
                         "ref":"G",
                         "alt":"T",
                         "freq":[0, 0.4] }
                     }
                     The list of frequencies is appended by each call of the function with a vcf_idx different.
    :type variants: dict
    :param vcf_path: Path to the VCF file to add.
    :type vcf_path: str
    :param vcf_idx: Index used to store the frequency of each vrariants of the VCF in frequencies list (start from 0).
    :type vcf_idx: int
    :param spl_name: The frequency of the variants came from this sample. This parameters is optional when the VCF file contain 0 to 1 sample.
    :type spl_name: str
    """
    with VCFIO(vcf_path) as FH_vcf:
        if spl_name is None:
            spl_name = FH_vcf.samples[0]
        for record in FH_vcf:
            allele_freq = record.getAltAF(spl_name)
            # For each alternative allele
            for idx_alt, alt in enumerate(record.alt):
                allele_record = getAlleleRecord(FH_vcf, record, idx_alt)
                allele_record.normalizeSingleAllele()
                variant_id = allele_record.getName()
                if variant_id not in variants:
                    variants[variant_id] = {
                        "chrom": allele_record.chrom,
                        "pos": allele_record.pos,
                        "ref": allele_record.ref,
                        "alt": allele_record.alt[0],
                        "freq": list()
                    }
                # Complete variants missing in previous VCF
                while len(variants[variant_id]["freq"]) <= vcf_idx:
                    variants[variant_id]["freq"].append(0)
                # Add allele frequency
                variants[variant_id]["freq"][vcf_idx] = allele_freq[idx_alt]
    # Complete variants missing in current VCF
    for variant_id in variants:
        while len(variants[variant_id]["freq"]) <= vcf_idx:
            variants[variant_id]["freq"].append(0)
Ejemplo n.º 15
0
    def testTagMultipleValues(self):
        # Write test data
        with VCFIO(self.tmp_variants, "w") as FH_var:
            FH_var.info = {
                "expected":
                HeaderInfoAttr("expected",
                               "Expected filter tag.",
                               type="String",
                               number="1"),
                "SAR":
                HeaderInfoAttr(
                    "SAR",
                    "Number of reads supporting the alternative allele in reverse strand.",
                    type="Integer",
                    number="A"),
                "SAF":
                HeaderInfoAttr(
                    "SAF",
                    "Number of reads supporting the alternative allele in forward strand.",
                    type="Integer",
                    number="A"),
                "SRR":
                HeaderInfoAttr(
                    "SRR",
                    "Number of reads supporting the reference allele in reverse strand.",
                    type="Integer",
                    number="A"),
                "SRF":
                HeaderInfoAttr(
                    "SRF",
                    "Number of reads supporting the reference allele in forward strand.",
                    type="Integer",
                    number="A"),
            }
            FH_var.writeHeader()
            self.variants = [
                # 0.5 alt, 0.5 ref, low DP, alt no bias, ref no bias
                VCFRecord(
                    "artificial_chr1", 10, "sub_01", "G", ["T"], None, None, {
                        "SAR": [5],
                        "SAF": [5],
                        "SRR": [5],
                        "SRF": [5],
                        "expected": "PASS"
                    }),
                # 0.05 alt, 0.95 ref, good DP, alt strand bias, ref no bias
                VCFRecord(
                    "artificial_chr1", 40, "sub_04", "G", ["T"], None, None, {
                        "SAR": [9],
                        "SAF": [1],
                        "SRR": [95],
                        "SRF": [95],
                        "expected": "strandRatioBias"
                    })
            ]
            for idx, curr_var in enumerate(self.variants):
                FH_var.write(curr_var)

        # Execute command
        subprocess.check_call(self.cmd, stderr=subprocess.DEVNULL)

        # Validate results
        expected = []
        for record in self.variants:
            for alt in record.alt:
                expected.append(record.id + ":" + record.info["expected"])
        observed = []
        with VCFIO(self.tmp_output) as FH_results:
            for record in FH_results:
                observed.append(record.id + ":" + record.filter[0])
        self.assertEqual(expected, observed)
Ejemplo n.º 16
0
    def setUp(self):
        tmp_folder = tempfile.gettempdir()
        unique_id = str(uuid.uuid1())

        # Temporary files
        self.tmp_sequences = os.path.join(tmp_folder, unique_id + ".fasta")
        self.tmp_faidx = os.path.join(tmp_folder, unique_id + ".fasta.fai")
        self.tmp_variants = os.path.join(tmp_folder, unique_id + ".vcf")
        self.tmp_output = os.path.join(tmp_folder, unique_id + "_out.vcf")

        # Exec command
        self.cmd = [
            "standardizeVCF.py",
            "--trace-unstandard",
            "--input-reference", self.tmp_sequences,
            "--input-variants", self.tmp_variants,
            "--output-variants", self.tmp_output
        ]

        # Create fasta
        with FastaIO(self.tmp_sequences, "w") as FH_seq:
            # Repeats:                                       ****....            ...***
            # Region:                                 |----|        |------------|         |------|
            FH_seq.write(Sequence("artificial_chr1", "CTCAGTCATGTATGTATGTGCTCACAAAGTAGTAGATCATGGCAC"))
            #                                         123456789| | | | | | | | | | | | | | | | | |
            #                                                  10| 14| 18| 22| 26| 30| 34| 38| 42|
            #                                                    12  16  20  24  28  32  36  40  44
            FH_seq.write(Sequence("artificial_chr2", "CGATNNNCGAT"))
            #                                         123456789|
            #                                                  10

        # Create faidx
        with open(self.tmp_faidx, "w") as FH_fai:
            FH_fai.write("""artificial_chr1	45	17	45	46
artificial_chr2	11	80	11	12""")

        # Create VCF
        with VCFIO(self.tmp_variants, "w") as FH_var:
            FH_var.info = {
                "expected": HeaderInfoAttr("expected", "Standardized version of {chrom}:{pos}={ref}/{alt}.", type="String", number="."),
                "ANN": HeaderInfoAttr("ANN", "Annotation of variants Format: Allele|Annotation_id|Alt_allele_idx", type="String", number="."),
                "expectedANN": HeaderInfoAttr("expectedANN", "Standardized version of annotations Format: Allele|Annotation_id|Alt_allele_idx", type="String", number=".")
            }
            FH_var.writeHeader()
            self.variants = [
                # Substit single nt
                VCFRecord("artificial_chr1", 14, "sub_01", "G", ["T"], None, None, {
                    "expected": ["artificial_chr1:14=G/T"],
                    "ANN": ["T|ann_1|0", "T|ann_2|0", "A|ann_3|"],
                    "expectedANN": ["T|ann_1|0", "T|ann_2|0"]
                }),
                VCFRecord("artificial_chr1", 19, "sub_02", "T", ["A", "C"], None, None, {
                    "expected": ["artificial_chr1:19=T/A", "artificial_chr1:19=T/C"],
                    "ANN": ["A|ann_1|0", "A|ann_2|0", "T|ann_3|"],
                    "expectedANN": ["A|ann_1|0", "A|ann_2|0"]
                }),
                # Substit multi nt
                VCFRecord("artificial_chr1", 7, "sub_03", "CATGTATG", ["GTACCCGC"], None, None, {
                    "expected": ["artificial_chr1:7=CATGTATG/GTACCCGC"],
                    "ANN": ["GTACCCGC|ann_1|0", "GTACCCGC|ann_2|0", "GTGT|ann_3|"],
                    "expectedANN": ["GTACCCGC|ann_1|0", "GTACCCGC|ann_2|0"]
                }),
                VCFRecord("artificial_chr1", 11, "sub_04", "TATGTATG", ["GTACCCGC", "GTACCCAA"], None, None, {
                    "expected": ["artificial_chr1:11=TATGTATG/GTACCCGC", "artificial_chr1:11=TATGTATG/GTACCCAA"],
                    "ANN": ["GTACCCGC|ann_1|0", "GTACCCGC|ann_2|0", "GTACCCAA|ann_3|1"],
                    "expectedANN": ["GTACCCGC|ann_1|0", "GTACCCGC|ann_2|0", "GTACCCAA|ann_3|1"]
                }),
                # Insertion single nt
                VCFRecord("artificial_chr1", 14, "ins_01", "G", ["GA"], None, None, {
                    "expected": ["artificial_chr1:14=G/GA"],
                    "ANN": ["GA|ann_1|0", "GA|ann_2|0", "GT|ann_3|"],
                    "expectedANN": ["GA|ann_1|0", "GA|ann_2|0"]
                }),
                VCFRecord("artificial_chr1", 20, "ins_02", "-", ["A"], None, None, {
                    "expected": ["artificial_chr1:19=T/TA"],
                    "ANN": ["A|ann_1|0", "A|ann_2|0", "T|ann_3|"],
                    "expectedANN": ["TA|ann_1|0", "TA|ann_2|0"]
                }),
                VCFRecord("artificial_chr1", 14, "ins_03", "G", ["GA", "GC"], None, None, {
                    "expected": ["artificial_chr1:14=G/GA", "artificial_chr1:14=G/GC"],
                    "ANN": ["GA|ann_1|0", "GA|ann_2|0", "GC|ann_3|1", "GT|ann_4|"],
                    "expectedANN": ["GA|ann_1|0", "GA|ann_2|0", "GC|ann_3|1"]
                }),
                VCFRecord("artificial_chr1", 20, "ins_04", "-", ["A", "C"], None, None, {
                    "expected": ["artificial_chr1:19=T/TA", "artificial_chr1:19=T/TC"],
                    "ANN": ["A|ann_1|0", "A|ann_2|0", "C|ann_3|1", "T|ann_4|"],
                    "expectedANN": ["TA|ann_1|0", "TA|ann_2|0", "TC|ann_3|1"]
                }),
                # Insertion multi nt
                VCFRecord("artificial_chr1", 14, "ins_05", "G", ["GATGC"], None, None, {
                    "expected": ["artificial_chr1:14=G/GATGC"],
                    "ANN": ["GATGC|ann_1|0", "GATGC|ann_2|0", "GAAAC|ann_3|"],
                    "expectedANN": ["GATGC|ann_1|0", "GATGC|ann_2|0"]
                }),
                VCFRecord("artificial_chr1", 20, "ins_06", "-", ["AAATC"], None, None, {
                    "expected": ["artificial_chr1:19=T/TAAATC"],
                    "ANN": ["AAATC|ann_1|0", "AAATC|ann_2|0", "GAAAC|ann_3|"],
                    "expectedANN": ["TAAATC|ann_1|0", "TAAATC|ann_2|0"]
                }),
                # Movable insertion multi nt
                VCFRecord("artificial_chr1", 14, "ins_07", "G", ["GTG"], None, None, {
                    "expected": ["artificial_chr1:12=A/ATG"],
                    "ANN": ["GTG|ann_1|0", "GTG|ann_2|0", "GAAAC|ann_3|"],
                    "expectedANN": ["ATG|ann_1|0", "ATG|ann_2|0"]
                }),
                VCFRecord("artificial_chr1", 27, "ins_08", "A", ["AAAA"], None, None, {
                    "expected": ["artificial_chr1:25=C/CAAA"],
                    "ANN": ["AAAA|ann_1|0", "AAAA|ann_2|0", "CAAA|ann_3|"],
                    "expectedANN": ["CAAA|ann_1|0", "CAAA|ann_2|0"]
                }),
                # Deletion single nt
                VCFRecord("artificial_chr1", 14, "del_01", "G", [""], None, None, {
                    "expected": ["artificial_chr1:13=TG/T"],
                    "ANN": ["-|ann_1|0", "-|ann_2|0", "T|ann_3|"],
                    "expectedANN": ["T|ann_1|0", "T|ann_2|0"]
                }),
                VCFRecord("artificial_chr1", 14, "del_02", "G", ["-"], None, None, {
                    "expected": ["artificial_chr1:13=TG/T"],
                    "ANN": ["-|ann_1|0", "-|ann_2|0", "T|ann_3|"],
                    "expectedANN": ["T|ann_1|0", "T|ann_2|0"]
                }),
                VCFRecord("artificial_chr1", 13, "del_03", "TG", ["T"], None, None, {
                    "expected": ["artificial_chr1:13=TG/T"],
                    "ANN": ["T|ann_1|0", "T|ann_2|0", "A|ann_3|"],
                    "expectedANN": ["T|ann_1|0", "T|ann_2|0"]
                }),
                VCFRecord("artificial_chr1", 13, "del_04", "TG", ["T", "-"], None, None, {
                    "expected": ["artificial_chr1:13=TG/T", "artificial_chr1:12=ATG/A"],
                    "ANN": ["T|ann_1|0", "T|ann_2|0", "-|ann_3|1"],
                    "expectedANN": ["T|ann_1|0", "T|ann_2|0", "A|ann_3|1"]
                }),
                # Movable deletion multi nt
                VCFRecord("artificial_chr1", 11, "del_05", "TATG", ["T", "TA", "-"], None, None, {
                    "expected": ["artificial_chr1:11=TATG/T", "artificial_chr1:12=ATG/A", "artificial_chr1:7=CATGT/C"],
                    "ANN": ["T|ann_1|0", "T|ann_2|0", "TA|ann_3|1", "-|ann_4|2"],
                    "expectedANN": ["T|ann_1|0", "T|ann_2|0", "A|ann_3|1", "C|ann_4|2"]
                }),
            ]
            for idx, curr_var in enumerate(self.variants):
                FH_var.write(curr_var)
Ejemplo n.º 17
0
        '-i',
        '--input-variants',
        required=True,
        help='The path to the variants file (format: VCF).')
    group_output = parser.add_argument_group('Outputs')  # Outputs
    group_output.add_argument(
        '-o',
        '--output-variants',
        required=True,
        help=
        'The path to the outputted file containing the constitutive variants (format: TSV).'
    )
    args = parser.parse_args()

    # Process
    with VCFIO(args.input_variants) as FH_in:
        with open(args.output_variants, "w") as FH_out:
            # Header
            FH_out.write("## PARAMETERS: {}\n".format(" ".join(sys.argv)))
            FH_out.write("## VERSION: {}\n".format(__version__))
            FH_out.write("\t".join([
                "#Chromosome", "Position", "Reference_allele",
                "Alternative_allele", "Noise_rate", "Nb_input_spl",
                "Nb_usable_spl", "Nb_support_spl", "Nb_constit_spl",
                "Constit_spl", "Constit_AF"
            ]) + "\n")
            # Records
            for record in FH_in:
                for idx in range(len(record.alt)):
                    curr_allele = getAlleleRecord(FH_in, record, idx)
                    nb_spl = len(FH_in.samples)
Ejemplo n.º 18
0
    def setUp(self):
        tmp_folder = tempfile.gettempdir()
        unique_id = str(uuid.uuid1())

        # Temporary files
        self.tmp_variants = os.path.join(tmp_folder, unique_id + ".vcf")
        self.tmp_output = os.path.join(tmp_folder, unique_id + "_out.vcf")

        # Exec command
        self.cmd = [
            "filterVCFBySOR.py", "--input-variants", self.tmp_variants,
            "--output-variants", self.tmp_output
        ]

        # Create VCF
        with VCFIO(self.tmp_variants, "w") as FH_var:
            FH_var.info = {
                "expected":
                HeaderInfoAttr("expected",
                               "Expected filter tag.",
                               type="String",
                               number="1"),
                "SAR":
                HeaderInfoAttr(
                    "SAR",
                    "Number of reads supporting the alternative allele in reverse strand.",
                    type="Integer",
                    number="1"),
                "SAF":
                HeaderInfoAttr(
                    "SAF",
                    "Number of reads supporting the alternative allele in forward strand.",
                    type="Integer",
                    number="1"),
                "SRR":
                HeaderInfoAttr(
                    "SRR",
                    "Number of reads supporting the reference allele in reverse strand.",
                    type="Integer",
                    number="1"),
                "SRF":
                HeaderInfoAttr(
                    "SRF",
                    "Number of reads supporting the reference allele in forward strand.",
                    type="Integer",
                    number="1"),
            }
            FH_var.writeHeader()
            self.variants = [
                # 0.5 alt, 0.5 ref, low DP, alt no bias, ref no bias
                VCFRecord("artificial_chr1", 10, "sub_01", "G", ["T"], None,
                          None, {
                              "SAR": 5,
                              "SAF": 5,
                              "SRR": 5,
                              "SRF": 5,
                              "expected": "PASS"
                          }),
                # 0.05 alt, 0.95 ref, good DP, alt no bias, ref no bias
                VCFRecord("artificial_chr1", 20, "sub_02", "G", ["T"], None,
                          None, {
                              "SAR": 5,
                              "SAF": 5,
                              "SRR": 95,
                              "SRF": 95,
                              "expected": "PASS"
                          }),
                # 0.05 alt, 0.95 ref, good DP, alt no bias, ref strand bias
                VCFRecord("artificial_chr1", 30, "sub_03", "G", ["T"], None,
                          None, {
                              "SAR": 5,
                              "SAF": 5,
                              "SRR": 150,
                              "SRF": 30,
                              "expected": "PASS"
                          }),
                # 0.05 alt, 0.95 ref, good DP, alt strand bias, ref no bias
                VCFRecord(
                    "artificial_chr1", 40, "sub_04", "G", ["T"], None, None, {
                        "SAR": 9,
                        "SAF": 1,
                        "SRR": 95,
                        "SRF": 95,
                        "expected": "strandRatioBias"
                    }),
                # 0.05 alt, 0.95 ref, good DP, alt strand bias, ref strand bias => no bias
                VCFRecord("artificial_chr1", 50, "sub_05", "G", ["T"], None,
                          None, {
                              "SAR": 9,
                              "SAF": 1,
                              "SRR": 150,
                              "SRF": 30,
                              "expected": "PASS"
                          }),
                # 0.5 alt, 0.5 ref, low DP, alt strand bias, ref no bias
                VCFRecord(
                    "artificial_chr1", 60, "sub_06", "G", ["T"], None, None, {
                        "SAR": 9,
                        "SAF": 1,
                        "SRR": 5,
                        "SRF": 5,
                        "expected": "strandRatioBias"
                    }),
                # 0.29 alt, 0.71 ref, good DP, alt no bias, ref no bias
                VCFRecord(
                    "artificial_chr1", 70, "sub_07", "G", ["T"], None, None, {
                        "SAR": 400,
                        "SAF": 600,
                        "SRR": 1400,
                        "SRF": 1000,
                        "expected": "PASS"
                    }),
                # 0.71 alt, 0.29 ref, good DP, alt no bias, ref no bias
                VCFRecord(
                    "artificial_chr1", 80, "sub_08", "G", ["T"], None, None, {
                        "SAR": 1400,
                        "SAF": 1000,
                        "SRR": 400,
                        "SRF": 600,
                        "expected": "PASS"
                    }),
                # 1.0 alt, 0.0 ref, good DP, alt no bias, ref 0 DP
                VCFRecord(
                    "artificial_chr1", 90, "sub_09", "G", ["T"], None, None, {
                        "SAR": 1400,
                        "SAF": 1000,
                        "SRR": 0,
                        "SRF": 0,
                        "expected": "PASS"
                    }),
                # 1.0 alt, 0.0 ref, good DP, alt no bias, ref 2 DP
                VCFRecord(
                    "artificial_chr1", 100, "sub_10", "G", ["T"], None, None, {
                        "SAR": 1400,
                        "SAF": 1000,
                        "SRR": 0,
                        "SRF": 2,
                        "expected": "PASS"
                    }),
                # 1.0 alt, 0.0 ref, limit DP, alt no bias, ref 0 DP
                VCFRecord("artificial_chr1", 110, "sub_11", "G", ["T"], None,
                          None, {
                              "SAR": 90,
                              "SAF": 30,
                              "SRR": 0,
                              "SRF": 0,
                              "expected": "PASS"
                          }),
                # 1.0 alt, 0.0 ref, limit DP, alt no bias, ref 2 DP
                VCFRecord("artificial_chr1", 120, "sub_12", "G", ["T"], None,
                          None, {
                              "SAR": 90,
                              "SAF": 30,
                              "SRR": 0,
                              "SRF": 2,
                              "expected": "PASS"
                          }),
                # 1.0 alt, 0.0 ref, limit DP, alt strand bias, ref 0 DP
                VCFRecord(
                    "artificial_chr1", 130, "sub_13", "G", ["T"], None, None, {
                        "SAR": 90,
                        "SAF": 10,
                        "SRR": 0,
                        "SRF": 0,
                        "expected": "strandRatioBias"
                    }),
                # 1.0 alt, 0.0 ref, limit DP, alt strand bias, ref 2 DP
                VCFRecord(
                    "artificial_chr1", 140, "sub_14", "G", ["T"], None, None, {
                        "SAR": 90,
                        "SAF": 10,
                        "SRR": 0,
                        "SRF": 2,
                        "expected": "strandRatioBias"
                    }),
                # 1.0 alt, 0.0 ref, limit DP, alt strand bias, ref 1 DP
                VCFRecord(
                    "artificial_chr1",
                    150,
                    "sub_15",
                    "G",
                    ["T"],
                    None,
                    None,
                    {
                        "SAR": 90,
                        "SAF": 10,
                        "SRR": 1,
                        "SRF": 0,
                        "expected": "PASS"  # It can be discuss: 2.89
                    }),
                # 0.04 alt, 0.96 ref, good DP, alt strand bias, ref no bias
                VCFRecord(
                    "artificial_chr1", 160, "sub_16", "G", ["T"], None, None, {
                        "SAR": 15,
                        "SAF": 2,
                        "SRR": 200,
                        "SRF": 200,
                        "expected": "strandRatioBias"
                    }),
                # 0.04 alt, 0.96 ref, good DP, alt strand bias, ref no bias
                VCFRecord(
                    "artificial_chr1",
                    170,
                    "sub_17",
                    "G",
                    ["T"],
                    None,
                    None,
                    {
                        "SAR": 13,  # 12 => PASS
                        "SAF": 2,
                        "SRR": 200,
                        "SRF": 200,
                        "expected": "strandRatioBias"
                    }),
                # 0.04 alt, 0.96 ref, good DP, alt strand bias, ref strand bias => no bias
                VCFRecord("artificial_chr1", 180, "sub_18", "G", ["T"], None,
                          None, {
                              "SAR": 13,
                              "SAF": 2,
                              "SRR": 350,
                              "SRF": 50,
                              "expected": "PASS"
                          }),
                # 0.04 alt, 0.96 ref, good DP, alt strand bias, ref strand bias rev => bias
                VCFRecord(
                    "artificial_chr1", 190, "sub_19", "G", ["T"], None, None, {
                        "SAR": 13,
                        "SAF": 2,
                        "SRR": 50,
                        "SRF": 350,
                        "expected": "strandRatioBias"
                    }),
                # 0.5 alt, 0.5 ref, low DP, alt strand bias, ref no bias
                VCFRecord(
                    "artificial_chr1", 200, "sub_20", "G", ["T"], None, None, {
                        "SAR": 14,
                        "SAF": 2,
                        "SRR": 8,
                        "SRF": 8,
                        "expected": "strandRatioBias"
                    }),
            ]
            for idx, curr_var in enumerate(self.variants):
                FH_var.write(curr_var)
Ejemplo n.º 19
0
    )
    log = logging.getLogger(os.path.basename(__file__))
    log.setLevel(logging.INFO)
    log.info("Command: " + " ".join(sys.argv))

    # Process
    cosmic_reader = None
    with AnnotVCFIO(args.output_variants,
                    "w",
                    annot_field=args.annotations_field) as FH_out:
        with AnnotVCFIO(args.input_variants,
                        annot_field=args.annotations_field) as FH_in:
            # Header
            FH_out.copyHeader(FH_in)
            if args.input_cosmic:
                cosmic_reader = VCFIO(args.input_cosmic, "i")
                cosmic_version = getDatabankVersion(cosmic_reader)
                FH_out.extra_header.append(
                    "##COSMIC={}".format(cosmic_version))
            FH_out.writeHeader()
            # Records
            for record in FH_in:
                # To upper
                record.ref = record.ref.upper()
                record.alt = [alt.upper() for alt in record.alt]
                for annot in record.info[FH_in.annot_field]:
                    annot["Allele"] = annot["Allele"].upper()
                # Change alternative representation
                for alt_idx, alt in enumerate(record.alt):
                    alt_record = getAlleleRecord(FH_in, record, alt_idx)
                    vep_alt = getVEPAlt(alt_record.ref, alt_record.alt)[0]
Ejemplo n.º 20
0
    group_input = parser.add_argument_group('Inputs')  # Inputs
    group_input.add_argument(
        '-i',
        '--input-variants',
        required=True,
        help='The path to the variants file (format: VCF).')
    group_output = parser.add_argument_group('Outputs')  # Outputs
    group_output.add_argument(
        '-o',
        '--output-variants',
        required=True,
        help='The path to the outputted variants file (format: VCF).')
    args = parser.parse_args()

    # Process
    with VCFIO(args.output_variants, "w") as FH_out:
        with VCFIO(args.input_variants) as FH_in:
            # Header
            FH_out.copyHeader(FH_in)
            FH_out.writeHeader()
            # Records
            records_by_chr = dict()
            for record in FH_in:
                if record.chrom not in records_by_chr:
                    records_by_chr[record.chrom] = list()
                records_by_chr[record.chrom].append(record)
            for chrom in sorted(records_by_chr):
                sorted_records = sorted(records_by_chr[chrom],
                                        key=lambda x:
                                        (x.chrom, x.pos, x.refEnd(), x.alt[0]))
                for record in sorted_records:
Ejemplo n.º 21
0
    def setUp(self):
        tmp_folder = tempfile.gettempdir()
        unique_id = str(uuid.uuid1())

        # Temporary files
        self.tmp_sequences = os.path.join(tmp_folder, unique_id + ".fasta")
        self.tmp_faidx = os.path.join(tmp_folder, unique_id + ".fasta.fai")
        self.tmp_regions = os.path.join(tmp_folder, unique_id + ".bed")
        self.tmp_variants = os.path.join(tmp_folder, unique_id + ".vcf")
        self.tmp_output = os.path.join(tmp_folder, unique_id + "_out.vcf")

        # Exec command
        self.cmd = [
            "filterVCFTargets.py", "--mode", "remove", "--input-variants",
            self.tmp_variants, "--input-targets", self.tmp_regions,
            "--input-reference", self.tmp_sequences, "--output-variants",
            self.tmp_output
        ]

        # Create fasta
        with FastaIO(self.tmp_sequences, "w") as FH_seq:
            # Repeats:                                       ****....            ...***
            # Region:                                 |----|        |------------|         |------|
            FH_seq.write(
                Sequence("artificial_chr1",
                         "CTCAGTCATGTATGTATGTGCTCACAAAGTAGTAGATCATGGCAC"))
            #                                         123456789| | | | | | | | | | | | | | | | | |
            #                                                  10| 14| 18| 22| 26| 30| 34| 38| 42|
            #                                                    12  16  20  24  28  32  36  40  44
            FH_seq.write(Sequence("artificial_chr2", "CGATNNNCGAT"))
            #                                         123456789|
            #                                                  10

        # Create faidx
        with open(self.tmp_faidx, "w") as FH_fai:
            FH_fai.write("""artificial_chr1	45	17	45	46
artificial_chr2	11	80	11	12""")

        # Create targets
        with BEDIO(self.tmp_regions, "w", write_nb_col=4) as FH_bed:
            FH_bed.write(BEDRecord("artificial_chr1", 1, 6, "target_1"))
            FH_bed.write(BEDRecord("artificial_chr1", 15, 28, "target_2"))
            FH_bed.write(BEDRecord("artificial_chr1", 38, 45, "target_3"))

        # Create VCF
        with VCFIO(self.tmp_variants, "w") as FH_var:
            FH_var.info = {
                "target":
                HeaderInfoAttr("target",
                               "The ID of the overlapped target.",
                               type="String",
                               number="1")
            }
            FH_var.writeHeader()
            self.variants = [
                # Substit single nt
                VCFRecord("artificial_chr1", 14, "alt_00", "G", ["T"], None,
                          None, {"target": None
                                 }),  # Before target ; first nt before target
                VCFRecord(
                    "artificial_chr1", 15, "alt_01", "G", ["T"], None, None,
                    {"target": "target_2"}),  # On target ; first nt of target
                VCFRecord("artificial_chr1", 21, "alt_02", "C", ["G"], None,
                          None, {"target": "target_2"}),  # On target
                VCFRecord("artificial_chr1", 28, "alt_03", "A", ["G"], None,
                          None, {"target": "target_2"}),  # On target ; last nt
                VCFRecord(
                    "artificial_chr1", 29, "alt_04", "G", ["C"], None, None,
                    {"target": None}),  # After target ; first nt after target
                # Substit multi nt
                VCFRecord("artificial_chr1", 7, "alt_05", "CATGTATG",
                          ["GTACCCGC"], None, None,
                          {"target": None
                           }),  # Before target ; first nt before target
                VCFRecord("artificial_chr1", 11, "alt_06", "TATGTATG",
                          ["GTACCCGC"], None, None,
                          {"target": "target_2"}),  # Overlap target start
                VCFRecord("artificial_chr1", 13, "alt_07",
                          "TGTATGTGCTCACAAAGTA", ["CCCGCCCCTACATTGCAGT"], None,
                          None, {"target": "target_2"}),  # Include target
                VCFRecord("artificial_chr1", 15, "alt_08", "TATGTGCTCACAAA",
                          ["CGCCCCTACATTGC"], None, None,
                          {"target": "target_2"}),  # Exact target
                VCFRecord("artificial_chr1", 21, "alt_09", "CTCACAA",
                          ["GTACCCG"], None, None,
                          {"target": "target_2"}),  # Included by target
                VCFRecord("artificial_chr1", 24, "alt_10", "ACAAAGTA",
                          ["GTACCCG"], None, None,
                          {"target": "target_2"}),  # Overlap target end
                VCFRecord(
                    "artificial_chr1", 29, "alt_11", "GTAGTAGAT",
                    ["GTACCCGA"], None, None,
                    {"target": None}),  # After target ; first nt after target
                # Ins single nt
                VCFRecord("artificial_chr1", 14, "alt_12", "G", ["GA"], None,
                          None, {"target": None
                                 }),  # Before target ; first nt before target
                VCFRecord("artificial_chr1", 15, "alt_12.2", "-", ["A"], None,
                          None, {"target": None
                                 }),  # Before target ; first nt before target
                VCFRecord(
                    "artificial_chr1", 15, "alt_13", "A", ["TG"], None, None,
                    {"target": "target_2"}),  # On target ; first nt of target
                VCFRecord("artificial_chr1", 21, "alt_14", "C", ["CG"], None,
                          None, {"target": "target_2"}),  # On target
                VCFRecord("artificial_chr1", 27, "alt_15", "A", ["AT"], None,
                          None, {"target": "target_2"}),  # On target ; last nt
                VCFRecord("artificial_chr1", 28, "alt_15.2", "-", ["T"], None,
                          None, {"target": "target_2"}),  # On target ; last nt
                VCFRecord(
                    "artificial_chr1", 28, "alt_16", "A", ["AT"], None, None,
                    {"target": None}),  # After target ; first nt afetr target
                # Movable del multi nt
                VCFRecord(
                    "artificial_chr1", 14, "alt_17", "G", ["GT"], None, None,
                    {"target": "target_2"}),  # Movable to first nt of target
                VCFRecord(
                    "artificial_chr1", 28, "alt_18", "A", ["AA"], None, None,
                    {"target": "target_2"}),  # Movable to last nt of target
                # Del single nt
                VCFRecord("artificial_chr1", 14, "alt_19", "G", [""], None,
                          None, {"target": None
                                 }),  # Before target ; first nt before target
                VCFRecord(
                    "artificial_chr1", 15, "alt_20", "T", [""], None, None,
                    {"target": "target_2"}),  # On target ; first nt of target
                VCFRecord("artificial_chr1", 21, "alt_21", "C", [""], None,
                          None, {"target": "target_2"}),  # On target
                VCFRecord("artificial_chr1", 28, "alt_22", "A", [""], None,
                          None, {"target": "target_2"}),  # On target ; last nt
                VCFRecord(
                    "artificial_chr1", 29, "alt_23", "G", [""], None, None,
                    {"target": None}),  # After target ; first nt afetr target
                # Del multi nt
                VCFRecord("artificial_chr1", 11, "alt_24", "TATG", ["T"], None,
                          None, {"target": None
                                 }),  # Before target ; first nt before target
                VCFRecord(
                    "artificial_chr1", 13, "alt_25", "TGTA", ["T"], None, None,
                    {"target": "target_2"}),  # On target ; first nt of target
                VCFRecord("artificial_chr1", 20, "alt_26", "GCTC", ["G"], None,
                          None, {"target": "target_2"}),  # On target
                VCFRecord("artificial_chr1", 27, "alt_27", "AAGT", ["A"], None,
                          None, {"target": "target_2"}),  # On target ; last nt
                VCFRecord(
                    "artificial_chr1", 28, "alt_28", "AGT", ["A"], None, None,
                    {"target": None}),  # After target ; first nt afetr target
                # Movable del multi nt
                VCFRecord("artificial_chr1", 7, "alt_29", "CATGT", ["C"], None,
                          None,
                          {"target": "target_2"
                           }),  # On repeat and movable to first nt of target
                VCFRecord(
                    "artificial_chr1", 12, "alt_30", "ATG", ["A"], None, None,
                    {"target": "target_2"}),  # Movable to first nt of target
                VCFRecord(
                    "artificial_chr1", 28, "alt_31", "AGTA", ["A"], None, None,
                    {"target": "target_2"}),  # Movable to last nt of target
                VCFRecord("artificial_chr1", 30, "alt_32", "TAGT", ["T"], None,
                          None,
                          {"target": "target_2"
                           }),  # On repeat and movable to last nt of target
            ]
            for idx, curr_var in enumerate(self.variants):
                FH_var.write(curr_var)
Ejemplo n.º 22
0
        'chr19': '19',
        'chr20': '20',
        'chr21': '21',
        'chr22': '22',
        'chrX': 'X',
        'chrY': 'Y',
        'chrM': 'MT'
    }
    if args.input_names:
        with SVIO(args.input_names, "r", separator="\t",
                  has_title=False) as reader:
            for record in reader:
                new_names[record[0]] = record[1]

    # Process
    with VCFIO(args.output_variants, "w") as writer:
        with VCFIO(args.input_variants, "r") as reader:
            # Header
            writer.copyHeader(reader)
            for idx, curr_header in enumerate(writer.extra_header):
                if curr_header.startswith("##contig"):
                    content = uGetHeaderAttr(curr_header)
                    old_id = content.id
                    if content.id in new_names:
                        new_id = new_names[old_id]
                        writer.extra_header[idx] = curr_header.replace(
                            "ID={},".format(old_id), "ID={},".format(new_id))
            writer.writeHeader()
            # Variants
            for record in reader:
                if record.chrom in new_names:
Ejemplo n.º 23
0
    group_input = parser.add_argument_group('Inputs')  # Inputs
    group_input.add_argument('-i', '--input-variants', help='Path to the variants file (format: VCF).')
    group_output = parser.add_argument_group('Outputs')  # Outputs
    group_input.add_argument('-o', '--output-variants', help='Path to the file outputted file (format: VCF).')
    args = parser.parse_args()

    # Logger
    logging.basicConfig(format='%(asctime)s -- [%(filename)s][pid:%(process)d][%(levelname)s] -- %(message)s')
    log = logging.getLogger(os.path.basename(__file__))
    log.setLevel(logging.INFO)
    log.info("Command: " + " ".join(sys.argv))

    # Process
    nb_variants = 0
    nb_filtered = 0
    with VCFIO(args.input_variants) as handle_in:
        with VCFIO(args.output_variants, "w") as handle_out:
            # Header
            handle_out.copyHeader(handle_in)
            handle_out.info[args.SOR_tag] = HeaderInfoAttr(args.SOR_tag, "Strand bias estimated by the symmetric odds ratio test.", type="Float")
            handle_out.filter[args.bias_tag] = HeaderFilterAttr(args.bias_tag, "Strand ratio bias (estimated by the symmetric odds ratio test): substit SOR > {}, InDel SOR > {}.".format(args.substit_max_SOR, args.indel_max_SOR))
            handle_out.writeHeader()
            # Records
            for record in handle_in:
                if len(record.alt) > 1:
                    raise Exception("The multi-allelic variants cannot be processed: {}.".format(record.getName()))
                nb_variants += 1
                is_filtered = False
                # Compute SOR
                record.info[args.SOR_tag] = strandOddRatio(
                    record.info[args.ref_fwd_tag] if handle_in.info[args.ref_fwd_tag].number == "1" else record.info[args.ref_fwd_tag][0],
    def setUp(self):
        tmp_folder = tempfile.gettempdir()
        unique_id = str(uuid.uuid1())

        # Temporary files
        self.tmp_sequences = os.path.join(tmp_folder, unique_id + ".fasta")
        self.tmp_faidx = os.path.join(tmp_folder, unique_id + ".fasta.fai")
        self.tmp_variants = os.path.join(tmp_folder, unique_id + ".vcf")
        self.tmp_output = os.path.join(tmp_folder, unique_id + "_out.vcf")

        # Exec command
        self.cmd = [
            "filterVCFHomopolym.py", "--mode", "remove", "--homopolym-length",
            "4", "--input-variants", self.tmp_variants, "--input-reference",
            self.tmp_sequences, "--output-variants", self.tmp_output
        ]

        # Create fasta
        with FastaIO(self.tmp_sequences, "w") as FH_seq:
            #                                                    12  16  20  24  28  32  36  40  44  48  52  56  60  64  68  72  76  80  84  88  92  96  100
            #                                          2 4 6 8 10| 14| 18| 22| 26| 30| 34| 38| 42| 46| 50| 54| 58| 62| 66| 70| 74| 78| 82| 86| 90| 94| 98| 102
            #                                          | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
            FH_seq.write(
                Sequence(
                    "artificial_chr1",
                    "CGAATATGATCCAGCAATAAAAAGTTCCTACAGGAAAAAAGTAGAAAGAGAAACCTGTCTCTTGGATATTCTCGACACAGCAGGTCAAG"
                ))
            FH_seq.write(
                Sequence(
                    "artificial_chr2",
                    "CGAATATGATCCAGCAATAAAAAGCTCCTACAGGCAAAAGTAGGCAAAGAGAAACCTGTCTCTTGGATATTCTCGACACAGCAGGTCAA"
                ))
            FH_seq.write(
                Sequence(
                    "artificial_chr3",
                    "CGAATATGATCCAGCAATGAAAATTCCTACAGGTAAAACGTAGAAAGAGAAACCTGTCTCTTGGATATTCTCGACACAGCAGGTCAAG"
                ))
            FH_seq.write(
                Sequence(
                    "artificial_chr4",
                    "CGAATATGATCCAGCAATAAAAAGTTCCTACAGGAAAAAAGTAGAAAGAGAAACCTGTCAAAAGGATATTCTCGACAAAACAGCAGAAAGTCAAG"
                ))
            FH_seq.write(
                Sequence(
                    "artificial_chr5",
                    "CGAATATGATCCAGTAATAAAAAGTTCCTACAGGAAAAAAGTAGAAAGAGAAACCTGTCTCTTGGATATTCTCGACACAGCAGGTCAAG"
                ))
            FH_seq.write(
                Sequence(
                    "artificial_chr6",
                    "CGAATATGATCCAGCAATAAAAAGTTCCTACAGGAAAAAAGTAGAAAGCACAACCTGTCTCTTGGAAAATCTCGACACAGCAGGTAAAACAATGCAGTAAAT"
                ))
        """
        Variant	before_start	before_end	before_seq	after_start	after_end	after_seq
        alt_00	10	13	TCCA	15	18	CAAT
        alt_01	20	23	AAAA	25	28	TTCC
        alt_02	30	33	ACAG	35	38	AAAA
        alt_03	40	43	AGTA	45	48	AAAG
        alt_04	10	13	TCCA	16	19	AATA
        alt_05	20	23	AAAA	26	29	TCCT
        alt_06	30	33	ACAG	36	39	AAAA
        alt_07	40	43	GTAG	46	49	AAAG
        alt_08	11	14	CCAG	15	18	CAAT
        alt_09	20	23	AAAA	24	27	TTCC
        alt_10	31	34	AGGT	35	38	AAAA
        alt_11	40	43	GTAG	44	47	AAAG
        alt_12	11	14	CCAG	15	18	CAAT
        alt_13	20	23	AAAA	24	27	GTTC
        alt_14	31	34	CAGG	35	38	AAAA
        alt_15	41	44	GTAG	45	48	AAAG
        alt_16	50	53	GAAA	57	60	GTCA
        alt_17	60	63	AAAA	67	70	TATT
        alt_18	70	73	TCTC	77	80	AAAA
        alt_19	80	83	ACAG	87	90	AAAG
        alt_20	11	14	CCAG	16	19	AATA
        alt_21	20	23	AAAA	25	28	TTCC
        alt_22	31	34	CAGG	36	39	AAAA
        alt_23	40	43	AGTA	45	48	AAAG
        alt_24	11	14	CCAG	17	20	ATAA
        alt_25	19	22	AAAA	26	29	TCCT
        alt_26	29	32	TACA	35	38	AAAA
        alt_27	38	41	AAAG	45	48	AAAG
        alt_28	50	53	ACAA	61	64	CTTG
        alt_29	66	69	AAAA	76	79	CACA
        alt_30	76	79	CACA	86	89	AAAA
        alt_31	88	91	AACA	99	102	AAAT
        """

        # Create faidx
        with open(self.tmp_faidx, "w") as FH_fai:
            FH_fai.write("""artificial_chr1	89	17	89	90
artificial_chr2	89	124	89	90
artificial_chr3	88	231	88	89
artificial_chr4	95	337	95	96
artificial_chr5	89	450	89	90
artificial_chr6	102	557	102	103""")

        # Create VCF
        with VCFIO(self.tmp_variants, "w") as FH_var:
            FH_var.info = {
                "is_filtered":
                HeaderInfoAttr(
                    "is_filtered",
                    "1 if the variant is adjacent to an homopolymer.",
                    type="Integer",
                    number="1")
            }
            FH_var.writeHeader()
            self.variants = [
                # Substit single nt
                VCFRecord("artificial_chr1", 14, "alt_00", "G", ["T"], None,
                          None,
                          {"is_filtered": 0}),  # Without adjacent homopolymers
                VCFRecord(
                    "artificial_chr1", 24, "alt_01", "G", ["T"], None, None,
                    {"is_filtered": 1}),  # Adjacent homopolymers upstream
                VCFRecord(
                    "artificial_chr1", 34, "alt_02", "G", ["T"], None, None,
                    {"is_filtered": 1}),  # Adjacent homopolymers downstream
                VCFRecord(
                    "artificial_chr1", 44, "alt_03", "G", ["T"], None, None,
                    {"is_filtered": 0}),  # Adjacent too short homopolymers
                # Substit multi nt
                VCFRecord("artificial_chr2", 14, "alt_04", "GC", ["TA"], None,
                          None,
                          {"is_filtered": 0}),  # Without adjacent homopolymers
                VCFRecord(
                    "artificial_chr2", 24, "alt_05", "GC", ["TA"], None, None,
                    {"is_filtered": 1}),  # Adjacent homopolymers upstream
                VCFRecord(
                    "artificial_chr2", 34, "alt_06", "GC", ["TA"], None, None,
                    {"is_filtered": 1}),  # Adjacent homopolymers downstream
                VCFRecord(
                    "artificial_chr2", 44, "alt_07", "GC", ["TA"], None, None,
                    {"is_filtered": 0}),  # Adjacent too short homopolymers
                # Ins single nt
                VCFRecord("artificial_chr3", 14, "alt_08", "G", ["GT"], None,
                          None,
                          {"is_filtered": 0}),  # Without adjacent homopolymers
                VCFRecord(
                    "artificial_chr3", 23, "alt_09", "A", ["AT"], None, None,
                    {"is_filtered": 1}),  # Adjacent homopolymers upstream
                VCFRecord(
                    "artificial_chr3", 34, "alt_10", "T", ["TA"], None, None,
                    {"is_filtered": 1}),  # Adjacent homopolymers downstream
                VCFRecord(
                    "artificial_chr3", 43, "alt_11", "G", ["GT"], None, None,
                    {"is_filtered": 0}),  # Adjacent too short homopolymers
                # Ins multi nt
                VCFRecord("artificial_chr4", 14, "alt_12", "G", ["GTA"], None,
                          None,
                          {"is_filtered": 0}),  # Without adjacent homopolymers
                VCFRecord(
                    "artificial_chr4", 23, "alt_13", "A", ["ATA"], None, None,
                    {"is_filtered": 1}),  # Adjacent homopolymers upstream
                VCFRecord(
                    "artificial_chr4", 34, "alt_14", "G", ["GTA"], None, None,
                    {"is_filtered": 1}),  # Adjacent homopolymers downstream
                VCFRecord(
                    "artificial_chr4", 44, "alt_15", "G", ["GTC"], None, None,
                    {"is_filtered": 0}),  # Adjacent too short homopolymer
                VCFRecord("artificial_chr4", 54, "alt_16", "CCT", ["ATCCAGA"],
                          None, None,
                          {"is_filtered": 0}),  # Without adjacent homopolymers
                VCFRecord(
                    "artificial_chr4", 64, "alt_17", "GGA", ["CTCCAGT"], None,
                    None,
                    {"is_filtered": 1}),  # Adjacent homopolymers upstream
                VCFRecord(
                    "artificial_chr4", 74, "alt_18", "GAC", ["ATCCAGT"], None,
                    None,
                    {"is_filtered": 1}),  # Adjacent homopolymers downstream
                VCFRecord(
                    "artificial_chr4", 84, "alt_19", "CAG", ["ATCCAGT"], None,
                    None,
                    {"is_filtered": 0}),  # Adjacent too short homopolymer
                # Del single nt
                VCFRecord("artificial_chr5", 14, "alt_20", "GT", ["G"], None,
                          None,
                          {"is_filtered": 0}),  # Without adjacent homopolymers
                VCFRecord(
                    "artificial_chr5", 23, "alt_21", "AG", ["A"], None, None,
                    {"is_filtered": 1}),  # Adjacent homopolymers upstream
                VCFRecord(
                    "artificial_chr5", 34, "alt_22", "GA", ["G"], None, None,
                    {"is_filtered": 1}),  # Adjacent homopolymers downstream
                VCFRecord(
                    "artificial_chr5", 43, "alt_23", "AG", ["A"], None, None,
                    {"is_filtered": 0}),  # Adjacent too short homopolymers
                # # Del multi nt
                VCFRecord("artificial_chr6", 14, "alt_24", "GCA", ["G"], None,
                          None,
                          {"is_filtered": 0}),  # Without adjacent homopolymers
                VCFRecord(
                    "artificial_chr6", 23, "alt_25", "AGT", ["C"], None, None,
                    {"is_filtered": 1}),  # Adjacent homopolymers upstream
                VCFRecord(
                    "artificial_chr6", 32, "alt_26", "AGG", ["A"], None, None,
                    {"is_filtered": 1}),  # Adjacent homopolymers downstream
                VCFRecord(
                    "artificial_chr6", 42, "alt_27", "TAG", ["C"], None, None,
                    {"is_filtered": 0}),  # Adjacent too short homopolymer
                VCFRecord("artificial_chr6", 54, "alt_28", "CCTGTCT", ["GAA"],
                          None, None,
                          {"is_filtered": 0}),  # Without adjacent homopolymers
                VCFRecord(
                    "artificial_chr6", 70, "alt_29", "TCTCGA", ["CCC"], None,
                    None,
                    {"is_filtered": 1}),  # Adjacent homopolymers upstream
                VCFRecord(
                    "artificial_chr6", 80, "alt_30", "GCAGGT", ["CCC"], None,
                    None,
                    {"is_filtered": 1}),  # Adjacent homopolymers downstream
                VCFRecord(
                    "artificial_chr6", 92, "alt_31", "ATGCAGT", ["CCC"], None,
                    None,
                    {"is_filtered": 0}),  # Adjacent too short homopolymer
            ]
            for idx, curr_var in enumerate(self.variants):
                FH_var.write(curr_var)
Ejemplo n.º 25
0
        '%(asctime)s -- [%(filename)s][pid:%(process)d][%(levelname)s] -- %(message)s'
    )
    log = logging.getLogger(os.path.basename(__file__))
    log.setLevel(args.logging_level)
    log.info("Command: " + " ".join(sys.argv))

    # Get merged records
    fusions = getMergedRecords(args.inputs_variants, args.calling_sources,
                               args.annotation_field, args.shared_filters)

    # Log differences in SR and PR
    logSupportVariance(fusions, log)

    # Write
    breakends = list(itertools.chain.from_iterable(fusions))
    with VCFIO(args.output_variants, "w") as writer:
        # Header
        new_header = getNewHeaderAttr(args)
        writer.samples = new_header["samples"]
        writer.info = new_header["info"]
        writer.format = new_header["format"]
        writer.filter = new_header["filter"]
        writer.writeHeader()
        # Records
        for record in sorted(
                breakends,
                key=lambda record:
            (record.chrom, record.refStart(), record.refEnd())):
            if len(record.filter) == 0:
                record.filter = ["PASS"]
            writer.write(record)
Ejemplo n.º 26
0
        format=
        '%(asctime)s -- [%(filename)s][pid:%(process)d][%(levelname)s] -- %(message)s'
    )
    log = logging.getLogger(os.path.basename(__file__))
    log.setLevel(logging.INFO)
    log.info("Command: " + " ".join(sys.argv))

    # Get merged records
    variants = getMergedRecords(args.inputs_variants, args.calling_sources,
                                args.annotations_field, args.shared_filters)

    # Log differences in AF and AD
    logACVariance(variants, log)

    # Write
    with VCFIO(args.output_variants, "w") as FH_out:
        # Header
        new_header = getNewHeaderAttr(args)
        FH_out.samples = new_header["samples"]
        FH_out.info = new_header["info"]
        FH_out.format = new_header["format"]
        FH_out.filter = new_header["filter"]
        FH_out.writeHeader()
        # Records
        for record in sorted(
                variants,
                key=lambda record:
            (record.chrom, record.refStart(), record.refEnd())):
            if record.filter is not None and len(record.filter) == 0:
                record.filter = ["PASS"]
            FH_out.write(record)
Ejemplo n.º 27
0
def getNewHeaderAttr(args):
    """
    Return renamed and new VCFHeader elements for the merged VCF.

    :param args: The script's parameters.
    :type args: NameSpace
    :return: VCFHeader elements (filter, info, format, samples).
    :rtype: dict
    """
    unchanged_info = {"MATEID", "RNA_FIRST", "SVTYPE", "IMPRECISE"}
    final_filter = {}
    final_info = {
        "CIPOS":
        HeaderInfoAttr("CIPOS",
                       type="Integer",
                       number="2",
                       description="Confidence interval around POS"),
        "IDSRC":
        HeaderInfoAttr("IDSRC",
                       type="String",
                       number=".",
                       description="ID of breakend by source"),
        "REFSRC":
        HeaderInfoAttr(
            "REFSRC",
            type="String",
            number="1",
            description="Selected support data (SR, PR) come from this source"
        ),
        "SRC":
        HeaderInfoAttr(
            "SRC",
            type="String",
            number=".",
            description=
            "Fusions callers where the breakend is identified. Possible values: {}"
            .format({
                name: "s" + str(idx)
                for idx, name in enumerate(args.calling_sources)
            }))
    }
    final_format = {
        "SR":
        HeaderFormatAttr(
            "SR",
            type="Integer",
            number="1",
            description="Count of reads mapping on the fusion junction"),
        "PR":
        HeaderFormatAttr(
            "PR",
            type="Integer",
            number="1",
            description="Count of pairs of reads supporting the fusion"),
        "SRSRC":
        HeaderFormatAttr(
            "SRSRC",
            type="Integer",
            number=".",
            description=
            "Count of reads mapping on the fusion junction by source"),
        "PRSRC":
        HeaderFormatAttr(
            "PRSRC",
            type="Integer",
            number=".",
            description=
            "Count of pairs of reads supporting the fusion by source")
    }
    final_samples = None
    for idx_in, curr_in in enumerate(args.inputs_variants):
        with VCFIO(curr_in) as FH_vcf:
            # Samples
            if final_samples is None:
                final_samples = FH_vcf.samples
            elif FH_vcf.samples != final_samples:
                raise Exception(
                    "The samples in VCF are not the same: {} in {} and {} in {}."
                    .format(final_samples, args.inputs_variants[0],
                            FH_vcf.samples, curr_in))
            # FILTER
            for tag, data in FH_vcf.filter.items():
                new_tag = tag
                if tag not in args.shared_filters:  # Rename filters not based on caller
                    new_tag = "s{}_{}".format(idx_in, tag)
                    data.id = new_tag
                    data.source = args.calling_sources[idx_in]
                final_filter[new_tag] = data
            # INFO
            for tag, data in FH_vcf.info.items():
                if tag in unchanged_info:
                    if tag not in final_info or len(
                            final_info[tag].description
                    ) < len(
                            data.description
                    ):  # Manage merge between callers with 0 variants (and 0 annotations) and callers with variants
                        final_info[tag] = data
                else:
                    new_tag = "s{}_{}".format(idx_in, tag)
                    data.id = new_tag
                    data.source = args.calling_sources[idx_in]
                    final_info[new_tag] = data
            qual_tag = "s{}_VCQUAL".format(idx_in)
            final_info[qual_tag] = HeaderInfoAttr(
                qual_tag,
                type="Float",
                number="1",
                description="The variant quality",
                source=args.calling_sources[idx_in])
            # FORMAT
            for tag, data in FH_vcf.format.items():
                new_tag = "s{}_{}".format(idx_in, tag)
                data.id = new_tag
                data.source = args.calling_sources[idx_in]
                final_format[new_tag] = data
    return {
        "filter": final_filter,
        "info": final_info,
        "format": final_format,
        "samples": final_samples
    }
Ejemplo n.º 28
0
def getMergedRecords(inputs_variants, calling_sources, annotations_field,
                     shared_filters):
    """
    Merge VCFRecords coming from several variant callers.

    :param inputs_variants: Pathes to the variants files.
    :type inputs_variants: list
    :param calling_sources: Names of the variants callers (in same order as inputs_variants).
    :type calling_sources: list
    :param annotations_field: Field used to store annotations.
    :type annotations_field: str
    :param shared_filters: Filters tags applying to the variant and independent of caller like filters on annotations. These filters are not renamed to add caller ID as suffix.
    :type shared_filters: set
    :return: Merged VCF records.
    :rtype: list
    """
    variant_by_name = {}
    for idx_in, curr_in in enumerate(inputs_variants):
        curr_caller = calling_sources[idx_in]
        with VCFIO(curr_in) as FH_in:
            log.info("Process {}".format(curr_caller))
            for record in FH_in:
                variant_name = record.getName()
                # Extract AD and DP
                support_by_spl = {}
                for spl in FH_in.samples:
                    support_by_spl[spl] = {
                        "AD": record.getAltAD(spl)[0],
                        "DP": record.getDP(spl)
                    }
                # Rename filters
                if record.filter is not None:
                    new_filter = []
                    for tag in record.filter:
                        if tag != "PASS":
                            if tag in shared_filters:  # Rename filters not based on caller
                                new_filter.append(tag)
                            else:
                                new_filter.append("s{}_{}".format(idx_in, tag))
                    record.filter = new_filter
                # Rename INFO
                new_info = {}
                for key, val in record.info.items():
                    if key == annotations_field:
                        new_info[key] = val
                    else:
                        new_info["s{}_{}".format(idx_in, key)] = val
                record.info = new_info
                # Backup quality
                if record.qual is not None:
                    record.info["s{}_VCQUAL".format(idx_in)] = record.qual
                # Rename FORMAT
                record.format = [
                    "s{}_{}".format(idx_in, curr_filter)
                    for curr_filter in record.format
                ]
                for spl_name, spl_info in record.samples.items():
                    renamed_info = {}
                    for key, val in spl_info.items():
                        renamed_info["s{}_{}".format(idx_in, key)] = val
                    record.samples[spl_name] = renamed_info
                # Add to storage
                if variant_name not in variant_by_name:
                    variant_by_name[variant_name] = record
                    # Data source
                    record.info["SRC"] = [curr_caller]
                    # Quality
                    if idx_in != 0:
                        record.qual = None  # For consistency, the quality of the variant comes only from the first caller of the variant
                    # AD and DP by sample (from the first caller finding the variant: callers are in user order)
                    record.format.insert(0, "ADSRC")
                    record.format.insert(0, "DPSRC")
                    record.format.insert(0, "AD")
                    record.format.insert(0, "DP")
                    for spl_name, spl_data in record.samples.items():
                        spl_data["AD"] = [support_by_spl[spl_name]["AD"]]
                        spl_data["DP"] = support_by_spl[spl_name]["DP"]
                        spl_data["ADSRC"] = [support_by_spl[spl_name]["AD"]]
                        spl_data["DPSRC"] = [support_by_spl[spl_name]["DP"]]
                else:
                    prev_variant = variant_by_name[variant_name]
                    prev_variant.info["SRC"].append(curr_caller)
                    # IDs
                    if record.id is not None:
                        prev_ids = prev_variant.id.split(";")
                        prev_ids.extend(record.id.split(";"))
                        prev_ids = sorted(list(set(prev_ids)))
                        prev_variant.id = ";".join(prev_ids)
                    # FILTERS
                    if record.filter is not None:
                        if prev_variant.filter is None:
                            prev_variant.filter = record.filter
                        else:
                            prev_variant.filter = list(
                                set(prev_variant.filter) or set(record.filter))
                    # FORMAT
                    prev_variant.format.extend(record.format)
                    # INFO
                    prev_variant.info.update(record.info)
                    for spl_name, spl_data in prev_variant.samples.items():
                        spl_data.update(record.samples[spl_name])
                        spl_data["ADSRC"].append(
                            support_by_spl[spl_name]["AD"])
                        spl_data["DPSRC"].append(
                            support_by_spl[spl_name]["DP"])
    return variant_by_name.values()
    def setUp(self):
        tmp_folder = tempfile.gettempdir()
        unique_id = str(uuid.uuid1())
        self.tmp_initial_pathes = os.path.join(tmp_folder, unique_id + "_{}_initial.vcf")
        self.tmp_haplotyped_pathes = os.path.join(tmp_folder, unique_id + "_{}_haplotyped.vcf")
        self.tmp_expected_pathes = os.path.join(tmp_folder, unique_id + "_{}_expected.vcf")
        self.tmp_out_pathes = os.path.join(tmp_folder, unique_id + "_{}_out.vcf")

        # test cases
        self.test_cases = [
            {  # *a-b, a-b, a b, /
                "initial": {
                    "caller1": [VCFRecord("chr1", 14, None, "GCGTA", ["CCGTG"])],
                    "caller2": [VCFRecord("chr1", 14, None, "GCGTA", ["CCGTG"])],
                    "caller3": [
                        VCFRecord("chr1", 14, None, "G", ["C"], info={"AD": 100}),
                        VCFRecord("chr1", 18, None, "A", ["G"], info={"AD": 104})
                    ]
                },
                "haplotyped": {
                    "caller1": [VCFRecord("chr1", 14, None, "GCGTA", ["CCGTG"])],
                    "caller2": [VCFRecord("chr1", 14, None, "GCGTA", ["CCGTG"])],
                    "caller3": [VCFRecord("chr1", 14, None, "GCGTA", ["CCGTG"], info={"MCO_VAR": ["chr1:14=G/C", "chr1:18=A/G"], "AD": 100})]
                },
                "expected": {
                    "caller1": [VCFRecord("chr1", 14, None, "GCGTA", ["CCGTG"])],
                    "caller2": [VCFRecord("chr1", 14, None, "GCGTA", ["CCGTG"])],
                    "caller3": [VCFRecord("chr1", 14, None, "GCGTA", ["CCGTG"], info={"AD": 104})]
                }
            },
            {  # *a b, a b, a-b, /
                "initial": {
                    "caller1": [
                        VCFRecord("chr2", 14, None, "G", ["C"]),
                        VCFRecord("chr2", 18, None, "A", ["G"])
                    ],
                    "caller2": [
                        VCFRecord("chr2", 14, None, "G", ["C"]),
                        VCFRecord("chr2", 18, None, "A", ["G"])
                    ],
                    "caller3": [VCFRecord("chr2", 14, None, "GCGTA", ["CCGTG"])]
                },
                "haplotyped": {
                    "caller1": [VCFRecord("chr2", 14, None, "GCGTA", ["CCGTG"], info={"MCO_VAR": ["chr2:14=G/C", "chr2:18=A/G"]})],
                    "caller2": [VCFRecord("chr2", 14, None, "GCGTA", ["CCGTG"], info={"MCO_VAR": ["chr2:14=G/C", "chr2:18=A/G"]})],
                    "caller3": [VCFRecord("chr2", 14, None, "GCGTA", ["CCGTG"])]
                },
                "expected": {
                    "caller1": [
                        VCFRecord("chr2", 14, None, "G", ["C"]),
                        VCFRecord("chr2", 18, None, "A", ["G"])
                    ],
                    "caller2": [
                        VCFRecord("chr2", 14, None, "G", ["C"]),
                        VCFRecord("chr2", 18, None, "A", ["G"])
                    ],
                    "caller3": [
                        VCFRecord("chr2", 14, None, "G", ["C"]),
                        VCFRecord("chr2", 18, None, "A", ["G"])
                    ]
                }
            },
            {  # *a-b c, a-b c, a b c, /
                "initial": {
                    "caller1": [
                        VCFRecord("chr3", 14, None, "GCGTA", ["CCGTG"]),
                        VCFRecord("chr3", 20, None, "A", ["G"])
                    ],
                    "caller2": [
                        VCFRecord("chr3", 14, None, "GCGTA", ["CCGTG"]),
                        VCFRecord("chr3", 20, None, "A", ["G"])
                    ],
                    "caller3": [
                        VCFRecord("chr3", 14, None, "G", ["C"], info={"AD": 104}),
                        VCFRecord("chr3", 18, None, "A", ["G"], info={"AD": 100}),
                        VCFRecord("chr3", 20, None, "A", ["G"], info={"AD": 98})
                    ]
                },
                "haplotyped": {
                    "caller1": [VCFRecord("chr3", 14, None, "GCGTATCA", ["CCGTGTCG"], info={"MCO_VAR": ["chr3:14=GCGTA/CCGTG", "chr3:20=A/G"]})],
                    "caller2": [VCFRecord("chr3", 14, None, "GCGTATCA", ["CCGTGTCG"], info={"MCO_VAR": ["chr3:14=GCGTA/CCGTG", "chr3:20=A/G"]})],
                    "caller3": [VCFRecord("chr3", 14, None, "GCGTATCA", ["CCGTGTCG"], info={"MCO_VAR": ["chr3:14=G/C", "chr3:18=A/G", "chr3:20=A/G"], "AD": 98})]
                },
                "expected": {
                    "caller1": [
                        VCFRecord("chr3", 14, None, "GCGTA", ["CCGTG"]),
                        VCFRecord("chr3", 20, None, "A", ["G"])
                    ],
                    "caller2": [
                        VCFRecord("chr3", 14, None, "GCGTA", ["CCGTG"]),
                        VCFRecord("chr3", 20, None, "A", ["G"])
                    ],
                    "caller3": [
                        VCFRecord("chr3", 14, None, "GCGTA", ["CCGTG"], info={"AD": 104}),
                        VCFRecord("chr3", 20, None, "A", ["G"], info={"AD": 98})
                    ]
                }
            },
            {  # *a-b c, a-b c, a b c, a-b-c
                "initial": {
                    "caller1": [
                        VCFRecord("chr4", 14, None, "GCGTA", ["CCGTG"]),
                        VCFRecord("chr4", 20, None, "A", ["G"])
                    ],
                    "caller2": [
                        VCFRecord("chr4", 14, None, "GCGTA", ["CCGTG"]),
                        VCFRecord("chr4", 20, None, "A", ["G"])
                    ],
                    "caller3": [
                        VCFRecord("chr4", 14, None, "G", ["C"], info={"AD": 98}),
                        VCFRecord("chr4", 18, None, "A", ["G"], info={"AD": 104}),
                        VCFRecord("chr4", 20, None, "A", ["G"], info={"AD": 100})
                    ],
                    "caller4": [VCFRecord("chr4", 14, None, "GCGTATCA", ["CCGTGTCG"])]
                },
                "haplotyped": {
                    "caller1": [VCFRecord("chr4", 14, None, "GCGTATCA", ["CCGTGTCG"], info={"MCO_VAR": ["chr4:14=GCGTA/CCGTG", "chr4:20=A/G"]})],
                    "caller2": [VCFRecord("chr4", 14, None, "GCGTATCA", ["CCGTGTCG"], info={"MCO_VAR": ["chr4:14=GCGTA/CCGTG", "chr4:20=A/G"]})],
                    "caller3": [VCFRecord("chr4", 14, None, "GCGTATCA", ["CCGTGTCG"], info={"MCO_VAR": ["chr4:14=G/C", "chr4:18=A/G", "chr4:20=A/G"], "AD": 98})],
                    "caller4": [VCFRecord("chr4", 14, None, "GCGTATCA", ["CCGTGTCG"])]
                },
                "expected": {
                    "caller1": [
                        VCFRecord("chr4", 14, None, "GCGTA", ["CCGTG"]),
                        VCFRecord("chr4", 20, None, "A", ["G"])
                    ],
                    "caller2": [
                        VCFRecord("chr4", 14, None, "GCGTA", ["CCGTG"]),
                        VCFRecord("chr4", 20, None, "A", ["G"])
                    ],
                    "caller3": [
                        VCFRecord("chr4", 14, None, "GCGTA", ["CCGTG"], info={"AD": 104}),
                        VCFRecord("chr4", 20, None, "A", ["G"], info={"AD": 100})
                    ],
                    "caller4": [
                        VCFRecord("chr4", 14, None, "GCGTA", ["CCGTG"]),
                        VCFRecord("chr4", 20, None, "A", ["G"])
                    ]
                }
            },
            {  # *a-b c, a' a-b c, a b c, a-b-c
                "initial": {
                    "caller1": [
                        VCFRecord("chr5", 14, None, "GCGTA", ["CCGTG"]),
                        VCFRecord("chr5", 20, None, "A", ["G"])
                    ],
                    "caller2": [
                        VCFRecord("chr5", 14, None, "G", ["C"], info={"AD": 3}),
                        VCFRecord("chr5", 14, None, "GCGTA", ["CCGTG"], info={"AD": 100}),
                        VCFRecord("chr5", 20, None, "A", ["G"], info={"AD": 104})
                    ],
                    "caller3": [
                        VCFRecord("chr5", 14, None, "G", ["C"], info={"AD": 110}),
                        VCFRecord("chr5", 18, None, "A", ["G"], info={"AD": 105}),
                        VCFRecord("chr5", 20, None, "A", ["G"], info={"AD": 100})
                    ],
                    "caller4": [VCFRecord("chr5", 14, None, "GCGTATCA", ["CCGTGTCG"])]
                },
                "haplotyped": {
                    "caller1": [VCFRecord("chr5", 14, None, "GCGTATCA", ["CCGTGTCG"], info={"MCO_VAR": ["chr5:14=GCGTA/CCGTG", "chr5:20=A/G"]})],
                    "caller2": [
                        VCFRecord("chr5", 14, None, "G", ["C"], info={"AD": 3}),
                        VCFRecord("chr5", 14, None, "GCGTATCA", ["CCGTGTCG"], info={"MCO_VAR": ["chr5:14=GCGTA/CCGTG", "chr5:20=A/G"], "AD": 100})
                    ],
                    "caller3": [VCFRecord("chr5", 14, None, "GCGTATCA", ["CCGTGTCG"], info={"MCO_VAR": ["chr5:14=G/C", "chr5:18=A/G", "chr5:20=A/G"], "AD": 100})],
                    "caller4": [VCFRecord("chr5", 14, None, "GCGTATCA", ["CCGTGTCG"])]
                },
                "expected": {
                    "caller1": [
                        VCFRecord("chr5", 14, None, "GCGTA", ["CCGTG"]),
                        VCFRecord("chr5", 20, None, "A", ["G"])
                    ],
                    "caller2": [
                        VCFRecord("chr5", 14, None, "G", ["C"], info={"AD": 3}),
                        VCFRecord("chr5", 14, None, "GCGTA", ["CCGTG"], info={"AD": 100}),
                        VCFRecord("chr5", 20, None, "A", ["G"], info={"AD": 104})
                    ],
                    "caller3": [
                        VCFRecord("chr5", 14, None, "GCGTA", ["CCGTG"], info={"AD": 110}),
                        VCFRecord("chr5", 20, None, "A", ["G"], info={"AD": 100})
                    ],
                    "caller4": [
                        VCFRecord("chr5", 14, None, "GCGTA", ["CCGTG"]),
                        VCFRecord("chr5", 20, None, "A", ["G"])
                    ]
                }
            },
            {  # *a b c, a' a-b c, a-b c, a-b-c
                "initial": {
                    "caller1": [
                        VCFRecord("chr6", 14, None, "G", ["C"]),
                        VCFRecord("chr6", 18, None, "A", ["G"]),
                        VCFRecord("chr6", 20, None, "A", ["G"])
                    ],
                    "caller2": [
                        VCFRecord("chr6", 14, None, "G", ["C"], info={"AD": 3}),
                        VCFRecord("chr6", 14, None, "GCGTA", ["CCGTG"], info={"AD": 100}),
                        VCFRecord("chr6", 20, None, "A", ["G"], info={"AD": 104})
                    ],
                    "caller3": [
                        VCFRecord("chr6", 14, None, "GCGTA", ["CCGTG"], info={"AD": 105}),
                        VCFRecord("chr6", 20, None, "A", ["G"], info={"AD": 101})
                    ],
                    "caller4": [VCFRecord("chr6", 14, None, "GCGTATCA", ["CCGTGTCG"])]
                },
                "haplotyped": {
                    "caller1": [VCFRecord("chr6", 14, None, "GCGTATCA", ["CCGTGTCG"], info={"MCO_VAR": ["chr6:14=G/C", "chr6:18=A/G", "chr6:20=A/G"]})],
                    "caller2": [
                        VCFRecord("chr6", 14, None, "G", ["C"], info={"AD": 3}),
                        VCFRecord("chr6", 14, None, "GCGTATCA", ["CCGTGTCG"], info={"MCO_VAR": ["chr6:14=GCGTA/CCGTG", "chr6:20=A/G"], "AD": 100})
                    ],
                    "caller3": [VCFRecord("chr6", 14, None, "GCGTATCA", ["CCGTGTCG"], info={"MCO_VAR": ["chr6:14=GCGTA/CCGTG", "chr6:20=A/G"], "AD": 101})],
                    "caller4": [VCFRecord("chr6", 14, None, "GCGTATCA", ["CCGTGTCG"])]
                },
                "expected": {
                    "caller1": [
                        VCFRecord("chr6", 14, None, "G", ["C"]),
                        VCFRecord("chr6", 18, None, "A", ["G"]),
                        VCFRecord("chr6", 20, None, "A", ["G"])
                    ],
                    "caller2": [
                        VCFRecord("chr6", 14, None, "G", ["C"], info={"AD": 100}),
                        VCFRecord("chr6", 18, None, "A", ["G"], info={"AD": 100}),
                        VCFRecord("chr6", 20, None, "A", ["G"], info={"AD": 104})
                    ],
                    "caller3": [
                        VCFRecord("chr6", 14, None, "G", ["C"], info={"AD": 105}),
                        VCFRecord("chr6", 18, None, "A", ["G"], info={"AD": 105}),
                        VCFRecord("chr6", 20, None, "A", ["G"], info={"AD": 101})
                    ],
                    "caller4": [
                        VCFRecord("chr6", 14, None, "G", ["C"]),
                        VCFRecord("chr6", 18, None, "A", ["G"]),
                        VCFRecord("chr6", 20, None, "A", ["G"])
                    ]
                }
            },
            {  # *a b c, a-b b' c, a-b c, a-b-c
                "initial": {
                    "caller1": [
                        VCFRecord("chr7", 14, None, "G", ["C"]),
                        VCFRecord("chr7", 18, None, "A", ["G"]),
                        VCFRecord("chr7", 20, None, "A", ["G"])
                    ],
                    "caller2": [
                        VCFRecord("chr7", 14, None, "GCGTA", ["CCGTG"], info={"AD": 100}),
                        VCFRecord("chr7", 18, None, "A", ["G"], info={"AD": 3}),
                        VCFRecord("chr7", 20, None, "A", ["G"], info={"AD": 104})
                    ],
                    "caller3": [
                        VCFRecord("chr7", 14, None, "GCGTA", ["CCGTG"]),
                        VCFRecord("chr7", 20, None, "A", ["G"])
                    ],
                    "caller4": [VCFRecord("chr7", 14, None, "GCGTATCA", ["CCGTGTCG"])]
                },
                "haplotyped": {
                    "caller1": [VCFRecord("chr7", 14, None, "GCGTATCA", ["CCGTGTCG"], info={"MCO_VAR": ["chr7:14=G/C", "chr7:18=A/G", "chr7:20=A/G"]})],
                    "caller2": [
                        VCFRecord("chr7", 14, None, "GCGTATCA", ["CCGTGTCG"], info={"MCO_VAR": ["chr7:14=GCGTA/CCGTG", "chr7:20=A/G"], "AD": 100}),
                        VCFRecord("chr7", 18, None, "G", ["C"], info={"AD": 3})
                    ],
                    "caller3": [VCFRecord("chr7", 14, None, "GCGTATCA", ["CCGTGTCG"], info={"MCO_VAR": ["chr7:14=GCGTA/CCGTG", "chr7:20=A/G"]})],
                    "caller4": [VCFRecord("chr7", 14, None, "GCGTATCA", ["CCGTGTCG"])]
                },
                "expected": {
                    "caller1": [
                        VCFRecord("chr7", 14, None, "G", ["C"]),
                        VCFRecord("chr7", 18, None, "A", ["G"]),
                        VCFRecord("chr7", 20, None, "A", ["G"])
                    ],
                    "caller2": [
                        VCFRecord("chr7", 14, None, "G", ["C"], info={"AD": 100}),
                        VCFRecord("chr7", 18, None, "A", ["G"], info={"AD": 100}),
                        VCFRecord("chr7", 20, None, "A", ["G"], info={"AD": 104})
                    ],
                    "caller3": [
                        VCFRecord("chr7", 14, None, "G", ["C"]),
                        VCFRecord("chr7", 18, None, "A", ["G"]),
                        VCFRecord("chr7", 20, None, "A", ["G"])
                    ],
                    "caller4": [
                        VCFRecord("chr7", 14, None, "G", ["C"]),
                        VCFRecord("chr7", 18, None, "A", ["G"]),
                        VCFRecord("chr7", 20, None, "A", ["G"])
                    ]
                }
            },
            {  # *a-b c, a-b b' c, a b c, a-b-c
                "initial": {
                    "caller1": [
                        VCFRecord("chr8", 14, None, "GCGTA", ["CCGTG"]),
                        VCFRecord("chr8", 20, None, "A", ["G"])
                    ],
                    "caller2": [
                        VCFRecord("chr8", 14, None, "GCGTA", ["CCGTG"], info={"AD": 100}),
                        VCFRecord("chr8", 18, None, "A", ["G"], info={"AD": 3}),
                        VCFRecord("chr8", 20, None, "A", ["G"], info={"AD": 104})
                    ],
                    "caller3": [
                        VCFRecord("chr8", 14, None, "G", ["C"], info={"AD": 110}),
                        VCFRecord("chr8", 18, None, "A", ["G"], info={"AD": 105}),
                        VCFRecord("chr8", 20, None, "A", ["G"], info={"AD": 100})
                    ],
                    "caller4": [VCFRecord("chr8", 14, None, "GCGTATCA", ["CCGTGTCG"])]
                },
                "haplotyped": {
                    "caller1": [VCFRecord("chr8", 14, None, "GCGTATCA", ["CCGTGTCG"], info={"MCO_VAR": ["chr8:14=GCGTA/CCGTG", "chr8:20=A/G"]})],
                    "caller2": [
                        VCFRecord("chr8", 14, None, "GCGTATCA", ["CCGTGTCG"], info={"MCO_VAR": ["chr8:14=GCGTA/CCGTG", "chr8:20=A/G"], "AD": 100}),
                        VCFRecord("chr8", 18, None, "G", ["C"], info={"AD": 3})
                    ],
                    "caller3": [VCFRecord("chr8", 14, None, "GCGTATCA", ["CCGTGTCG"], info={"MCO_VAR": ["chr8:14=G/C", "chr8:18=A/G", "chr8:20=A/G"], "AD": 100})],
                    "caller4": [VCFRecord("chr8", 14, None, "GCGTATCA", ["CCGTGTCG"])]
                },
                "expected": {
                    "caller1": [
                        VCFRecord("chr8", 14, None, "GCGTA", ["CCGTG"]),
                        VCFRecord("chr8", 20, None, "A", ["G"])
                    ],
                    "caller2": [
                        VCFRecord("chr8", 14, None, "GCGTA", ["CCGTG"], info={"AD": 100}),
                        VCFRecord("chr8", 18, None, "A", ["G"], info={"AD": 3}),
                        VCFRecord("chr8", 20, None, "A", ["G"], info={"AD": 104})
                    ],
                    "caller3": [
                        VCFRecord("chr8", 14, None, "GCGTA", ["CCGTG"], info={"AD": 110}),
                        VCFRecord("chr8", 20, None, "A", ["G"], info={"AD": 100})
                    ],
                    "caller4": [
                        VCFRecord("chr8", 14, None, "GCGTA", ["CCGTG"]),
                        VCFRecord("chr8", 20, None, "A", ["G"])
                    ]
                }
            },
            {  # *a' a-b c, a-b b' c, a b c, a-b-c
                "initial": {
                    "caller1": [
                        VCFRecord("chr9", 14, None, "G", ["C"]),
                        VCFRecord("chr9", 14, None, "GCGTA", ["CCGTG"]),
                        VCFRecord("chr9", 20, None, "A", ["G"])
                    ],
                    "caller2": [
                        VCFRecord("chr9", 14, None, "GCGTA", ["CCGTG"], info={"AD": 100}),
                        VCFRecord("chr9", 18, None, "A", ["G"], info={"AD": 3}),
                        VCFRecord("chr9", 20, None, "A", ["G"], info={"AD": 104})
                    ],
                    "caller3": [
                        VCFRecord("chr9", 14, None, "G", ["C"], info={"AD": 110}),
                        VCFRecord("chr9", 18, None, "A", ["G"], info={"AD": 105}),
                        VCFRecord("chr9", 20, None, "A", ["G"], info={"AD": 100})
                    ],
                    "caller4": [VCFRecord("chr9", 14, None, "GCGTATCA", ["CCGTGTCG"])]
                },
                "haplotyped": {
                    "caller1": [
                        VCFRecord("chr9", 14, None, "G", ["C"]),
                        VCFRecord("chr9", 14, None, "GCGTATCA", ["CCGTGTCG"], info={"MCO_VAR": ["chr9:14=GCGTA/CCGTG", "chr9:20=A/G"]})
                    ],
                    "caller2": [
                        VCFRecord("chr9", 14, None, "GCGTATCA", ["CCGTGTCG"], info={"MCO_VAR": ["chr9:14=GCGTA/CCGTG", "chr9:20=A/G"], "AD": 100}),
                        VCFRecord("chr9", 18, None, "G", ["C"], info={"AD": 3})
                    ],
                    "caller3": [VCFRecord("chr9", 14, None, "GCGTATCA", ["CCGTGTCG"], info={"MCO_VAR": ["chr9:14=G/C", "chr9:18=A/G", "chr9:20=A/G"], "AD": 100})],
                    "caller4": [VCFRecord("chr9", 14, None, "GCGTATCA", ["CCGTGTCG"])]
                },
                "expected": {
                    "caller1": [
                        VCFRecord("chr9", 14, None, "G", ["C"]),
                        VCFRecord("chr9", 14, None, "GCGTA", ["CCGTG"]),
                        VCFRecord("chr9", 20, None, "A", ["G"])
                    ],
                    "caller2": [
                        VCFRecord("chr9", 14, None, "GCGTA", ["CCGTG"], info={"AD": 100}),
                        VCFRecord("chr9", 18, None, "A", ["G"], info={"AD": 3}),
                        VCFRecord("chr9", 20, None, "A", ["G"], info={"AD": 104})
                    ],
                    "caller3": [
                        VCFRecord("chr9", 14, None, "GCGTA", ["CCGTG"], info={"AD": 110}),
                        VCFRecord("chr9", 20, None, "A", ["G"], info={"AD": 100})
                    ],
                    "caller4": [
                        VCFRecord("chr9", 14, None, "GCGTA", ["CCGTG"]),
                        VCFRecord("chr9", 20, None, "A", ["G"])
                    ]
                }
            }
        ]

        # Get callers
        callers = set()
        for curr_test in self.test_cases:
            for curr_caller in curr_test["initial"]:
                callers.add(curr_caller)
        self.callers = sorted(list(callers))

        # Write files
        for curr_caller in self.callers:
            # Initial
            with VCFIO(self.tmp_initial_pathes.format(curr_caller), "w") as handle_out:
                handle_out.info = {
                    "AD": HeaderInfoAttr("AD", "Alternative allele depth.", type="Integer", number="1")
                }
                handle_out.extra_header = ["##source={}".format(curr_caller)]
                handle_out.writeHeader()
                for curr_test in self.test_cases:
                    if curr_caller in curr_test["initial"]:
                        for curr_var in curr_test["initial"][curr_caller]:
                            handle_out.write(curr_var)
            # Haplotyped
            with VCFIO(self.tmp_haplotyped_pathes.format(curr_caller), "w") as handle_out:
                handle_out.info = {
                    "AD": HeaderInfoAttr("AD", "Alternative allele depth.", type="Integer", number="1"),
                    "MCO_VAR": HeaderInfoAttr("MCO_VAR", "Name of the variants merged because their occur on same reads.", type="String", number=".")
                }
                handle_out.extra_header = ["##source={}".format(curr_caller)]
                handle_out.writeHeader()
                for curr_test in self.test_cases:
                    if curr_caller in curr_test["haplotyped"]:
                        for curr_var in curr_test["haplotyped"][curr_caller]:
                            handle_out.write(curr_var)
            # Expected
            with VCFIO(self.tmp_expected_pathes.format(curr_caller), "w") as handle_out:
                handle_out.info = {
                    "AD": HeaderInfoAttr("AD", "Alternative allele depth.", type="Integer", number="1")
                }
                handle_out.extra_header = ["##source={}".format(curr_caller)]
                handle_out.writeHeader()
                for curr_test in self.test_cases:
                    if curr_caller in curr_test["expected"]:
                        for curr_var in curr_test["expected"][curr_caller]:
                            handle_out.write(curr_var)
Ejemplo n.º 30
0
def getNewHeaderAttr(args):
    """
    Return renamed and new VCFHeader elements for the merged VCF.

    :param args: The script's parameters.
    :type args: NameSpace
    :return: VCFHeader elements (filter, info, format, samples).
    :rtype: dict
    """
    final_filter = {}
    final_info = {
        "SRC":
        HeaderInfoAttr(
            "SRC",
            type="String",
            number=".",
            description=
            "Variant callers where the variant is identified. Possible values: {}"
            .format({
                name: "s" + str(idx)
                for idx, name in enumerate(args.calling_sources)
            }))
    }
    final_format = {
        "AD":
        HeaderFormatAttr("AD",
                         type="Integer",
                         number="A",
                         description="Allele Depth"),
        "DP":
        HeaderFormatAttr("DP",
                         type="Integer",
                         number="1",
                         description="Total Depth"),
        "ADSRC":
        HeaderFormatAttr("ADSRC",
                         type="Integer",
                         number=".",
                         description="Allele Depth by source"),
        "DPSRC":
        HeaderFormatAttr("DPSRC",
                         type="Integer",
                         number=".",
                         description="Total Depth by source")
    }
    final_samples = None
    for idx_in, curr_in in enumerate(args.inputs_variants):
        with VCFIO(curr_in) as FH_vcf:
            # Samples
            if final_samples is None:
                final_samples = FH_vcf.samples
            elif FH_vcf.samples != final_samples:
                raise Exception(
                    "The samples in VCF are not the same: {} in {} and {} in {}."
                    .format(final_samples, args.inputs_variants[0],
                            FH_vcf.samples, curr_in))
            # FILTER
            for tag, data in FH_vcf.filter.items():
                new_tag = tag
                if tag not in args.shared_filters:  # Rename filters not based on caller
                    new_tag = "s{}_{}".format(idx_in, tag)
                    data.id = new_tag
                    data.source = args.calling_sources[idx_in]
                final_filter[new_tag] = data
            # INFO
            for tag, data in FH_vcf.info.items():
                if tag == args.annotations_field:
                    if tag not in final_info or len(
                            final_info[tag].description
                    ) < len(
                            data.description
                    ):  # Manage merge between callers with 0 variants (and 0 annotations) and callers with variants
                        final_info[tag] = data
                else:
                    new_tag = "s{}_{}".format(idx_in, tag)
                    data.id = new_tag
                    data.source = args.calling_sources[idx_in]
                    final_info[new_tag] = data
            qual_tag = "s{}_VCQUAL".format(idx_in)
            final_info[qual_tag] = HeaderInfoAttr(
                qual_tag,
                type="Float",
                number="1",
                description="The variant quality",
                source=args.calling_sources[idx_in])
            # FORMAT
            for tag, data in FH_vcf.format.items():
                new_tag = "s{}_{}".format(idx_in, tag)
                data.id = new_tag
                data.source = args.calling_sources[idx_in]
                final_format[new_tag] = data
    return {
        "filter": final_filter,
        "info": final_info,
        "format": final_format,
        "samples": final_samples
    }