Ejemplo n.º 1
0
    def to_sv_interval(self):
        sv_type = PINDEL_TO_SV_TYPE[self.sv_type]
        if sv_type not in PindelReader.svs_supported:
            return None

        if sv_type != "INS":
            return SVInterval(self.chromosome,
                              self.start_pos,
                              self.end_pos,
                              name=self.name,
                              sv_type=sv_type,
                              length=self.sv_len,
                              sources=pindel_source,
                              info=self.info,
                              native_sv=self)
        else:
            return SVInterval(self.chromosome,
                              self.start_pos,
                              self.start_pos,
                              self.name,
                              sv_type=sv_type,
                              length=self.sv_len,
                              sources=pindel_source,
                              native_sv=self,
                              wiggle=100,
                              info=self.info,
                              gt=self.gt)
    def to_sv_interval(self):
        if self.sv_type not in valid_breakdancer_svs:
            return None

        if self.sv_type == "DEL" or self.sv_type == "INV":
            return SVInterval(
                self.chr1,
                self.pos1 + 1,
                self.pos2,  # fudge
                name=self.name,
                sv_type=self.sv_type,
                length=self.sv_len,
                sources=breakdancer_source,
                cipos=[0, self.pos2 - self.pos1 - abs(self.sv_len)],
                info=self.info,
                native_sv=self)
        elif self.sv_type == "INS":
            return SVInterval(
                self.chr1,
                self.pos1 + 1,
                self.pos2,  # fudge
                name=self.name,
                sv_type=self.sv_type,
                length=self.sv_len,
                sources=breakdancer_source,
                cipos=[0, self.pos2 - self.pos1],
                info=self.info,
                native_sv=self)
        else:
            logger.error("Bad SV type: " + repr(self))
Ejemplo n.º 3
0
    def to_sv_interval(self):
        if self.sv_type not in BreakDancerReader.svs_supported:
            return None

        if (self.chr1 != self.chr2) and (self.sv_type != "CTX"):
            logger.error("Bad entry: " + repr(self))
            return None

        if self.sv_type == "DEL" or self.sv_type == "INV":
            return SVInterval(
                self.chr1,
                self.pos1 + 1,
                self.pos2,  # fudge
                name=self.name,
                sv_type=self.sv_type,
                length=self.sv_len,
                sources=breakdancer_source,
                cipos=[0, self.pos2 - self.pos1 - abs(self.sv_len)],
                info=self.info,
                native_sv=self)
        elif self.sv_type == "INS":
            return SVInterval(
                self.chr1,
                self.pos1 + 1,
                self.pos2,  # fudge
                name=self.name,
                sv_type=self.sv_type,
                length=self.sv_len,
                sources=breakdancer_source,
                cipos=[0, self.pos2 - self.pos1],
                info=self.info,
                native_sv=self)
        elif self.sv_type == "ITX" or self.sv_type == "CTX":
            #As in Breakdancer Native output, we always assume that:
            #For CTX: chr2 >= chr1
            #For ITX: pos2 >= pos1
            return SVInterval(
                self.chr1,
                self.pos1 + 1,
                self.pos2,  # fudge
                name=self.name,
                sv_type=self.sv_type,
                length=0,
                sources=breakdancer_source,
                cipos=[0, 0],
                info=self.info,
                native_sv=self,
                chrom2=self.chr2)
        else:
            logger.error("Bad SV type: " + repr(self))

        return None
 def to_sv_interval(self):
     return SVInterval(self.chromosome,
                       self.start,
                       self.end,
                       name=self.name,
                       sv_type=self.sv_type,
                       length=self.sv_len,
                       sources=cnvnator_source,
                       info=self.info,
                       native_sv=self)
Ejemplo n.º 5
0
    def to_sv_interval(self):
        if self.sv_type not in CNVnatorReader.svs_supported:
            return None

        return SVInterval(self.chromosome,
                          self.start,
                          self.end,
                          name=self.name,
                          sv_type=self.sv_type,
                          length=self.sv_len,
                          sources=cnvnator_source,
                          info=self.info,
                          native_sv=self)
Ejemplo n.º 6
0
    def to_sv_interval(self):
        if self.sv_type not in BreakSeqReader.svs_supported:
            return None

        return SVInterval(self.chromosome,
                          self.start,
                          self.end,
                          name=self.name,
                          sv_type=self.sv_type,
                          length=self.sv_len,
                          sources=source,
                          cipos=[],
                          info=self.info,
                          native_sv=self)
    def to_sv_interval(self):
        if self.sv_type not in valid_svs:
            return None

        return SVInterval(self.chromosome,
                          self.start,
                          self.end,
                          name=self.name,
                          sv_type=self.sv_type,
                          length=self.sv_len,
                          sources=source,
                          cipos=[],
                          info=self.info,
                          native_sv=self)
Ejemplo n.º 8
0
def run_metasv(args):
    logger.info("Running MetaSV %s" % __version__)
    logger.info("Arguments are " + str(args))

    # Check if there is work to do
    if not (args.pindel_vcf + args.breakdancer_vcf + args.breakseq_vcf +
            args.cnvnator_vcf + args.pindel_native + args.breakdancer_native +
            args.breakseq_native + args.cnvnator_native + args.manta_vcf +
            args.lumpy_vcf + args.cnvkit_vcf, args.wham_vcf):
        logger.warning("Nothing to merge since no SV file specified")

    # Simple check for arguments
    if not args.disable_assembly:
        if not args.spades:
            logger.error("Spades executable not specified")
            return os.EX_USAGE

        if not args.age:
            logger.error("AGE executable not specified")
            return os.EX_USAGE

    # Create the directories for working
    bedtools_tmpdir = os.path.join(args.workdir, "bedtools")
    create_dirs([args.workdir, args.outdir, bedtools_tmpdir])

    # Reference handling
    if not os.path.isfile(args.reference + ".fai"):
        logger.error("Reference file %s is not indexed" % (args.reference))
        return 1

    fasta_handle = pysam.Fastafile(args.reference) if os.path.isfile(
        args.reference) else None
    contigs = get_contigs(args.reference)
    include_intervals = sorted([
        SVInterval(contig.name,
                   0,
                   contig.length,
                   contig.name,
                   "include",
                   length=contig.length) for contig in contigs
    ])

    # Generate the list of contigs to process
    contig_whitelist = set(args.chromosomes) if args.chromosomes else set(
        [contig.name for contig in contigs])
    if args.keep_standard_contigs:
        contig_whitelist &= set([str(i) for i in xrange(1, 23)] +
                                ["chr%d" % (i) for i in xrange(1, 23)] +
                                ["X", "Y", "MT", "chrX", "chrY", "chrM"])
    logger.info("Only SVs on the following contigs will be reported: %s" %
                (sorted(list(contig_whitelist))))

    # Load the intervals from different files
    vcf_name_list = [("CNVnator", args.cnvnator_vcf),
                     ("Pindel", args.pindel_vcf),
                     ("BreakDancer", args.breakdancer_vcf),
                     ("BreakSeq", args.breakseq_vcf),
                     ("HaplotypeCaller", args.gatk_vcf),
                     ("Lumpy", args.lumpy_vcf), ("Manta", args.manta_vcf),
                     ("CNVkit", args.cnvkit_vcf), ("WHAM", args.wham_vcf)]
    native_name_list = [("CNVnator", args.cnvnator_native, CNVnatorReader),
                        ("Pindel", args.pindel_native, PindelReader),
                        ("BreakSeq", args.breakseq_native, BreakSeqReader),
                        ("BreakDancer", args.breakdancer_native,
                         BreakDancerReader)]

    tools = []
    intervals = {}
    sv_types = set()

    gap_intervals = []
    if args.filter_gaps:
        gaps = args.gaps if args.gaps else get_gaps_file(contig_whitelist)
        gap_intervals = sorted(load_gap_intervals(gaps))

    # Handles native input
    logger.info("Load native files")
    for toolname, nativename, svReader in native_name_list:
        # If no native file is given, ignore the tool
        if not nativename: continue

        tools.append(toolname)
        intervals[toolname] = defaultdict(list)

        for native_file in nativename:
            for record in svReader(native_file,
                                   svs_to_report=args.svs_to_report):
                interval = record.to_sv_interval()
                if not interval:
                    # This is the case for SVs we want to skip
                    continue
                BD_min_inv_len = args.mean_read_length + 4 * args.isize_sd
                if toolname == "BreakDancer" and interval.sv_type == "INV" and abs(
                        interval.length) < BD_min_inv_len:
                    #Filter BreakDancer artifact INVs with size < readlength+4*isize_sd
                    continue
                if not interval_overlaps_interval_list(
                        interval,
                        gap_intervals) and interval.chrom in contig_whitelist:

                    # Check length
                    if interval.length < args.minsvlen and interval.sv_type not in [
                            "ITX", "CTX"
                    ]:
                        continue

                    # Set wiggle
                    if interval.sv_type not in ["ITX", "CTX"]:
                        interval.wiggle = max(
                            args.inswiggle if interval.sv_type == "INS" else 0,
                            args.wiggle)
                    else:
                        interval.wiggle = TX_WIGGLE

                    intervals[toolname][interval.sv_type].append(interval)
        sv_types |= set(intervals[toolname].keys())

    # Handles the VCF input cases, we will just deal with these cases
    logger.info("Load VCF files")
    for toolname, vcfname in vcf_name_list:
        # If no VCF is given, ignore the tool
        if not vcfname:
            continue

        tools.append(toolname)
        intervals[toolname] = {}

        vcf_list = []
        for vcffile in vcfname:
            if os.path.isdir(vcffile):
                logger.info(
                    "Will load from per-chromosome VCFs from directory %s for tool %s"
                    % (vcffile, toolname))
                vcf_list += [
                    os.path.join(vcffile, "%s.vcf.gz" % contig.name)
                    for contig in contigs
                    if (not contig_whitelist or contig.name in contig_whitelist
                        )
                ]
            else:
                vcf_list.append(vcffile)

        for vcffile in vcf_list:
            load_intervals(vcffile,
                           intervals[toolname],
                           gap_intervals,
                           include_intervals,
                           toolname,
                           contig_whitelist,
                           minsvlen=args.minsvlen,
                           wiggle=args.wiggle,
                           inswiggle=args.inswiggle,
                           svs_to_report=args.svs_to_report,
                           maxsvlen=args.maxsvlen)
        sv_types |= set(intervals[toolname].keys())

    logger.info("SV types are %s" % (str(sv_types)))
    tool_merged_intervals = {}
    final_intervals = []

    # This will just output per-tool VCFs, no intra-tool merging is done yet
    if args.enable_per_tool_output:
        logger.info("Output per-tool VCFs")
        for toolname in intervals:
            tool_out = os.path.join(args.outdir, "%s.vcf" % (toolname.lower()))

            logger.info("Outputting single tool VCF for %s" % (str(toolname)))
            vcf_template_reader = vcf.Reader(
                open(os.path.join(mydir, "resources/template.vcf"), "r"))
            vcf_template_reader.samples = [args.sample]

            intervals_tool = []
            tool_out_fd = open(tool_out, "w")
            vcf_writer = vcf.Writer(tool_out_fd, vcf_template_reader)
            chr_intervals_tool = {contig.name: [] for contig in contigs}
            for sv_type in sv_types:
                if sv_type in intervals[toolname]:
                    intervals_tool.extend([
                        copy.deepcopy(interval)
                        for interval in intervals[toolname][sv_type]
                    ])
            for interval in intervals_tool:
                # Marghoob says that this is just to fill-in some metadata
                interval.do_validation(args.overlap_ratio)

                interval.fix_pos()
                chr_intervals_tool[interval.chrom].append(interval)

            for contig in contigs:
                chr_intervals_tool[contig.name].sort()
                for interval in chr_intervals_tool[contig.name]:
                    vcf_record = interval.to_vcf_record(
                        fasta_handle, args.sample)
                    if vcf_record is not None:
                        vcf_writer.write_record(vcf_record)
            tool_out_fd.close()
            vcf_writer.close()
            logger.info("Indexing single tool VCF for %s" % (str(toolname)))
            pysam.tabix_index(tool_out, force=True, preset="vcf")

    # Do merging here
    logger.info("Do merging")
    for sv_type in sv_types:
        logger.info("Processing SVs of type %s" % sv_type)
        tool_merged_intervals[sv_type] = []

        # Do the intra-tool merging
        logger.info("Intra-tool Merging SVs of type %s" % sv_type)
        for tool in tools:
            logger.debug("Is %s in tool keys? %s" %
                         (sv_type, str(intervals[tool].keys())))
            if sv_type not in intervals[tool]:
                logger.debug("%s not in tool %s" % (sv_type, tool))
                continue
            logger.info("First level merging for %s for tool %s" %
                        (sv_type, tool))
            tool_merged_intervals[sv_type] += merge_intervals(
                intervals[tool][sv_type])

        # Do the inter-tool merging
        logger.info("Inter-tool Merging SVs of type %s" % sv_type)
        final_intervals.extend(
            merge_intervals_recursively(tool_merged_intervals[sv_type],
                                        args.overlap_ratio))

    final_chr_intervals = {contig.name: [] for contig in contigs}
    for interval in final_intervals:
        interval.do_validation(args.overlap_ratio)
        interval.fix_pos()
        if args.minsvlen <= interval.length <= args.maxsvlen or interval.sv_type in [
                "ITX", "CTX"
        ]:
            final_chr_intervals[interval.chrom].append(interval)

    # This is the merged VCF without assembly, ok for deletions at this point
    logger.info("Output merged VCF without assembly ")
    vcf_template_reader = vcf.Reader(
        open(os.path.join(mydir, "resources/template.vcf"), "r"))
    vcf_template_reader.samples = [args.sample]
    preasm_vcf = os.path.join(args.workdir, "pre_asm.vcf")
    vcf_fd = open(preasm_vcf, "w")
    vcf_writer = vcf.Writer(vcf_fd, vcf_template_reader)

    final_stats = {}

    bed_intervals = []
    for contig in contigs:
        final_chr_intervals[contig.name].sort()
        for interval in final_chr_intervals[contig.name]:
            vcf_record = interval.to_vcf_record(fasta_handle)
            if vcf_record is not None:
                key = (interval.sv_type,
                       "PASS" if interval.is_validated else "LowQual",
                       "PRECISE" if interval.is_precise else "IMPRECISE",
                       tuple(sorted(list(interval.sources))))
                if key not in final_stats:
                    final_stats[key] = 0
                final_stats[key] += 1
                vcf_writer.write_record(vcf_record)
            bed_interval = interval.to_bed_interval(args.sample)
            if bed_interval is not None:
                bed_intervals.append(bed_interval)
    vcf_fd.close()
    vcf_writer.close()

    # Also save a BED file representation of the merged variants without assembly
    merged_bed = None
    if bed_intervals:
        merged_bed = os.path.join(args.workdir, "metasv.bed")
        pybedtools.BedTool(bed_intervals).saveas(merged_bed)

    for key in sorted(final_stats.keys()):
        logger.info(str(key) + ":" + str(final_stats[key]))

    final_vcf = os.path.join(args.outdir, "variants.vcf")

    # Run assembly here
    if not args.disable_assembly:
        logger.info("Running assembly")

        spades_tmpdir = os.path.join(args.workdir, "spades")
        age_tmpdir = os.path.join(args.workdir, "age")

        create_dirs([spades_tmpdir, age_tmpdir])

        assembly_bed = merged_bed

        # this does the improved assembly location finder with softclipped reads
        if args.boost_sc:
            logger.info("Generating Soft-Clipping intervals.")
            assembly_bed = parallel_generate_sc_intervals(
                args.bams,
                list(contig_whitelist),
                merged_bed,
                args.workdir,
                num_threads=args.num_threads,
                min_support_ins=args.min_support_ins,
                min_support_frac_ins=args.min_support_frac_ins,
                max_intervals=args.max_ins_intervals,
                min_mapq=args.min_mapq,
                min_avg_base_qual=args.min_avg_base_qual,
                min_soft_clip=args.min_soft_clip,
                max_nm=args.max_nm,
                min_matches=args.min_matches,
                isize_mean=args.isize_mean,
                isize_sd=args.isize_sd,
                svs_to_softclip=args.svs_to_softclip,
                overlap_ratio=args.overlap_ratio,
                mean_read_length=args.mean_read_length,
                mean_read_coverage=args.mean_read_coverage,
                min_ins_cov_frac=args.min_ins_cov_frac,
                max_ins_cov_frac=args.max_ins_cov_frac,
                assembly_max_tools=args.assembly_max_tools)
            logger.info("Generated intervals for assembly in %s" %
                        assembly_bed)

        logger.info("Will run assembly now")

        assembled_fasta, ignored_bed = run_spades_parallel(
            bams=args.bams,
            spades=args.spades,
            spades_options=args.spades_options,
            bed=assembly_bed,
            work=spades_tmpdir,
            pad=args.assembly_pad,
            nthreads=args.num_threads,
            chrs=list(contig_whitelist),
            max_interval_size=args.spades_max_interval_size,
            timeout=args.spades_timeout,
            svs_to_assemble=args.svs_to_assemble,
            stop_on_fail=args.stop_spades_on_fail,
            max_read_pairs=args.extraction_max_read_pairs,
            assembly_max_tools=args.assembly_max_tools)
        breakpoints_bed = run_age_parallel(
            intervals_bed=assembly_bed,
            reference=args.reference,
            assembly=assembled_fasta,
            pad=args.assembly_pad,
            age=args.age,
            timeout=args.age_timeout,
            chrs=list(contig_whitelist),
            nthreads=args.num_threads,
            min_contig_len=AGE_MIN_CONTIG_LENGTH,
            min_del_subalign_len=args.min_del_subalign_len,
            min_inv_subalign_len=args.min_inv_subalign_len,
            age_window=args.age_window,
            age_workdir=age_tmpdir)

        final_bed = os.path.join(args.workdir, "final.bed")
        if breakpoints_bed:
            if ignored_bed:
                pybedtools.BedTool(breakpoints_bed) \
                    .cat(pybedtools.BedTool(ignored_bed), postmerge=False) \
                    .sort().saveas(final_bed)
            else:
                pybedtools.BedTool(breakpoints_bed).saveas(final_bed)
        elif ignored_bed:
            pybedtools.BedTool(ignored_bed).sort().saveas(final_bed)
        else:
            final_bed = None

        genotyped_bed = parallel_genotype_intervals(
            final_bed,
            args.bams,
            workdir=os.path.join(args.workdir, "genotyping"),
            nthreads=args.num_threads,
            chromosomes=list(contig_whitelist),
            window=args.gt_window,
            isize_mean=args.isize_mean,
            isize_sd=args.isize_sd,
            normal_frac_threshold=args.gt_normal_frac)

        logger.info("Output final VCF file")

        convert_metasv_bed_to_vcf(bedfile=genotyped_bed,
                                  vcf_out=final_vcf,
                                  workdir=args.workdir,
                                  sample=args.sample,
                                  reference=args.reference,
                                  pass_calls=False)
    else:
        shutil.copy(preasm_vcf, final_vcf)
        pysam.tabix_index(final_vcf, force=True, preset="vcf")

    logger.info("Clean up pybedtools")

    pybedtools.cleanup(remove_all=True)

    logger.info("All Done!")

    return os.EX_OK
Ejemplo n.º 9
0
def run_metasv(sample,
               reference,
               pindel_vcf=[],
               pindel_native=[],
               breakdancer_vcf=[],
               breakdancer_native=[],
               breakseq_vcf=[],
               breakseq_native=[],
               cnvnator_vcf=[],
               cnvnator_native=[],
               gatk_vcf=[],
               gaps=None,
               filter_gaps=False,
               keep_standard_contigs=False,
               wiggle=WIGGLE,
               overlap_ratio=OVERLAP_RATIO,
               workdir="work",
               outdir="out",
               boost_ins=False,
               bam=None,
               chromosomes=[],
               num_threads=1,
               spades=None,
               age=None,
               disable_assembly=True,
               minsvlen=MIN_SV_LENGTH,
               inswiggle=INS_WIGGLE,
               enable_per_tool_output=False,
               min_support=MIN_SUPPORT,
               min_support_frac=MIN_SUPPORT_FRAC,
               max_intervals=MAX_INTERVALS,
               disable_deletion_assembly=False,
               stop_spades_on_fail=False):
    """Invoke the MetaSV workflow.

    Positional arguments:
    sample -- Sample name
    reference -- Path to a samtools indexed reference FASTA
    
    Keyword arguments:
    pindel_vcf -- List of Pindel VCFs generated by SVGenotyper
    pindel_native -- List of Pindel native output files
    breakdancer_vcf -- List of BreakDancer VCFs generated by SVGenotyper
    breakdancer_native -- List of BreakDancer native output files
    breakseq_vcf -- List of BreakSeq2 VCFs
    breakseq_native -- List of BreakSeq native GFF outputs
    cnvnator_vcf -- List of CNVnator VCFs generated by cnvnator2VCF.pl
    cnvnator_native -- List of CNVnator native output files
    gatk_vcf -- List of Indel VCFs generated by GATK's HaplotypeCaller
    gaps -- Gaps BED file
    filter_gaps -- Flag to filter out SVs overlapping gaps (default False)
    keep_standard_contigs -- Flag to only generate SVs for the major contigs 1, 2, ..., 22, X, Y, MT (default False)
    wiggle -- Wiggle for SV interval comparision (default 100)
    overlap_ratio -- Reciprocal overlap ratio for SV interval comparison (default 0.5)
    workdir -- Scratch directory for MetaSV (default "work")
    outdir -- Output directory for MetaSV (default "out")
    boost_ins -- Enable MetaSV's soft-clip based insertion detection (default False)
    bam -- Alignment BAM for assembly and insertion detection (default None)
    chromosomes -- If specified, indicates the list of chromosomes to process (default [])
    num_threads -- Number of worker threads to use for assembly steps (default 1)
    spades -- Path for the SPAdes executable (default None)
    age -- Path for the AGE executable (default None)
    disable_assembly -- Flag to disable assembly (default False)
    enable_per_tool_output -- Flag to also output merged calls for each tool (default False)

    """

    # Check if there is work to do
    if not (pindel_vcf + breakdancer_vcf + breakseq_vcf + cnvnator_vcf +
            pindel_native + breakdancer_native + breakseq_native +
            cnvnator_native):
        logger.error("Nothing to do since no SV file specified")
        return 1

    # Create the directories for working
    bedtools_tmpdir = os.path.join(workdir, "bedtools")
    create_dirs([workdir, outdir, bedtools_tmpdir])

    # Reference handling
    if not os.path.isfile(reference + ".fai"):
        logger.error("Reference file %s is not indexed" % (reference))
        return 1

    fasta_handle = pysam.Fastafile(reference) if os.path.isfile(
        reference) else None
    contigs = get_contigs(reference)
    include_intervals = sorted([
        SVInterval(contig.name,
                   0,
                   contig.length,
                   contig.name,
                   "include",
                   length=contig.length) for contig in contigs
    ])

    # Generate the list of contigs to process
    contig_whitelist = set(chromosomes) if chromosomes else set(
        [contig.name for contig in contigs])
    if keep_standard_contigs:
        contig_whitelist &= set([str(i) for i in xrange(1, 23)] +
                                ["chr%d" % (i) for i in xrange(1, 23)] +
                                ["X", "Y", "MT", "chrX", "chrY", "chrM"])
    logger.info("Only SVs on the following contigs will be reported: %s" %
                (sorted(list(contig_whitelist))))

    # Load the intervals from different files
    vcf_name_list = [("CNVnator", cnvnator_vcf), ("Pindel", pindel_vcf),
                     ("BreakDancer", breakdancer_vcf),
                     ("BreakSeq", breakseq_vcf), ("HaplotypeCaller", gatk_vcf)]
    native_name_list = [("CNVnator", cnvnator_native, CNVnatorReader),
                        ("Pindel", pindel_native, PindelReader),
                        ("BreakSeq", breakseq_native, BreakSeqReader),
                        ("BreakDancer", breakdancer_native, BreakDancerReader)]

    tools = []
    intervals = {}
    sv_types = set()

    gap_intervals = []
    if filter_gaps:
        if not gaps: gaps = get_gaps_file(contig_whitelist)
        gap_intervals = sorted(load_gap_intervals(gaps))

    # Handles native input
    logger.info("Load native files")
    for toolname, nativename, svReader in native_name_list:
        # If no native file is given, ignore the tool
        if not nativename: continue

        tools.append(toolname)
        intervals[toolname] = defaultdict(list)

        for native_file in nativename:
            for record in svReader(native_file):
                interval = record.to_sv_interval()

                if not interval:
                    # This is the case for SVs we want to skip
                    continue
                if not interval_overlaps_interval_list(
                        interval,
                        gap_intervals) and interval.chrom in contig_whitelist:

                    # Check length
                    if interval.length < minsvlen:
                        continue

                    # Set wiggle
                    if interval.sv_type == "INS":
                        interval.wiggle = max(inswiggle, wiggle)
                    else:
                        interval.wiggle = wiggle

                    intervals[toolname][interval.sv_type].append(interval)

        sv_types |= set(intervals[toolname].keys())

    # Handles the VCF input cases, we will just deal with these cases
    logger.info("Load VCF files")
    for toolname, vcfname in vcf_name_list:
        # If no VCF is given, ignore the tool
        if not vcfname:
            continue

        tools.append(toolname)
        intervals[toolname] = {}

        vcf_list = []
        for vcffile in vcfname:
            if os.path.isdir(vcffile):
                logger.info(
                    "Will load from per-chromosome VCFs from directory %s for tool %s"
                    % (vcffile, toolname))
                vcf_list += [
                    os.path.join(vcffile, "%s.vcf.gz" % contig.name)
                    for contig in contigs
                    if (not contig_whitelist or contig.name in contig_whitelist
                        )
                ]
            else:
                vcf_list.append(vcffile)

        for vcffile in vcf_list:
            load_intervals(vcffile,
                           intervals[toolname],
                           gap_intervals,
                           include_intervals,
                           toolname,
                           contig_whitelist,
                           minsvlen=minsvlen,
                           wiggle=wiggle,
                           inswiggle=inswiggle)
        sv_types |= set(intervals[toolname].keys())

    logger.info("SV types are %s" % (str(sv_types)))
    tool_merged_intervals = {}
    final_intervals = []

    bd_out = os.path.join(outdir, "breakdancer.vcf")
    pindel_out = os.path.join(outdir, "pindel.vcf")
    cnvnator_out = os.path.join(outdir, "cnvnator.vcf")
    breakseq_out = os.path.join(outdir, "breakseq.vcf")

    vcf_out_list = [("BreakDancer", bd_out), ("Pindel", pindel_out),
                    ("CNVnator", cnvnator_out), ("BreakSeq", breakseq_out)]

    # This will just output per-tool VCFs, no intra-tool merging is done yet
    if enable_per_tool_output:
        logger.info("Output per-tool VCFs")
        for toolname, tool_out in vcf_out_list:
            if tool_out is None or toolname not in intervals:
                continue

            logger.info("Outputting single tool VCF for %s" % (str(toolname)))
            vcf_template_reader = vcf.Reader(
                open(os.path.join(mydir, "resources/template.vcf"), "r"))
            vcf_template_reader.samples = [sample]

            intervals_tool = []
            tool_out_fd = open(tool_out, "w")
            vcf_writer = vcf.Writer(tool_out_fd, vcf_template_reader)
            chr_intervals_tool = {contig.name: [] for contig in contigs}
            for sv_type in sv_types:
                if sv_type in intervals[toolname]:
                    intervals_tool.extend([
                        copy.deepcopy(interval)
                        for interval in intervals[toolname][sv_type]
                    ])
            for interval in intervals_tool:
                # Marghoob says that this is just to fill-in some metadata
                interval.do_validation(overlap_ratio)

                interval.fix_pos()
                chr_intervals_tool[interval.chrom].append(interval)

            for contig in contigs:
                chr_intervals_tool[contig.name].sort()
                for interval in chr_intervals_tool[contig.name]:
                    vcf_record = interval.to_vcf_record(fasta_handle, sample)
                    if vcf_record is not None:
                        vcf_writer.write_record(vcf_record)
            tool_out_fd.close()
            vcf_writer.close()
            logger.info("Indexing single tool VCF for %s" % (str(toolname)))
            pysam.tabix_index(tool_out, force=True, preset="vcf")

    # Do merging here
    logger.info("Do merging")
    for sv_type in sv_types:
        logger.info("Processing SVs of type %s" % sv_type)
        tool_merged_intervals[sv_type] = []

        # Do the intra-tool merging
        logger.info("Intra-tool Merging SVs of type %s" % sv_type)
        for tool in tools:
            logger.debug("Is %s in tool keys? %s" %
                         (sv_type, str(intervals[tool].keys())))
            if sv_type not in intervals[tool]:
                logger.debug("%s not in tool %s" % (sv_type, tool))
                continue
            logger.info("First level merging for %s for tool %s" %
                        (sv_type, tool))
            tool_merged_intervals[sv_type] += merge_intervals(
                intervals[tool][sv_type])

        # Do the inter-tool merging
        logger.info("Inter-tool Merging SVs of type %s" % sv_type)
        merged_intervals = merge_intervals(tool_merged_intervals[sv_type])

        # Intervals which overlap well with merged_intervals
        intervals1 = []
        # Intervals which do not overlap well with merged_intervals.
        # Used to filter out small intervals which got merged with large intervals
        intervals2 = []

        logger.info("Checking overlaps SVs of type %s" % sv_type)
        for interval in tool_merged_intervals[sv_type]:
            if interval_overlaps_interval_list(interval, merged_intervals,
                                               overlap_ratio, overlap_ratio):
                intervals2.append(interval)
            else:
                intervals1.append(interval)
        final_intervals.extend(
            merge_intervals(intervals1) + merge_intervals(intervals2))

    final_chr_intervals = {contig.name: [] for contig in contigs}
    for interval in final_intervals:
        interval.do_validation(overlap_ratio)
        interval.fix_pos()
        final_chr_intervals[interval.chrom].append(interval)

    # This is the merged VCF without assembly, ok for deletions at this point
    logger.info("Output merged VCF without assembly ")
    vcf_template_reader = vcf.Reader(
        open(os.path.join(mydir, "resources/template.vcf"), "r"))
    vcf_template_reader.samples = [sample]
    preasm_vcf = os.path.join(workdir, "pre_asm.vcf")
    vcf_fd = open(preasm_vcf, "w")
    vcf_writer = vcf.Writer(vcf_fd, vcf_template_reader)

    final_stats = {}

    bed_intervals = []
    merged_bed = os.path.join(workdir, "metasv.bed")
    for contig in contigs:
        final_chr_intervals[contig.name].sort()
        for interval in final_chr_intervals[contig.name]:
            vcf_record = interval.to_vcf_record(fasta_handle)
            if vcf_record is not None:
                key = (interval.sv_type,
                       "PASS" if interval.is_validated else "LowQual",
                       "PRECISE" if interval.is_precise else "IMPRECISE",
                       tuple(sorted(list(interval.sources))))
                if key not in final_stats:
                    final_stats[key] = 0
                final_stats[key] += 1
                vcf_writer.write_record(vcf_record)
            bed_interval = interval.to_bed_interval(sample)
            if bed_interval is not None:
                bed_intervals.append(bed_interval)

    # Also save a BED file representation of the merged variants without assembly
    pybedtools.BedTool(bed_intervals).saveas(merged_bed)
    vcf_fd.close()
    vcf_writer.close()

    for key in sorted(final_stats.keys()):
        logger.info(str(key) + ":" + str(final_stats[key]))

    final_vcf = os.path.join(outdir, "variants.vcf")

    # Run assembly here
    if not disable_assembly:
        logger.info("Running assembly")
        if spades is None:
            logger.error("Spades executable not specified")
            return 1

        if age is None:
            logger.error("AGE executable not specified")
            return 1

        spades_tmpdir = os.path.join(workdir, "spades")
        age_tmpdir = os.path.join(workdir, "age")

        create_dirs([spades_tmpdir, age_tmpdir])

        assembly_bed = merged_bed

        # this does the improved assembly location finder with softclipped reads
        if boost_ins:
            logger.info("Generating intervals for insertions")
            assembly_bed = parallel_generate_sc_intervals(
                [bam.name],
                list(contig_whitelist),
                merged_bed,
                workdir,
                num_threads=num_threads,
                min_support=min_support,
                min_support_frac=min_support_frac,
                max_intervals=max_intervals)
            logger.info("Generated intervals for assembly in %s" %
                        assembly_bed)

        logger.info("Will run assembly now")

        assembled_fasta, ignored_bed = run_spades_parallel(
            bam=bam.name,
            spades=spades,
            bed=assembly_bed,
            work=spades_tmpdir,
            pad=SPADES_PAD,
            nthreads=num_threads,
            chrs=list(contig_whitelist),
            disable_deletion_assembly=disable_deletion_assembly,
            stop_on_fail=stop_spades_on_fail)
        breakpoints_bed = run_age_parallel(
            intervals_bed=assembly_bed,
            reference=reference,
            assembly=assembled_fasta,
            pad=AGE_PAD,
            age=age,
            chrs=list(contig_whitelist),
            nthreads=num_threads,
            min_contig_len=AGE_MIN_CONTIG_LENGTH,
            age_workdir=age_tmpdir)

        final_bed = os.path.join(workdir, "final.bed")
        if ignored_bed:
            pybedtools.BedTool(breakpoints_bed) \
                .cat(pybedtools.BedTool(ignored_bed), postmerge=False) \
                .sort().saveas(final_bed)
        else:
            pybedtools.BedTool(breakpoints_bed).saveas(final_bed)

        logger.info("Output final VCF file")

        convert_metasv_bed_to_vcf(bedfile=final_bed,
                                  vcf_out=final_vcf,
                                  sample=sample,
                                  pass_calls=False)
    else:
        shutil.copy(preasm_vcf, final_vcf)
        pysam.tabix_index(final_vcf, force=True, preset="vcf")

    logger.info("Clean up pybedtools")

    pybedtools.cleanup(remove_all=True)

    logger.info("All Done!")