def run_metasv(args): logger.info("Running MetaSV %s" % __version__) logger.info("Arguments are " + str(args)) # Check if there is work to do if not (args.pindel_vcf + args.breakdancer_vcf + args.breakseq_vcf + args.cnvnator_vcf + args.pindel_native + args.breakdancer_native + args.breakseq_native + args.cnvnator_native + args.manta_vcf + args.lumpy_vcf + args.cnvkit_vcf, args.wham_vcf): logger.warning("Nothing to merge since no SV file specified") # Simple check for arguments if not args.disable_assembly: if not args.spades: logger.error("Spades executable not specified") return os.EX_USAGE if not args.age: logger.error("AGE executable not specified") return os.EX_USAGE # Create the directories for working bedtools_tmpdir = os.path.join(args.workdir, "bedtools") create_dirs([args.workdir, args.outdir, bedtools_tmpdir]) # Reference handling if not os.path.isfile(args.reference + ".fai"): logger.error("Reference file %s is not indexed" % (args.reference)) return 1 fasta_handle = pysam.Fastafile(args.reference) if os.path.isfile(args.reference) else None contigs = get_contigs(args.reference) include_intervals = sorted( [SVInterval(contig.name, 0, contig.length, contig.name, "include", length=contig.length) for contig in contigs]) # Generate the list of contigs to process contig_whitelist = set(args.chromosomes) if args.chromosomes else set([contig.name for contig in contigs]) if args.keep_standard_contigs: contig_whitelist &= set( [str(i) for i in xrange(1, 23)] + ["chr%d" % (i) for i in xrange(1, 23)] + ["X", "Y", "MT", "chrX", "chrY", "chrM"]) logger.info("Only SVs on the following contigs will be reported: %s" % (sorted(list(contig_whitelist)))) # Load the intervals from different files vcf_name_list = [("CNVnator", args.cnvnator_vcf), ("Pindel", args.pindel_vcf), ("BreakDancer", args.breakdancer_vcf), ("BreakSeq", args.breakseq_vcf), ("HaplotypeCaller", args.gatk_vcf), ("Lumpy", args.lumpy_vcf), ("Manta", args.manta_vcf), ("CNVkit", args.cnvkit_vcf), ("WHAM", args.wham_vcf)] native_name_list = [("CNVnator", args.cnvnator_native, CNVnatorReader), ("Pindel", args.pindel_native, PindelReader), ("BreakSeq", args.breakseq_native, BreakSeqReader), ("BreakDancer", args.breakdancer_native, BreakDancerReader)] tools = [] intervals = {} sv_types = set() gap_intervals = [] if args.filter_gaps: gaps = args.gaps if args.gaps else get_gaps_file(contig_whitelist) gap_intervals = sorted(load_gap_intervals(gaps)) # Handles native input logger.info("Load native files") for toolname, nativename, svReader in native_name_list: # If no native file is given, ignore the tool if not nativename: continue tools.append(toolname) intervals[toolname] = defaultdict(list) for native_file in nativename: for record in svReader(native_file, svs_to_report=args.svs_to_report): interval = record.to_sv_interval() BD_min_inv_len = args.mean_read_length+4*args.isize_sd if toolname=="BreakDancer" and interval.sv_type == "INV" and abs(interval.length)< BD_min_inv_len: #Filter BreakDancer artifact INVs with size < readlength+4*isize_sd continue if not interval: # This is the case for SVs we want to skip continue if not interval_overlaps_interval_list(interval, gap_intervals) and interval.chrom in contig_whitelist: # Check length if interval.length < args.minsvlen and interval.sv_type not in ["ITX", "CTX"]: continue # Set wiggle if interval.sv_type not in ["ITX","CTX"]: interval.wiggle = max(args.inswiggle if interval.sv_type == "INS" else 0, args.wiggle) else: interval.wiggle = TX_WIGGLE intervals[toolname][interval.sv_type].append(interval) sv_types |= set(intervals[toolname].keys()) # Handles the VCF input cases, we will just deal with these cases logger.info("Load VCF files") for toolname, vcfname in vcf_name_list: # If no VCF is given, ignore the tool if not vcfname: continue tools.append(toolname) intervals[toolname] = {} vcf_list = [] for vcffile in vcfname: if os.path.isdir(vcffile): logger.info("Will load from per-chromosome VCFs from directory %s for tool %s" % (vcffile, toolname)) vcf_list += [os.path.join(vcffile, "%s.vcf.gz" % contig.name) for contig in contigs if (not contig_whitelist or contig.name in contig_whitelist)] else: vcf_list.append(vcffile) for vcffile in vcf_list: load_intervals(vcffile, intervals[toolname], gap_intervals, include_intervals, toolname, contig_whitelist, minsvlen=args.minsvlen, wiggle=args.wiggle, inswiggle=args.inswiggle, svs_to_report=args.svs_to_report, maxsvlen=args.maxsvlen) sv_types |= set(intervals[toolname].keys()) logger.info("SV types are %s" % (str(sv_types))) tool_merged_intervals = {} final_intervals = [] # This will just output per-tool VCFs, no intra-tool merging is done yet if args.enable_per_tool_output: logger.info("Output per-tool VCFs") for toolname in intervals: tool_out = os.path.join(args.outdir, "%s.vcf" % (toolname.lower())) logger.info("Outputting single tool VCF for %s" % (str(toolname))) vcf_template_reader = vcf.Reader(open(os.path.join(mydir, "resources/template.vcf"), "r")) vcf_template_reader.samples = [args.sample] intervals_tool = [] tool_out_fd = open(tool_out, "w") vcf_writer = vcf.Writer(tool_out_fd, vcf_template_reader) chr_intervals_tool = {contig.name: [] for contig in contigs} for sv_type in sv_types: if sv_type in intervals[toolname]: intervals_tool.extend([copy.deepcopy(interval) for interval in intervals[toolname][sv_type]]) for interval in intervals_tool: # Marghoob says that this is just to fill-in some metadata interval.do_validation(args.overlap_ratio) interval.fix_pos() chr_intervals_tool[interval.chrom].append(interval) for contig in contigs: chr_intervals_tool[contig.name].sort() for interval in chr_intervals_tool[contig.name]: vcf_record = interval.to_vcf_record(fasta_handle, args.sample) if vcf_record is not None: vcf_writer.write_record(vcf_record) tool_out_fd.close() vcf_writer.close() logger.info("Indexing single tool VCF for %s" % (str(toolname))) pysam.tabix_index(tool_out, force=True, preset="vcf") # Do merging here logger.info("Do merging") for sv_type in sv_types: logger.info("Processing SVs of type %s" % sv_type) tool_merged_intervals[sv_type] = [] # Do the intra-tool merging logger.info("Intra-tool Merging SVs of type %s" % sv_type) for tool in tools: logger.debug("Is %s in tool keys? %s" % (sv_type, str(intervals[tool].keys()))) if sv_type not in intervals[tool]: logger.debug("%s not in tool %s" % (sv_type, tool)) continue logger.info("First level merging for %s for tool %s" % (sv_type, tool)) tool_merged_intervals[sv_type] += merge_intervals(intervals[tool][sv_type]) # Do the inter-tool merging logger.info("Inter-tool Merging SVs of type %s" % sv_type) final_intervals.extend(merge_intervals_recursively(tool_merged_intervals[sv_type],args.overlap_ratio)) final_chr_intervals = {contig.name: [] for contig in contigs} for interval in final_intervals: interval.do_validation(args.overlap_ratio) interval.fix_pos() if args.minsvlen <= interval.length <= args.maxsvlen or interval.sv_type in ["ITX", "CTX"]: final_chr_intervals[interval.chrom].append(interval) # This is the merged VCF without assembly, ok for deletions at this point logger.info("Output merged VCF without assembly ") vcf_template_reader = vcf.Reader(open(os.path.join(mydir, "resources/template.vcf"), "r")) vcf_template_reader.samples = [args.sample] preasm_vcf = os.path.join(args.workdir, "pre_asm.vcf") vcf_fd = open(preasm_vcf, "w") vcf_writer = vcf.Writer(vcf_fd, vcf_template_reader) final_stats = {} bed_intervals = [] for contig in contigs: final_chr_intervals[contig.name].sort() for interval in final_chr_intervals[contig.name]: vcf_record = interval.to_vcf_record(fasta_handle) if vcf_record is not None: key = (interval.sv_type, "PASS" if interval.is_validated else "LowQual", "PRECISE" if interval.is_precise else "IMPRECISE", tuple(sorted(list(interval.sources)))) if key not in final_stats: final_stats[key] = 0 final_stats[key] += 1 vcf_writer.write_record(vcf_record) bed_interval = interval.to_bed_interval(args.sample) if bed_interval is not None: bed_intervals.append(bed_interval) vcf_fd.close() vcf_writer.close() # Also save a BED file representation of the merged variants without assembly merged_bed = None if bed_intervals: merged_bed = os.path.join(args.workdir, "metasv.bed") pybedtools.BedTool(bed_intervals).saveas(merged_bed) for key in sorted(final_stats.keys()): logger.info(str(key) + ":" + str(final_stats[key])) final_vcf = os.path.join(args.outdir, "variants.vcf") # Run assembly here if not args.disable_assembly: logger.info("Running assembly") spades_tmpdir = os.path.join(args.workdir, "spades") age_tmpdir = os.path.join(args.workdir, "age") create_dirs([spades_tmpdir, age_tmpdir]) assembly_bed = merged_bed # this does the improved assembly location finder with softclipped reads if args.boost_sc: logger.info("Generating Soft-Clipping intervals.") assembly_bed = parallel_generate_sc_intervals([args.bam.name], list(contig_whitelist), merged_bed, args.workdir, num_threads=args.num_threads, min_support_ins=args.min_support_ins, min_support_frac_ins=args.min_support_frac_ins, max_intervals=args.max_ins_intervals, min_mapq=args.min_mapq, min_avg_base_qual=args.min_avg_base_qual, min_soft_clip=args.min_soft_clip, max_nm=args.max_nm, min_matches=args.min_matches, isize_mean=args.isize_mean, isize_sd=args.isize_sd, svs_to_softclip=args.svs_to_softclip, overlap_ratio=args.overlap_ratio, mean_read_length=args.mean_read_length, mean_read_coverage=args.mean_read_coverage, min_ins_cov_frac=args.min_ins_cov_frac, max_ins_cov_frac=args.max_ins_cov_frac) logger.info("Generated intervals for assembly in %s" % assembly_bed) logger.info("Will run assembly now") assembled_fasta, ignored_bed = run_spades_parallel(bam=args.bam.name, spades=args.spades, bed=assembly_bed, work=spades_tmpdir, pad=args.assembly_pad, nthreads=args.num_threads, chrs=list(contig_whitelist), max_interval_size=args.spades_max_interval_size, svs_to_assemble=args.svs_to_assemble, stop_on_fail=args.stop_spades_on_fail, max_read_pairs=args.extraction_max_read_pairs, assembly_max_tools=args.assembly_max_tools) breakpoints_bed = run_age_parallel(intervals_bed=assembly_bed, reference=args.reference, assembly=assembled_fasta, pad=args.assembly_pad, age=args.age, chrs=list(contig_whitelist), nthreads=args.num_threads, min_contig_len=AGE_MIN_CONTIG_LENGTH, min_del_subalign_len=args.min_del_subalign_len, min_inv_subalign_len=args.min_inv_subalign_len, age_workdir=age_tmpdir) final_bed = os.path.join(args.workdir, "final.bed") if breakpoints_bed: if ignored_bed: pybedtools.BedTool(breakpoints_bed) \ .cat(pybedtools.BedTool(ignored_bed), postmerge=False) \ .sort().saveas(final_bed) else: pybedtools.BedTool(breakpoints_bed).saveas(final_bed) elif ignored_bed: pybedtools.BedTool(ignored_bed).sort().saveas(final_bed) else: final_bed = None genotyped_bed = parallel_genotype_intervals(final_bed, args.bam.name, workdir=os.path.join(args.workdir, "genotyping"), nthreads=args.num_threads, chromosomes=list(contig_whitelist), window=args.gt_window, isize_mean=args.isize_mean, isize_sd=args.isize_sd, normal_frac_threshold=args.gt_normal_frac) logger.info("Output final VCF file") convert_metasv_bed_to_vcf(bedfile=genotyped_bed, vcf_out=final_vcf, workdir=args.workdir, sample=args.sample, pass_calls=False) else: shutil.copy(preasm_vcf, final_vcf) pysam.tabix_index(final_vcf, force=True, preset="vcf") logger.info("Clean up pybedtools") pybedtools.cleanup(remove_all=True) logger.info("All Done!") return os.EX_OK
def run_metasv(args): logger.info("Running MetaSV %s" % __version__) logger.info("Arguments are " + str(args)) # Check if there is work to do if not (args.pindel_vcf + args.breakdancer_vcf + args.breakseq_vcf + args.cnvnator_vcf + args.pindel_native + args.breakdancer_native + args.breakseq_native + args.cnvnator_native + args.manta_vcf + args.lumpy_vcf + args.cnvkit_vcf, args.wham_vcf): logger.warning("Nothing to merge since no SV file specified") # Simple check for arguments if not args.disable_assembly: if not args.spades: logger.error("Spades executable not specified") return os.EX_USAGE if not args.age: logger.error("AGE executable not specified") return os.EX_USAGE # Create the directories for working bedtools_tmpdir = os.path.join(args.workdir, "bedtools") create_dirs([args.workdir, args.outdir, bedtools_tmpdir]) # Reference handling if not os.path.isfile(args.reference + ".fai"): logger.error("Reference file %s is not indexed" % (args.reference)) return 1 fasta_handle = pysam.Fastafile(args.reference) if os.path.isfile( args.reference) else None contigs = get_contigs(args.reference) include_intervals = sorted([ SVInterval(contig.name, 0, contig.length, contig.name, "include", length=contig.length) for contig in contigs ]) # Generate the list of contigs to process contig_whitelist = set(args.chromosomes) if args.chromosomes else set( [contig.name for contig in contigs]) if args.keep_standard_contigs: contig_whitelist &= set([str(i) for i in xrange(1, 23)] + ["chr%d" % (i) for i in xrange(1, 23)] + ["X", "Y", "MT", "chrX", "chrY", "chrM"]) logger.info("Only SVs on the following contigs will be reported: %s" % (sorted(list(contig_whitelist)))) # Load the intervals from different files vcf_name_list = [("CNVnator", args.cnvnator_vcf), ("Pindel", args.pindel_vcf), ("BreakDancer", args.breakdancer_vcf), ("BreakSeq", args.breakseq_vcf), ("HaplotypeCaller", args.gatk_vcf), ("Lumpy", args.lumpy_vcf), ("Manta", args.manta_vcf), ("CNVkit", args.cnvkit_vcf), ("WHAM", args.wham_vcf)] native_name_list = [("CNVnator", args.cnvnator_native, CNVnatorReader), ("Pindel", args.pindel_native, PindelReader), ("BreakSeq", args.breakseq_native, BreakSeqReader), ("BreakDancer", args.breakdancer_native, BreakDancerReader)] tools = [] intervals = {} sv_types = set() gap_intervals = [] if args.filter_gaps: gaps = args.gaps if args.gaps else get_gaps_file(contig_whitelist) gap_intervals = sorted(load_gap_intervals(gaps)) # Handles native input logger.info("Load native files") for toolname, nativename, svReader in native_name_list: # If no native file is given, ignore the tool if not nativename: continue tools.append(toolname) intervals[toolname] = defaultdict(list) for native_file in nativename: for record in svReader(native_file, svs_to_report=args.svs_to_report): interval = record.to_sv_interval() if not interval: # This is the case for SVs we want to skip continue BD_min_inv_len = args.mean_read_length + 4 * args.isize_sd if toolname == "BreakDancer" and interval.sv_type == "INV" and abs( interval.length) < BD_min_inv_len: #Filter BreakDancer artifact INVs with size < readlength+4*isize_sd continue if not interval_overlaps_interval_list( interval, gap_intervals) and interval.chrom in contig_whitelist: # Check length if interval.length < args.minsvlen and interval.sv_type not in [ "ITX", "CTX" ]: continue # Set wiggle if interval.sv_type not in ["ITX", "CTX"]: interval.wiggle = max( args.inswiggle if interval.sv_type == "INS" else 0, args.wiggle) else: interval.wiggle = TX_WIGGLE intervals[toolname][interval.sv_type].append(interval) sv_types |= set(intervals[toolname].keys()) # Handles the VCF input cases, we will just deal with these cases logger.info("Load VCF files") for toolname, vcfname in vcf_name_list: # If no VCF is given, ignore the tool if not vcfname: continue tools.append(toolname) intervals[toolname] = {} vcf_list = [] for vcffile in vcfname: if os.path.isdir(vcffile): logger.info( "Will load from per-chromosome VCFs from directory %s for tool %s" % (vcffile, toolname)) vcf_list += [ os.path.join(vcffile, "%s.vcf.gz" % contig.name) for contig in contigs if (not contig_whitelist or contig.name in contig_whitelist ) ] else: vcf_list.append(vcffile) for vcffile in vcf_list: load_intervals(vcffile, intervals[toolname], gap_intervals, include_intervals, toolname, contig_whitelist, minsvlen=args.minsvlen, wiggle=args.wiggle, inswiggle=args.inswiggle, svs_to_report=args.svs_to_report, maxsvlen=args.maxsvlen) sv_types |= set(intervals[toolname].keys()) logger.info("SV types are %s" % (str(sv_types))) tool_merged_intervals = {} final_intervals = [] # This will just output per-tool VCFs, no intra-tool merging is done yet if args.enable_per_tool_output: logger.info("Output per-tool VCFs") for toolname in intervals: tool_out = os.path.join(args.outdir, "%s.vcf" % (toolname.lower())) logger.info("Outputting single tool VCF for %s" % (str(toolname))) vcf_template_reader = vcf.Reader( open(os.path.join(mydir, "resources/template.vcf"), "r")) vcf_template_reader.samples = [args.sample] intervals_tool = [] tool_out_fd = open(tool_out, "w") vcf_writer = vcf.Writer(tool_out_fd, vcf_template_reader) chr_intervals_tool = {contig.name: [] for contig in contigs} for sv_type in sv_types: if sv_type in intervals[toolname]: intervals_tool.extend([ copy.deepcopy(interval) for interval in intervals[toolname][sv_type] ]) for interval in intervals_tool: # Marghoob says that this is just to fill-in some metadata interval.do_validation(args.overlap_ratio) interval.fix_pos() chr_intervals_tool[interval.chrom].append(interval) for contig in contigs: chr_intervals_tool[contig.name].sort() for interval in chr_intervals_tool[contig.name]: vcf_record = interval.to_vcf_record( fasta_handle, args.sample) if vcf_record is not None: vcf_writer.write_record(vcf_record) tool_out_fd.close() vcf_writer.close() logger.info("Indexing single tool VCF for %s" % (str(toolname))) pysam.tabix_index(tool_out, force=True, preset="vcf") # Do merging here logger.info("Do merging") for sv_type in sv_types: logger.info("Processing SVs of type %s" % sv_type) tool_merged_intervals[sv_type] = [] # Do the intra-tool merging logger.info("Intra-tool Merging SVs of type %s" % sv_type) for tool in tools: logger.debug("Is %s in tool keys? %s" % (sv_type, str(intervals[tool].keys()))) if sv_type not in intervals[tool]: logger.debug("%s not in tool %s" % (sv_type, tool)) continue logger.info("First level merging for %s for tool %s" % (sv_type, tool)) tool_merged_intervals[sv_type] += merge_intervals( intervals[tool][sv_type]) # Do the inter-tool merging logger.info("Inter-tool Merging SVs of type %s" % sv_type) final_intervals.extend( merge_intervals_recursively(tool_merged_intervals[sv_type], args.overlap_ratio)) final_chr_intervals = {contig.name: [] for contig in contigs} for interval in final_intervals: interval.do_validation(args.overlap_ratio) interval.fix_pos() if args.minsvlen <= interval.length <= args.maxsvlen or interval.sv_type in [ "ITX", "CTX" ]: final_chr_intervals[interval.chrom].append(interval) # This is the merged VCF without assembly, ok for deletions at this point logger.info("Output merged VCF without assembly ") vcf_template_reader = vcf.Reader( open(os.path.join(mydir, "resources/template.vcf"), "r")) vcf_template_reader.samples = [args.sample] preasm_vcf = os.path.join(args.workdir, "pre_asm.vcf") vcf_fd = open(preasm_vcf, "w") vcf_writer = vcf.Writer(vcf_fd, vcf_template_reader) final_stats = {} bed_intervals = [] for contig in contigs: final_chr_intervals[contig.name].sort() for interval in final_chr_intervals[contig.name]: vcf_record = interval.to_vcf_record(fasta_handle) if vcf_record is not None: key = (interval.sv_type, "PASS" if interval.is_validated else "LowQual", "PRECISE" if interval.is_precise else "IMPRECISE", tuple(sorted(list(interval.sources)))) if key not in final_stats: final_stats[key] = 0 final_stats[key] += 1 vcf_writer.write_record(vcf_record) bed_interval = interval.to_bed_interval(args.sample) if bed_interval is not None: bed_intervals.append(bed_interval) vcf_fd.close() vcf_writer.close() # Also save a BED file representation of the merged variants without assembly merged_bed = None if bed_intervals: merged_bed = os.path.join(args.workdir, "metasv.bed") pybedtools.BedTool(bed_intervals).saveas(merged_bed) for key in sorted(final_stats.keys()): logger.info(str(key) + ":" + str(final_stats[key])) final_vcf = os.path.join(args.outdir, "variants.vcf") # Run assembly here if not args.disable_assembly: logger.info("Running assembly") spades_tmpdir = os.path.join(args.workdir, "spades") age_tmpdir = os.path.join(args.workdir, "age") create_dirs([spades_tmpdir, age_tmpdir]) assembly_bed = merged_bed # this does the improved assembly location finder with softclipped reads if args.boost_sc: logger.info("Generating Soft-Clipping intervals.") assembly_bed = parallel_generate_sc_intervals( args.bams, list(contig_whitelist), merged_bed, args.workdir, num_threads=args.num_threads, min_support_ins=args.min_support_ins, min_support_frac_ins=args.min_support_frac_ins, max_intervals=args.max_ins_intervals, min_mapq=args.min_mapq, min_avg_base_qual=args.min_avg_base_qual, min_soft_clip=args.min_soft_clip, max_nm=args.max_nm, min_matches=args.min_matches, isize_mean=args.isize_mean, isize_sd=args.isize_sd, svs_to_softclip=args.svs_to_softclip, overlap_ratio=args.overlap_ratio, mean_read_length=args.mean_read_length, mean_read_coverage=args.mean_read_coverage, min_ins_cov_frac=args.min_ins_cov_frac, max_ins_cov_frac=args.max_ins_cov_frac, assembly_max_tools=args.assembly_max_tools) logger.info("Generated intervals for assembly in %s" % assembly_bed) logger.info("Will run assembly now") assembled_fasta, ignored_bed = run_spades_parallel( bams=args.bams, spades=args.spades, spades_options=args.spades_options, bed=assembly_bed, work=spades_tmpdir, pad=args.assembly_pad, nthreads=args.num_threads, chrs=list(contig_whitelist), max_interval_size=args.spades_max_interval_size, timeout=args.spades_timeout, svs_to_assemble=args.svs_to_assemble, stop_on_fail=args.stop_spades_on_fail, max_read_pairs=args.extraction_max_read_pairs, assembly_max_tools=args.assembly_max_tools) breakpoints_bed = run_age_parallel( intervals_bed=assembly_bed, reference=args.reference, assembly=assembled_fasta, pad=args.assembly_pad, age=args.age, timeout=args.age_timeout, chrs=list(contig_whitelist), nthreads=args.num_threads, min_contig_len=AGE_MIN_CONTIG_LENGTH, min_del_subalign_len=args.min_del_subalign_len, min_inv_subalign_len=args.min_inv_subalign_len, age_window=args.age_window, age_workdir=age_tmpdir) final_bed = os.path.join(args.workdir, "final.bed") if breakpoints_bed: if ignored_bed: pybedtools.BedTool(breakpoints_bed) \ .cat(pybedtools.BedTool(ignored_bed), postmerge=False) \ .sort().saveas(final_bed) else: pybedtools.BedTool(breakpoints_bed).saveas(final_bed) elif ignored_bed: pybedtools.BedTool(ignored_bed).sort().saveas(final_bed) else: final_bed = None genotyped_bed = parallel_genotype_intervals( final_bed, args.bams, workdir=os.path.join(args.workdir, "genotyping"), nthreads=args.num_threads, chromosomes=list(contig_whitelist), window=args.gt_window, isize_mean=args.isize_mean, isize_sd=args.isize_sd, normal_frac_threshold=args.gt_normal_frac) logger.info("Output final VCF file") convert_metasv_bed_to_vcf(bedfile=genotyped_bed, vcf_out=final_vcf, workdir=args.workdir, sample=args.sample, reference=args.reference, pass_calls=False) else: shutil.copy(preasm_vcf, final_vcf) pysam.tabix_index(final_vcf, force=True, preset="vcf") logger.info("Clean up pybedtools") pybedtools.cleanup(remove_all=True) logger.info("All Done!") return os.EX_OK
def convert_metasv_bed_to_vcf(bedfile=None, vcf_out=None, vcf_template_file=vcf_template, sample=None, reference=None, pass_calls=True): func_logger = logging.getLogger("%s" % (convert_metasv_bed_to_vcf.__name__)) vcf_template_reader = vcf.Reader(open(vcf_template_file, "r")) # The following are hacks to ensure sample name and contig names are put in the VCF header vcf_template_reader.samples = [sample] contigs = [] if reference: contigs = fasta_utils.get_contigs(reference) contigs_order_dict = {contig.name: index for (index, contig) in enumerate(contigs)} vcf_template_reader.contigs = OrderedDict([(contig.name, (contig.name, contig.length)) for contig in contigs]) vcf_template_reader.metadata["reference"] = reference vcf_template_reader.metadata["fileDate"] = str(datetime.date.today()) vcf_template_reader.metadata["source"] = [" ".join(sys.argv)] vcf_writer = vcf.Writer(open(vcf_out, "w"), vcf_template_reader) vcf_records = [] for interval in pybedtools.BedTool(bedfile): chrom = interval.chrom pos = interval.start end = interval.end genotype = "./." if len(interval.fields) < 11 else interval.fields[10] if genotype == "0/0": func_logger.info("Skipping homozygous reference %s" % str(interval)) continue sub_names = interval.name.split(":") sub_lengths = map(lambda x: int(x.split(",")[2]), sub_names) sub_types = map(lambda x: x.split(",")[1], sub_names) sub_methods = [name.split(",")[3] for name in sub_names] svmethods = (";".join([name.split(",")[3] for name in sub_names])).split(";") try: info = json.loads(base64.b64decode(name.split(",")[0])) except TypeError: info = dict() if len(interval.fields) > 9: info.update(json.loads(base64.b64decode(interval.fields[9]))) index_to_use = 0 is_pass = False svlen = -1 if "DEL" in sub_types: index_to_use = sub_types.index("DEL") svmethods_s = set(svmethods) - {"SC"} is_pass = len(svmethods_s) > 1 elif "INV" in sub_types: index_to_use = sub_types.index("INV") svmethods_s = set(svmethods) - {"SC"} is_pass = len(svmethods_s) > 1 elif "INS" in sub_types and "SC" in sub_methods: index_to_use = sub_methods.index("SC") pos = int(interval.fields[6]) end = int(interval.fields[7]) svlen = int(interval.fields[8]) if svlen < 0: svlen = sub_lengths[index_to_use] if sub_types[index_to_use] == "DEL": svlen = -svlen sv_type = sub_types[index_to_use] if sv_type == "INS": if pass_calls and end != pos + 1: continue end = pos is_pass = (int(interval.fields[8]) != -1) and (svlen == 0 or svlen >= 100) sv_id = "." ref = "." alt = ["<%s>" % sv_type] qual = "." sv_filter = ["PASS" if is_pass else "LowQual"] info.update({"END": end, "SVLEN": svlen, "SVTYPE": sv_type, "SVMETHOD": svmethods, "NUM_SVMETHODS": len(svmethods)}) sv_format = "GT" sample_indexes = [0] vcf_record = vcf.model._Record(chrom, pos, sv_id, ref, alt, qual, sv_filter, info, sv_format, sample_indexes) vcf_record.samples = vcf_template_reader._parse_samples([genotype], "GT", vcf_record) vcf_records.append(vcf_record) if contigs: vcf_records.sort(key=lambda x: (contigs_order_dict[x.CHROM], x.POS)) else: vcf_records.sort(key=lambda x: (x.CHROM, x.POS)) for vcf_record in vcf_records: vcf_writer.write_record(vcf_record) vcf_writer.close() func_logger.info("Tabix compressing and indexing %s" % vcf_out) pysam.tabix_index(vcf_out, force=True, preset="vcf")
def convert_metasv_bed_to_vcf(bedfile=None, vcf_out=None, workdir=None, vcf_template_file=vcf_template, sample=None, reference=None, pass_calls=True): func_logger = logging.getLogger("%s" % (convert_metasv_bed_to_vcf.__name__)) if not os.path.exists(workdir): os.makedirs(workdir) intervals = [] if bedfile: for interval in pybedtools.BedTool(bedfile): interval_info = get_interval_info(interval,pass_calls) if interval_info: updated_interval = pybedtools.Interval(interval.chrom, interval_info["pos"], interval_info["end"], name="%s,%s,%d,%s" % ( base64.b64encode(json.dumps(interval_info["info"])), interval_info["sv_type"], interval_info["sv_length"], ";".join(interval_info["svmethods"])), score = interval.score, otherfields=[interval_info["genotype"] , interval_info["sv_filter"]]) if not intervals: intervals.append(updated_interval) else: merged_interval=check_duplicates(updated_interval,intervals[-1]) if merged_interval: func_logger.info("Merging intervals: %s and %s" % (updated_interval,intervals[-1])) intervals.pop() intervals.append(merged_interval) else: intervals.append(updated_interval) else: func_logger.info("Skip interval: %s" % (interval)) nonfilterd_bed = os.path.join(workdir, "final_nonfilterd.bed") filterd_bed = os.path.join(workdir, "final_filterd.bed") bedtool = pybedtools.BedTool(intervals).sort().moveto(nonfilterd_bed) filterd_bed = filter_confused_INS_calls(nonfilterd_bed,filterd_bed) vcf_template_reader = vcf.Reader(open(vcf_template_file, "r")) # The following are hacks to ensure sample name and contig names are put in the VCF header vcf_template_reader.samples = [sample] contigs = [] fasta_file = None if reference: contigs = fasta_utils.get_contigs(reference) contigs_order_dict = {contig.name: index for (index, contig) in enumerate(contigs)} vcf_template_reader.contigs = OrderedDict([(contig.name, (contig.name, contig.length)) for contig in contigs]) vcf_template_reader.metadata["reference"] = reference fasta_file = pysam.Fastafile(reference) vcf_template_reader.metadata["fileDate"] = str(datetime.date.today()) vcf_template_reader.metadata["source"] = [" ".join(sys.argv)] vcf_writer = vcf.Writer(open(vcf_out, "w"), vcf_template_reader) vcf_records = [] if filterd_bed: bedtool = pybedtools.BedTool(filterd_bed) for interval in bedtool: name_split=interval.name.split(",") info = json.loads(base64.b64decode(name_split[0])) sv_type = name_split[1] sv_id = "." ref = fasta_file.fetch(str(interval.chrom), interval.start, interval.start + 1) if fasta_file else "." alt = [vcf.model._SV(sv_type)] qual = "." sv_filter = [interval.fields[7]] genotype = interval.fields[6] sv_format = "GT" sample_indexes = [0] vcf_record = vcf.model._Record(interval.chrom, interval.start, sv_id, ref, alt, qual, sv_filter, info, sv_format, sample_indexes) vcf_record.samples = vcf_template_reader._parse_samples([genotype], "GT", vcf_record) vcf_records.append(vcf_record) if contigs: vcf_records.sort(key=lambda x: (contigs_order_dict[x.CHROM], x.POS)) else: vcf_records.sort(key=lambda x: (x.CHROM, x.POS)) resolved_vcf_records = resolve_for_IDP_ITX_CTX(vcf_records,fasta_file) for vcf_record in resolved_vcf_records: vcf_writer.write_record(vcf_record) vcf_writer.close() func_logger.info("Tabix compressing and indexing %s" % vcf_out) pysam.tabix_index(vcf_out, force=True, preset="vcf")
def run_metasv(sample, reference, pindel_vcf=[], pindel_native=[], breakdancer_vcf=[], breakdancer_native=[], breakseq_vcf=[], breakseq_native=[], cnvnator_vcf=[], cnvnator_native=[], gatk_vcf=[], gaps=None, filter_gaps=False, keep_standard_contigs=False, wiggle=WIGGLE, overlap_ratio=OVERLAP_RATIO, workdir="work", outdir="out", boost_ins=False, bam=None, chromosomes=[], num_threads=1, spades=None, age=None, disable_assembly=True, minsvlen=MIN_SV_LENGTH, inswiggle=INS_WIGGLE, enable_per_tool_output=False, min_support=MIN_SUPPORT, min_support_frac=MIN_SUPPORT_FRAC, max_intervals=MAX_INTERVALS, disable_deletion_assembly=False, stop_spades_on_fail=False): """Invoke the MetaSV workflow. Positional arguments: sample -- Sample name reference -- Path to a samtools indexed reference FASTA Keyword arguments: pindel_vcf -- List of Pindel VCFs generated by SVGenotyper pindel_native -- List of Pindel native output files breakdancer_vcf -- List of BreakDancer VCFs generated by SVGenotyper breakdancer_native -- List of BreakDancer native output files breakseq_vcf -- List of BreakSeq2 VCFs breakseq_native -- List of BreakSeq native GFF outputs cnvnator_vcf -- List of CNVnator VCFs generated by cnvnator2VCF.pl cnvnator_native -- List of CNVnator native output files gatk_vcf -- List of Indel VCFs generated by GATK's HaplotypeCaller gaps -- Gaps BED file filter_gaps -- Flag to filter out SVs overlapping gaps (default False) keep_standard_contigs -- Flag to only generate SVs for the major contigs 1, 2, ..., 22, X, Y, MT (default False) wiggle -- Wiggle for SV interval comparision (default 100) overlap_ratio -- Reciprocal overlap ratio for SV interval comparison (default 0.5) workdir -- Scratch directory for MetaSV (default "work") outdir -- Output directory for MetaSV (default "out") boost_ins -- Enable MetaSV's soft-clip based insertion detection (default False) bam -- Alignment BAM for assembly and insertion detection (default None) chromosomes -- If specified, indicates the list of chromosomes to process (default []) num_threads -- Number of worker threads to use for assembly steps (default 1) spades -- Path for the SPAdes executable (default None) age -- Path for the AGE executable (default None) disable_assembly -- Flag to disable assembly (default False) enable_per_tool_output -- Flag to also output merged calls for each tool (default False) """ # Check if there is work to do if not ( pindel_vcf + breakdancer_vcf + breakseq_vcf + cnvnator_vcf + pindel_native + breakdancer_native + breakseq_native + cnvnator_native): logger.error("Nothing to do since no SV file specified") return 1 # Create the directories for working bedtools_tmpdir = os.path.join(workdir, "bedtools") create_dirs([workdir, outdir, bedtools_tmpdir]) # Reference handling if not os.path.isfile(reference + ".fai"): logger.error("Reference file %s is not indexed" % (reference)) return 1 fasta_handle = pysam.Fastafile(reference) if os.path.isfile(reference) else None contigs = get_contigs(reference) include_intervals = sorted( [SVInterval(contig.name, 0, contig.length, contig.name, "include", length=contig.length) for contig in contigs]) # Generate the list of contigs to process contig_whitelist = set(chromosomes) if chromosomes else set([contig.name for contig in contigs]) if keep_standard_contigs: contig_whitelist &= set( [str(i) for i in xrange(1, 23)] + ["chr%d" % (i) for i in xrange(1, 23)] + ["X", "Y", "MT", "chrX", "chrY", "chrM"]) logger.info("Only SVs on the following contigs will be reported: %s" % (sorted(list(contig_whitelist)))) # Load the intervals from different files vcf_name_list = [("CNVnator", cnvnator_vcf), ("Pindel", pindel_vcf), ("BreakDancer", breakdancer_vcf), ("BreakSeq", breakseq_vcf), ("HaplotypeCaller", gatk_vcf)] native_name_list = [("CNVnator", cnvnator_native, CNVnatorReader), ("Pindel", pindel_native, PindelReader), ("BreakSeq", breakseq_native, BreakSeqReader), ("BreakDancer", breakdancer_native, BreakDancerReader)] tools = [] intervals = {} sv_types = set() gap_intervals = [] if filter_gaps: if not gaps: gaps = get_gaps_file(contig_whitelist) gap_intervals = sorted(load_gap_intervals(gaps)) # Handles native input logger.info("Load native files") for toolname, nativename, svReader in native_name_list: # If no native file is given, ignore the tool if not nativename: continue tools.append(toolname) intervals[toolname] = defaultdict(list) for native_file in nativename: for record in svReader(native_file): interval = record.to_sv_interval() if not interval: # This is the case for SVs we want to skip continue if not interval_overlaps_interval_list(interval, gap_intervals) and interval.chrom in contig_whitelist: # Check length if interval.length < minsvlen: continue # Set wiggle if interval.sv_type == "INS": interval.wiggle = max(inswiggle, wiggle) else: interval.wiggle = wiggle intervals[toolname][interval.sv_type].append(interval) sv_types |= set(intervals[toolname].keys()) # Handles the VCF input cases, we will just deal with these cases logger.info("Load VCF files") for toolname, vcfname in vcf_name_list: # If no VCF is given, ignore the tool if not vcfname: continue tools.append(toolname) intervals[toolname] = {} vcf_list = [] for vcffile in vcfname: if os.path.isdir(vcffile): logger.info("Will load from per-chromosome VCFs from directory %s for tool %s" % (vcffile, toolname)) vcf_list += [os.path.join(vcffile, "%s.vcf.gz" % contig.name) for contig in contigs if (not contig_whitelist or contig.name in contig_whitelist)] else: vcf_list.append(vcffile) for vcffile in vcf_list: load_intervals(vcffile, intervals[toolname], gap_intervals, include_intervals, toolname, contig_whitelist, minsvlen=minsvlen, wiggle=wiggle, inswiggle=inswiggle) sv_types |= set(intervals[toolname].keys()) logger.info("SV types are %s" % (str(sv_types))) tool_merged_intervals = {} final_intervals = [] bd_out = os.path.join(outdir, "breakdancer.vcf") pindel_out = os.path.join(outdir, "pindel.vcf") cnvnator_out = os.path.join(outdir, "cnvnator.vcf") breakseq_out = os.path.join(outdir, "breakseq.vcf") vcf_out_list = [("BreakDancer", bd_out), ("Pindel", pindel_out), ("CNVnator", cnvnator_out), ("BreakSeq", breakseq_out)] # This will just output per-tool VCFs, no intra-tool merging is done yet if enable_per_tool_output: logger.info("Output per-tool VCFs") for toolname, tool_out in vcf_out_list: if tool_out is None or toolname not in intervals: continue logger.info("Outputting single tool VCF for %s" % (str(toolname))) vcf_template_reader = vcf.Reader(open(os.path.join(mydir, "resources/template.vcf"), "r")) vcf_template_reader.samples = [sample] intervals_tool = [] tool_out_fd = open(tool_out, "w") vcf_writer = vcf.Writer(tool_out_fd, vcf_template_reader) chr_intervals_tool = {contig.name: [] for contig in contigs} for sv_type in sv_types: if sv_type in intervals[toolname]: intervals_tool.extend([copy.deepcopy(interval) for interval in intervals[toolname][sv_type]]) for interval in intervals_tool: # Marghoob says that this is just to fill-in some metadata interval.do_validation(overlap_ratio) interval.fix_pos() chr_intervals_tool[interval.chrom].append(interval) for contig in contigs: chr_intervals_tool[contig.name].sort() for interval in chr_intervals_tool[contig.name]: vcf_record = interval.to_vcf_record(fasta_handle, sample) if vcf_record is not None: vcf_writer.write_record(vcf_record) tool_out_fd.close() vcf_writer.close() logger.info("Indexing single tool VCF for %s" % (str(toolname))) pysam.tabix_index(tool_out, force=True, preset="vcf") # Do merging here logger.info("Do merging") for sv_type in sv_types: logger.info("Processing SVs of type %s" % sv_type) tool_merged_intervals[sv_type] = [] # Do the intra-tool merging logger.info("Intra-tool Merging SVs of type %s" % sv_type) for tool in tools: logger.debug("Is %s in tool keys? %s" % (sv_type, str(intervals[tool].keys()))) if sv_type not in intervals[tool]: logger.debug("%s not in tool %s" % (sv_type, tool)) continue logger.info("First level merging for %s for tool %s" % (sv_type, tool)) tool_merged_intervals[sv_type] += merge_intervals(intervals[tool][sv_type]) # Do the inter-tool merging logger.info("Inter-tool Merging SVs of type %s" % sv_type) merged_intervals = merge_intervals(tool_merged_intervals[sv_type]) # Intervals which overlap well with merged_intervals intervals1 = [] # Intervals which do not overlap well with merged_intervals. # Used to filter out small intervals which got merged with large intervals intervals2 = [] logger.info("Checking overlaps SVs of type %s" % sv_type) for interval in tool_merged_intervals[sv_type]: if interval_overlaps_interval_list(interval, merged_intervals, overlap_ratio, overlap_ratio): intervals2.append(interval) else: intervals1.append(interval) final_intervals.extend(merge_intervals(intervals1) + merge_intervals(intervals2)) final_chr_intervals = {contig.name: [] for contig in contigs} for interval in final_intervals: interval.do_validation(overlap_ratio) interval.fix_pos() final_chr_intervals[interval.chrom].append(interval) # This is the merged VCF without assembly, ok for deletions at this point logger.info("Output merged VCF without assembly ") vcf_template_reader = vcf.Reader(open(os.path.join(mydir, "resources/template.vcf"), "r")) vcf_template_reader.samples = [sample] preasm_vcf = os.path.join(workdir, "pre_asm.vcf") vcf_fd = open(preasm_vcf, "w") vcf_writer = vcf.Writer(vcf_fd, vcf_template_reader) final_stats = {} bed_intervals = [] merged_bed = os.path.join(workdir, "metasv.bed") for contig in contigs: final_chr_intervals[contig.name].sort() for interval in final_chr_intervals[contig.name]: vcf_record = interval.to_vcf_record(fasta_handle) if vcf_record is not None: key = (interval.sv_type, "PASS" if interval.is_validated else "LowQual", "PRECISE" if interval.is_precise else "IMPRECISE", tuple(sorted(list(interval.sources)))) if key not in final_stats: final_stats[key] = 0 final_stats[key] += 1 vcf_writer.write_record(vcf_record) bed_interval = interval.to_bed_interval(sample) if bed_interval is not None: bed_intervals.append(bed_interval) # Also save a BED file representation of the merged variants without assembly pybedtools.BedTool(bed_intervals).saveas(merged_bed) vcf_fd.close() vcf_writer.close() for key in sorted(final_stats.keys()): logger.info(str(key) + ":" + str(final_stats[key])) final_vcf = os.path.join(outdir, "variants.vcf") # Run assembly here if not disable_assembly: logger.info("Running assembly") if spades is None: logger.error("Spades executable not specified") return 1 if age is None: logger.error("AGE executable not specified") return 1 spades_tmpdir = os.path.join(workdir, "spades") age_tmpdir = os.path.join(workdir, "age") create_dirs([spades_tmpdir, age_tmpdir]) assembly_bed = merged_bed # this does the improved assembly location finder with softclipped reads if boost_ins: logger.info("Generating intervals for insertions") assembly_bed = parallel_generate_sc_intervals([bam.name], list(contig_whitelist), merged_bed, workdir, num_threads=num_threads, min_support=min_support, min_support_frac=min_support_frac, max_intervals=max_intervals) logger.info("Generated intervals for assembly in %s" % assembly_bed) logger.info("Will run assembly now") assembled_fasta, ignored_bed = run_spades_parallel(bam=bam.name, spades=spades, bed=assembly_bed, work=spades_tmpdir, pad=SPADES_PAD, nthreads=num_threads, chrs=list(contig_whitelist), disable_deletion_assembly=disable_deletion_assembly, stop_on_fail=stop_spades_on_fail) breakpoints_bed = run_age_parallel(intervals_bed=assembly_bed, reference=reference, assembly=assembled_fasta, pad=AGE_PAD, age=age, chrs=list(contig_whitelist), nthreads=num_threads, min_contig_len=AGE_MIN_CONTIG_LENGTH, age_workdir=age_tmpdir) final_bed = os.path.join(workdir, "final.bed") if ignored_bed: pybedtools.BedTool(breakpoints_bed) \ .cat(pybedtools.BedTool(ignored_bed), postmerge=False) \ .sort().saveas(final_bed) else: pybedtools.BedTool(breakpoints_bed).saveas(final_bed) logger.info("Output final VCF file") convert_metasv_bed_to_vcf(bedfile=final_bed, vcf_out=final_vcf, sample=sample, pass_calls=False) else: shutil.copy(preasm_vcf, final_vcf) pysam.tabix_index(final_vcf, force=True, preset="vcf") logger.info("Clean up pybedtools") pybedtools.cleanup(remove_all=True) logger.info("All Done!")
def run_metasv(sample, reference, pindel_vcf=[], pindel_native=[], breakdancer_vcf=[], breakdancer_native=[], breakseq_vcf=[], breakseq_native=[], cnvnator_vcf=[], cnvnator_native=[], gatk_vcf=[], gaps=None, filter_gaps=False, keep_standard_contigs=False, wiggle=WIGGLE, overlap_ratio=OVERLAP_RATIO, workdir="work", outdir="out", boost_ins=False, bam=None, chromosomes=[], num_threads=1, spades=None, age=None, disable_assembly=True, minsvlen=MIN_SV_LENGTH, inswiggle=INS_WIGGLE, enable_per_tool_output=False, min_support=MIN_SUPPORT, min_support_frac=MIN_SUPPORT_FRAC, max_intervals=MAX_INTERVALS, disable_deletion_assembly=False, stop_spades_on_fail=False): """Invoke the MetaSV workflow. Positional arguments: sample -- Sample name reference -- Path to a samtools indexed reference FASTA Keyword arguments: pindel_vcf -- List of Pindel VCFs generated by SVGenotyper pindel_native -- List of Pindel native output files breakdancer_vcf -- List of BreakDancer VCFs generated by SVGenotyper breakdancer_native -- List of BreakDancer native output files breakseq_vcf -- List of BreakSeq2 VCFs breakseq_native -- List of BreakSeq native GFF outputs cnvnator_vcf -- List of CNVnator VCFs generated by cnvnator2VCF.pl cnvnator_native -- List of CNVnator native output files gatk_vcf -- List of Indel VCFs generated by GATK's HaplotypeCaller gaps -- Gaps BED file filter_gaps -- Flag to filter out SVs overlapping gaps (default False) keep_standard_contigs -- Flag to only generate SVs for the major contigs 1, 2, ..., 22, X, Y, MT (default False) wiggle -- Wiggle for SV interval comparision (default 100) overlap_ratio -- Reciprocal overlap ratio for SV interval comparison (default 0.5) workdir -- Scratch directory for MetaSV (default "work") outdir -- Output directory for MetaSV (default "out") boost_ins -- Enable MetaSV's soft-clip based insertion detection (default False) bam -- Alignment BAM for assembly and insertion detection (default None) chromosomes -- If specified, indicates the list of chromosomes to process (default []) num_threads -- Number of worker threads to use for assembly steps (default 1) spades -- Path for the SPAdes executable (default None) age -- Path for the AGE executable (default None) disable_assembly -- Flag to disable assembly (default False) enable_per_tool_output -- Flag to also output merged calls for each tool (default False) """ # Check if there is work to do if not (pindel_vcf + breakdancer_vcf + breakseq_vcf + cnvnator_vcf + pindel_native + breakdancer_native + breakseq_native + cnvnator_native): logger.error("Nothing to do since no SV file specified") return 1 # Create the directories for working bedtools_tmpdir = os.path.join(workdir, "bedtools") create_dirs([workdir, outdir, bedtools_tmpdir]) # Reference handling if not os.path.isfile(reference + ".fai"): logger.error("Reference file %s is not indexed" % (reference)) return 1 fasta_handle = pysam.Fastafile(reference) if os.path.isfile( reference) else None contigs = get_contigs(reference) include_intervals = sorted([ SVInterval(contig.name, 0, contig.length, contig.name, "include", length=contig.length) for contig in contigs ]) # Generate the list of contigs to process contig_whitelist = set(chromosomes) if chromosomes else set( [contig.name for contig in contigs]) if keep_standard_contigs: contig_whitelist &= set([str(i) for i in xrange(1, 23)] + ["chr%d" % (i) for i in xrange(1, 23)] + ["X", "Y", "MT", "chrX", "chrY", "chrM"]) logger.info("Only SVs on the following contigs will be reported: %s" % (sorted(list(contig_whitelist)))) # Load the intervals from different files vcf_name_list = [("CNVnator", cnvnator_vcf), ("Pindel", pindel_vcf), ("BreakDancer", breakdancer_vcf), ("BreakSeq", breakseq_vcf), ("HaplotypeCaller", gatk_vcf)] native_name_list = [("CNVnator", cnvnator_native, CNVnatorReader), ("Pindel", pindel_native, PindelReader), ("BreakSeq", breakseq_native, BreakSeqReader), ("BreakDancer", breakdancer_native, BreakDancerReader)] tools = [] intervals = {} sv_types = set() gap_intervals = [] if filter_gaps: if not gaps: gaps = get_gaps_file(contig_whitelist) gap_intervals = sorted(load_gap_intervals(gaps)) # Handles native input logger.info("Load native files") for toolname, nativename, svReader in native_name_list: # If no native file is given, ignore the tool if not nativename: continue tools.append(toolname) intervals[toolname] = defaultdict(list) for native_file in nativename: for record in svReader(native_file): interval = record.to_sv_interval() if not interval: # This is the case for SVs we want to skip continue if not interval_overlaps_interval_list( interval, gap_intervals) and interval.chrom in contig_whitelist: # Check length if interval.length < minsvlen: continue # Set wiggle if interval.sv_type == "INS": interval.wiggle = max(inswiggle, wiggle) else: interval.wiggle = wiggle intervals[toolname][interval.sv_type].append(interval) sv_types |= set(intervals[toolname].keys()) # Handles the VCF input cases, we will just deal with these cases logger.info("Load VCF files") for toolname, vcfname in vcf_name_list: # If no VCF is given, ignore the tool if not vcfname: continue tools.append(toolname) intervals[toolname] = {} vcf_list = [] for vcffile in vcfname: if os.path.isdir(vcffile): logger.info( "Will load from per-chromosome VCFs from directory %s for tool %s" % (vcffile, toolname)) vcf_list += [ os.path.join(vcffile, "%s.vcf.gz" % contig.name) for contig in contigs if (not contig_whitelist or contig.name in contig_whitelist ) ] else: vcf_list.append(vcffile) for vcffile in vcf_list: load_intervals(vcffile, intervals[toolname], gap_intervals, include_intervals, toolname, contig_whitelist, minsvlen=minsvlen, wiggle=wiggle, inswiggle=inswiggle) sv_types |= set(intervals[toolname].keys()) logger.info("SV types are %s" % (str(sv_types))) tool_merged_intervals = {} final_intervals = [] bd_out = os.path.join(outdir, "breakdancer.vcf") pindel_out = os.path.join(outdir, "pindel.vcf") cnvnator_out = os.path.join(outdir, "cnvnator.vcf") breakseq_out = os.path.join(outdir, "breakseq.vcf") vcf_out_list = [("BreakDancer", bd_out), ("Pindel", pindel_out), ("CNVnator", cnvnator_out), ("BreakSeq", breakseq_out)] # This will just output per-tool VCFs, no intra-tool merging is done yet if enable_per_tool_output: logger.info("Output per-tool VCFs") for toolname, tool_out in vcf_out_list: if tool_out is None or toolname not in intervals: continue logger.info("Outputting single tool VCF for %s" % (str(toolname))) vcf_template_reader = vcf.Reader( open(os.path.join(mydir, "resources/template.vcf"), "r")) vcf_template_reader.samples = [sample] intervals_tool = [] tool_out_fd = open(tool_out, "w") vcf_writer = vcf.Writer(tool_out_fd, vcf_template_reader) chr_intervals_tool = {contig.name: [] for contig in contigs} for sv_type in sv_types: if sv_type in intervals[toolname]: intervals_tool.extend([ copy.deepcopy(interval) for interval in intervals[toolname][sv_type] ]) for interval in intervals_tool: # Marghoob says that this is just to fill-in some metadata interval.do_validation(overlap_ratio) interval.fix_pos() chr_intervals_tool[interval.chrom].append(interval) for contig in contigs: chr_intervals_tool[contig.name].sort() for interval in chr_intervals_tool[contig.name]: vcf_record = interval.to_vcf_record(fasta_handle, sample) if vcf_record is not None: vcf_writer.write_record(vcf_record) tool_out_fd.close() vcf_writer.close() logger.info("Indexing single tool VCF for %s" % (str(toolname))) pysam.tabix_index(tool_out, force=True, preset="vcf") # Do merging here logger.info("Do merging") for sv_type in sv_types: logger.info("Processing SVs of type %s" % sv_type) tool_merged_intervals[sv_type] = [] # Do the intra-tool merging logger.info("Intra-tool Merging SVs of type %s" % sv_type) for tool in tools: logger.debug("Is %s in tool keys? %s" % (sv_type, str(intervals[tool].keys()))) if sv_type not in intervals[tool]: logger.debug("%s not in tool %s" % (sv_type, tool)) continue logger.info("First level merging for %s for tool %s" % (sv_type, tool)) tool_merged_intervals[sv_type] += merge_intervals( intervals[tool][sv_type]) # Do the inter-tool merging logger.info("Inter-tool Merging SVs of type %s" % sv_type) merged_intervals = merge_intervals(tool_merged_intervals[sv_type]) # Intervals which overlap well with merged_intervals intervals1 = [] # Intervals which do not overlap well with merged_intervals. # Used to filter out small intervals which got merged with large intervals intervals2 = [] logger.info("Checking overlaps SVs of type %s" % sv_type) for interval in tool_merged_intervals[sv_type]: if interval_overlaps_interval_list(interval, merged_intervals, overlap_ratio, overlap_ratio): intervals2.append(interval) else: intervals1.append(interval) final_intervals.extend( merge_intervals(intervals1) + merge_intervals(intervals2)) final_chr_intervals = {contig.name: [] for contig in contigs} for interval in final_intervals: interval.do_validation(overlap_ratio) interval.fix_pos() final_chr_intervals[interval.chrom].append(interval) # This is the merged VCF without assembly, ok for deletions at this point logger.info("Output merged VCF without assembly ") vcf_template_reader = vcf.Reader( open(os.path.join(mydir, "resources/template.vcf"), "r")) vcf_template_reader.samples = [sample] preasm_vcf = os.path.join(workdir, "pre_asm.vcf") vcf_fd = open(preasm_vcf, "w") vcf_writer = vcf.Writer(vcf_fd, vcf_template_reader) final_stats = {} bed_intervals = [] merged_bed = os.path.join(workdir, "metasv.bed") for contig in contigs: final_chr_intervals[contig.name].sort() for interval in final_chr_intervals[contig.name]: vcf_record = interval.to_vcf_record(fasta_handle) if vcf_record is not None: key = (interval.sv_type, "PASS" if interval.is_validated else "LowQual", "PRECISE" if interval.is_precise else "IMPRECISE", tuple(sorted(list(interval.sources)))) if key not in final_stats: final_stats[key] = 0 final_stats[key] += 1 vcf_writer.write_record(vcf_record) bed_interval = interval.to_bed_interval(sample) if bed_interval is not None: bed_intervals.append(bed_interval) # Also save a BED file representation of the merged variants without assembly pybedtools.BedTool(bed_intervals).saveas(merged_bed) vcf_fd.close() vcf_writer.close() for key in sorted(final_stats.keys()): logger.info(str(key) + ":" + str(final_stats[key])) final_vcf = os.path.join(outdir, "variants.vcf") # Run assembly here if not disable_assembly: logger.info("Running assembly") if spades is None: logger.error("Spades executable not specified") return 1 if age is None: logger.error("AGE executable not specified") return 1 spades_tmpdir = os.path.join(workdir, "spades") age_tmpdir = os.path.join(workdir, "age") create_dirs([spades_tmpdir, age_tmpdir]) assembly_bed = merged_bed # this does the improved assembly location finder with softclipped reads if boost_ins: logger.info("Generating intervals for insertions") assembly_bed = parallel_generate_sc_intervals( [bam.name], list(contig_whitelist), merged_bed, workdir, num_threads=num_threads, min_support=min_support, min_support_frac=min_support_frac, max_intervals=max_intervals) logger.info("Generated intervals for assembly in %s" % assembly_bed) logger.info("Will run assembly now") assembled_fasta, ignored_bed = run_spades_parallel( bam=bam.name, spades=spades, bed=assembly_bed, work=spades_tmpdir, pad=SPADES_PAD, nthreads=num_threads, chrs=list(contig_whitelist), disable_deletion_assembly=disable_deletion_assembly, stop_on_fail=stop_spades_on_fail) breakpoints_bed = run_age_parallel( intervals_bed=assembly_bed, reference=reference, assembly=assembled_fasta, pad=AGE_PAD, age=age, chrs=list(contig_whitelist), nthreads=num_threads, min_contig_len=AGE_MIN_CONTIG_LENGTH, age_workdir=age_tmpdir) final_bed = os.path.join(workdir, "final.bed") if ignored_bed: pybedtools.BedTool(breakpoints_bed) \ .cat(pybedtools.BedTool(ignored_bed), postmerge=False) \ .sort().saveas(final_bed) else: pybedtools.BedTool(breakpoints_bed).saveas(final_bed) logger.info("Output final VCF file") convert_metasv_bed_to_vcf(bedfile=final_bed, vcf_out=final_vcf, sample=sample, pass_calls=False) else: shutil.copy(preasm_vcf, final_vcf) pysam.tabix_index(final_vcf, force=True, preset="vcf") logger.info("Clean up pybedtools") pybedtools.cleanup(remove_all=True) logger.info("All Done!")