コード例 #1
0
ファイル: main.py プロジェクト: chapmanb/metasv
def run_metasv(args):
    logger.info("Running MetaSV %s" % __version__)
    logger.info("Arguments are " + str(args))
    
    
    # Check if there is work to do
    if not (args.pindel_vcf + args.breakdancer_vcf + args.breakseq_vcf + args.cnvnator_vcf +
            args.pindel_native + args.breakdancer_native + args.breakseq_native + args.cnvnator_native +
            args.manta_vcf + args.lumpy_vcf + args.cnvkit_vcf, args.wham_vcf):
        logger.warning("Nothing to merge since no SV file specified")

    # Simple check for arguments
    if not args.disable_assembly:
        if not args.spades:
            logger.error("Spades executable not specified")
            return os.EX_USAGE

        if not args.age:
            logger.error("AGE executable not specified")
            return os.EX_USAGE

    # Create the directories for working
    bedtools_tmpdir = os.path.join(args.workdir, "bedtools")
    create_dirs([args.workdir, args.outdir, bedtools_tmpdir])

    # Reference handling
    if not os.path.isfile(args.reference + ".fai"):
        logger.error("Reference file %s is not indexed" % (args.reference))
        return 1

    fasta_handle = pysam.Fastafile(args.reference) if os.path.isfile(args.reference) else None
    contigs = get_contigs(args.reference)
    include_intervals = sorted(
        [SVInterval(contig.name, 0, contig.length, contig.name, "include", length=contig.length) for contig in contigs])

    # Generate the list of contigs to process
    contig_whitelist = set(args.chromosomes) if args.chromosomes else set([contig.name for contig in contigs])
    if args.keep_standard_contigs:
        contig_whitelist &= set(
            [str(i) for i in xrange(1, 23)] + ["chr%d" % (i) for i in xrange(1, 23)] + ["X", "Y", "MT", "chrX", "chrY",
                                                                                        "chrM"])
    logger.info("Only SVs on the following contigs will be reported: %s" % (sorted(list(contig_whitelist))))

    # Load the intervals from different files
    vcf_name_list = [("CNVnator", args.cnvnator_vcf), ("Pindel", args.pindel_vcf),
                     ("BreakDancer", args.breakdancer_vcf),
                     ("BreakSeq", args.breakseq_vcf), ("HaplotypeCaller", args.gatk_vcf),
                     ("Lumpy", args.lumpy_vcf), ("Manta", args.manta_vcf), ("CNVkit", args.cnvkit_vcf),
                     ("WHAM", args.wham_vcf)]
    native_name_list = [("CNVnator", args.cnvnator_native, CNVnatorReader),
                        ("Pindel", args.pindel_native, PindelReader),
                        ("BreakSeq", args.breakseq_native, BreakSeqReader),
                        ("BreakDancer", args.breakdancer_native, BreakDancerReader)]

    tools = []
    intervals = {}
    sv_types = set()

    gap_intervals = []
    if args.filter_gaps:
        gaps = args.gaps if args.gaps else get_gaps_file(contig_whitelist)
        gap_intervals = sorted(load_gap_intervals(gaps))

    # Handles native input
    logger.info("Load native files")
    for toolname, nativename, svReader in native_name_list:
        # If no native file is given, ignore the tool
        if not nativename: continue

        tools.append(toolname)
        intervals[toolname] = defaultdict(list)

        for native_file in nativename:
            for record in svReader(native_file, svs_to_report=args.svs_to_report):
                interval = record.to_sv_interval()
                BD_min_inv_len = args.mean_read_length+4*args.isize_sd
                if toolname=="BreakDancer" and interval.sv_type == "INV" and  abs(interval.length)< BD_min_inv_len:
                    #Filter BreakDancer artifact INVs with size < readlength+4*isize_sd
                    continue

                if not interval:
                    # This is the case for SVs we want to skip
                    continue
                if not interval_overlaps_interval_list(interval, gap_intervals) and interval.chrom in contig_whitelist:
                    
                    # Check length
                    if interval.length < args.minsvlen and interval.sv_type not in  ["ITX", "CTX"]:
                        continue

                    # Set wiggle
                    if interval.sv_type not in ["ITX","CTX"]:
                        interval.wiggle = max(args.inswiggle if interval.sv_type == "INS" else 0, args.wiggle)
                    else:
                        interval.wiggle = TX_WIGGLE
                    
                    intervals[toolname][interval.sv_type].append(interval)
        sv_types |= set(intervals[toolname].keys())

    # Handles the VCF input cases, we will just deal with these cases
    logger.info("Load VCF files")
    for toolname, vcfname in vcf_name_list:
        # If no VCF is given, ignore the tool
        if not vcfname:
            continue

        tools.append(toolname)
        intervals[toolname] = {}

        vcf_list = []
        for vcffile in vcfname:
            if os.path.isdir(vcffile):
                logger.info("Will load from per-chromosome VCFs from directory %s for tool %s" % (vcffile, toolname))
                vcf_list += [os.path.join(vcffile, "%s.vcf.gz" % contig.name) for contig in contigs if
                             (not contig_whitelist or contig.name in contig_whitelist)]
            else:
                vcf_list.append(vcffile)

        for vcffile in vcf_list:
            load_intervals(vcffile, intervals[toolname], gap_intervals, include_intervals, toolname, contig_whitelist,
                           minsvlen=args.minsvlen, wiggle=args.wiggle, inswiggle=args.inswiggle,
                           svs_to_report=args.svs_to_report, maxsvlen=args.maxsvlen)
        sv_types |= set(intervals[toolname].keys())

    logger.info("SV types are %s" % (str(sv_types)))
    tool_merged_intervals = {}
    final_intervals = []

    # This will just output per-tool VCFs, no intra-tool merging is done yet
    if args.enable_per_tool_output:
        logger.info("Output per-tool VCFs")
        for toolname in intervals:
            tool_out = os.path.join(args.outdir, "%s.vcf" % (toolname.lower()))

            logger.info("Outputting single tool VCF for %s" % (str(toolname)))
            vcf_template_reader = vcf.Reader(open(os.path.join(mydir, "resources/template.vcf"), "r"))
            vcf_template_reader.samples = [args.sample]

            intervals_tool = []
            tool_out_fd = open(tool_out, "w")
            vcf_writer = vcf.Writer(tool_out_fd, vcf_template_reader)
            chr_intervals_tool = {contig.name: [] for contig in contigs}
            for sv_type in sv_types:
                if sv_type in intervals[toolname]:
                    intervals_tool.extend([copy.deepcopy(interval) for interval in intervals[toolname][sv_type]])
            for interval in intervals_tool:
                # Marghoob says that this is just to fill-in some metadata
                interval.do_validation(args.overlap_ratio)

                interval.fix_pos()
                chr_intervals_tool[interval.chrom].append(interval)

            for contig in contigs:
                chr_intervals_tool[contig.name].sort()
                for interval in chr_intervals_tool[contig.name]:
                    vcf_record = interval.to_vcf_record(fasta_handle, args.sample)
                    if vcf_record is not None:
                        vcf_writer.write_record(vcf_record)
            tool_out_fd.close()
            vcf_writer.close()
            logger.info("Indexing single tool VCF for %s" % (str(toolname)))
            pysam.tabix_index(tool_out, force=True, preset="vcf")


    # Do merging here
    logger.info("Do merging")
    for sv_type in sv_types:
        logger.info("Processing SVs of type %s" % sv_type)
        tool_merged_intervals[sv_type] = []

        # Do the intra-tool merging
        logger.info("Intra-tool Merging SVs of type %s" % sv_type)
        for tool in tools:
            logger.debug("Is %s in tool keys? %s" % (sv_type, str(intervals[tool].keys())))
            if sv_type not in intervals[tool]:
                logger.debug("%s not in tool %s" % (sv_type, tool))
                continue
            logger.info("First level merging for %s for tool %s" % (sv_type, tool))
            tool_merged_intervals[sv_type] += merge_intervals(intervals[tool][sv_type])

        # Do the inter-tool merging
        logger.info("Inter-tool Merging SVs of type %s" % sv_type)
        final_intervals.extend(merge_intervals_recursively(tool_merged_intervals[sv_type],args.overlap_ratio))

    final_chr_intervals = {contig.name: [] for contig in contigs}
    for interval in final_intervals:
        interval.do_validation(args.overlap_ratio)
        interval.fix_pos()
        if args.minsvlen <= interval.length <= args.maxsvlen or interval.sv_type in ["ITX", "CTX"]:
            final_chr_intervals[interval.chrom].append(interval)

    # This is the merged VCF without assembly, ok for deletions at this point
    logger.info("Output merged VCF without assembly ")
    vcf_template_reader = vcf.Reader(open(os.path.join(mydir, "resources/template.vcf"), "r"))
    vcf_template_reader.samples = [args.sample]
    preasm_vcf = os.path.join(args.workdir, "pre_asm.vcf")
    vcf_fd = open(preasm_vcf, "w")
    vcf_writer = vcf.Writer(vcf_fd, vcf_template_reader)

    final_stats = {}

    bed_intervals = []
    for contig in contigs:
        final_chr_intervals[contig.name].sort()
        for interval in final_chr_intervals[contig.name]:
            vcf_record = interval.to_vcf_record(fasta_handle)
            if vcf_record is not None:
                key = (interval.sv_type, "PASS" if interval.is_validated else "LowQual",
                       "PRECISE" if interval.is_precise else "IMPRECISE", tuple(sorted(list(interval.sources))))
                if key not in final_stats:
                    final_stats[key] = 0
                final_stats[key] += 1
                vcf_writer.write_record(vcf_record)
            bed_interval = interval.to_bed_interval(args.sample)
            if bed_interval is not None:
                bed_intervals.append(bed_interval)
    vcf_fd.close()
    vcf_writer.close()

    # Also save a BED file representation of the merged variants without assembly
    merged_bed = None
    if bed_intervals:
        merged_bed = os.path.join(args.workdir, "metasv.bed")
        pybedtools.BedTool(bed_intervals).saveas(merged_bed)

    for key in sorted(final_stats.keys()):
        logger.info(str(key) + ":" + str(final_stats[key]))

    final_vcf = os.path.join(args.outdir, "variants.vcf")

    # Run assembly here
    if not args.disable_assembly:
        logger.info("Running assembly")

        spades_tmpdir = os.path.join(args.workdir, "spades")
        age_tmpdir = os.path.join(args.workdir, "age")

        create_dirs([spades_tmpdir, age_tmpdir])

        assembly_bed = merged_bed

        # this does the improved assembly location finder with softclipped reads
        if args.boost_sc:
            logger.info("Generating Soft-Clipping intervals.")
            assembly_bed = parallel_generate_sc_intervals([args.bam.name], list(contig_whitelist), merged_bed,
                                                          args.workdir,
                                                          num_threads=args.num_threads,
                                                          min_support_ins=args.min_support_ins,
                                                          min_support_frac_ins=args.min_support_frac_ins,
                                                          max_intervals=args.max_ins_intervals, min_mapq=args.min_mapq,
                                                          min_avg_base_qual=args.min_avg_base_qual,
                                                          min_soft_clip=args.min_soft_clip,
                                                          max_nm=args.max_nm, min_matches=args.min_matches,
                                                          isize_mean=args.isize_mean, isize_sd=args.isize_sd,                                                        
                                                          svs_to_softclip=args.svs_to_softclip,
                                                          overlap_ratio=args.overlap_ratio,
                                                          mean_read_length=args.mean_read_length,
                                                          mean_read_coverage=args.mean_read_coverage, 
                                                          min_ins_cov_frac=args.min_ins_cov_frac,
                                                          max_ins_cov_frac=args.max_ins_cov_frac)
            logger.info("Generated intervals for assembly in %s" % assembly_bed)

        logger.info("Will run assembly now")

        assembled_fasta, ignored_bed = run_spades_parallel(bam=args.bam.name, spades=args.spades, bed=assembly_bed,
                                                           work=spades_tmpdir, pad=args.assembly_pad,
                                                           nthreads=args.num_threads,
                                                           chrs=list(contig_whitelist),
                                                           max_interval_size=args.spades_max_interval_size,
                                                           svs_to_assemble=args.svs_to_assemble,
                                                           stop_on_fail=args.stop_spades_on_fail,
                                                           max_read_pairs=args.extraction_max_read_pairs,
                                                           assembly_max_tools=args.assembly_max_tools)
        breakpoints_bed = run_age_parallel(intervals_bed=assembly_bed, reference=args.reference,
                                           assembly=assembled_fasta,
                                           pad=args.assembly_pad, age=args.age, chrs=list(contig_whitelist),
                                           nthreads=args.num_threads,
                                           min_contig_len=AGE_MIN_CONTIG_LENGTH, min_del_subalign_len=args.min_del_subalign_len,
                                           min_inv_subalign_len=args.min_inv_subalign_len,
                                           age_workdir=age_tmpdir)

        final_bed = os.path.join(args.workdir, "final.bed")
        if breakpoints_bed:
            if ignored_bed:
                pybedtools.BedTool(breakpoints_bed) \
                    .cat(pybedtools.BedTool(ignored_bed), postmerge=False) \
                    .sort().saveas(final_bed)
            else:
                pybedtools.BedTool(breakpoints_bed).saveas(final_bed)
        elif ignored_bed:
            pybedtools.BedTool(ignored_bed).sort().saveas(final_bed)
        else:
            final_bed = None

        genotyped_bed = parallel_genotype_intervals(final_bed, args.bam.name,
                                                    workdir=os.path.join(args.workdir, "genotyping"),
                                                    nthreads=args.num_threads, chromosomes=list(contig_whitelist),
                                                    window=args.gt_window, isize_mean=args.isize_mean,
                                                    isize_sd=args.isize_sd,
                                                    normal_frac_threshold=args.gt_normal_frac)

        logger.info("Output final VCF file")

        convert_metasv_bed_to_vcf(bedfile=genotyped_bed, vcf_out=final_vcf, workdir=args.workdir, sample=args.sample, pass_calls=False)
    else:
        shutil.copy(preasm_vcf, final_vcf)
        pysam.tabix_index(final_vcf, force=True, preset="vcf")

    logger.info("Clean up pybedtools")

    pybedtools.cleanup(remove_all=True)

    logger.info("All Done!")

    return os.EX_OK
コード例 #2
0
ファイル: main.py プロジェクト: yyxql/Genomics_Docker
def run_metasv(args):
    logger.info("Running MetaSV %s" % __version__)
    logger.info("Arguments are " + str(args))

    # Check if there is work to do
    if not (args.pindel_vcf + args.breakdancer_vcf + args.breakseq_vcf +
            args.cnvnator_vcf + args.pindel_native + args.breakdancer_native +
            args.breakseq_native + args.cnvnator_native + args.manta_vcf +
            args.lumpy_vcf + args.cnvkit_vcf, args.wham_vcf):
        logger.warning("Nothing to merge since no SV file specified")

    # Simple check for arguments
    if not args.disable_assembly:
        if not args.spades:
            logger.error("Spades executable not specified")
            return os.EX_USAGE

        if not args.age:
            logger.error("AGE executable not specified")
            return os.EX_USAGE

    # Create the directories for working
    bedtools_tmpdir = os.path.join(args.workdir, "bedtools")
    create_dirs([args.workdir, args.outdir, bedtools_tmpdir])

    # Reference handling
    if not os.path.isfile(args.reference + ".fai"):
        logger.error("Reference file %s is not indexed" % (args.reference))
        return 1

    fasta_handle = pysam.Fastafile(args.reference) if os.path.isfile(
        args.reference) else None
    contigs = get_contigs(args.reference)
    include_intervals = sorted([
        SVInterval(contig.name,
                   0,
                   contig.length,
                   contig.name,
                   "include",
                   length=contig.length) for contig in contigs
    ])

    # Generate the list of contigs to process
    contig_whitelist = set(args.chromosomes) if args.chromosomes else set(
        [contig.name for contig in contigs])
    if args.keep_standard_contigs:
        contig_whitelist &= set([str(i) for i in xrange(1, 23)] +
                                ["chr%d" % (i) for i in xrange(1, 23)] +
                                ["X", "Y", "MT", "chrX", "chrY", "chrM"])
    logger.info("Only SVs on the following contigs will be reported: %s" %
                (sorted(list(contig_whitelist))))

    # Load the intervals from different files
    vcf_name_list = [("CNVnator", args.cnvnator_vcf),
                     ("Pindel", args.pindel_vcf),
                     ("BreakDancer", args.breakdancer_vcf),
                     ("BreakSeq", args.breakseq_vcf),
                     ("HaplotypeCaller", args.gatk_vcf),
                     ("Lumpy", args.lumpy_vcf), ("Manta", args.manta_vcf),
                     ("CNVkit", args.cnvkit_vcf), ("WHAM", args.wham_vcf)]
    native_name_list = [("CNVnator", args.cnvnator_native, CNVnatorReader),
                        ("Pindel", args.pindel_native, PindelReader),
                        ("BreakSeq", args.breakseq_native, BreakSeqReader),
                        ("BreakDancer", args.breakdancer_native,
                         BreakDancerReader)]

    tools = []
    intervals = {}
    sv_types = set()

    gap_intervals = []
    if args.filter_gaps:
        gaps = args.gaps if args.gaps else get_gaps_file(contig_whitelist)
        gap_intervals = sorted(load_gap_intervals(gaps))

    # Handles native input
    logger.info("Load native files")
    for toolname, nativename, svReader in native_name_list:
        # If no native file is given, ignore the tool
        if not nativename: continue

        tools.append(toolname)
        intervals[toolname] = defaultdict(list)

        for native_file in nativename:
            for record in svReader(native_file,
                                   svs_to_report=args.svs_to_report):
                interval = record.to_sv_interval()
                if not interval:
                    # This is the case for SVs we want to skip
                    continue
                BD_min_inv_len = args.mean_read_length + 4 * args.isize_sd
                if toolname == "BreakDancer" and interval.sv_type == "INV" and abs(
                        interval.length) < BD_min_inv_len:
                    #Filter BreakDancer artifact INVs with size < readlength+4*isize_sd
                    continue
                if not interval_overlaps_interval_list(
                        interval,
                        gap_intervals) and interval.chrom in contig_whitelist:

                    # Check length
                    if interval.length < args.minsvlen and interval.sv_type not in [
                            "ITX", "CTX"
                    ]:
                        continue

                    # Set wiggle
                    if interval.sv_type not in ["ITX", "CTX"]:
                        interval.wiggle = max(
                            args.inswiggle if interval.sv_type == "INS" else 0,
                            args.wiggle)
                    else:
                        interval.wiggle = TX_WIGGLE

                    intervals[toolname][interval.sv_type].append(interval)
        sv_types |= set(intervals[toolname].keys())

    # Handles the VCF input cases, we will just deal with these cases
    logger.info("Load VCF files")
    for toolname, vcfname in vcf_name_list:
        # If no VCF is given, ignore the tool
        if not vcfname:
            continue

        tools.append(toolname)
        intervals[toolname] = {}

        vcf_list = []
        for vcffile in vcfname:
            if os.path.isdir(vcffile):
                logger.info(
                    "Will load from per-chromosome VCFs from directory %s for tool %s"
                    % (vcffile, toolname))
                vcf_list += [
                    os.path.join(vcffile, "%s.vcf.gz" % contig.name)
                    for contig in contigs
                    if (not contig_whitelist or contig.name in contig_whitelist
                        )
                ]
            else:
                vcf_list.append(vcffile)

        for vcffile in vcf_list:
            load_intervals(vcffile,
                           intervals[toolname],
                           gap_intervals,
                           include_intervals,
                           toolname,
                           contig_whitelist,
                           minsvlen=args.minsvlen,
                           wiggle=args.wiggle,
                           inswiggle=args.inswiggle,
                           svs_to_report=args.svs_to_report,
                           maxsvlen=args.maxsvlen)
        sv_types |= set(intervals[toolname].keys())

    logger.info("SV types are %s" % (str(sv_types)))
    tool_merged_intervals = {}
    final_intervals = []

    # This will just output per-tool VCFs, no intra-tool merging is done yet
    if args.enable_per_tool_output:
        logger.info("Output per-tool VCFs")
        for toolname in intervals:
            tool_out = os.path.join(args.outdir, "%s.vcf" % (toolname.lower()))

            logger.info("Outputting single tool VCF for %s" % (str(toolname)))
            vcf_template_reader = vcf.Reader(
                open(os.path.join(mydir, "resources/template.vcf"), "r"))
            vcf_template_reader.samples = [args.sample]

            intervals_tool = []
            tool_out_fd = open(tool_out, "w")
            vcf_writer = vcf.Writer(tool_out_fd, vcf_template_reader)
            chr_intervals_tool = {contig.name: [] for contig in contigs}
            for sv_type in sv_types:
                if sv_type in intervals[toolname]:
                    intervals_tool.extend([
                        copy.deepcopy(interval)
                        for interval in intervals[toolname][sv_type]
                    ])
            for interval in intervals_tool:
                # Marghoob says that this is just to fill-in some metadata
                interval.do_validation(args.overlap_ratio)

                interval.fix_pos()
                chr_intervals_tool[interval.chrom].append(interval)

            for contig in contigs:
                chr_intervals_tool[contig.name].sort()
                for interval in chr_intervals_tool[contig.name]:
                    vcf_record = interval.to_vcf_record(
                        fasta_handle, args.sample)
                    if vcf_record is not None:
                        vcf_writer.write_record(vcf_record)
            tool_out_fd.close()
            vcf_writer.close()
            logger.info("Indexing single tool VCF for %s" % (str(toolname)))
            pysam.tabix_index(tool_out, force=True, preset="vcf")

    # Do merging here
    logger.info("Do merging")
    for sv_type in sv_types:
        logger.info("Processing SVs of type %s" % sv_type)
        tool_merged_intervals[sv_type] = []

        # Do the intra-tool merging
        logger.info("Intra-tool Merging SVs of type %s" % sv_type)
        for tool in tools:
            logger.debug("Is %s in tool keys? %s" %
                         (sv_type, str(intervals[tool].keys())))
            if sv_type not in intervals[tool]:
                logger.debug("%s not in tool %s" % (sv_type, tool))
                continue
            logger.info("First level merging for %s for tool %s" %
                        (sv_type, tool))
            tool_merged_intervals[sv_type] += merge_intervals(
                intervals[tool][sv_type])

        # Do the inter-tool merging
        logger.info("Inter-tool Merging SVs of type %s" % sv_type)
        final_intervals.extend(
            merge_intervals_recursively(tool_merged_intervals[sv_type],
                                        args.overlap_ratio))

    final_chr_intervals = {contig.name: [] for contig in contigs}
    for interval in final_intervals:
        interval.do_validation(args.overlap_ratio)
        interval.fix_pos()
        if args.minsvlen <= interval.length <= args.maxsvlen or interval.sv_type in [
                "ITX", "CTX"
        ]:
            final_chr_intervals[interval.chrom].append(interval)

    # This is the merged VCF without assembly, ok for deletions at this point
    logger.info("Output merged VCF without assembly ")
    vcf_template_reader = vcf.Reader(
        open(os.path.join(mydir, "resources/template.vcf"), "r"))
    vcf_template_reader.samples = [args.sample]
    preasm_vcf = os.path.join(args.workdir, "pre_asm.vcf")
    vcf_fd = open(preasm_vcf, "w")
    vcf_writer = vcf.Writer(vcf_fd, vcf_template_reader)

    final_stats = {}

    bed_intervals = []
    for contig in contigs:
        final_chr_intervals[contig.name].sort()
        for interval in final_chr_intervals[contig.name]:
            vcf_record = interval.to_vcf_record(fasta_handle)
            if vcf_record is not None:
                key = (interval.sv_type,
                       "PASS" if interval.is_validated else "LowQual",
                       "PRECISE" if interval.is_precise else "IMPRECISE",
                       tuple(sorted(list(interval.sources))))
                if key not in final_stats:
                    final_stats[key] = 0
                final_stats[key] += 1
                vcf_writer.write_record(vcf_record)
            bed_interval = interval.to_bed_interval(args.sample)
            if bed_interval is not None:
                bed_intervals.append(bed_interval)
    vcf_fd.close()
    vcf_writer.close()

    # Also save a BED file representation of the merged variants without assembly
    merged_bed = None
    if bed_intervals:
        merged_bed = os.path.join(args.workdir, "metasv.bed")
        pybedtools.BedTool(bed_intervals).saveas(merged_bed)

    for key in sorted(final_stats.keys()):
        logger.info(str(key) + ":" + str(final_stats[key]))

    final_vcf = os.path.join(args.outdir, "variants.vcf")

    # Run assembly here
    if not args.disable_assembly:
        logger.info("Running assembly")

        spades_tmpdir = os.path.join(args.workdir, "spades")
        age_tmpdir = os.path.join(args.workdir, "age")

        create_dirs([spades_tmpdir, age_tmpdir])

        assembly_bed = merged_bed

        # this does the improved assembly location finder with softclipped reads
        if args.boost_sc:
            logger.info("Generating Soft-Clipping intervals.")
            assembly_bed = parallel_generate_sc_intervals(
                args.bams,
                list(contig_whitelist),
                merged_bed,
                args.workdir,
                num_threads=args.num_threads,
                min_support_ins=args.min_support_ins,
                min_support_frac_ins=args.min_support_frac_ins,
                max_intervals=args.max_ins_intervals,
                min_mapq=args.min_mapq,
                min_avg_base_qual=args.min_avg_base_qual,
                min_soft_clip=args.min_soft_clip,
                max_nm=args.max_nm,
                min_matches=args.min_matches,
                isize_mean=args.isize_mean,
                isize_sd=args.isize_sd,
                svs_to_softclip=args.svs_to_softclip,
                overlap_ratio=args.overlap_ratio,
                mean_read_length=args.mean_read_length,
                mean_read_coverage=args.mean_read_coverage,
                min_ins_cov_frac=args.min_ins_cov_frac,
                max_ins_cov_frac=args.max_ins_cov_frac,
                assembly_max_tools=args.assembly_max_tools)
            logger.info("Generated intervals for assembly in %s" %
                        assembly_bed)

        logger.info("Will run assembly now")

        assembled_fasta, ignored_bed = run_spades_parallel(
            bams=args.bams,
            spades=args.spades,
            spades_options=args.spades_options,
            bed=assembly_bed,
            work=spades_tmpdir,
            pad=args.assembly_pad,
            nthreads=args.num_threads,
            chrs=list(contig_whitelist),
            max_interval_size=args.spades_max_interval_size,
            timeout=args.spades_timeout,
            svs_to_assemble=args.svs_to_assemble,
            stop_on_fail=args.stop_spades_on_fail,
            max_read_pairs=args.extraction_max_read_pairs,
            assembly_max_tools=args.assembly_max_tools)
        breakpoints_bed = run_age_parallel(
            intervals_bed=assembly_bed,
            reference=args.reference,
            assembly=assembled_fasta,
            pad=args.assembly_pad,
            age=args.age,
            timeout=args.age_timeout,
            chrs=list(contig_whitelist),
            nthreads=args.num_threads,
            min_contig_len=AGE_MIN_CONTIG_LENGTH,
            min_del_subalign_len=args.min_del_subalign_len,
            min_inv_subalign_len=args.min_inv_subalign_len,
            age_window=args.age_window,
            age_workdir=age_tmpdir)

        final_bed = os.path.join(args.workdir, "final.bed")
        if breakpoints_bed:
            if ignored_bed:
                pybedtools.BedTool(breakpoints_bed) \
                    .cat(pybedtools.BedTool(ignored_bed), postmerge=False) \
                    .sort().saveas(final_bed)
            else:
                pybedtools.BedTool(breakpoints_bed).saveas(final_bed)
        elif ignored_bed:
            pybedtools.BedTool(ignored_bed).sort().saveas(final_bed)
        else:
            final_bed = None

        genotyped_bed = parallel_genotype_intervals(
            final_bed,
            args.bams,
            workdir=os.path.join(args.workdir, "genotyping"),
            nthreads=args.num_threads,
            chromosomes=list(contig_whitelist),
            window=args.gt_window,
            isize_mean=args.isize_mean,
            isize_sd=args.isize_sd,
            normal_frac_threshold=args.gt_normal_frac)

        logger.info("Output final VCF file")

        convert_metasv_bed_to_vcf(bedfile=genotyped_bed,
                                  vcf_out=final_vcf,
                                  workdir=args.workdir,
                                  sample=args.sample,
                                  reference=args.reference,
                                  pass_calls=False)
    else:
        shutil.copy(preasm_vcf, final_vcf)
        pysam.tabix_index(final_vcf, force=True, preset="vcf")

    logger.info("Clean up pybedtools")

    pybedtools.cleanup(remove_all=True)

    logger.info("All Done!")

    return os.EX_OK
コード例 #3
0
def convert_metasv_bed_to_vcf(bedfile=None, vcf_out=None, vcf_template_file=vcf_template, sample=None, reference=None, pass_calls=True):
    func_logger = logging.getLogger("%s" % (convert_metasv_bed_to_vcf.__name__))

    vcf_template_reader = vcf.Reader(open(vcf_template_file, "r"))

    # The following are hacks to ensure sample name and contig names are put in the VCF header
    vcf_template_reader.samples = [sample]
    contigs = []
    if reference:
        contigs = fasta_utils.get_contigs(reference)
        contigs_order_dict = {contig.name: index for (index, contig) in enumerate(contigs)}
        vcf_template_reader.contigs = OrderedDict([(contig.name, (contig.name, contig.length)) for contig in contigs])
        vcf_template_reader.metadata["reference"] = reference

    vcf_template_reader.metadata["fileDate"] = str(datetime.date.today())
    vcf_template_reader.metadata["source"] = [" ".join(sys.argv)]

    vcf_writer = vcf.Writer(open(vcf_out, "w"), vcf_template_reader)

    vcf_records = []
    for interval in pybedtools.BedTool(bedfile):
        chrom = interval.chrom
        pos = interval.start
        end = interval.end
        genotype = "./." if len(interval.fields) < 11 else interval.fields[10]

        if genotype == "0/0":
            func_logger.info("Skipping homozygous reference %s" % str(interval))
            continue

        sub_names = interval.name.split(":")
        sub_lengths = map(lambda x: int(x.split(",")[2]), sub_names)

        sub_types = map(lambda x: x.split(",")[1], sub_names)
        sub_methods = [name.split(",")[3] for name in sub_names]
        svmethods = (";".join([name.split(",")[3] for name in sub_names])).split(";")
        try:
            info = json.loads(base64.b64decode(name.split(",")[0]))
        except TypeError:
            info = dict()
        if len(interval.fields) > 9:
            info.update(json.loads(base64.b64decode(interval.fields[9])))

        index_to_use = 0
        is_pass = False
        svlen = -1
        if "DEL" in sub_types:
            index_to_use = sub_types.index("DEL")
            svmethods_s = set(svmethods) - {"SC"}
            is_pass = len(svmethods_s) > 1
        elif "INV" in sub_types:
            index_to_use = sub_types.index("INV")
            svmethods_s = set(svmethods) - {"SC"}
            is_pass = len(svmethods_s) > 1
        elif "INS" in sub_types and "SC" in sub_methods:
            index_to_use = sub_methods.index("SC")
            pos = int(interval.fields[6])
            end = int(interval.fields[7])
            svlen = int(interval.fields[8])

        if svlen < 0: svlen = sub_lengths[index_to_use]
        if sub_types[index_to_use] == "DEL":
            svlen = -svlen

        sv_type = sub_types[index_to_use]
        if sv_type == "INS":
            if pass_calls and end != pos + 1:
                continue
            end = pos
            is_pass = (int(interval.fields[8]) != -1) and (svlen == 0 or svlen >= 100)
        sv_id = "."
        ref = "."
        alt = ["<%s>" % sv_type]
        qual = "."
        sv_filter = ["PASS" if is_pass else "LowQual"]
        info.update({"END": end, "SVLEN": svlen, "SVTYPE": sv_type, "SVMETHOD": svmethods, "NUM_SVMETHODS": len(svmethods)})
        sv_format = "GT"
        sample_indexes = [0]
        vcf_record = vcf.model._Record(chrom, pos, sv_id, ref, alt, qual, sv_filter, info, sv_format, sample_indexes)
        vcf_record.samples = vcf_template_reader._parse_samples([genotype], "GT", vcf_record)
        vcf_records.append(vcf_record)

    if contigs:
        vcf_records.sort(key=lambda x: (contigs_order_dict[x.CHROM], x.POS))
    else:
        vcf_records.sort(key=lambda x: (x.CHROM, x.POS))

    for vcf_record in vcf_records:
        vcf_writer.write_record(vcf_record)
    vcf_writer.close()

    func_logger.info("Tabix compressing and indexing %s" % vcf_out)
    pysam.tabix_index(vcf_out, force=True, preset="vcf")
コード例 #4
0
def convert_metasv_bed_to_vcf(bedfile=None, vcf_out=None, workdir=None, vcf_template_file=vcf_template, sample=None, reference=None,
                              pass_calls=True):
    func_logger = logging.getLogger("%s" % (convert_metasv_bed_to_vcf.__name__))
    if not os.path.exists(workdir):
        os.makedirs(workdir)

    intervals = []
    if bedfile:
    
        for interval in pybedtools.BedTool(bedfile):
            interval_info = get_interval_info(interval,pass_calls)            
            if interval_info:
                updated_interval = pybedtools.Interval(interval.chrom, interval_info["pos"], 
                                                       interval_info["end"], name="%s,%s,%d,%s" % (
                                                       base64.b64encode(json.dumps(interval_info["info"])), 
                                                       interval_info["sv_type"], interval_info["sv_length"],
                                                       ";".join(interval_info["svmethods"])), 
                                                       score = interval.score, 
                                                       otherfields=[interval_info["genotype"]
                                                                    , interval_info["sv_filter"]])
                if not intervals:
                    intervals.append(updated_interval)
                else:
                    merged_interval=check_duplicates(updated_interval,intervals[-1])
                    if merged_interval:
                        func_logger.info("Merging intervals: %s and %s" % (updated_interval,intervals[-1]))
                        intervals.pop()
                        intervals.append(merged_interval)
                    else:
                        intervals.append(updated_interval)
            else: 
                func_logger.info("Skip interval: %s" % (interval))

    nonfilterd_bed = os.path.join(workdir, "final_nonfilterd.bed")
    filterd_bed = os.path.join(workdir, "final_filterd.bed")
    bedtool = pybedtools.BedTool(intervals).sort().moveto(nonfilterd_bed)
    filterd_bed = filter_confused_INS_calls(nonfilterd_bed,filterd_bed)    

    vcf_template_reader = vcf.Reader(open(vcf_template_file, "r"))
    # The following are hacks to ensure sample name and contig names are put in the VCF header
    vcf_template_reader.samples = [sample]
    contigs = []
    fasta_file = None
    if reference:
        contigs = fasta_utils.get_contigs(reference)
        contigs_order_dict = {contig.name: index for (index, contig) in enumerate(contigs)}
        vcf_template_reader.contigs = OrderedDict([(contig.name, (contig.name, contig.length)) for contig in contigs])
        vcf_template_reader.metadata["reference"] = reference
        fasta_file = pysam.Fastafile(reference)

    vcf_template_reader.metadata["fileDate"] = str(datetime.date.today())
    vcf_template_reader.metadata["source"] = [" ".join(sys.argv)]
    vcf_writer = vcf.Writer(open(vcf_out, "w"), vcf_template_reader)
    vcf_records = []
    if filterd_bed:
        bedtool = pybedtools.BedTool(filterd_bed)
        for interval in bedtool:
            name_split=interval.name.split(",")
            info = json.loads(base64.b64decode(name_split[0]))
            sv_type = name_split[1]
            sv_id = "."
            ref = fasta_file.fetch(str(interval.chrom), interval.start, interval.start + 1) if fasta_file else "."
            alt = [vcf.model._SV(sv_type)]
            qual = "."
            sv_filter = [interval.fields[7]]
            genotype = interval.fields[6]
            sv_format = "GT"
            sample_indexes = [0]
            vcf_record = vcf.model._Record(interval.chrom, interval.start, sv_id, ref, alt, qual,
                                           sv_filter, info, sv_format, sample_indexes)
            vcf_record.samples = vcf_template_reader._parse_samples([genotype], "GT", vcf_record)
            vcf_records.append(vcf_record)
            
    if contigs:
        vcf_records.sort(key=lambda x: (contigs_order_dict[x.CHROM], x.POS))
    else:
        vcf_records.sort(key=lambda x: (x.CHROM, x.POS))

    resolved_vcf_records = resolve_for_IDP_ITX_CTX(vcf_records,fasta_file)

    for vcf_record in resolved_vcf_records:
        vcf_writer.write_record(vcf_record)
    vcf_writer.close()

    func_logger.info("Tabix compressing and indexing %s" % vcf_out)
    pysam.tabix_index(vcf_out, force=True, preset="vcf")
コード例 #5
0
ファイル: main.py プロジェクト: BioinformaticsArchive/metasv
def run_metasv(sample, reference, pindel_vcf=[], pindel_native=[], breakdancer_vcf=[], breakdancer_native=[],
               breakseq_vcf=[], breakseq_native=[], cnvnator_vcf=[], cnvnator_native=[], gatk_vcf=[], gaps=None,
               filter_gaps=False,
               keep_standard_contigs=False,
               wiggle=WIGGLE, overlap_ratio=OVERLAP_RATIO, workdir="work", outdir="out", boost_ins=False, bam=None, chromosomes=[],
               num_threads=1, spades=None, age=None, disable_assembly=True, minsvlen=MIN_SV_LENGTH, inswiggle=INS_WIGGLE,
               enable_per_tool_output=False, min_support=MIN_SUPPORT,
               min_support_frac=MIN_SUPPORT_FRAC, max_intervals=MAX_INTERVALS, disable_deletion_assembly=False, stop_spades_on_fail=False):
    """Invoke the MetaSV workflow.

    Positional arguments:
    sample -- Sample name
    reference -- Path to a samtools indexed reference FASTA
    
    Keyword arguments:
    pindel_vcf -- List of Pindel VCFs generated by SVGenotyper
    pindel_native -- List of Pindel native output files
    breakdancer_vcf -- List of BreakDancer VCFs generated by SVGenotyper
    breakdancer_native -- List of BreakDancer native output files
    breakseq_vcf -- List of BreakSeq2 VCFs
    breakseq_native -- List of BreakSeq native GFF outputs
    cnvnator_vcf -- List of CNVnator VCFs generated by cnvnator2VCF.pl
    cnvnator_native -- List of CNVnator native output files
    gatk_vcf -- List of Indel VCFs generated by GATK's HaplotypeCaller
    gaps -- Gaps BED file
    filter_gaps -- Flag to filter out SVs overlapping gaps (default False)
    keep_standard_contigs -- Flag to only generate SVs for the major contigs 1, 2, ..., 22, X, Y, MT (default False)
    wiggle -- Wiggle for SV interval comparision (default 100)
    overlap_ratio -- Reciprocal overlap ratio for SV interval comparison (default 0.5)
    workdir -- Scratch directory for MetaSV (default "work")
    outdir -- Output directory for MetaSV (default "out")
    boost_ins -- Enable MetaSV's soft-clip based insertion detection (default False)
    bam -- Alignment BAM for assembly and insertion detection (default None)
    chromosomes -- If specified, indicates the list of chromosomes to process (default [])
    num_threads -- Number of worker threads to use for assembly steps (default 1)
    spades -- Path for the SPAdes executable (default None)
    age -- Path for the AGE executable (default None)
    disable_assembly -- Flag to disable assembly (default False)
    enable_per_tool_output -- Flag to also output merged calls for each tool (default False)

    """

    # Check if there is work to do
    if not (
                                        pindel_vcf + breakdancer_vcf + breakseq_vcf + cnvnator_vcf + pindel_native + breakdancer_native + breakseq_native + cnvnator_native):
        logger.error("Nothing to do since no SV file specified")
        return 1

    # Create the directories for working
    bedtools_tmpdir = os.path.join(workdir, "bedtools")
    create_dirs([workdir, outdir, bedtools_tmpdir])

    # Reference handling
    if not os.path.isfile(reference + ".fai"):
        logger.error("Reference file %s is not indexed" % (reference))
        return 1

    fasta_handle = pysam.Fastafile(reference) if os.path.isfile(reference) else None
    contigs = get_contigs(reference)
    include_intervals = sorted(
        [SVInterval(contig.name, 0, contig.length, contig.name, "include", length=contig.length) for contig in contigs])

    # Generate the list of contigs to process
    contig_whitelist = set(chromosomes) if chromosomes else set([contig.name for contig in contigs])
    if keep_standard_contigs:
        contig_whitelist &= set(
            [str(i) for i in xrange(1, 23)] + ["chr%d" % (i) for i in xrange(1, 23)] + ["X", "Y", "MT", "chrX", "chrY",
                                                                                        "chrM"])
    logger.info("Only SVs on the following contigs will be reported: %s" % (sorted(list(contig_whitelist))))

    # Load the intervals from different files
    vcf_name_list = [("CNVnator", cnvnator_vcf), ("Pindel", pindel_vcf), ("BreakDancer", breakdancer_vcf),
                     ("BreakSeq", breakseq_vcf), ("HaplotypeCaller", gatk_vcf)]
    native_name_list = [("CNVnator", cnvnator_native, CNVnatorReader),
                        ("Pindel", pindel_native, PindelReader),
                        ("BreakSeq", breakseq_native, BreakSeqReader),
                        ("BreakDancer", breakdancer_native, BreakDancerReader)]

    tools = []
    intervals = {}
    sv_types = set()

    gap_intervals = []
    if filter_gaps:
        if not gaps: gaps = get_gaps_file(contig_whitelist)
        gap_intervals = sorted(load_gap_intervals(gaps))

    # Handles native input
    logger.info("Load native files")
    for toolname, nativename, svReader in native_name_list:
        # If no native file is given, ignore the tool
        if not nativename: continue

        tools.append(toolname)
        intervals[toolname] = defaultdict(list)

        for native_file in nativename:
            for record in svReader(native_file):
                interval = record.to_sv_interval()

                if not interval:
                    # This is the case for SVs we want to skip
                    continue
                if not interval_overlaps_interval_list(interval, gap_intervals) and interval.chrom in contig_whitelist:

                    # Check length
                    if interval.length < minsvlen:
                        continue

                    # Set wiggle
                    if interval.sv_type == "INS":
                        interval.wiggle = max(inswiggle, wiggle)
                    else:
                        interval.wiggle = wiggle

                    intervals[toolname][interval.sv_type].append(interval)

        sv_types |= set(intervals[toolname].keys())

    # Handles the VCF input cases, we will just deal with these cases
    logger.info("Load VCF files")
    for toolname, vcfname in vcf_name_list:
        # If no VCF is given, ignore the tool
        if not vcfname:
            continue

        tools.append(toolname)
        intervals[toolname] = {}

        vcf_list = []
        for vcffile in vcfname:
            if os.path.isdir(vcffile):
                logger.info("Will load from per-chromosome VCFs from directory %s for tool %s" % (vcffile, toolname))
                vcf_list += [os.path.join(vcffile, "%s.vcf.gz" % contig.name) for contig in contigs if
                             (not contig_whitelist or contig.name in contig_whitelist)]
            else:
                vcf_list.append(vcffile)

        for vcffile in vcf_list:
            load_intervals(vcffile, intervals[toolname], gap_intervals, include_intervals, toolname, contig_whitelist,
                           minsvlen=minsvlen, wiggle=wiggle, inswiggle=inswiggle)
        sv_types |= set(intervals[toolname].keys())

    logger.info("SV types are %s" % (str(sv_types)))
    tool_merged_intervals = {}
    final_intervals = []

    bd_out = os.path.join(outdir, "breakdancer.vcf")
    pindel_out = os.path.join(outdir, "pindel.vcf")
    cnvnator_out = os.path.join(outdir, "cnvnator.vcf")
    breakseq_out = os.path.join(outdir, "breakseq.vcf")

    vcf_out_list = [("BreakDancer", bd_out),
                    ("Pindel", pindel_out),
                    ("CNVnator", cnvnator_out),
                    ("BreakSeq", breakseq_out)]

    # This will just output per-tool VCFs, no intra-tool merging is done yet
    if enable_per_tool_output:
        logger.info("Output per-tool VCFs")
        for toolname, tool_out in vcf_out_list:
            if tool_out is None or toolname not in intervals:
                continue

            logger.info("Outputting single tool VCF for %s" % (str(toolname)))
            vcf_template_reader = vcf.Reader(open(os.path.join(mydir, "resources/template.vcf"), "r"))
            vcf_template_reader.samples = [sample]

            intervals_tool = []
            tool_out_fd = open(tool_out, "w")
            vcf_writer = vcf.Writer(tool_out_fd, vcf_template_reader)
            chr_intervals_tool = {contig.name: [] for contig in contigs}
            for sv_type in sv_types:
                if sv_type in intervals[toolname]:
                    intervals_tool.extend([copy.deepcopy(interval) for interval in intervals[toolname][sv_type]])
            for interval in intervals_tool:
                # Marghoob says that this is just to fill-in some metadata
                interval.do_validation(overlap_ratio)

                interval.fix_pos()
                chr_intervals_tool[interval.chrom].append(interval)

            for contig in contigs:
                chr_intervals_tool[contig.name].sort()
                for interval in chr_intervals_tool[contig.name]:
                    vcf_record = interval.to_vcf_record(fasta_handle, sample)
                    if vcf_record is not None:
                        vcf_writer.write_record(vcf_record)
            tool_out_fd.close()
            vcf_writer.close()
            logger.info("Indexing single tool VCF for %s" % (str(toolname)))
            pysam.tabix_index(tool_out, force=True, preset="vcf")


    # Do merging here
    logger.info("Do merging")
    for sv_type in sv_types:
        logger.info("Processing SVs of type %s" % sv_type)
        tool_merged_intervals[sv_type] = []

        # Do the intra-tool merging
        logger.info("Intra-tool Merging SVs of type %s" % sv_type)
        for tool in tools:
            logger.debug("Is %s in tool keys? %s" % (sv_type, str(intervals[tool].keys())))
            if sv_type not in intervals[tool]:
                logger.debug("%s not in tool %s" % (sv_type, tool))
                continue
            logger.info("First level merging for %s for tool %s" % (sv_type, tool))
            tool_merged_intervals[sv_type] += merge_intervals(intervals[tool][sv_type])

        # Do the inter-tool merging
        logger.info("Inter-tool Merging SVs of type %s" % sv_type)
        merged_intervals = merge_intervals(tool_merged_intervals[sv_type])

        # Intervals which overlap well with merged_intervals
        intervals1 = []
        # Intervals which do not overlap well with merged_intervals.
        # Used to filter out small intervals which got merged with large intervals
        intervals2 = []

        logger.info("Checking overlaps SVs of type %s" % sv_type)
        for interval in tool_merged_intervals[sv_type]:
            if interval_overlaps_interval_list(interval, merged_intervals, overlap_ratio, overlap_ratio):
                intervals2.append(interval)
            else:
                intervals1.append(interval)
        final_intervals.extend(merge_intervals(intervals1) + merge_intervals(intervals2))

    final_chr_intervals = {contig.name: [] for contig in contigs}
    for interval in final_intervals:
        interval.do_validation(overlap_ratio)
        interval.fix_pos()
        final_chr_intervals[interval.chrom].append(interval)

    # This is the merged VCF without assembly, ok for deletions at this point
    logger.info("Output merged VCF without assembly ")
    vcf_template_reader = vcf.Reader(open(os.path.join(mydir, "resources/template.vcf"), "r"))
    vcf_template_reader.samples = [sample]
    preasm_vcf = os.path.join(workdir, "pre_asm.vcf")
    vcf_fd = open(preasm_vcf, "w")
    vcf_writer = vcf.Writer(vcf_fd, vcf_template_reader)

    final_stats = {}

    bed_intervals = []
    merged_bed = os.path.join(workdir, "metasv.bed")
    for contig in contigs:
        final_chr_intervals[contig.name].sort()
        for interval in final_chr_intervals[contig.name]:
            vcf_record = interval.to_vcf_record(fasta_handle)
            if vcf_record is not None:
                key = (interval.sv_type, "PASS" if interval.is_validated else "LowQual",
                       "PRECISE" if interval.is_precise else "IMPRECISE", tuple(sorted(list(interval.sources))))
                if key not in final_stats:
                    final_stats[key] = 0
                final_stats[key] += 1
                vcf_writer.write_record(vcf_record)
            bed_interval = interval.to_bed_interval(sample)
            if bed_interval is not None:
                bed_intervals.append(bed_interval)

    # Also save a BED file representation of the merged variants without assembly
    pybedtools.BedTool(bed_intervals).saveas(merged_bed)
    vcf_fd.close()
    vcf_writer.close()

    for key in sorted(final_stats.keys()):
        logger.info(str(key) + ":" + str(final_stats[key]))

    final_vcf = os.path.join(outdir, "variants.vcf")

    # Run assembly here
    if not disable_assembly:
        logger.info("Running assembly")
        if spades is None:
            logger.error("Spades executable not specified")
            return 1

        if age is None:
            logger.error("AGE executable not specified")
            return 1

        spades_tmpdir = os.path.join(workdir, "spades")
        age_tmpdir = os.path.join(workdir, "age")

        create_dirs([spades_tmpdir, age_tmpdir])

        assembly_bed = merged_bed

        # this does the improved assembly location finder with softclipped reads
        if boost_ins:
            logger.info("Generating intervals for insertions")
            assembly_bed = parallel_generate_sc_intervals([bam.name], list(contig_whitelist), merged_bed, workdir,
                                                          num_threads=num_threads, min_support=min_support,
                                                          min_support_frac=min_support_frac, max_intervals=max_intervals)
            logger.info("Generated intervals for assembly in %s" % assembly_bed)

        logger.info("Will run assembly now")

        assembled_fasta, ignored_bed = run_spades_parallel(bam=bam.name, spades=spades, bed=assembly_bed,
                                                           work=spades_tmpdir, pad=SPADES_PAD, nthreads=num_threads,
                                                           chrs=list(contig_whitelist), disable_deletion_assembly=disable_deletion_assembly, stop_on_fail=stop_spades_on_fail)
        breakpoints_bed = run_age_parallel(intervals_bed=assembly_bed, reference=reference, assembly=assembled_fasta,
                                           pad=AGE_PAD, age=age, chrs=list(contig_whitelist), nthreads=num_threads,
                                           min_contig_len=AGE_MIN_CONTIG_LENGTH, age_workdir=age_tmpdir)

        final_bed = os.path.join(workdir, "final.bed")
        if ignored_bed:
            pybedtools.BedTool(breakpoints_bed) \
                .cat(pybedtools.BedTool(ignored_bed), postmerge=False) \
                .sort().saveas(final_bed)
        else:
            pybedtools.BedTool(breakpoints_bed).saveas(final_bed)

        logger.info("Output final VCF file")

        convert_metasv_bed_to_vcf(bedfile=final_bed, vcf_out=final_vcf, sample=sample, pass_calls=False)
    else:
        shutil.copy(preasm_vcf, final_vcf)
        pysam.tabix_index(final_vcf, force=True, preset="vcf")

    logger.info("Clean up pybedtools")

    pybedtools.cleanup(remove_all=True)

    logger.info("All Done!")
コード例 #6
0
ファイル: main.py プロジェクト: BioinformaticsArchive/metasv
def run_metasv(sample,
               reference,
               pindel_vcf=[],
               pindel_native=[],
               breakdancer_vcf=[],
               breakdancer_native=[],
               breakseq_vcf=[],
               breakseq_native=[],
               cnvnator_vcf=[],
               cnvnator_native=[],
               gatk_vcf=[],
               gaps=None,
               filter_gaps=False,
               keep_standard_contigs=False,
               wiggle=WIGGLE,
               overlap_ratio=OVERLAP_RATIO,
               workdir="work",
               outdir="out",
               boost_ins=False,
               bam=None,
               chromosomes=[],
               num_threads=1,
               spades=None,
               age=None,
               disable_assembly=True,
               minsvlen=MIN_SV_LENGTH,
               inswiggle=INS_WIGGLE,
               enable_per_tool_output=False,
               min_support=MIN_SUPPORT,
               min_support_frac=MIN_SUPPORT_FRAC,
               max_intervals=MAX_INTERVALS,
               disable_deletion_assembly=False,
               stop_spades_on_fail=False):
    """Invoke the MetaSV workflow.

    Positional arguments:
    sample -- Sample name
    reference -- Path to a samtools indexed reference FASTA
    
    Keyword arguments:
    pindel_vcf -- List of Pindel VCFs generated by SVGenotyper
    pindel_native -- List of Pindel native output files
    breakdancer_vcf -- List of BreakDancer VCFs generated by SVGenotyper
    breakdancer_native -- List of BreakDancer native output files
    breakseq_vcf -- List of BreakSeq2 VCFs
    breakseq_native -- List of BreakSeq native GFF outputs
    cnvnator_vcf -- List of CNVnator VCFs generated by cnvnator2VCF.pl
    cnvnator_native -- List of CNVnator native output files
    gatk_vcf -- List of Indel VCFs generated by GATK's HaplotypeCaller
    gaps -- Gaps BED file
    filter_gaps -- Flag to filter out SVs overlapping gaps (default False)
    keep_standard_contigs -- Flag to only generate SVs for the major contigs 1, 2, ..., 22, X, Y, MT (default False)
    wiggle -- Wiggle for SV interval comparision (default 100)
    overlap_ratio -- Reciprocal overlap ratio for SV interval comparison (default 0.5)
    workdir -- Scratch directory for MetaSV (default "work")
    outdir -- Output directory for MetaSV (default "out")
    boost_ins -- Enable MetaSV's soft-clip based insertion detection (default False)
    bam -- Alignment BAM for assembly and insertion detection (default None)
    chromosomes -- If specified, indicates the list of chromosomes to process (default [])
    num_threads -- Number of worker threads to use for assembly steps (default 1)
    spades -- Path for the SPAdes executable (default None)
    age -- Path for the AGE executable (default None)
    disable_assembly -- Flag to disable assembly (default False)
    enable_per_tool_output -- Flag to also output merged calls for each tool (default False)

    """

    # Check if there is work to do
    if not (pindel_vcf + breakdancer_vcf + breakseq_vcf + cnvnator_vcf +
            pindel_native + breakdancer_native + breakseq_native +
            cnvnator_native):
        logger.error("Nothing to do since no SV file specified")
        return 1

    # Create the directories for working
    bedtools_tmpdir = os.path.join(workdir, "bedtools")
    create_dirs([workdir, outdir, bedtools_tmpdir])

    # Reference handling
    if not os.path.isfile(reference + ".fai"):
        logger.error("Reference file %s is not indexed" % (reference))
        return 1

    fasta_handle = pysam.Fastafile(reference) if os.path.isfile(
        reference) else None
    contigs = get_contigs(reference)
    include_intervals = sorted([
        SVInterval(contig.name,
                   0,
                   contig.length,
                   contig.name,
                   "include",
                   length=contig.length) for contig in contigs
    ])

    # Generate the list of contigs to process
    contig_whitelist = set(chromosomes) if chromosomes else set(
        [contig.name for contig in contigs])
    if keep_standard_contigs:
        contig_whitelist &= set([str(i) for i in xrange(1, 23)] +
                                ["chr%d" % (i) for i in xrange(1, 23)] +
                                ["X", "Y", "MT", "chrX", "chrY", "chrM"])
    logger.info("Only SVs on the following contigs will be reported: %s" %
                (sorted(list(contig_whitelist))))

    # Load the intervals from different files
    vcf_name_list = [("CNVnator", cnvnator_vcf), ("Pindel", pindel_vcf),
                     ("BreakDancer", breakdancer_vcf),
                     ("BreakSeq", breakseq_vcf), ("HaplotypeCaller", gatk_vcf)]
    native_name_list = [("CNVnator", cnvnator_native, CNVnatorReader),
                        ("Pindel", pindel_native, PindelReader),
                        ("BreakSeq", breakseq_native, BreakSeqReader),
                        ("BreakDancer", breakdancer_native, BreakDancerReader)]

    tools = []
    intervals = {}
    sv_types = set()

    gap_intervals = []
    if filter_gaps:
        if not gaps: gaps = get_gaps_file(contig_whitelist)
        gap_intervals = sorted(load_gap_intervals(gaps))

    # Handles native input
    logger.info("Load native files")
    for toolname, nativename, svReader in native_name_list:
        # If no native file is given, ignore the tool
        if not nativename: continue

        tools.append(toolname)
        intervals[toolname] = defaultdict(list)

        for native_file in nativename:
            for record in svReader(native_file):
                interval = record.to_sv_interval()

                if not interval:
                    # This is the case for SVs we want to skip
                    continue
                if not interval_overlaps_interval_list(
                        interval,
                        gap_intervals) and interval.chrom in contig_whitelist:

                    # Check length
                    if interval.length < minsvlen:
                        continue

                    # Set wiggle
                    if interval.sv_type == "INS":
                        interval.wiggle = max(inswiggle, wiggle)
                    else:
                        interval.wiggle = wiggle

                    intervals[toolname][interval.sv_type].append(interval)

        sv_types |= set(intervals[toolname].keys())

    # Handles the VCF input cases, we will just deal with these cases
    logger.info("Load VCF files")
    for toolname, vcfname in vcf_name_list:
        # If no VCF is given, ignore the tool
        if not vcfname:
            continue

        tools.append(toolname)
        intervals[toolname] = {}

        vcf_list = []
        for vcffile in vcfname:
            if os.path.isdir(vcffile):
                logger.info(
                    "Will load from per-chromosome VCFs from directory %s for tool %s"
                    % (vcffile, toolname))
                vcf_list += [
                    os.path.join(vcffile, "%s.vcf.gz" % contig.name)
                    for contig in contigs
                    if (not contig_whitelist or contig.name in contig_whitelist
                        )
                ]
            else:
                vcf_list.append(vcffile)

        for vcffile in vcf_list:
            load_intervals(vcffile,
                           intervals[toolname],
                           gap_intervals,
                           include_intervals,
                           toolname,
                           contig_whitelist,
                           minsvlen=minsvlen,
                           wiggle=wiggle,
                           inswiggle=inswiggle)
        sv_types |= set(intervals[toolname].keys())

    logger.info("SV types are %s" % (str(sv_types)))
    tool_merged_intervals = {}
    final_intervals = []

    bd_out = os.path.join(outdir, "breakdancer.vcf")
    pindel_out = os.path.join(outdir, "pindel.vcf")
    cnvnator_out = os.path.join(outdir, "cnvnator.vcf")
    breakseq_out = os.path.join(outdir, "breakseq.vcf")

    vcf_out_list = [("BreakDancer", bd_out), ("Pindel", pindel_out),
                    ("CNVnator", cnvnator_out), ("BreakSeq", breakseq_out)]

    # This will just output per-tool VCFs, no intra-tool merging is done yet
    if enable_per_tool_output:
        logger.info("Output per-tool VCFs")
        for toolname, tool_out in vcf_out_list:
            if tool_out is None or toolname not in intervals:
                continue

            logger.info("Outputting single tool VCF for %s" % (str(toolname)))
            vcf_template_reader = vcf.Reader(
                open(os.path.join(mydir, "resources/template.vcf"), "r"))
            vcf_template_reader.samples = [sample]

            intervals_tool = []
            tool_out_fd = open(tool_out, "w")
            vcf_writer = vcf.Writer(tool_out_fd, vcf_template_reader)
            chr_intervals_tool = {contig.name: [] for contig in contigs}
            for sv_type in sv_types:
                if sv_type in intervals[toolname]:
                    intervals_tool.extend([
                        copy.deepcopy(interval)
                        for interval in intervals[toolname][sv_type]
                    ])
            for interval in intervals_tool:
                # Marghoob says that this is just to fill-in some metadata
                interval.do_validation(overlap_ratio)

                interval.fix_pos()
                chr_intervals_tool[interval.chrom].append(interval)

            for contig in contigs:
                chr_intervals_tool[contig.name].sort()
                for interval in chr_intervals_tool[contig.name]:
                    vcf_record = interval.to_vcf_record(fasta_handle, sample)
                    if vcf_record is not None:
                        vcf_writer.write_record(vcf_record)
            tool_out_fd.close()
            vcf_writer.close()
            logger.info("Indexing single tool VCF for %s" % (str(toolname)))
            pysam.tabix_index(tool_out, force=True, preset="vcf")

    # Do merging here
    logger.info("Do merging")
    for sv_type in sv_types:
        logger.info("Processing SVs of type %s" % sv_type)
        tool_merged_intervals[sv_type] = []

        # Do the intra-tool merging
        logger.info("Intra-tool Merging SVs of type %s" % sv_type)
        for tool in tools:
            logger.debug("Is %s in tool keys? %s" %
                         (sv_type, str(intervals[tool].keys())))
            if sv_type not in intervals[tool]:
                logger.debug("%s not in tool %s" % (sv_type, tool))
                continue
            logger.info("First level merging for %s for tool %s" %
                        (sv_type, tool))
            tool_merged_intervals[sv_type] += merge_intervals(
                intervals[tool][sv_type])

        # Do the inter-tool merging
        logger.info("Inter-tool Merging SVs of type %s" % sv_type)
        merged_intervals = merge_intervals(tool_merged_intervals[sv_type])

        # Intervals which overlap well with merged_intervals
        intervals1 = []
        # Intervals which do not overlap well with merged_intervals.
        # Used to filter out small intervals which got merged with large intervals
        intervals2 = []

        logger.info("Checking overlaps SVs of type %s" % sv_type)
        for interval in tool_merged_intervals[sv_type]:
            if interval_overlaps_interval_list(interval, merged_intervals,
                                               overlap_ratio, overlap_ratio):
                intervals2.append(interval)
            else:
                intervals1.append(interval)
        final_intervals.extend(
            merge_intervals(intervals1) + merge_intervals(intervals2))

    final_chr_intervals = {contig.name: [] for contig in contigs}
    for interval in final_intervals:
        interval.do_validation(overlap_ratio)
        interval.fix_pos()
        final_chr_intervals[interval.chrom].append(interval)

    # This is the merged VCF without assembly, ok for deletions at this point
    logger.info("Output merged VCF without assembly ")
    vcf_template_reader = vcf.Reader(
        open(os.path.join(mydir, "resources/template.vcf"), "r"))
    vcf_template_reader.samples = [sample]
    preasm_vcf = os.path.join(workdir, "pre_asm.vcf")
    vcf_fd = open(preasm_vcf, "w")
    vcf_writer = vcf.Writer(vcf_fd, vcf_template_reader)

    final_stats = {}

    bed_intervals = []
    merged_bed = os.path.join(workdir, "metasv.bed")
    for contig in contigs:
        final_chr_intervals[contig.name].sort()
        for interval in final_chr_intervals[contig.name]:
            vcf_record = interval.to_vcf_record(fasta_handle)
            if vcf_record is not None:
                key = (interval.sv_type,
                       "PASS" if interval.is_validated else "LowQual",
                       "PRECISE" if interval.is_precise else "IMPRECISE",
                       tuple(sorted(list(interval.sources))))
                if key not in final_stats:
                    final_stats[key] = 0
                final_stats[key] += 1
                vcf_writer.write_record(vcf_record)
            bed_interval = interval.to_bed_interval(sample)
            if bed_interval is not None:
                bed_intervals.append(bed_interval)

    # Also save a BED file representation of the merged variants without assembly
    pybedtools.BedTool(bed_intervals).saveas(merged_bed)
    vcf_fd.close()
    vcf_writer.close()

    for key in sorted(final_stats.keys()):
        logger.info(str(key) + ":" + str(final_stats[key]))

    final_vcf = os.path.join(outdir, "variants.vcf")

    # Run assembly here
    if not disable_assembly:
        logger.info("Running assembly")
        if spades is None:
            logger.error("Spades executable not specified")
            return 1

        if age is None:
            logger.error("AGE executable not specified")
            return 1

        spades_tmpdir = os.path.join(workdir, "spades")
        age_tmpdir = os.path.join(workdir, "age")

        create_dirs([spades_tmpdir, age_tmpdir])

        assembly_bed = merged_bed

        # this does the improved assembly location finder with softclipped reads
        if boost_ins:
            logger.info("Generating intervals for insertions")
            assembly_bed = parallel_generate_sc_intervals(
                [bam.name],
                list(contig_whitelist),
                merged_bed,
                workdir,
                num_threads=num_threads,
                min_support=min_support,
                min_support_frac=min_support_frac,
                max_intervals=max_intervals)
            logger.info("Generated intervals for assembly in %s" %
                        assembly_bed)

        logger.info("Will run assembly now")

        assembled_fasta, ignored_bed = run_spades_parallel(
            bam=bam.name,
            spades=spades,
            bed=assembly_bed,
            work=spades_tmpdir,
            pad=SPADES_PAD,
            nthreads=num_threads,
            chrs=list(contig_whitelist),
            disable_deletion_assembly=disable_deletion_assembly,
            stop_on_fail=stop_spades_on_fail)
        breakpoints_bed = run_age_parallel(
            intervals_bed=assembly_bed,
            reference=reference,
            assembly=assembled_fasta,
            pad=AGE_PAD,
            age=age,
            chrs=list(contig_whitelist),
            nthreads=num_threads,
            min_contig_len=AGE_MIN_CONTIG_LENGTH,
            age_workdir=age_tmpdir)

        final_bed = os.path.join(workdir, "final.bed")
        if ignored_bed:
            pybedtools.BedTool(breakpoints_bed) \
                .cat(pybedtools.BedTool(ignored_bed), postmerge=False) \
                .sort().saveas(final_bed)
        else:
            pybedtools.BedTool(breakpoints_bed).saveas(final_bed)

        logger.info("Output final VCF file")

        convert_metasv_bed_to_vcf(bedfile=final_bed,
                                  vcf_out=final_vcf,
                                  sample=sample,
                                  pass_calls=False)
    else:
        shutil.copy(preasm_vcf, final_vcf)
        pysam.tabix_index(final_vcf, force=True, preset="vcf")

    logger.info("Clean up pybedtools")

    pybedtools.cleanup(remove_all=True)

    logger.info("All Done!")